Ejemplo n.º 1
0
    def start_requests(self):
        """
            default Scrapy method to send requests
        """

        # if spider already active
        if self.settings['active'] == 'T':
            log.msg('[OVERLAP] - at %s EST' % (datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)
            # Close the spider
            raise exceptions.CloseSpider('Recon Spider already active')

        # Set spider is activating
        ReconSpiderSettings(self.site).write_active('T')

        log.msg('[START_ID] - %s at %s EST' % (str(self.settings['recon_startid']), datetime.now(timezone('US/Eastern'))
                .strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)
        log.msg('[CYCLES] - %s at %s EST' % (
            str(self.settings['cycles']), datetime.now(timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")), level=log.INFO)

        # requires a new recon_startid, if not, close the spider
        if self.settings['recon_startid'] == -1:
            # Close the spider and notice to provive initial start_id
            raise exceptions.CloseSpider('Provide start_id value via start_id parameter for initilizing')

        # Generate ids list for reconnoitering
        url_ids = generate_ids(self.site)
        
        # Send URL requests
        for id in url_ids:
            req = Request("".join([self.base_url, str(id)]), dont_filter=True, callback=self.parse)
            # save url_id for calling back
            req.meta['url_id'] = id
            yield req
Ejemplo n.º 2
0
    def start_requests(self):
        """
            default Scrapy method to send requests
        """

        # if spider already active
        if self.settings['active'] == 'T':
            log.msg('[OVERLAP] - at %s EST' % (datetime.now(
                timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
                    level=log.INFO)
            # Close the spider
            raise exceptions.CloseSpider('Main Spider already active')

        # Set spider is activating
        MainSpiderSettings(self.site).write_active('T')

        # If new recon_start is existed
        if self.settings['recon_startid'] == -1:
            raise exceptions.CloseSpider(
                'It requires to run Recon Spider to looking for new ids before running Main Spider.'
            )

        # If main_startid is existed
        if self.settings['main_startid'] == -1:
            # set a temporary start_id
            self.settings['main_startid'] = self.settings[
                'recon_startid'] - self.settings['block_size']

        log.msg('[START_ID] - %s at %s EST' % (str(
            self.settings['main_startid']), datetime.now(
                timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
                level=log.INFO)
        log.msg('[FOUND_GAP] - %s at %s EST' %
                (str(self.settings['recon_startid'] -
                     self.settings['main_startid']),
                 datetime.now(
                     timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
                level=log.INFO)

        # generate an array of url_ids
        url_ids = array('i', (xrange(
            self.settings['main_startid'],
            self.settings['main_startid'] + self.settings['block_size'])))

        # Shuffle the id list before requesting
        random.shuffle(url_ids)

        for id in url_ids:
            req = Request("".join([self.base_url, str(id)]),
                          dont_filter=True,
                          callback=self.parse)
            req.meta['url_id'] = id
            yield req
Ejemplo n.º 3
0
    def build_check_recipient(self, ip, port, scheme,
                              user=None, password=None):
        """
        1. build a request for availability checking
        2. drop it if already existed

        :return: Request
        """

        if self.complete_condition():
            raise exceptions.CloseSpider('Enough items')

        spec = dict(ip=ip, port=port, scheme=scheme)

        if self.already_exists(spec):
            self.logger.debug('Dropped duplicated: %s' % spec.values())

            return {}  # drop it

        proxy_url = utils.build_proxy_url(ip, port, scheme, user, password)
        need_auth = bool(user and password)
        item = Proxy(
            ip=ip,
            scheme=scheme,
            port=port,
            need_auth=need_auth,
            url=proxy_url,
        )

        if need_auth:
            item['user'], item['password'] = user, password

        return self.build_check_request(item)
Ejemplo n.º 4
0
    def start_requests(self):
        """
            default Scrapy method to send requests
        """

        # if spider already active
        if self.settings['active'] == 'T':
            log.msg('[OVERLAP] - at %s EST' % (datetime.now(
                timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
                    level=log.INFO)
            # Close the spider
            raise exceptions.CloseSpider('Finalcheck Spider already active')

        # if no available urls
        if len(self.settings['url_ids']) == 0:
            log.msg('[FINALCHECK_IS_EMPTY] - at %s EST' % (datetime.now(
                timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
                    level=log.INFO)
            # Close the spider
            raise exceptions.CloseSpider('Finalcheck URLs is empty')

        # Set spider is activating
        FinalcheckSpiderSettings(self.site).write_active('T')

        log.msg(
            '[URL_QUANTITY] - %s at %s EST' %
            (len(self.settings['url_ids']), datetime.now(
                timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")),
            level=log.INFO)

        # Enqueue URLs
        for id in self.settings['url_ids']:
            req = Request("".join((self.base_url, str(id))),
                          dont_filter=True,
                          callback=self.parse)
            req.meta['url_id'] = id
            yield req
Ejemplo n.º 5
0
    def __init__(self,
                 # government numbers to scrape
                 # 0-6 are all available governments
                 gov_indexes="0,1,2,3,4,5,6",
                 *args, **kwargs):

        super(ResolutionSpider, self).__init__(*args, **kwargs)

        # convert gov. indexes string to integer list
        self.gov_indexes = [int(i) for i in gov_indexes.split(',')]

        # validate values
        for i in self.gov_indexes:
            if i < 0 or i > 6:
                raise exceptions.CloseSpider(
                    "gov_indexes list can only include integers ranging 0-6")
Ejemplo n.º 6
0
 def _check_stop_criteria(self):
     if len(self.visited_pages) >= self.pages_max:
         raise exceptions.CloseSpider(
             'Maximum visited pages number exceeded')