Exemple #1
0
def build_saved_cache():
    for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}):
        url = URL(doc['url'])
        r.set_url_saved(doc['method'], url)

    for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}, is_target=False):
        url = URL(doc['url'])
        r.set_url_saved(doc['method'], url)
Exemple #2
0
    def proc_req(self, req):
        try:
            data = json.loads(req)
        except:
            logger.exception('json loads req error: %s' % req)
            return
        urlstring = data.get('url', '')
        if not urlstring:
            logger.error('empty url found!')
            return
        url = URL(urlstring)

        method = data.get('method', '')
        if not method:
            logger.error('not method found!')
            return
        # save to mongodb
        data.update({
            'pattern': url.pattern,
            'hostname': url.hostname,
            'domain': url.domain
        })
        target = self.redis_handle.is_target(url)

        if not self.redis_handle.is_url_saved(method, url):
            logger.debug('redis saved pattern not found!')
            self.mongo_handle.save(data, is_target=target)
            self.redis_handle.set_url_saved(method, url)
        else:
            logger.debug('redis saved pattern found!')

        if not target:
            logger.debug('%s is not target' % url.hostname)
            return

        # todo post req
        if method == 'POST':
            logger.debug('POST not support now')
        elif method == 'GET':
            # new host found, add index page to task queue
            if self.redis_handle.get_hostname_reqcount(url.hostname) == 0:
                self.create_task_from_url(URL(url.index_page),
                                          add_whitelist=False)
            # check url validation inside create_url_task
            self.create_task_from_url(url, add_whitelist=False)
        else:
            # not GET nor POST
            logger.error('HTTP Verb %s found!' % method)
            logger.debug(data)
Exemple #3
0
 def create_task_from_file(self, fileobj):
     """
     create task from file
     :param filename:
     :return:
     """
     with fileobj:
         for line in fileobj:
             line = line.strip()
             if not line: continue
             url = URL(line)
             self.create_task_from_url(url)
def process(filename):
    data = defaultdict(dict)
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line: continue
            url = URL(line)
            if not url.valid: continue
            netloc = url.netloc
            pattern = url.pattern
            if pattern not in data[netloc]:
                data[netloc][pattern] = url.urlstring
    return data
Exemple #5
0
 def consume(self):
     if not self.redis_handle.connected:
         logger.error('no redis connection found in consumer! exit.')
         return
     while True:
         try:
             url = self.redis_handle.fetch_one_task()
             with self.context['lock']:
                 self.context['live_spider_counts'].value += 1
                 self.context['task_counts'].value -= 1
             logger.info('get task url: %s' % url)
             logger.info('%d tasks left' %
                         self.context['task_counts'].value)
             if not self.redis_handle.is_blocked(URL(url)):
                 self.start_spider(url, self.__cookie_file)
         except:
             logger.exception('consumer exception!')
             if not self.redis_handle.connected:
                 logger.error('redis disconnected! reconnecting...')
                 self.redis_handle.connect()
             time.sleep(10)
         finally:
             with self.context['lock']:
                 self.context['live_spider_counts'].value -= 1
Exemple #6
0
        'mongo_db': args.mongo_db
    }
    for _ in range(args.consumer):
        worker = Consumer(**kwargs).consume
        proc = Process(name='consumer-%d' % _, target=worker)
        proc.daemon = True
        proc.start()
    for _ in range(args.producer):
        worker = Producer(**kwargs).produce
        proc = Process(name='producer-%d' % _, target=worker)
        proc.daemon = True
        proc.start()

    if not args.keepon:
        redis_handle.flushdb()
        redis_handle.save_startup_params(args)
        target = args.url or args.file
        producer = Producer(**kwargs)
        if isinstance(target, basestring):
            url = URL(target)
            if not url.valid or url.blocked:
                logger.error('not valid url, exit.')
                sys.exit(-1)
            producer.create_task_from_url(url)
        # file object
        else:
            producer.create_task_from_file(target)

    redis_handle.close()
    tspider_context['task_done'].wait()
Exemple #7
0
    def create_task_from_url(self, url, **kwargs):
        with self.context['lock']:
            if self.redis_handle.create_task_from_url(url, **kwargs):
                self.context['task_counts'].value += 1

    def create_task_from_file(self, fileobj):
        """
        create task from file
        :param filename:
        :return:
        """
        with fileobj:
            for line in fileobj:
                line = line.strip()
                if not line: continue
                url = URL(line)
                self.create_task_from_url(url)


if __name__ == '__main__':
    # tld=False, only scan links inside demo.aisec.cn
    # no scan www.aisec.cn even got links from demo.aisc.cn
    p = Producer(tld=False)
    url = URL('http://demo.aisec.cn/demo/aisec/')
    p.create_task_from_url(url)
    p.produce()

    # with open('test.txt') as f:
    #     p.create_file_task(f)