def build_saved_cache(): for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}): url = URL(doc['url']) r.set_url_saved(doc['method'], url) for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}, is_target=False): url = URL(doc['url']) r.set_url_saved(doc['method'], url)
def proc_req(self, req): try: data = json.loads(req) except: logger.exception('json loads req error: %s' % req) return urlstring = data.get('url', '') if not urlstring: logger.error('empty url found!') return url = URL(urlstring) method = data.get('method', '') if not method: logger.error('not method found!') return # save to mongodb data.update({ 'pattern': url.pattern, 'hostname': url.hostname, 'domain': url.domain }) target = self.redis_handle.is_target(url) if not self.redis_handle.is_url_saved(method, url): logger.debug('redis saved pattern not found!') self.mongo_handle.save(data, is_target=target) self.redis_handle.set_url_saved(method, url) else: logger.debug('redis saved pattern found!') if not target: logger.debug('%s is not target' % url.hostname) return # todo post req if method == 'POST': logger.debug('POST not support now') elif method == 'GET': # new host found, add index page to task queue if self.redis_handle.get_hostname_reqcount(url.hostname) == 0: self.create_task_from_url(URL(url.index_page), add_whitelist=False) # check url validation inside create_url_task self.create_task_from_url(url, add_whitelist=False) else: # not GET nor POST logger.error('HTTP Verb %s found!' % method) logger.debug(data)
def create_task_from_file(self, fileobj): """ create task from file :param filename: :return: """ with fileobj: for line in fileobj: line = line.strip() if not line: continue url = URL(line) self.create_task_from_url(url)
def process(filename): data = defaultdict(dict) with open(filename) as f: for line in f: line = line.strip() if not line: continue url = URL(line) if not url.valid: continue netloc = url.netloc pattern = url.pattern if pattern not in data[netloc]: data[netloc][pattern] = url.urlstring return data
def consume(self): if not self.redis_handle.connected: logger.error('no redis connection found in consumer! exit.') return while True: try: url = self.redis_handle.fetch_one_task() with self.context['lock']: self.context['live_spider_counts'].value += 1 self.context['task_counts'].value -= 1 logger.info('get task url: %s' % url) logger.info('%d tasks left' % self.context['task_counts'].value) if not self.redis_handle.is_blocked(URL(url)): self.start_spider(url, self.__cookie_file) except: logger.exception('consumer exception!') if not self.redis_handle.connected: logger.error('redis disconnected! reconnecting...') self.redis_handle.connect() time.sleep(10) finally: with self.context['lock']: self.context['live_spider_counts'].value -= 1
'mongo_db': args.mongo_db } for _ in range(args.consumer): worker = Consumer(**kwargs).consume proc = Process(name='consumer-%d' % _, target=worker) proc.daemon = True proc.start() for _ in range(args.producer): worker = Producer(**kwargs).produce proc = Process(name='producer-%d' % _, target=worker) proc.daemon = True proc.start() if not args.keepon: redis_handle.flushdb() redis_handle.save_startup_params(args) target = args.url or args.file producer = Producer(**kwargs) if isinstance(target, basestring): url = URL(target) if not url.valid or url.blocked: logger.error('not valid url, exit.') sys.exit(-1) producer.create_task_from_url(url) # file object else: producer.create_task_from_file(target) redis_handle.close() tspider_context['task_done'].wait()
def create_task_from_url(self, url, **kwargs): with self.context['lock']: if self.redis_handle.create_task_from_url(url, **kwargs): self.context['task_counts'].value += 1 def create_task_from_file(self, fileobj): """ create task from file :param filename: :return: """ with fileobj: for line in fileobj: line = line.strip() if not line: continue url = URL(line) self.create_task_from_url(url) if __name__ == '__main__': # tld=False, only scan links inside demo.aisec.cn # no scan www.aisec.cn even got links from demo.aisc.cn p = Producer(tld=False) url = URL('http://demo.aisec.cn/demo/aisec/') p.create_task_from_url(url) p.produce() # with open('test.txt') as f: # p.create_file_task(f)