def test_main(self): queue = JoinableQueue() print dir(queue) queue.put(1) queue.put(3) queue.put(2) queue.put(6) print queue.qsize() print '1', queue.get(), queue.get()
def test_api(self): queue = JoinableQueue() task_group = self.api.search('terminator', queue) while True: finished = all( [greenlet.ready() for greenlet in task_group.greenlets]) try: item = queue.get(timeout=1.0) except Empty: if finished: log.info('queue is empty and all jobs are done, quitting') break log.info( 'queue was empty and jobs are still running, retrying') continue try: log.info('%r', item) finally: queue.task_done() task_group.join() queue.join() log.info('joined everything')
class Dispatcher(gevent.Greenlet): """ The Dispatcher class handles routing communications to and from the Gateway. It implements an Actor interface as made popular by Erlang. """ def __init__(self): self._gw_inbox = JoinableQueue() super().__init__() def _run(self): while True: try: event = self._gw_inbox.get(block=False) # Dispatch the event back to interface self._gw_inbox.task_done() finally: gevent.sleep(1) @property def gw_inbox(self): """ This is the inbox for the Gateway. It's not accessible outside the class methods. :return: None """ return None @gw_inbox.setter def gw_inbox(self, message): self._gw_inbox.put(message)
def process_24_network(net, port): q = JoinableQueue() r = JoinableQueue() gevent.spawn(prepare_list, q, net) tasks = [] for x in range(0, CONCURRENT_GROUPS): #print "spawning %i" % x tasks += [gevent.spawn(scan_network, q, r, port)] q.join() gevent.joinall(tasks) if not r.empty(): with open(str(net.ip) + '_' + str(port) + ".m3u", "w+") as f: f.write("#EXTM3U\n") while not r.empty(): try: group = r.get(timeout=10) f.write( '#EXTINF:-1 tvg-logo="" tvg-name="" group-title="",ChannelName' + "\n") f.write('udp://@' + str(group) + ':' + str(port) + "\n") logging.info("Ok ====> %s" % group) except gevent.queue.Empty: break
def on_search(self, query): log.debug('search for %r', query) queue = JoinableQueue() task_group = g.api.search(query, queue) while True: finished = all( [t.ready() for t in task_group] ) try: item = queue.get(timeout=1.0) except Empty: if finished: break continue try: self.emit('result', item._asdict()) finally: queue.task_done() queue.join() task_group.join() self.emit('done', query)
def test_api(self): queue = JoinableQueue() task_group = self.api.search('terminator', queue) while True: finished = all( [greenlet.ready() for greenlet in task_group.greenlets] ) try: item = queue.get(timeout=1.0) except Empty: if finished: log.info('queue is empty and all jobs are done, quitting') break log.info( 'queue was empty and jobs are still running, retrying' ) continue try: log.info('%r', item) finally: queue.task_done() task_group.join() queue.join() log.info('joined everything')
def test_service(): "Service() should implement the basic needs of an async service" # Given the following service class MyService(Service): def __init__(self, my_mock, result_queue=None): self.my_mock = my_mock super(MyService, self).__init__( callback=self.run, result_queue=result_queue, ) def run(self, package, sender_data): self.my_mock.ran = package my_mock = Mock() queue = JoinableQueue() service = MyService(my_mock, result_queue=queue) # When I queue a package to be processed by my service and start the # service with 1 concurrent worker service.queue('gherkin==0.1.0', 'main') service.consume() # Then I see that the package processed package = queue.get() package.should.equal('gherkin==0.1.0') my_mock.ran.should.equal('gherkin==0.1.0')
class GeventPoolExecutor2(LoggerMixin): def __init__( self, max_works, ): check_gevent_monkey_patch() self._q = JoinableQueue(maxsize=max_works) # self._q = Queue(maxsize=max_works) for _ in range(max_works): # self.logger.debug('yyyyyy') gevent.spawn(self.__worker) atexit.register(self.__atexit) def __worker(self): while True: fn, args, kwargs = self._q.get() # noinspection PyBroadException try: fn(*args, **kwargs) except Exception as exc: self.logger.exception( f'函数 {fn.__name__} 中发生错误,错误原因是 {type(exc)} {exc} ') finally: pass self._q.task_done() def submit(self, fn: Callable, *args, **kwargs): # self.logger.debug(self._q.qsize()) self._q.put((fn, args, kwargs)) def __atexit(self): self.logger.critical('想即将退出程序。') self._q.join()
class GeventPoolExecutor2(LoggerMixin): def __init__( self, max_works, ): self._q = JoinableQueue(maxsize=max_works) # self._q = Queue(maxsize=max_works) for _ in range(max_works): gevent.spawn(self.__worker) # atexit.register(self.__atexit) self._q.join(timeout=100) def __worker(self): while True: fn, args, kwargs = self._q.get() try: fn(*args, **kwargs) except Exception as exc: self.logger.exception( f'函数 {fn.__name__} 中发生错误,错误原因是 {type(exc)} {exc} ') finally: pass self._q.task_done() def submit(self, fn: Callable, *args, **kwargs): self._q.put((fn, args, kwargs)) def __atexit(self): self.logger.critical('想即将退出程序。') self._q.join()
class GQueue(object): def __init__(self): self.__QUEUE = JoinableQueue() def job(self, func): @functools.wraps(func) def f(*args, **kwargs): self.__QUEUE.put([func, args, kwargs]) return f def join(self): self.__QUEUE.join() def work(self): while True: func, args, kwargs = self.__QUEUE.get() try: func(*args, **kwargs) finally: self.__QUEUE.task_done() def run_worker(self, num=1): for i in range(num): gevent.spawn(self.work)
def get_movie_id(): baidu_tool = MysqlCurd('douban_movie') baidu_tool.connect_mysql() result = baidu_tool.query_mysql_condition('movie_name', [{'version': 0}, ['name']]) q = JoinableQueue() for temp in result: if not baidu_tool.query_mysql_condition('name_id', [{'movie_name': temp[0]}, ['movie_id']]): q.put(temp[0]) baidu_tool.close_connect() error_q = JoinableQueue() def crawl(time): while not q.empty(): tool = MysqlCurd('douban_movie') tool.connect_mysql() name = q.get() try: page = super_downloader('https://movie.douban.com/subject_search?', params={'search_text': name}, cookies=True, proxy=True) except requests.exceptions.RequestException: print('get movie id ' + name + 'download error!') return False page = etree.HTML(page) gevent.sleep(random.uniform(time[0], time[1])) try: count = 0 count1 = 0 for _ in page.xpath('//*[@id="content"]/div/div[1]/div[2]/table[@width="100%"]'): try: mark = _.xpath('tr/td[2]/div')[0] id = mark.xpath('a')[0].get('href')[33:-1] _name = mark.xpath('a')[0].text.split('/')[0].strip() # score = mark.xpath('div/span[2]')[0].text # comment_num = mark.xpath('div/span[3]')[0].text[1:-4] tool.replace_mysql('name_id', {'movie_id': id, 'movie_name': _name}) count1 += 1 print('get movie id '+_name+'completed!!!') except IndexError as e: print('get movie id sub error!!!'+repr(e)) continue count += 1 if count == 3: break if count1>0: # tool.replace_mysql('movie_name', {'version': 1, 'name': name}) tool.close_connect() print('get movie id ' + name + ' completed!') except Exception as e: error_q.put(name) print('get movie id ' + name + ' error!') print(e) worker = SleepFunction() worker.run(crawl) with open('errorlist//movie_id.txt', 'a', encoding='utf8') as f: if not error_q.empty(): print(get_time(), file=f) while not error_q.empty(): print(error_q.get(), file=f)
def fetch_worker(fetch_queue: JoinableQueue, save_queue: JoinableQueue, direction: str): while True: word = fetch_queue.get() print(word) res = fetch(word) if res: save_queue.put((word, direction, res)) fetch_queue.task_done() else: fetch_queue.put(word)
def save_worker(dsn: str, save_queue: JoinableQueue): conn = psycopg2.connect(dsn) while True: word, direction, data = save_queue.get() try: with conn: with conn.cursor() as cur: psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur) cur.execute("INSERT INTO youdao_bilingual (keyword, direction, data) VALUES (%s, %s, %s)", (word, direction, data)) save_queue.task_done() except Exception as e: print(e) save_queue.put((word, direction, data)) conn.close()
def _run(self): paths = glob.glob(self.path) while not paths: gevent.sleep(0.01) paths = glob.glob(self.path) q = JoinableQueue() self.logger.debug('Tailing %s' % ', '.join(paths)) self.tails = [Tail(p, q, self.statedir) for p in paths] while True: data = q.get() if data: if data.endswith('\n'): data = data[0:-1] self.logger.debug('Received: %r' % data) self.output.put(Event(data=data)) q.task_done()
def get_person_info(): baidu_tool = MysqlCurd('douban_person') baidu_tool.connect_mysql() result = baidu_tool.query_mysql_condition('person_name_id', [{ 'version': 0 }, ['person_id', 'person_name']]) print(result) print(result.__len__()) q = JoinableQueue() for _ in result: if not baidu_tool.query_mysql_condition('person_info', [{ 'person_id': _[0] }, ['person_name']]): q.put(_) error_q = JoinableQueue() baidu_tool.close_connect() def temp(param): while not q.empty(): i = q.get() p = Person(id=i[0], name=i[1]) flag = p.analysis_person_info() if flag: name_id_tool = MysqlCurd('douban_person') name_id_tool.connect_mysql() name_id_tool.replace_mysql('person_name_id', { 'person_id': p.id, 'person_name': p.name, 'version': 1 }) name_id_tool.close_connect() else: error_q.put((p.id, p.name)) worker = SleepFunction() worker.run(temp) with open('errorlist//person_id.txt', 'a', encoding='utf8') as f: if not error_q.empty(): print(get_time(), file=f) while not error_q.empty(): print(error_q.get(), file=f)
def get_movie_info(name=None): q = JoinableQueue() tool = MysqlCurd('douban_movie') tool.connect_mysql() if name: try: movie_id = tool.query_mysql_condition('name_id', [{'movie_name': name}, ['movie_id']])[0][0] q.put((movie_id, name)) except IndexError: print('no id!') else: result = tool.query_mysql_condition('name_id', [{'version': 0}, ['movie_id', 'movie_name']]) for temp in result: if not tool.query_mysql_condition('movie_info', [{'movie_id': temp[0]}, ['movie_name']]): q.put(temp) tool.close_connect() error_q = JoinableQueue() def temp(time): while not q.empty(): data = q.get() m = Movie(data[0], data[1]) try: print('analysis movie info ' + data[1] + 'started') # 显示到控制台进行到哪个电影 m.analysis_movie_info() gevent.sleep(random.uniform(time[0], time[1])) except Exception as e: print(e) print('analysis movie info ' + data[1] + 'error') error_q.put(data[1]) m.tool.close_connect() print(len(q), 'remain!') worker = SleepFunction() worker.run(temp) with open('errorlist//movie_info.txt', 'a', encoding='utf8') as f: if not error_q.empty(): print(get_time(), file=f) while not error_q.empty(): print(error_q.get(), file=f)
class MassGet(FastGet): def __init__(self, urls, dic, threads=10, report_db=False, keepalive=None, each_threads=10): self.dic = dic self.report_db = report_db self.table = None if report_db: self.sql_conn(report_db) self.keepalive = keepalive self.each_threads = each_threads self.queue = JoinableQueue() [self.queue.put(x.strip()) for x in urls] [spawn(self.worker) for _ in xrange(threads)] self.queue.join() def worker(self): while not self.queue.empty(): url = self.queue.get() try: FastGet(url, self.dic, self.each_threads, self.report_db, self.keepalive, self.table) except Exception as e: logging.error('Worker global exception for %s: %s' % (url, e)) finally: self.queue.task_done()
def get(self, *args, **kw): result = JoinableQueue.get(self, *args, **kw) log.debug("operating on {}".format(result)) return result
class Migrator: def __init__(self, scheme, create_devices=True, write_data=True, start_date="2000-01-01T00:00:00Z", end_date="2014-12-31T00:00:00Z", pool_size=3): self.scheme = scheme self.create_devices = create_devices self.should_write_data = write_data self.start_date = start_date self.end_date = end_date self.tdb = TDBClient(scheme.db_key, scheme.db_key, scheme.db_secret, base_url=scheme.db_baseurl) iq_endpoint = HTTPEndpoint(scheme.iq_baseurl, scheme.iq_key, scheme.iq_secret) self.tiq = TIQClient(iq_endpoint) self.queue = JoinableQueue() self.lock = Lock() self.dp_count = 0 self.req_count = 0 self.dp_reset = time.time() for i in range(pool_size): gevent.spawn(self.worker) def worker(self): while True: series = self.queue.get() try: self.migrate_series(series) finally: self.queue.task_done() def migrate_all_series(self, start_key="", limit=None): start_time = time.time() (keys, tags, attrs) = self.scheme.identity_series_filter() series_set = self.tdb.list_series(keys, tags, attrs) # Keep our own state of whether we passed the resume point, so we don't # need to assume client and server sort strings the same. found_first_series = False series_count = 0 for series in series_set: if not found_first_series and series.key < start_key: continue else: found_first_series = True if limit and series_count >= limit: print("Reached limit of %d devices, stopping." % (limit)) break if self.scheme.identity_series_client_filter(series): # If the series looks like an identity series, # queue it to be processed by the threadpool self.queue.put(series) series_count += 1 self.queue.join() end_time = time.time() print("Exporting {} devices took {} seconds".format( series_count, end_time - start_time)) def migrate_series(self, series): print(" Beginning to migrate series: %s" % (series.key)) error = False try: if self.create_devices: error = self.create_device(series) if self.should_write_data and not error: error = self.write_data(series) except Exception, e: logging.exception(e) error = True if not error: print("COMPLETED migrating for series %s" % (series.key)) else: print("ERROR migrating series %s" % (series.key))
class Service(object): def __init__(self, callback, **args): self.callback = callback self.result_queue = args.get('result_queue') self.package_queue = JoinableQueue() self.failed_queue = [] self.env = args.get('env') self.main_greenlet = None self.pool = Pool(args.get('concurrency')) self.should_run = True self.subscribers = [] self.logger = Logger(self.name, args.get('log_level')) @property def name(self): return self.__class__.__name__.lower() def queue(self, package, sender_name, **data): assert (sender_name == 'downloadmanager' and data.get('path')) or True self.package_queue.put((package, (sender_name, data))) self.logger.level(3, ' * queue(from=%s, to=%s, package=%s, data=%s)', sender_name, self.name, package, data) def consume(self): package, sender_data = self.package_queue.get() self.pool.spawn(self._run_service, package, sender_data) self.logger.level(3, ' * %s.run(package=%s, sender_data=%s)', self.name, package, sender_data) def subscribe(self, other): other.subscribers.append(self) def loop(self): while self.should_run: self.consume() def start(self): self.main_greenlet = gevent.spawn(self.loop) def stop(self, force=False): # This will force the current iteraton on `loop()` to be the last one, # so the thing we're processing will be able to finish; self.should_run = False # if the caller is in a hurry, we'll just kill everything mercilessly if force and self.main_greenlet: self.main_greenlet.kill() def _run_service(self, package, sender_data): try: data = self.callback(package, sender_data) except NotForMe: return except ReportableError as exc: self.failed_queue.append((package, exc)) self.logger.level(0, "Error: %s", exc) except BaseException as exc: self.failed_queue.append((package, exc)) self.logger.traceback(4, 'failed to run %s (requested by:%s) for package %s:', self.name, sender_data[0], package, exc=exc) else: # Let's notify our subscribers for subscriber in self.subscribers: subscriber.queue(package, self.name, **(data or {})) # If the callback worked, let's go ahead and tell the world. If and # only if requested by the caller, of course. if self.result_queue: self.result_queue.put(package)
def get_person_id(): baidu_tool = MysqlCurd('douban_person') baidu_tool.connect_mysql() result = baidu_tool.query_mysql_condition('person_name', [{ 'version': 0 }, ['name']]) print(result) print(result.__len__()) q = JoinableQueue() for _ in result: if not baidu_tool.query_mysql_condition('person_name_id', [{ 'person_name': _[0] }, ['person_id']]): q.put(_[0].strip('\n')) error_q = JoinableQueue() def crawl(param): while not q.empty(): tool = MysqlCurd('douban_person') tool.connect_mysql() name = q.get() try: result = super_downloader( 'https://movie.douban.com/subject_search?', params={'search_text': name}, proxy=True, cookies=True) gevent.sleep(random.uniform(2, 6.5)) except requests.exceptions.RequestException as e: print(name + 'download error!') continue try: page = etree.HTML(result) basic = page.xpath( '//*[@id="content"]/div/div[@class="article"]/div[1]/' 'div[@class="result-item"]/div[@class="content"]/h3/a')[0] id = basic.get('href')[35:-1] name = basic.text.split()[0] tool.replace_mysql('person_name_id', { 'person_id': id, 'person_name': name, }) baidu_tool = MysqlCurd('douban_person') baidu_tool.connect_mysql() baidu_tool.replace_mysql('person_name', { 'name': name, 'version': 1 }) baidu_tool.close_connect() tool.close_connect() print(name + 'completed') except IndexError: error_q.put(name) print(name + 'error!') worker = SleepFunction() worker.run(crawl) with open('errorlist//person_id.txt', 'a', encoding='utf8') as f: if not error_q.empty(): print(get_time(), file=f) while not error_q.empty(): print(error_q.get(), file=f)
class DriverPool(object): """ Create a pool of available Selenium containers for processing. Args: size (int): maximum concurrent tasks. Must be at least ``2``. driver_cls (WebDriver): driver_cls_args (tuple): driver_cls_kw (dict): use_proxy (bool): factory (:obj:`~selenium_docker.base.ContainerFactory`): name (str): logger (:obj:`logging.Logger`): Example:: pool = DriverPool(size=2) urls = [ 'https://google.com', 'https://reddit.com', 'https://yahoo.com', 'http://ksl.com', 'http://cnn.com' ] def get_title(driver, url): driver.get(url) return driver.title for result in pool.execute(get_title, urls): print(result) """ INNER_THREAD_SLEEP = 0.5 """float: essentially our polling interval between tasks and checking when tasks have completed. """ PROXY_CLS = SquidProxy """:obj:`~selenium_docker.proxy.AbstractProxy`: created for the pool when ``use_proxy=True`` during pool instantiation. """ def __init__(self, size, driver_cls=ChromeDriver, driver_cls_args=None, driver_cls_kw=None, use_proxy=True, factory=None, name=None, logger=None): self.size = max(2, size) self.name = name or gen_uuid(6) self.factory = factory or ContainerFactory.get_default_factory() self.logger = logger or getLogger( '%s.DriverPool.%s' % (__name__, self.name)) self._driver_cls = driver_cls self._driver_cls_args = driver_cls_args or tuple() self._driver_cls_kw = driver_cls_kw or dict() self._drivers = Queue(maxsize=self.size) # post init inspections if not hasattr(self._driver_cls, 'CONTAINER'): raise DriverPoolValueError('driver_cls must extend DockerDriver') if not isiterable(self._driver_cls_args): raise DriverPoolValueError( '%s is not iterable' % self._driver_cls_args) if not isinstance(self._driver_cls_kw, Mapping): raise DriverPoolValueError( '%s is not a valid mapping' % self._driver_cls_kw) # determine proxy usage self.proxy = None self._use_proxy = use_proxy # type: bool # deferred instantiation self._pool = None # type: Pool self._results = None # type: Queue self._tasks = None # type: JoinableQueue self._processing = False # type: bool self.__feeder_green = None # type: gevent.Greenlet def __repr__(self): return '<DriverPool-%s(size=%d,driver=%s,proxy=%s,async=%s)>' % ( self.name, self.size, self._driver_cls.BROWSER, self._use_proxy, self.is_async) def __iter__(self): return self.results(block=self.is_async) def __del__(self): try: self.close() except Exception as e: if hasattr(self, 'logger'): self.logger.exection(e, exc_info=False) @property def is_processing(self): """bool: whether or not we're currently processing tasks. """ return self._processing @property def is_async(self): """bool: returns True when asynchronous processing is happening. """ return self.__feeder_green is not None def __bootstrap(self): """ Prepare this driver pool instance to batch execute task items. """ if self.is_processing: # cannot run two executions simultaneously raise DriverPoolRuntimeException( 'cannot bootstrap pool, already running') if self._results and self._results.qsize(): # pragma: no cover self.logger.debug('pending results being discarded') if self._tasks and self._tasks.qsize(): # pragma: no cover self.logger.debug('pending tasks being discarded') if self._pool: # pragma: no cover self.logger.debug('killing processing pool') self._pool.join(timeout=10.0) self._pool.kill() self._pool = None if self._use_proxy and not self.proxy: # defer proxy instantiation -- since spinning up a squid proxy # docker container is surprisingly time consuming. self.logger.debug('bootstrapping squid proxy') self.proxy = self.PROXY_CLS(factory=self.factory) self.logger.debug('bootstrapping pool processing') self._processing = True self._results = Queue() self._tasks = JoinableQueue() self._load_drivers() # create our processing pool with headroom over the number of drivers # requested for this processing pool. self._pool = Pool(size=self.size + math.ceil(self.size * 0.25)) def __cleanup(self, force=False): """ Stop and remove the web drivers and their containers. This function should not remove pending tasks or results. It should be possible to cleanup all the external resources of a driver pool and still extract the results of the work that was completed. Raises: DriverPoolRuntimeException: when attempting to cleanup an environment while processing is still happening, and forcing the cleanup is set to ``False``. SeleniumDockerException: when a driver instance or container cannot be closed properly. Returns: None """ if self.is_processing and not force: # pragma: no cover raise DriverPoolRuntimeException( 'cannot cleanup driver pool while executing') self._processing = False squid = None # type: gevent.Greenlet error = None # type: SeleniumDockerException if self.proxy: self.logger.debug('closing squid proxy') squid = gevent.spawn(self.proxy.quit) if self._pool: # pragma: no cover self.logger.debug('emptying task pool') if not force: self._pool.join(timeout=10.0) self._pool.kill(block=False, timeout=10.0) self._pool = None self.logger.debug('closing all driver containers') while not self._drivers.empty(): d = self._drivers.get(block=True) try: d.quit() except SeleniumDockerException as e: # pragma: no cover self.logger.exception(e, exc_info=True) if not force: error = e if self.proxy: squid.join() self.proxy = None if error: # pragma: no cover raise error def _load_driver(self, and_add=True): """ Load a single web driver instance and container. """ args = self._driver_cls_args kw = dict(self._driver_cls_kw) kw.update({ 'proxy': self.proxy, 'factory': self.factory, }) driver = self._driver_cls(*args, **kw) if and_add: self._drivers.put(driver) return driver def _load_drivers(self): """ Load the web driver instances and containers. Raises: DriverPoolRuntimeException: when the requested number of drivers for the given pool size cannot be created for some reason. Returns: None """ if not self._drivers.empty(): # pragma: no cover return threads = [] for o in range(self.size): self.logger.debug('creating driver %d of %d', o + 1, self.size) thread = gevent.spawn(self._load_driver) threads.append(thread) for t in reversed(threads): t.join() if not self._drivers.full(): raise DriverPoolRuntimeException( 'unable to fulfill required concurrent drivers, %d of %d' % ( self._drivers.qsize(), self.size)) def _recycle_driver(self, driver): if not driver: return try: driver.quit() except Exception as e: self.logger.exception(e, exc_info=True) # do NOT add the new driver container to the drivers queue, # instead this will be handled in the recycle logic that requested # the driver in the first place. Instead of returning the one it # received this "new" instance will be put in its placed. print('RECYCLED!!!!!!') return self._load_driver(and_add=False) def add_async(self, *items): """ Add additional items to the asynchronous processing queue. Args: items (list(Any)): list of items that need processing. Each item is applied one at a time to an available driver from the pool. Raises: StopIteration: when all items have been added. """ if len(items) == 1 and isinstance(items[0], list): items = iter(items[0]) if not items: raise DriverPoolValueError( 'cannot add items with value: %s' % str(items)) item_count = count(items) self.logger.debug('adding %d additional items to tasks', item_count) for o in items: self._tasks.put(o) def close(self): """ Force close all the drivers and cleanup their containers. Returns: None """ self.__cleanup(force=True) def execute(self, fn, items, preserve_order=False, auto_clean=True, no_wait=False): """ Execute a fixed function, blocking for results. Args: fn (Callable): function that takes two parameters, ``driver`` and ``task``. items (list(Any)): list of items that need processing. Each item is applied one at a time to an available driver from the pool. preserve_order (bool): should the results be returned in the order they were supplied via ``items``. It's more performant to allow results to return in any order. auto_clean (bool): cleanup docker containers after executing. If multiple processing tasks are going to be used, it's more performant to leave the containers running and reuse them. no_wait (bool): forgo a small sleep interval between finishing a task and putting the driver back in the available drivers pool. Yields: results: the result for each item as they're finished. """ def worker(o): job_num, item = o self.logger.debug('doing work on item %d' % job_num) driver = self._drivers.get(block=True) ret_val = fn(driver, item) if not no_wait: gevent.sleep(self.INNER_THREAD_SLEEP) self._drivers.put(driver) return ret_val if self.__feeder_green: raise DriverPoolRuntimeException( 'cannot perform a blocking execute while async processing') self.__bootstrap() self.logger.debug('starting sync processing') if preserve_order: ittr = self._pool.imap else: ittr = self._pool.imap_unordered self.logger.debug('yielding processed results') for o in ittr(worker, enumerate(items)): self._results.put(o) self._results.put(StopIteration) self.logger.debug('stopping sync processing') if auto_clean: self.logger.debug('auto cleanup pool environment') self.__cleanup(force=True) return self.results(block=False) def execute_async(self, fn, items=None, callback=None, catch=(WebDriverException,), requeue_task=False): """ Execute a fixed function in the background, streaming results. Args: fn (Callable): function that takes two parameters, ``driver`` and ``task``. items (list(Any)): list of items that need processing. Each item is applied one at a time to an available driver from the pool. callback (Callable): function that takes a single parameter, the return value of ``fn`` when its finished processing and has returned the driver to the queue. catch (tuple[Exception]): tuple of Exception classes to catch during task execution. If one of these Exception classes is caught during ``fn`` execution the driver that crashed will attempt to be recycled. requeue_task (bool): in the event of an Exception being caught should the task/item that was being worked on be re-added to the queue of items being processed. Raises: DriverPoolValueError: if ``callback`` is not ``None`` or ``callable``. Returns: None """ def worker(fn, task): ret_val = None async_task_id = gen_uuid(12) self.logger.debug('starting async task %s', async_task_id) driver = self._drivers.get(block=True) if isinstance(driver, Exception): raise driver try: ret_val = fn(driver, task) except catch as e: self.logger.exception('hihi') if self.is_processing: driver = self._recycle_driver(driver) if requeue_task: self._tasks.put(task) finally: self._results.put(ret_val) self._drivers.put(driver) gevent.sleep(self.INNER_THREAD_SLEEP) return ret_val def feeder(): self.logger.debug('starting async feeder thread') while True: while not self._tasks.empty(): task = self._tasks.get() if self._pool is None: break self._pool.apply_async( worker, args=(fn, task,), callback=greenlet_callback) gevent.sleep(self.INNER_THREAD_SLEEP) if self._pool is None and not self.is_processing: break return if callback is None: def logger(value): self.logger.debug('%s', value) callback = logger def real_callback(cb, value): if isinstance(value, gevent.GreenletExit): raise value else: cb(value) greenlet_callback = partial(real_callback, callback) for f in [fn, callback]: if not callable(f): raise DriverPoolValueError( 'cannot use %s, is not callable' % callback) self.logger.debug('starting async processing') self.__bootstrap() if not self.__feeder_green: self.__feeder_green = gevent.spawn(feeder) if items: self.add_async(*items) def quit(self): """ Alias for :func:`~DriverPool.close()`. Included for consistency with driver instances that generally call ``quit`` when they're no longer needed. Returns: None """ if self.__feeder_green: return self.stop_async() return self.close() def results(self, block=True): """ Iterate over available results from processed tasks. Args: block (bool): when ``True``, block this call until all tasks have been processed and all results have been returned. Otherwise this will continue indefinitely while tasks are dynamically added to the async processing queue. Yields: results: one result at a time as they're finished. Raises: StopIteration: when the processing is finished. """ est_size = self._results.qsize() self.logger.debug('there are an estimated %d results', est_size) if block: self.logger.debug('blocking for results to finish processing') while self.is_processing: while not self._results.empty(): yield self._results.get() gevent.sleep(self.INNER_THREAD_SLEEP) if self._tasks.empty() and self._results.empty(): break raise StopIteration else: if est_size > 0: self.logger.debug('returning as many results as have finished') self._results.put(StopIteration) for result in self._results: yield result def stop_async(self, timeout=None, auto_clean=True): """ Stop all the async worker processing from executing. Args: timeout (float): number of seconds to wait for pool to finish processing before killing and closing out the execution. auto_clean (bool): cleanup docker containers after executing. If multiple processing tasks are going to be used, it's more performant to leave the containers running and reuse them. Returns: None """ self.logger.debug('stopping async processing') if self.__feeder_green: self.logger.debug('killing async feeder thread') gevent.kill(self.__feeder_green) self.__feeder_green = None if self._pool: self.logger.debug('joining async pool before kill') self._pool.join(timeout=timeout or 1.0) self._pool.kill(block=False) tasks_count = self._tasks.qsize() self.logger.info('%d tasks remained unprocessed', tasks_count) if auto_clean: self.logger.debug('auto cleanup pool environment') self.__cleanup(force=True)
class FastGet: def __init__(self, url, dic, threads=100, report_db=False, keepalive=None, table_name=None): self.url = url parts = urlparse(url) self.scheme, self.host, self.port = parts.scheme, parts.hostname, parts.port if not self.port: self.port = 443 if self.scheme == 'https' else 80 self.keepalive = keepalive try: instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive) except Exception as e: logging.error('Init exception for %s: %s' % (self.url, e)) return if not keepalive: self.keepalive = instance.detect_keepalive() if self.keepalive == 0: logging.error('Keep-Alive value for %s appears to be 0, check the connection' % url) return logging.warning('Calculated Keep-Alive for %s: %s' % (url, self.keepalive)) self.report_db = report_db if report_db: self.table = table_name self.sql_conn(report_db) self.queue = JoinableQueue() [self.queue.put(dic[i:i + self.keepalive]) for i in xrange(0, len(dic), self.keepalive)] [spawn(self.worker) for _ in xrange(threads)] self.queue.join() def sql_conn(self, report_db): self.conn = MySQLdb.connect(report_db['host'], report_db['user'], report_db['passwd'], report_db['db']) self.cur = self.conn.cursor() if not self.table: self.table = 'scan_%s' % datetime.strftime(datetime.now(), '%Y_%m_%d_%H%M%S') self.cur.execute( 'create table %s(scheme varchar(16), host varchar(128), port smallint, uri varchar(128),\ code smallint, size int, type varchar(128))' % self.table) def report(self, result): if result[1] not in [302, 404]: logging.warning('Path %s://%s:%s/%s, response code %s, content-length %s, content-type %s' % ( self.scheme, self.host, self.port, result[0], result[1], result[2], result[3])) if self.report_db: p = [self.scheme, self.host, self.port] + list(result) self.cur.execute('insert into %s values(%%s,%%s,%%s,%%s,%%s,%%s,%%s)' % self.table, p) def worker(self): try: instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive) except Exception as e: logging.error('Worker init exception for %s: %s' % (self.url, e)) return while not self.queue.empty(): paths = self.queue.get() try: for x in instance.bulk_get(paths): self.report(x) except Exception as e: logging.error('Worker loop exception for %s: %s' % (self.url, e)) finally: if self.report_db: self.conn.commit() self.queue.task_done()
class GeventConsumer(object): def __init__( self, consumer_config=None, topic=None, parse_func=None, num=8, auto_commit_offset=False, is_debug=False, ): if not parse_func: raise Exception("not parse func, system exit") self.parse = parse_func self.queue = Queue(100) self.stop_flag = Event() self.num = num self.debug = is_debug if not self.debug: self.auto_commit_offset = auto_commit_offset if isinstance(consumer_config, dict): consumer_config.update({'enable.auto.commit':self.auto_commit_offset}) self.consumer = Consumer(consumer_config) self.topic = topic self.consumer.subscribe(self.topic) def sign_handler(self, sig, frame): print(" >>> Termination_signal:[{}] to stop".format(sig)) self.stop_flag.set() def kafka_to_queue(self): logger.info("Start Producer thread") m = 0 time_diff = 0 start_time = time.time() while not self.stop_flag.is_set(): msg = self.consumer.poll(1) if msg is None: time.sleep(0.001) return err = msg.error() if err: if err.code() == KafkaError._PARTITION_EOF: logger.debug( '%s [%s] reached end at offset %s', msg.topic(), msg.partition(), msg.offset() ) else: logger.error('kafka failed, system exit') self.stop_flag.set() self.queue.put(msg) # 消费速度统计 m += 1 current_time = time.time() time_diff = current_time - start_time if time_diff > 10: rate = m / time_diff start_time = current_time m = 0 logger.info('consumer_rate:[%.2f]p/s, queue_size:[%d]' % (rate, self.queue.qsize())) logger.info("Producer thread has stopped") def consume(self): logger.info('Start Thread To Consumer') data = dict() stop = False while True: stop = self.stop_flag.is_set() if stop and self.queue.empty(): break msg = self.queue.get() try: data = self.parse(msg.value()) if data: self.handle_data(data, stop) finally: self.queue.task_done() if not stop and not self.auto_commit_offset: self.consumer.commit(msg) logger.info('Thread Consumer has stopped') def handle_data(self, data, stop): raise NotImplementedError def consume_forever(self): """ start consume forever """ signal(SIGTERM, self.sign_handler) signal(SIGINT, self.sign_handler) if self.debug: consume_func = self.mock_consume produce_func = self.mock_kafka else: consume_func = self.consume produce_func = self.kafka_to_queue task_list = [] for _ in range(self.num): task_list.append(gevent.spawn(consume_func)) produce_func() self.queue.join() if not self.debug: logger.info("closing kafka...") self.consumer.close() gevent.joinall(task_list, timeout=5) logger.info('Exiting with qsize:%d' % self.queue.qsize()) # ===========mock kafka and consumer======================= def mock_kafka(self): logger.info("Start Producer thread") m = 0 time_diff = 0 start_time = time.time() # jing5 msg msg = "23230254455354325631393046433232323232320101008e14080b0e0c38426e0101008422551354455354325631393046433232323232323131313131313131313131313131313131313131313131313131313131313131313130010000000002803365818a91eb00010002fffe050018fffe2eeb596f50830005e91efd02649c6b7eb1ac0d80000043c497fd0022f90a3d057b2403032581373635343332310082e99f008a06".decode('hex') while not self.stop_flag.is_set(): self.queue.put(msg) m += 1 # 消费速度统计 current_time = time.time() time_diff = current_time - start_time if time_diff > 5: rate = m / time_diff start_time = current_time m = 0 logger.info('consumer_rate:[%.2f]p/s, queue_size:[%d]' % (rate, self.queue.qsize())) logger.info("closing produce...") logger.info("Producer thread has stopped") def mock_consume(self): logger.info('Start Thread To Consumer') data = dict() stop = False while True: stop = self.stop_flag.is_set() if stop and self.queue.empty(): break msg = self.queue.get() try: data = self.parse(msg) self.handle_data(data, stop) except Exception as err: logger.error("consumer:{}".format(getcurrent())) finally: self.queue.task_done() logger.info('Thread Consumer has stopped')
class AsynSpiderWithGevent(MySpider): def __init__(self, out=BasicAnalysis(), **kwargs): super(AsynSpiderWithGevent, self).__init__(out, **kwargs) self.q = JoinableQueue() self.fetching, self.fetched = set(), set() def assign_jobs(self, jobs): for job in jobs: self.q.put(job) def run(self): if self.q.empty(): url = LIST_URL + urllib.urlencode(self.list_query) self.q.put(url) for _ in range(CONCURRENCY): gevent.spawn(self.worker) self.q.join() assert self.fetching == self.fetched self._out.finish() def worker(self): while True: self.fetch_url() def fetch_url(self): current_url = self.q.get() try: if current_url in self.fetching: return self.fetching.add(current_url) resp = requests.get(current_url, headers=HEADERS) self.fetched.add(current_url) xml = etree.fromstring(resp.content) has_total_count = xml.xpath("//totalcount/text()") if has_total_count: # 非空证明为列表,否则为详细页 total_count = int(has_total_count[0]) if total_count == 0: return # 列表跨界 if self.list_query["pageno"] == 1: pageno = 2 # while pageno < 10: while pageno <= total_count / PAGE_SIZE: self.list_query["pageno"] = pageno next_list_url = LIST_URL + urllib.urlencode( self.list_query) self.q.put(next_list_url) # logging.info(next_list_url) pageno += 1 job_ids = xml.xpath("//jobid/text()") job_detail_urls = [] for ID in job_ids: new_detail_query = DETAIL_QUERY.copy() new_detail_query["jobid"] = ID job_detail_urls.append(DETAIL_URL + urllib.urlencode(new_detail_query)) for detail_url in job_detail_urls: self.q.put(detail_url) # logging.info(detail_url) else: self._out.collect(xml) finally: self.q.task_done()
class Migrator: def __init__(self, scheme, create_devices=True, write_data=True, start_date="2000-01-01T00:00:00Z", end_date="2014-12-31T00:00:00Z", pool_size=3): self.scheme = scheme self.create_devices = create_devices self.should_write_data = write_data self.start_date = start_date self.end_date = end_date self.tdb = TDBClient(scheme.db_key, scheme.db_key, scheme.db_secret, base_url=scheme.db_baseurl) iq_endpoint = HTTPEndpoint(scheme.iq_baseurl, scheme.iq_key, scheme.iq_secret) self.tiq = TIQClient(iq_endpoint) self.queue = JoinableQueue() self.lock = Lock() self.dp_count = 0 self.req_count = 0 self.dp_reset = time.time() for i in range(pool_size): gevent.spawn(self.worker) def worker(self): while True: series = self.queue.get() try: self.migrate_series(series) finally: self.queue.task_done() def migrate_all_series(self, start_key="", limit=None): start_time = time.time() (keys, tags, attrs) = self.scheme.identity_series_filter() series_set = self.tdb.list_series(keys, tags, attrs) # Keep our own state of whether we passed the resume point, so we don't # need to assume client and server sort strings the same. found_first_series = False series_count = 0 for series in series_set: if not found_first_series and series.key < start_key: continue else: found_first_series = True if limit and series_count >= limit: print("Reached limit of %d devices, stopping." % (limit)) break if self.scheme.identity_series_client_filter(series): # If the series looks like an identity series, # queue it to be processed by the threadpool self.queue.put(series) series_count += 1 self.queue.join() end_time = time.time() print("Exporting {} devices took {} seconds".format(series_count, end_time - start_time)) def migrate_series(self, series): print(" Beginning to migrate series: %s" % (series.key)) error = False try: if self.create_devices: error = self.create_device(series) if self.should_write_data and not error: error = self.write_data(series) except Exception, e: logging.exception(e) error = True if not error: print("COMPLETED migrating for series %s" % (series.key)) else: print("ERROR migrating series %s" % (series.key))
class HttpScanner(object): def __init__(self, args): """ Initialise HTTP scanner :param args: :return: """ self.args = args self.output = HttpScannerOutput(args) self._init_scan_options() # Reading files self.output.write_log("Reading files and deduplicating.", logging.INFO) self.hosts = self._file_to_list(args.hosts) self.urls = self._file_to_list(args.urls) # self._calc_urls() out = 'Loaded %i hosts %i urls' % (self.hosts_count, self.urls_count) if self.args.ports is not None: out += ' %i ports' % len(self.args.ports) self.output.print_and_log(out) if self.args.ports is not None and not self.args.syn: new_hosts = [] for host in self.hosts: for port in self.args.ports: # print(host, port) new_hosts.append(helper.generate_url(host, port)) self.hosts = new_hosts # self._calc_urls() self.output.print_and_log('%i full urls to scan' % self.full_urls_count) # Queue and workers self.hosts_queue = JoinableQueue() self.workers = [] def _file_to_list(self, filename, dedup=True): """ Get list from file :param filename: file to read :return: list of lines """ if not path.exists(filename) or not path.isfile(filename): self.output.print_and_log('File %s not found!' % filename, logging.ERROR) exit(-1) # Preparing lines list lines = filter(lambda line: line is not None and len(line) > 0, open(filename).read().split('\n')) if len(lines) == 0: self.output.print_and_log('File %s is empty!' % filename, logging.ERROR) exit(-1) return helper.deduplicate(lines) if dedup else lines def _init_scan_options(self): # Session self.session = session() self.session.timeout = self.args.timeout self.session.verify = False # TODO: debug and check # self.session.mount("http://", HTTPAdapter(max_retries=self.args.max_retries)) # self.session.mount("https://", HTTPAdapter(max_retries=self.args.max_retries)) # http://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request # Max retries adapters.DEFAULT_RETRIES = self.args.max_retries # TOR if self.args.tor: self.output.write_log("TOR usage detected. Making some checks.") self.session.proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } url = 'http://ifconfig.me/ip' real_ip, tor_ip = None, None # Ger real IP address try: real_ip = get(url).text.strip() except Exception as exception: self.output.print_and_log("Couldn't get real IP address. Check yout internet connection.", logging.ERROR) self.output.write_log(str(exception), logging.ERROR) exit(-1) # Get TOR IP address try: tor_ip = self.session.get(url).text.strip() except Exception as exception: self.output.print_and_log("TOR socks proxy doesn't seem to be working.", logging.ERROR) self.output.write_log(str(exception), logging.ERROR) exit(-1) # Show IP addresses self.output.print_and_log('Real IP: %s TOR IP: %s' % (real_ip, tor_ip)) if real_ip == tor_ip: self.output.print_and_log("TOR doesn't work! Stop to be secure.", logging.ERROR) exit(-1) # Proxy if self.args.proxy is not None: self.session.proxies = {"https": self.args.proxy, "http": self.args.proxy} # Auth if self.args.auth is not None: items = self.args.auth.split(':') self.session.auth = (items[0], items[1]) # Cookies self.cookies = {} if self.args.cookies is not None: self.cookies = Cookies.from_request(self.args.cookies) # Cookies from file if self.args.load_cookies is not None: if not path.exists(self.args.load_cookies) or not path.isfile(self.args.load_cookies): self.output.print_and_log('Could not find cookie file: %s' % self.args.load_cookies, logging.ERROR) exit(-1) self.cookies = MozillaCookieJar(self.args.load_cookies) self.cookies.load() self.session.cookies = self.cookies # User-Agent self.ua = UserAgent() if self.args.random_agent else None def worker(self, worker_id): self.output.write_log('Worker %i started.' % worker_id) while not self.hosts_queue.empty(): host = self.hosts_queue.get() try: self.scan_host(worker_id, host) finally: self.output.write_log('Worker %i finished.' % worker_id) self.hosts_queue.task_done() def _head_available(self, host): """ Determine if HEAD requests is allowed :param host: :return: """ # Trying to use OPTIONS request try: response = self.session.options(host, headers=self._fill_headers()) o = response.headers['allow'] if 'allow' in response.headers else None if o is not None and o.find('HEAD') != -1: return True except: # TODO: fix pass try: return False if self.session.head(host, headers=self._fill_headers()).status_code == 405 else True except: # TODO: fix return False def scan_host(self, worker_id, host): # check if resolvable ip = helper.url_to_ip(host) if ip is None: self.output.write_log('Could not resolve %s Skipping...' % host, logging.WARNING) self.output.urls_scanned += len(self.urls) return # Check for HEAD host_url = helper.host_to_url(host) head_available = False if self.args.head: head_available = self._head_available(host) if head_available: self.output.write_log('HEAD is supported for %s' % host) errors_count, urls_scanned = 0, 0 for url in self.urls: full_url = urljoin(host_url, url) r = self.scan_url(full_url, head_available) urls_scanned += 1 self.output.urls_scanned += 1 # Output r['worker'] = worker_id self.output.write(**r) if r['exception'] is not None: errors_count += 1 # Skip host on errors if self.args.skip is not None and errors_count == self.args.skip: self.output.write_log('Errors limit reached on %s Skipping other urls.' % host, logging.WARNING) self.output.urls_scanned += len(self.urls) - urls_scanned break # cookies bugfix? self.session.cookies.clear() def _fill_headers(self): # Fill UserAgent in headers headers = {} if self.args.user_agent is not None: headers['User-agent'] = self.args.user_agent elif self.args.random_agent: headers['User-agent'] = self.ua.random # Fill Referer in headers if self.args.referer is not None: headers['Referer'] = self.args.referer return headers def _parse_response(self, url, response, exception): res = {'url': url, 'response': response, 'exception': exception} if response is None or exception is not None: res.update({ 'status': -1, 'length': -1, }) return res try: length = int(response.headers['content-length']) if 'content-length' in response.headers else len( response.text) except Exception as exception: self.output.write_log( "Exception while getting content length for URL: %s Exception: %s" % (url, str(exception)), logging.ERROR) length = 0 res.update({ 'status': response.status_code, 'length': length, }) return res def scan_url(self, url, use_head=False): self.output.write_log('Scanning %s' % url, logging.DEBUG) # Query URL and handle exceptions response, exception = None, None method = 'HEAD' if use_head else 'GET' try: # TODO: add support for user:password in URL response = self.session.request(method, url, headers=self._fill_headers(), allow_redirects=self.args.allow_redirects) except ConnectionError as ex: self.output.write_log('Connection error while quering %s' % url, logging.ERROR) exception = ex except HTTPError as ex: self.output.write_log('HTTP error while quering %s' % url, logging.ERROR) exception = ex except Timeout as ex: self.output.write_log('Timeout while quering %s' % url, logging.ERROR) exception = ex except TooManyRedirects as ex: self.output.write_log('Too many redirects while quering %s' % url, logging.ERROR) exception = ex except Exception as ex: self.output.write_log('Unknown exception while quering %s' % url, logging.ERROR) exception = ex # print('cookies: %s' % self.cookies) print('session.cookies: %s' % self.session.cookies) # self.session.cookies = self.cookies return self._parse_response(url, response, exception) def signal_handler(self): """ Signal hdndler :return: """ # TODO: add saving status via pickle self.output.print_and_log('Signal caught. Stopping...', logging.WARNING) self.stop() exit(signal.SIGINT) def _calc_urls(self): # Calculations self.urls_count = len(self.urls) self.hosts_count = len(self.hosts) self.full_urls_count = len(self.urls) * len(self.hosts) self.output.args.urls_count = self.full_urls_count def start(self): """ Start mulithreaded scan :return: """ # Set signal handler gevent.signal(signal.SIGTERM, self.signal_handler) gevent.signal(signal.SIGINT, self.signal_handler) gevent.signal(signal.SIGQUIT, self.signal_handler) # ICMP scan if self.args.icmp: if geteuid() != 0: self.output.print_and_log('To use ICMP scan option you must run as root. Skipping ICMP scan', logging.WARNING) else: self.output.print_and_log('Starting ICMP scan.') self.hosts = helper.icmp_scan(self.hosts, self.args.timeout) self._calc_urls() self.output.print_and_log('After ICMP scan %i hosts %i urls loaded, %i urls to scan' % (self.hosts_count, self.urls_count, self.full_urls_count)) # SYN scan if self.args.syn: if self.args.tor or self.args.proxy is not None: self.output.print_and_log('SYN scan via tor or proxy is impossible!', logging.WARNING) self.output.print_and_log('Stopping to prevent deanonymization!', logging.WARNING) exit(-1) if geteuid() != 0: self.output.print_and_log('To use SYN scan option you must run as root. Skipping SYN scan', logging.WARNING) else: self.output.print_and_log('Starting SYN scan.') self.hosts = helper.syn_scan(self.hosts, self.args.ports, self.args.timeout) self._calc_urls() self.output.print_and_log('After SYN scan %i hosts %i urls loaded, %i urls to scan' % (self.hosts_count, self.urls_count, self.full_urls_count)) # Check threds count vs hosts count if self.args.threads > self.hosts_count: self.output.write_log('Too many threads! Fixing threads count to %i' % self.hosts_count, logging.WARNING) threads_count = self.hosts_count else: threads_count = self.args.threads # Output urls count self.output.args.urls_count = self.full_urls_count # Start workers self.workers = [spawn(self.worker, i) for i in range(threads_count)] # Fill and join queue [self.hosts_queue.put(host) for host in self.hosts] self.hosts_queue.join() def stop(self): """ Stop scan :return: """ # TODO: stop correctly gevent.killall(self.workers)
class BaseCrawler(object): def __init__(self, requestHandler=BaseRequestHandler(), parseHandler=BaseParseHandler(), sheduler=BaseScheduler(), pipeline=BasePipeline()): self.requestHandler = requestHandler self.parseHandler = parseHandler self.sheduler = sheduler self.pipeline = pipeline self.task_queue = JoinableQueue() self.response_queue = JoinableQueue() self.tasks_cnt = 0 self.result_queue = JoinableQueue() self.jobs_cnt = config.num_threads self.start_time = time.time() self.stop = False def doScheduler(self): """Generate tasks, one thread """ logging.info('scheduler started!') for task in self.sheduler.init_generator(): self.task_queue.put(task) self.tasks_cnt += 1 while self.tasks_cnt > 0 and not self.stop: gevent.sleep(config.new_task_check_time) logging.info('scheduler finished! All task done.') for i in xrange(config.num_threads): self.task_queue.put(StopIteration) def worker(self): """Fetch url and parse, config.num_threads threads """ task = self.task_queue.get() cnt = config.error_retry_cnt while task != StopIteration: try: #timeout = gevent.Timeout(config.TASK_TIMEOUT) #timeout.start() response = self.requestHandler.handle(task) result, new_tasks = self.parseHandler.handle(response) #timeout.cancel() #if isinstance(result, collections.Iterable): #if isinstance(result, list): # for ret in result: # self.result_queue.put(ret) #else: if result: self.result_queue.put(result) for task in new_tasks: self.task_queue.put(task) self.tasks_cnt += 1 #self.task_queue.task_done() self.tasks_cnt -= 1 task = self.task_queue.get() cnt = config.error_retry_cnt except Exception as e: try: #timeout.cancel() cnt -= 1 logging.exception(e) if cnt <= 0: #self.task_queue.task_done() self.tasks_cnt -= 1 task = self.task_queue.get() logging.error( 'task failed, try \033[31m%d\033[0m times! will not try' % (config.error_retry_cnt - cnt)) cnt = config.error_retry_cnt #logging.exception('task failed!') else: logging.error( 'task failed, try \033[31m%d\033[0m times!' % (config.error_retry_cnt - cnt)) except Exception as e: self.tasks_cnt -= 1 #self.jobs_cnt -= 1 raise finally: #timeout.cancel() pass self.jobs_cnt -= 1 def doPipeline(self): while self.jobs_cnt > 0 or not self.result_queue.empty(): gevent.sleep(config.pipeline_sleeptime) results = [] try: while 1: results.append(self.result_queue.get_nowait()) if len(results) > 100: raise gevent.queue.Empty except gevent.queue.Empty: if results: try: self.pipeline.process(results) except: logging.exception('') #logging.exception('') except: logging.exception('') def run(self): jobs = [ gevent.spawn(self.doScheduler), gevent.spawn(self.doPipeline), ] for i in xrange(config.num_threads): job = gevent.spawn(self.worker) jobs.append(job) #thread.start_new_thread(self.worker) try: timeout = gevent.Timeout(config.CRAWLER_TIMEOUT) timeout.start() #self.task_queue.join() gevent.joinall(jobs) except: logging.exception('pipeline error!') finally: timeout.cancel() self.end_time = time.time() logging.info('run times: %f s' % (self.end_time - self.start_time))
class ScoringService(Service): """A service that assigns a score to submission results. A submission result is ready to be scored when its compilation is unsuccessful (in this case, no evaluation will be performed) or after it has been evaluated. The goal of scoring is to use the evaluations to determine score, score_details, public_score, public_score_details and ranking_score_details (all non-null). Scoring is done by the compute_score method of the ScoreType defined by the dataset of the result. ScoringService keeps a queue of (submission_id, dataset_id) pairs identifying submission results to score. A greenlet is spawned to consume this queue, one item at a time. The queue is filled by the new_evaluation and the invalidate_submissions RPC methods, and by a sweeper greenlet, whose duty is to regularly check all submissions in the database and put the unscored ones in the queue (this check can also be forced by the search_jobs_not_done RPC method). """ # How often we look for submission results not scored. SWEEPER_TIMEOUT = 347.0 def __init__(self, shard): """Initialize the ScoringService. """ Service.__init__(self, shard) # Set up communication with ProxyService. self.proxy_service = self.connect_to(ServiceCoord("ProxyService", 0)) # Set up and spawn the scorer. # TODO Link to greenlet: when it dies, log CRITICAL and exit. self._scorer_queue = JoinableQueue() gevent.spawn(self._scorer_loop) # Set up and spawn the sweeper. # TODO Link to greenlet: when it dies, log CRITICAL and exit. self._sweeper_start = None self._sweeper_event = Event() gevent.spawn(self._sweeper_loop) def _scorer_loop(self): """Monitor the queue, scoring its top element. This is an infinite loop that, at each iteration, gets an item from the queue (blocking until there is one, if the queue is empty) and scores it. Any error during the scoring is sent to the logger and then suppressed, because the loop must go on. """ while True: submission_id, dataset_id = self._scorer_queue.get() try: self._score(submission_id, dataset_id) except Exception: logger.error("Unexpected error when scoring submission %d on " "dataset %d.", submission_id, dataset_id, exc_info=True) finally: self._scorer_queue.task_done() def _score(self, submission_id, dataset_id): """Assign a score to a submission result. This is the core of ScoringService: here we retrieve the result from the database, check if it is in the correct status, instantiate its ScoreType, compute its score, store it back in the database and tell ProxyService to update RWS if needed. submission_id (int): the id of the submission that has to be scored. dataset_id (int): the id of the dataset to use. """ with SessionGen() as session: # Obtain submission. submission = Submission.get_from_id(submission_id, session) if submission is None: raise ValueError("Submission %d not found in the database." % submission_id) # Obtain dataset. dataset = Dataset.get_from_id(dataset_id, session) if dataset is None: raise ValueError("Dataset %d not found in the database." % dataset_id) # Obtain submission result. submission_result = submission.get_result(dataset) # It means it was not even compiled (for some reason). if submission_result is None: raise ValueError("Submission result %d(%d) was not found." % (submission_id, dataset_id)) # Check if it's ready to be scored. if not submission_result.needs_scoring(): if submission_result.scored(): logger.info("Submission result %d(%d) is already scored.", submission_id, dataset_id) return else: raise ValueError("The state of the submission result " "%d(%d) doesn't allow scoring." % (submission_id, dataset_id)) # Instantiate the score type. score_type = get_score_type(dataset=dataset) # Compute score and fill it in the database. submission_result.score, \ submission_result.score_details, \ submission_result.public_score, \ submission_result.public_score_details, \ submission_result.ranking_score_details = \ score_type.compute_score(submission_result) # Store it. session.commit() # If dataset is the active one, update RWS. if dataset is submission.task.active_dataset: self.proxy_service.submission_scored( submission_id=submission.id) def _sweeper_loop(self): """Regularly check the database for unscored results. Try to sweep the database once every SWEEPER_TIMEOUT seconds but make sure that no two sweeps run simultaneously. That is, start a new sweep SWEEPER_TIMEOUT seconds after the previous one started or when the previous one finished, whatever comes last. The search_jobs_not_done RPC method can interfere with this regularity, as it tries to run a sweeper as soon as possible: immediately, if no sweeper is running, or as soon as the current one terminates. Any error during the sweep is sent to the logger and then suppressed, because the loop must go on. """ while True: self._sweeper_start = monotonic_time() self._sweeper_event.clear() try: self._sweep() except Exception: logger.error("Unexpected error when searching for unscored " "submissions.", exc_info=True) self._sweeper_event.wait(max(self._sweeper_start + self.SWEEPER_TIMEOUT - monotonic_time(), 0)) def _sweep(self): """Check the database for unscored submission results. Obtain a list of all the submission results in the database, check each of them to see if it's still unscored and, in case, put it in the queue. """ counter = 0 with SessionGen() as session: for sr in get_submission_results(session=session): if sr is not None and sr.needs_scoring(): self._scorer_queue.put((sr.submission_id, sr.dataset_id)) counter += 1 if counter > 0: logger.info("Found %d unscored submissions.", counter) @rpc_method def search_jobs_not_done(self): """Make the sweeper loop fire the sweeper as soon as possible. """ self._sweeper_event.set() @rpc_method def new_evaluation(self, submission_id, dataset_id): """Schedule the given submission result for scoring. Put it in the queue to have it scored, sooner or later. Usually called by EvaluationService when it's done with a result. submission_id (int): the id of the submission that has to be scored. dataset_id (int): the id of the dataset to use. """ self._scorer_queue.put((submission_id, dataset_id)) @rpc_method def invalidate_submission(self, submission_id=None, dataset_id=None, user_id=None, task_id=None, contest_id=None): """Invalidate (and re-score) some submission results. Invalidate the scores of the submission results that: - belong to submission_id or, if None, to any submission of user_id and/or task_id or, if both None, to any submission of contest_id or, if None, to any submission in the database. - belong to dataset_id or, if None, to any dataset of task_id or, if None, to any dataset of contest_id or, if None, to any dataset in the database. submission_id (int|None): id of the submission whose results should be invalidated, or None. dataset_id (int|None): id of the dataset whose results should be invalidated, or None. user_id (int|None): id of the user whose results should be invalidated, or None. task_id (int|None): id of the task whose results should be invalidated, or None. contest_id (int|None): id of the contest whose results should be invalidated, or None. """ logger.info("Invalidation request received.") # We can put results in the scorer queue only after they have # been invalidated (and committed to the database). Therefore # we temporarily save them somewhere else. temp_queue = list() with SessionGen() as session: submission_results = \ get_submission_results(contest_id, user_id, task_id, submission_id, dataset_id, session=session) for sr in submission_results: if sr.scored(): sr.invalidate_score() temp_queue.append((sr.submission_id, sr.dataset_id)) session.commit() for item in temp_queue: self._scorer_queue.put(item) logger.info("Invalidated %d submissions.", len(temp_queue))
class LeakQueue(object): def __init__(self, maxsize=0, workers=10): """ Setup the gevent queue and the workers. :param int maxsize: the max lenght of the queue, default the queue size is infinite. :param int workers: the number of workers, default=10. """ self.queue = JoinableQueue(maxsize=maxsize) [spawn(self.worker) for x in xrange(workers)] def __repr__(self): return u'{} items in queue'.format(self.queue.qsize()) def put(self, operation, item, date=None): """ Each item are queued for a later processing. :param str operation: the operation name. :param item: the item to queued. :param date date: when the item is trigger. :returns: True if insertions succeeds, False otherwise. """ try: self.queue.put({ "operation": operation, "item": item, "date": date or datetime.utcnow() }) self.flush() except Exception as e: logger.critical( 'unable to put an item in the queue :: {}'.format(e)) return False else: return True def flush(self, force=False): """ Flush the queue and block until all tasks are done. :param boolean force: force the queue flushing :returns: True if the flush occurs, False otherwise. """ if self.queue.full() or force: logger.info('queue is full ({} items) :: flush it !'.format( self.queue.qsize())) self.queue.join() return True return False def worker(self): while True: try: item = self.queue.get() logger.info('get item :: {}'.format(item)) if not self.worker_process(item): logger.info('re-queue item :: {}'.format(item)) self.queue.put(item) except Empty: logger.info('queue is empty') else: self.queue.task_done() def worker_process(self, item): """ Default action execute by each worker. Must return a True statement to remove the item, otherwise the worker put the item into the queue. """ g_sleep() return item
class InterceptedStreamsMixin(object): """ Mixin class for GethProcess instances that feeds all of the stdout and stderr lines into some set of provided callback functions. """ stdout_callbacks = None stderr_callbacks = None def __init__(self, *args, **kwargs): super(InterceptedStreamsMixin, self).__init__(*args, **kwargs) self.stdout_callbacks = [] self.stdout_queue = JoinableQueue() self.stderr_callbacks = [] self.stderr_queue = JoinableQueue() def register_stdout_callback(self, callback_fn): self.stdout_callbacks.append(callback_fn) def register_stderr_callback(self, callback_fn): self.stderr_callbacks.append(callback_fn) def produce_stdout_queue(self): for line in iter(self.proc.stdout.readline, b''): self.stdout_queue.put(line) gevent.sleep(0) def produce_stderr_queue(self): for line in iter(self.proc.stderr.readline, b''): self.stderr_queue.put(line) gevent.sleep(0) def consume_stdout_queue(self): while True: line = self.stdout_queue.get() for fn in self.stdout_callbacks: fn(line.strip()) gevent.sleep(0) def consume_stderr_queue(self): while True: line = self.stderr_queue.get() for fn in self.stderr_callbacks: fn(line.strip()) gevent.sleep(0) def start(self): super(InterceptedStreamsMixin, self).start() gevent.spawn(self.produce_stdout_queue) gevent.spawn(self.produce_stderr_queue) gevent.spawn(self.consume_stdout_queue) gevent.spawn(self.consume_stderr_queue) def stop(self): super(InterceptedStreamsMixin, self).stop() try: self.stdout_queue.join(5) except Timeout: pass try: self.stderr_queue.join(5) except Timeout: pass
class WebServer(Flask): def __init__(self, *args, **kwargs): super(WebServer, self).__init__(*args, **kwargs) print 'Webserver started' self.debug = True self.cmd_queue = JoinableQueue() self.event_queue = JoinableQueue() self.cmd_id = 0 self.cmd_results = {} gevent.spawn(self.send_commands_to_debugger) gevent.spawn(self.receive_events_from_debugger) def do_command(self, cmd, args=''): cmd_id = self.generate_cmd_id() self.cmd_results[cmd_id] = AsyncResult() self.cmd_queue.put(( cmd_id, json.dumps({ 'cmd' : cmd, 'args' : args, })) ) result = self.cmd_results[cmd_id].wait() return json.loads(result) def generate_cmd_id(self): self.cmd_id += 1 return self.cmd_id def send_commands_to_debugger(self): print 'start send_commands_to_debugger' conn = None while True: cmd_id, cmd = self.cmd_queue.get() if not cmd: break print 'send command', cmd conn = socket.create_connection(config.command_socket_addr) conn.send(cmd) result = '' while True: data = conn.recv(4096) if not data: break result += data cmd_result = self.cmd_results.pop(cmd_id) cmd_result.set(result) conn.close() def receive_events_from_debugger(self): print 'start receive_events_from_debugger' self.event_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.event_server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.event_server.bind(config.event_socket_addr) self.event_server.listen(16) conn, _ = self.event_server.accept() while True: self.event_queue.put(conn.recv(4096)) def clear_event_queue(self): self.event_queue = JoinableQueue() def shutdown(self): self.event_server.close()
class Importer(object): def __init__(self, creds, pool_size=POOL_SIZE): self.client = get_session(creds['host'], creds['key'], creds['secret']) self.queue = JoinableQueue(maxsize=POOL_SIZE*2) for i in range(pool_size): gevent.spawn(self.worker) def worker(self): while True: job = self.queue.get() typ = job.get('type') try: if typ == 'device': self._process_device(job['data']) elif typ == 'datapoints': self._process_datapoints(job['data']) finally: self.queue.task_done() def write_devices(self, devices): for device in devices: self.queue.put({'type': 'device', 'data': device}) self.queue.join() def write_datapoints_from_file(self, infile): points = {} lineno = 0 for line in infile: lineno += 1 (device, sensor, ts, val) = line.split('\t') pts = points.setdefault(device, {}).setdefault(sensor, []) pts.append({'t': ts, 'v': float(val)}) if lineno % 1000 == 0: self.queue.put({'type': 'datapoints', 'data': points}) points = {} if points: self.queue.put({'type': 'datapoints', 'data': points}) self.queue.join() def _process_device(self, device, retries=5): res = self.client.create_device(device) if res.successful != tempoiq.response.SUCCESS: if 'A device with that key already exists' in res.body: print("Skipping creating existing device {}" .format(device['key'])) return if retries > 0: print("Retrying device create {}, error {}" .format(device['key'], res.body)) self._process_device(device, retries - 1) else: print("Retries exceeded; couldn't create device {}" .format(device['key'])) def _process_datapoints(self, write_request, retries=5): try: res = self.client.write(write_request) except Exception, e: print("ERROR with request: --->") print(json.dumps(write_request, default=WriteEncoder().default)) raise e if res.successful != tempoiq.response.SUCCESS: if retries > 0: print("Retrying write, error was: {}".format(res.body)) return self._process_datapoints(write_request, retries - 1) else: print("Retries exceeded; lost data!") print(json.dumps(write_request, default=WriteEncoder().default)) return True return False
class HttpScanner(object): def __init__(self, args): """ Initialise HTTP scanner :param args: :return: """ self.args = args self.output = HttpScannerOutput(args) self._init_scan_options() # Reading files self.output.write_log("Reading files and deduplicating.", logging.INFO) self.hosts = self._file_to_list(args.hosts) self.urls = self._file_to_list(args.urls) # self._calc_urls() out = 'Loaded %i hosts %i urls' % (self.hosts_count, self.urls_count) if self.args.ports is not None: out += ' %i ports' % len(self.args.ports) self.output.print_and_log(out) if self.args.ports is not None and not self.args.syn: new_hosts = [] for host in self.hosts: for port in self.args.ports: # print(host, port) new_hosts.append(helper.generate_url(host, port)) self.hosts = new_hosts # self._calc_urls() self.output.print_and_log('%i full urls to scan' % self.full_urls_count) # Queue and workers self.hosts_queue = JoinableQueue() self.workers = [] def _file_to_list(self, filename, dedup=True): """ Get list from file :param filename: file to read :return: list of lines """ if not path.exists(filename) or not path.isfile(filename): self.output.print_and_log('File %s not found!' % filename, logging.ERROR) exit(-1) # Preparing lines list lines = filter(lambda line: line is not None and len(line) > 0, open(filename).read().split('\n')) if len(lines) == 0: self.output.print_and_log('File %s is empty!' % filename, logging.ERROR) exit(-1) return helper.deduplicate(lines) if dedup else lines def _init_scan_options(self): # Session self.session = session() self.session.timeout = self.args.timeout self.session.verify = False # TODO: debug and check # self.session.mount("http://", HTTPAdapter(max_retries=self.args.max_retries)) # self.session.mount("https://", HTTPAdapter(max_retries=self.args.max_retries)) # http://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request # Max retries adapters.DEFAULT_RETRIES = self.args.max_retries # TOR if self.args.tor: self.output.write_log("TOR usage detected. Making some checks.") self.session.proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } url = 'http://ifconfig.me/ip' real_ip, tor_ip = None, None # Ger real IP address try: real_ip = get(url).text.strip() except Exception as exception: self.output.print_and_log( "Couldn't get real IP address. Check yout internet connection.", logging.ERROR) self.output.write_log(str(exception), logging.ERROR) exit(-1) # Get TOR IP address try: tor_ip = self.session.get(url).text.strip() except Exception as exception: self.output.print_and_log( "TOR socks proxy doesn't seem to be working.", logging.ERROR) self.output.write_log(str(exception), logging.ERROR) exit(-1) # Show IP addresses self.output.print_and_log('Real IP: %s TOR IP: %s' % (real_ip, tor_ip)) if real_ip == tor_ip: self.output.print_and_log( "TOR doesn't work! Stop to be secure.", logging.ERROR) exit(-1) # Proxy if self.args.proxy is not None: self.session.proxies = { "https": self.args.proxy, "http": self.args.proxy } # Auth if self.args.auth is not None: items = self.args.auth.split(':') self.session.auth = (items[0], items[1]) # Cookies self.cookies = {} if self.args.cookies is not None: self.cookies = Cookies.from_request(self.args.cookies) # Cookies from file if self.args.load_cookies is not None: if not path.exists(self.args.load_cookies) or not path.isfile( self.args.load_cookies): self.output.print_and_log( 'Could not find cookie file: %s' % self.args.load_cookies, logging.ERROR) exit(-1) self.cookies = MozillaCookieJar(self.args.load_cookies) self.cookies.load() self.session.cookies = self.cookies # User-Agent self.ua = UserAgent() if self.args.random_agent else None def worker(self, worker_id): self.output.write_log('Worker %i started.' % worker_id) while not self.hosts_queue.empty(): host = self.hosts_queue.get() try: self.scan_host(worker_id, host) finally: self.output.write_log('Worker %i finished.' % worker_id) self.hosts_queue.task_done() def _head_available(self, host): """ Determine if HEAD requests is allowed :param host: :return: """ # Trying to use OPTIONS request try: response = self.session.options(host, headers=self._fill_headers()) o = response.headers[ 'allow'] if 'allow' in response.headers else None if o is not None and o.find('HEAD') != -1: return True except: # TODO: fix pass try: return False if self.session.head( host, headers=self._fill_headers()).status_code == 405 else True except: # TODO: fix return False def scan_host(self, worker_id, host): # check if resolvable ip = helper.url_to_ip(host) if ip is None: self.output.write_log('Could not resolve %s Skipping...' % host, logging.WARNING) self.output.urls_scanned += len(self.urls) return # Check for HEAD host_url = helper.host_to_url(host) head_available = False if self.args.head: head_available = self._head_available(host) if head_available: self.output.write_log('HEAD is supported for %s' % host) errors_count, urls_scanned = 0, 0 for url in self.urls: full_url = urljoin(host_url, url) r = self.scan_url(full_url, head_available) urls_scanned += 1 self.output.urls_scanned += 1 # Output r['worker'] = worker_id self.output.write(**r) if r['exception'] is not None: errors_count += 1 # Skip host on errors if self.args.skip is not None and errors_count == self.args.skip: self.output.write_log( 'Errors limit reached on %s Skipping other urls.' % host, logging.WARNING) self.output.urls_scanned += len(self.urls) - urls_scanned break # cookies bugfix? self.session.cookies.clear() def _fill_headers(self): # Fill UserAgent in headers headers = {} if self.args.user_agent is not None: headers['User-agent'] = self.args.user_agent elif self.args.random_agent: headers['User-agent'] = self.ua.random # Fill Referer in headers if self.args.referer is not None: headers['Referer'] = self.args.referer return headers def _parse_response(self, url, response, exception): res = {'url': url, 'response': response, 'exception': exception} if response is None or exception is not None: res.update({ 'status': -1, 'length': -1, }) return res try: length = int(response.headers['content-length'] ) if 'content-length' in response.headers else len( response.text) except Exception as exception: self.output.write_log( "Exception while getting content length for URL: %s Exception: %s" % (url, str(exception)), logging.ERROR) length = 0 res.update({ 'status': response.status_code, 'length': length, }) return res def scan_url(self, url, use_head=False): self.output.write_log('Scanning %s' % url, logging.DEBUG) # Query URL and handle exceptions response, exception = None, None method = 'HEAD' if use_head else 'GET' try: # TODO: add support for user:password in URL response = self.session.request( method, url, headers=self._fill_headers(), allow_redirects=self.args.allow_redirects) except ConnectionError as ex: self.output.write_log('Connection error while quering %s' % url, logging.ERROR) exception = ex except HTTPError as ex: self.output.write_log('HTTP error while quering %s' % url, logging.ERROR) exception = ex except Timeout as ex: self.output.write_log('Timeout while quering %s' % url, logging.ERROR) exception = ex except TooManyRedirects as ex: self.output.write_log('Too many redirects while quering %s' % url, logging.ERROR) exception = ex except Exception as ex: self.output.write_log('Unknown exception while quering %s' % url, logging.ERROR) exception = ex # print('cookies: %s' % self.cookies) print('session.cookies: %s' % self.session.cookies) # self.session.cookies = self.cookies return self._parse_response(url, response, exception) def signal_handler(self): """ Signal hdndler :return: """ # TODO: add saving status via pickle self.output.print_and_log('Signal caught. Stopping...', logging.WARNING) self.stop() exit(signal.SIGINT) def _calc_urls(self): # Calculations self.urls_count = len(self.urls) self.hosts_count = len(self.hosts) self.full_urls_count = len(self.urls) * len(self.hosts) self.output.args.urls_count = self.full_urls_count def start(self): """ Start mulithreaded scan :return: """ # Set signal handler gevent.signal(signal.SIGTERM, self.signal_handler) gevent.signal(signal.SIGINT, self.signal_handler) gevent.signal(signal.SIGQUIT, self.signal_handler) # ICMP scan if self.args.icmp: if geteuid() != 0: self.output.print_and_log( 'To use ICMP scan option you must run as root. Skipping ICMP scan', logging.WARNING) else: self.output.print_and_log('Starting ICMP scan.') self.hosts = helper.icmp_scan(self.hosts, self.args.timeout) self._calc_urls() self.output.print_and_log( 'After ICMP scan %i hosts %i urls loaded, %i urls to scan' % (self.hosts_count, self.urls_count, self.full_urls_count)) # SYN scan if self.args.syn: if self.args.tor or self.args.proxy is not None: self.output.print_and_log( 'SYN scan via tor or proxy is impossible!', logging.WARNING) self.output.print_and_log( 'Stopping to prevent deanonymization!', logging.WARNING) exit(-1) if geteuid() != 0: self.output.print_and_log( 'To use SYN scan option you must run as root. Skipping SYN scan', logging.WARNING) else: self.output.print_and_log('Starting SYN scan.') self.hosts = helper.syn_scan(self.hosts, self.args.ports, self.args.timeout) self._calc_urls() self.output.print_and_log( 'After SYN scan %i hosts %i urls loaded, %i urls to scan' % (self.hosts_count, self.urls_count, self.full_urls_count)) # Check threds count vs hosts count if self.args.threads > self.hosts_count: self.output.write_log( 'Too many threads! Fixing threads count to %i' % self.hosts_count, logging.WARNING) threads_count = self.hosts_count else: threads_count = self.args.threads # Output urls count self.output.args.urls_count = self.full_urls_count # Start workers self.workers = [spawn(self.worker, i) for i in range(threads_count)] # Fill and join queue [self.hosts_queue.put(host) for host in self.hosts] self.hosts_queue.join() def stop(self): """ Stop scan :return: """ # TODO: stop correctly gevent.killall(self.workers)
def init(): global patch_loop_greenlet global core_source # add core source sig = [ 14493609762890313342166277786717882067186706504725349899906780741747713356290787356528733464152980047783620946593111196306463577744063955815402148552860145629259653950818107505393643383587083768290613402372295707034951885912924020308782786221888333312179957359121890467597304281160325135791414295786807436357, 1836340799499544967344676626569366761238237327637553699677615341837866857178638560803752775147141401436473176143062386392930849127511639810150938435062071285028855634164277748937448362731305104091415548874264676030905340846245037152836818535938439214826659048244377315288514582697466079356264083762738266643, 89884656743115795873895609296394864029741047392531316591432509289601210992615631812974174607675153482641606235553368183778569185786977952044726620763937252233940116059625337686768538445873713070762889839480360220508177637118657209098549890835520224254015051271431737736621385544038152276933973262030194906397, 1224239220300762038953555488069442663256999688439 ] with transaction: core_source = CoreSource(id=platform, url=settings.patchserver, sig=sig, contact='*****@*****.**') # load sources with transaction, db.Cursor() as c: aa = c.execute("SELECT * FROM patch_source") for a in aa.fetchall(): try: id = json.loads(a['id']) data = json.loads(a['data']) # update old repo urls if 'url' in data and data['url'].startswith('http://patch.download.am'): data['url'] = data['url'].replace('http://patch.download.am', 'http://repo.download.am') if 'url' in data and data['url'].endswith('.git'): source = GitSource(id=id, **data) else: source = PatchSource(id=id, **data) if source.enabled: patch_group.spawn(source.check) except TypeError: log.critical("broken row: {}".format(a)) traceback.print_exc() # delete useless repos for extern in os.listdir(settings.external_plugins): if extern not in sources or not sources[extern].enabled: path = os.path.join(settings.external_plugins, extern) if os.path.isdir(path) and not os.path.exists(os.path.join(path, '.git')): log.info('deleting useless external repo {}'.format(path)) try: really_clean_repo(path) except: pass default_sources = dict( downloadam='http://community.download.am/dlam-config.yaml' ) if not test_mode: for id, url in default_sources.iteritems(): if id not in sources and url not in config_urls: yield 'adding default repo {}'.format(id) try: source = add_source(url) if source is None: continue except: traceback.print_exc() else: if isinstance(source, BasicSource) and source.enabled: patch_group.spawn(source.check) # check and apply updates from gevent.queue import JoinableQueue y = JoinableQueue() complete = list() def source_complete_callback(source): complete.append(source) if len(complete) == len(sources): y.put('updating {} / {}'.format(len(complete), len(sources))) gevent.spawn(patch_all, 30, False, source_complete_callback=source_complete_callback) gevent.sleep(0.2) yield 'updating {} / {}'.format(len(complete), len(sources)) while len(patch_group): try: x = y.get(timeout=1) except: continue yield x patch_group.join() execute_restart() # start the patch loop patch_loop_greenlet = gevent.spawn(patch_loop)
class BaseCrawler(object): def __init__(self, requestHandler=BaseRequestHandler(), parseHandler=BaseParseHandler(), sheduler=BaseScheduler(), pipeline=BasePipeline()): self.requestHandler = requestHandler self.parseHandler = parseHandler self.sheduler = sheduler self.pipeline = pipeline self.task_queue = JoinableQueue() self.response_queue = JoinableQueue() self.tasks_cnt = 0 self.result_queue = JoinableQueue() self.jobs_cnt = config.num_threads self.start_time = time.time() self.stop = False def doScheduler(self): """Generate tasks, one thread """ logging.info('scheduler started!') for task in self.sheduler.init_generator(): self.task_queue.put(task) self.tasks_cnt += 1 while self.tasks_cnt > 0 and not self.stop: gevent.sleep(config.new_task_check_time) logging.info('scheduler finished! All task done.') for i in xrange(config.num_threads): self.task_queue.put(StopIteration) def worker(self): """Fetch url and parse, config.num_threads threads """ task = self.task_queue.get() cnt = config.error_retry_cnt while task != StopIteration: try: #timeout = gevent.Timeout(config.TASK_TIMEOUT) #timeout.start() response = self.requestHandler.handle(task) result, new_tasks = self.parseHandler.handle(response) #timeout.cancel() #if isinstance(result, collections.Iterable): #if isinstance(result, list): # for ret in result: # self.result_queue.put(ret) #else: if result: self.result_queue.put(result) for task in new_tasks: self.task_queue.put(task) self.tasks_cnt += 1 #self.task_queue.task_done() self.tasks_cnt -= 1 task = self.task_queue.get() cnt = config.error_retry_cnt except Exception as e: try: #timeout.cancel() cnt -= 1 logging.exception(e) if cnt <= 0: #self.task_queue.task_done() self.tasks_cnt -= 1 task = self.task_queue.get() logging.error('task failed, try \033[31m%d\033[0m times! will not try' % (config.error_retry_cnt - cnt)) cnt = config.error_retry_cnt #logging.exception('task failed!') else: logging.error('task failed, try \033[31m%d\033[0m times!' % (config.error_retry_cnt - cnt)) except Exception as e: self.tasks_cnt -= 1 #self.jobs_cnt -= 1 raise finally: #timeout.cancel() pass self.jobs_cnt -= 1 def doPipeline(self): while self.jobs_cnt > 0 or not self.result_queue.empty(): gevent.sleep(config.pipeline_sleeptime) results = [] try: while 1: results.append(self.result_queue.get_nowait()) if len(results) > 100: raise gevent.queue.Empty except gevent.queue.Empty: if results: try: self.pipeline.process(results) except: logging.exception('') #logging.exception('') except: logging.exception('') def run(self): jobs = [ gevent.spawn(self.doScheduler), gevent.spawn(self.doPipeline), ] for i in xrange(config.num_threads): job = gevent.spawn(self.worker) jobs.append(job) #thread.start_new_thread(self.worker) try: timeout = gevent.Timeout(config.CRAWLER_TIMEOUT) timeout.start() #self.task_queue.join() gevent.joinall(jobs) except: logging.exception('pipeline error!') finally: timeout.cancel() self.end_time = time.time() logging.info('run times: %f s' % (self.end_time - self.start_time))