def generator(): import gevent.local lo = gevent.local.local() lo.tid = 1 with store.begin(write=True): Host.by_unchecked.find() with store.begin(): for host in Host.by_unchecked.values(max=1000): pool.wait_available() pool.add(gevent.spawn(worker, host))
def crawl(start_url, concurrency_level, visited_link_limit): """ Main crawling function. Uses a pool of greenlets to get the job done :param start_url: URL to start crawling from :param concurrency_level: number of concurrent downloads :param visited_link_limit: maximum number of links to crawl :return: None """ print('start crawling from %s' % start_url) print('concurrency level: %s, visited link limit: %s' % (concurrency_level, visited_link_limit)) # init our pending links with start_url pending_links.append(start_url) pool = gevent.pool.Pool(concurrency_level) # limit number of visited links, just for testing purposes while len(visited_links) < visited_link_limit and ( len(pending_links) > 0 or len(crawlin_links) > 0): # if there is nothing more to schedule, then wait for current jobs to complete and try again if not pending_links: pool.join() continue link = pending_links.pop(0) crawlin_links.add(link) pool.wait_available() pool.add(gevent.spawn(crawl_one, link)) # print('%s - current visited: %s' % (threading.currentThread(), visited_links)) pool.join() # print('%s - visited links: %s' % (threading.currentThread(), visited_links)) # print('%s - pending links: %s' % (threading.currentThread(), pending_links)) print('Done. %s links visited.' % len(visited_links))
def wait_available(pool, pool_name): statsd = stats.get_statsd_client() if pool.full(): statsd.incr('%s.pool.full' % pool_name) pool.wait_available()
def schedule(): while True: pool.wait_available() print('Starting greenlet') pool.apply_async(main)
def _run(ToolImplementation, tool_reads_stdin): ''' tool_reads_stdin - tool uses stdin for other purpose than to obtain startup arguments ''' args = sys.argv[:] action_name = args[0] #args = list(args)[1:] _E_INVALID_ARGUMENT_STDIN = \ '-: Invalid argument, target hosts may not be specified on stdin.' _usage = __usage__.replace('{{tool}}', action_name) if hasattr(ToolImplementation, '__itemname__'): itemname = ToolImplementation.__itemname__ _usage = _usage.replace('item', itemname) parser = XNetOptionParser(ToolImplementation.cmdline_options, usage=_usage) (options, args) = parser.parse_args(args) SignalHandler.setup(options) cmdlineinputs = args[1:] #vprint = VerbosityPrinter(options) vprint_stderr = VerbosityPrinter(options, sys.stderr) # # chain input sources, handle options # precedence order: cmdline, -r, stdin # # If --nr-processes=n and n > 1, then let this process # produce for a number of child processes. # _inputs = [] if '-' in cmdlineinputs: if tool_reads_stdin: raise Exception(_E_INVALID_ARGUMENT_STDIN) cmdlineinputs.remove('-') _inputs.append(sys.stdin) if options.read: f = open(options.read) _inputs.insert(0, f) if len(cmdlineinputs): _inputs.insert(0, cmdlineinputs) inputchain = itertools.chain(*_inputs) _nr_processes = 1 if options.nr_processes: _nr_processes = int(options.nr_processes) if options.pdb: import xnet.debug xnet.debug.interactive_debugger_on_exception(True) if options.print_source: ToolImplementation.print_source() return _nr_microthreads = 256 if not options.nr_microthreads is None: _nr_microthreads = int(options.nr_microthreads) _wait = None if not options.wait is None: _wait = float(options.wait) _interval = 0.0 if not options.interval is None: _interval = float(options.interval) _repeat = 1 if not options.repeat is None: _repeat = int(options.repeat) if options.format_help: print ToolImplementation.__format_help__() sys.exit(0) if options.split_tee: options.split_output = options.split_tee _outfile = sys.stdout # # Handle SSH-distributed execution. # If SSH-dist., main process returns on do_ssh() here. # if options.ssh_nodes_file: # # __massage__() expands wildcard IP-ranges such as 10.0.0.*. # This should be done in parent in order to split the expanded # set of IP:s across its children. # inputchain = ToolImplementation.__massage__(inputchain, options) if _repeat != 1: inputchain = repeaterator(inputchain, _repeat) return do_ssh(options, ToolImplementation, args, inputchain) # # Handle multiproc. # If multiproc, main process returns on do_fork() here. # if _nr_processes == 1: pass elif _nr_processes < 1: errmsg = 'invalid number of processes: {0}'.format(_nr_processes) sys.stderr.write(errmsg) sys.exit(1) else: # # __massage__() expands wildcard IP-ranges such as 10.0.0.*. # This should be done in parent in order to split the expanded # set of IP:s across its children. # inputchain = ToolImplementation.__massage__(inputchain, options) if _repeat != 1: inputchain = repeaterator(inputchain, _repeat) return do_fork(options, ToolImplementation, tool_reads_stdin, args, _nr_processes, _nr_microthreads, inputchain) # # verify commandline options and do preparations # ToolImplementation.__setup__(options) # # Errors if pool is too small. # pool = gevent.pool.Pool(_nr_microthreads) #pool = gevent.pool.Pool(options.nr_microthreads or DEFAULT_NR_MICROTHREADS) greenlets = [] wkpool = None if _wait: wkpool = gevent.pool.Pool(_nr_microthreads) def waitkill(g, killtime): from xnet.tools import WaitTimeout sleeptime = killtime - time.time() if sleeptime > 0: gevent.sleep(sleeptime) if not g.ready(): vprint_stderr(3, '[*] xx - kill {0}:{1}\n'.format( os.getpid(), g.action._greenlet_id )) gevent.kill(g, WaitTimeout) tstart = time.time() g.join() tdiff = time.time() - tstart vprint_stderr(3, '[*] yy - joined {0}:{1} ({2:.1f}s)\n'.format( os.getpid(), g.action._greenlet_id, tdiff )) assert g.ready() inputchain = ToolImplementation.__massage__(inputchain, options) kwargs = {} if tool_reads_stdin: gevent.spawn(stdin_disperser_greenlet, pool) _t = 0.0 killtime = 0 if _repeat != 1: inputchain = repeaterator(inputchain, _repeat) greenlet_id = 0 for (i, line) in enumerate(inputchain): action = ToolImplementation(options, greenlet_id=greenlet_id, **kwargs) greenlet_id += 1 pool.wait_available() if _wait: killtime = time.time() + _wait vprint_stderr(2, '[*] ++ spawning greenlet {0}:{1}\n'.format(os.getpid(), greenlet_id)) g = pool.spawn(action, line, inputchain) g.action = action greenlets.append(g) # # Timeout seems unreliable on debian squeeze, use waitkill instead. # if _wait: wkpool.wait_available() wkpool.spawn(waitkill, g, killtime) # # handle finished actions # vprint_stderr(3, '[*] ii len(greenlets) = {0}'.format(len(greenlets))) vprint_stderr(3, ', not_ready={0}\n'.format(len(filter( lambda g: not g.ready(), greenlets )))) #while len(greenlets) and greenlets[0].ready(): # action = greenlets[0].action # vprint_stderr(2, '[*] -- collecting greenlet: {0}:{1} (running: {2})\n'.format( # os.getpid(), action._greenlet_id, len(greenlets) # )) # output_action_result(action, options, _outfile) # del action # del greenlets[0] del_indexes = [] for (g_index, g) in enumerate(greenlets[:]): if not g.ready(): continue vprint_stderr(2, '[*] -- collecting greenlet: {0}:{1} (running: {2})\n'.format( os.getpid(), g.action._greenlet_id, len(greenlets) )) output_action_result(g.action, options, _outfile) del_indexes.append(g_index) while len(del_indexes): di = del_indexes.pop() del greenlets[di] # # handle interval # _this_interval = _interval - (time.time() - _t) if _this_interval > 0: gevent.sleep(_this_interval) _t = time.time() if wkpool: wkpool.join() gevent.joinall(greenlets) # timeout=timeout not_done = filter(lambda g: (not g.ready()), greenlets) if len(not_done) > 0 and not _wait is None: vprint_stderr(0, 'ERROR: not_done has contents inspite of _wait and grace time\n') vprint_stderr(0, not_done) # # Force-kill greenlets that didn't die in spite of 1 sec of grace time. # gevent.killall(not_done, block=True) gevent.joinall(not_done) # # cleanups # ToolImplementation.__teardown__(options) # # print results # for g in greenlets: action = g.action output_action_result(action, options, _outfile) del action
def traverse_delete(up, start_path, datelist, authstr): """ 遍历指定bucket从某个start_path开始的目录,删除之下的所有文件或空目录 """ global job_files, jobs, deleted_bytes, deleted_files, pool, logger children = getlist(up, start_path) # 先获取当前目录下的子目录或文件 if children is None: return if len(children) == 0: # 目录为空。如果不是根目录,就删除这个目录。 if start_path != u'/': pool.spawn(up_delete, up, start_path) return files = [] for f in children: try: if f['type'] == u'N': files.append(f) except KeyError: logger.error('KeyError: no type in f %s' % str(f)) sys.exit(-1) if len(files) != 0: # 目录下有文件,就批量删掉 if start_path == u'/': jobs.extend( [pool.spawn(up_delete, up, '/%s' % f['name']) for f in files]) else: jobs.extend([ pool.spawn(async_delete_file, up, authstr, start_path, f) for f in files ]) job_files += len(files) try: deleted_bytes += sum([int(f['size']) for f in files]) except ValueError: # {'time': u'1491871202', 'type': u'N', 'name': u'1491871201.853000.jpg', 'size': u'undefined'} traceback.print_exc() print f deleted_files += len(files) if job_files >= 5000: # 超过5000个,清零 logger.warning(u'deleted %d MB, 空间占用 %d MB' % (deleted_bytes / 1024.0 / 1024, int(up.usage()) / 1024.0 / 1024)) pool.wait_available() job_files = 0 jobs = [] # 接下来递归处理子目录 # 注意:如果当前是在根目录,就要判断子目录是否在datelist要删除的日期列表里 if start_path == u'/': folders = [ f for f in children if f['type'] == u'F' and f['name'] in datelist ] else: folders = [f for f in children if f['type'] == u'F'] for folder in folders: if start_path == u'/': traverse_delete(up, '/%s' % folder['name'], datelist, authstr) else: traverse_delete(up, '%s/%s' % (start_path, folder['name']), datelist, authstr) # 其他节点? others = [f for f in children if f['type'] not in (u'F', u'N')] if len(others) != 0: logger.error('others: %s' % str(others)) sys.exit(0)
def wait_available(pool, pool_name): statsd = stats.get_statsd_client() if pool.full(): statsd.incr('%s.pool.full' % pool_name) pool.wait_available() return not STATE['shutdown']