def __init__(self): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pid', help='Taget project id') parser.add_argument('spider', help='Spider name') parser.add_argument('frontier', help='Frontier name') parser.add_argument('prefix', help='Slot prefix') parser.add_argument('--max-jobs', help='Max number of jobs for the given spider allowed to run in parallel.\ Default is %(default)s.', type=int, default=1) parser.add_argument('--apikey', help='API key to use for HCF access. Uses SH_APIKEY environment variable if not given') parser.add_argument('--spider-args', help='Spider arguments dict in json format', default='{}') parser.add_argument('--loop-mode', help='If provided, manager will run in loop mode, with a cycle each given\ number of seconds.', type=int, metavar='SECONDS') self.args = parser.parse_args() client = ScrapinghubClient(self.args.apikey) self.project = client.get_project(self.args.pid) self.hcfpal = HCFPal(client._hsclient.get_project(self.args.pid))
class HCFSpiderManager(object): def __init__(self): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('pid', help='Taget project id') parser.add_argument('spider', help='Spider name') parser.add_argument('frontier', help='Frontier name') parser.add_argument('prefix', help='Slot prefix') parser.add_argument('--max-jobs', help='Max number of jobs for the given spider allowed to run in parallel.\ Default is %(default)s.', type=int, default=1) parser.add_argument('--apikey', help='API key to use for HCF access. Uses SH_APIKEY environment variable if not given') parser.add_argument('--spider-args', help='Spider arguments dict in json format', default='{}') parser.add_argument('--loop-mode', help='If provided, manager will run in loop mode, with a cycle each given\ number of seconds.', type=int, metavar='SECONDS') self.args = parser.parse_args() client = ScrapinghubClient(self.args.apikey) self.project = client.get_project(self.args.pid) self.hcfpal = HCFPal(client._hsclient.get_project(self.args.pid)) def run(self): if self.args.loop_mode: while True: if not self.loop(): break time.sleep(self.args.loop_mode) else: self.loop() def loop(self): slot_re = re.compile(rf'{self.args.prefix}\d+') available_slots = [slot for slot in self.hcfpal.get_slots(self.args.frontier) if slot_re.match(slot)] running_jobs = 0 for job in self.project.jobs.list(spider=self.args.spider, state='running', meta='spider_args'): frontera_settings_json = json.loads(job['spider_args'].get('frontera_settings_json', '{}')) if 'HCF_CONSUMER_SLOT' in frontera_settings_json: available_slots.remove(frontera_settings_json['HCF_CONSUMER_SLOT']) running_jobs += 1 available_slots = [slot for slot in available_slots if self.hcfpal.get_slot_count(self.args.frontier, slot) > 0] logger.info(f"Available slots: {available_slots!r}") if available_slots: random.shuffle(available_slots) if self.args.max_jobs: max_jobs = self.args.max_jobs - running_jobs available_slots = available_slots[:max_jobs] if not available_slots: logger.info(f"Already running max number of jobs.") for slot in available_slots: frontera_settings_json = json.dumps({ 'HCF_CONSUMER_SLOT': slot }) spider_args = json.loads(self.args.spider_args) spider_args.update({'frontera_settings_json': frontera_settings_json}) job = self.project.jobs.run(self.args.spider, job_args=spider_args) logger.info(f"Scheduled job {job.key} with frontera settings {frontera_settings_json}") return True return bool(running_jobs)
def __init__(self): super().__init__() self.hcfpal = HCFPal(self.client._hsclient.get_project( self.project_id))
class HCFCrawlManager(CrawlManager): default_max_jobs = 1 def __init__(self): super().__init__() self.hcfpal = HCFPal(self.client._hsclient.get_project( self.project_id)) def add_argparser_options(self): super().add_argparser_options() self.argparser.add_argument("frontier", help="Frontier name") self.argparser.add_argument("prefix", help="Slot prefix") self.argparser.add_argument("--frontera-settings-json") @property def description(self): return __doc__ def print_frontier_status(self): result = self.hcfpal.get_slots_count(self.args.frontier, self.args.prefix) logger.info(f"Frontier '{self.args.frontier}' status:") for slot in sorted(result["slots"].keys()): cnt_text = "\t{}: {}".format(slot, result["slots"][slot]) logger.info(cnt_text) logger.info("\tTotal count: {}".format( humanize.intcomma(result["total"]))) return set(result["slots"].keys()) def workflow_loop(self): available_slots = self.print_frontier_status() running_jobs = 0 states = "running", "pending" for state in states: for job in self.get_project().jobs.list(spider=self.args.spider, state=state, meta="spider_args"): frontera_settings_json = json.loads(job["spider_args"].get( "frontera_settings_json", "{}")) if "HCF_CONSUMER_SLOT" in frontera_settings_json: slot = frontera_settings_json["HCF_CONSUMER_SLOT"] if slot in available_slots: available_slots.discard(slot) running_jobs += 1 available_slots = [ slot for slot in available_slots if self.hcfpal.get_slot_count(self.args.frontier, slot) > 0 ] logger.info(f"Available slots: {available_slots!r}") if available_slots: random.shuffle(available_slots) if self.max_running_jobs: max_jobs = self.max_running_jobs - running_jobs available_slots = available_slots[:max_jobs] if not available_slots: logger.info("Already running max number of jobs.") base_frontera_settings = {} if self.args.frontera_settings_json: base_frontera_settings = json.loads( self.args.frontera_settings_json) for slot in available_slots: frontera_settings = base_frontera_settings.copy() frontera_settings.update({ "HCF_CONSUMER_SLOT": slot, "HCF_CONSUMER_FRONTIER": self.args.frontier, }) frontera_settings_json = json.dumps(frontera_settings) logger.info( f"Will schedule spider job with frontera settings {frontera_settings_json}" ) jobkey = self.schedule_spider( spider_args_override={ "frontera_settings_json": frontera_settings_json }) logger.info( f"Scheduled job {jobkey} with frontera settings {frontera_settings_json}" ) return True return bool(running_jobs)
def __init__(self): parser = argparse.ArgumentParser( description='Helper script for accessing HubCrawlFrontier.') parser.add_argument( '--apikey', help= 'API key to use for HCF access. Uses SH_APIKEY environment variable if not given' ) subparsers = parser.add_subparsers(dest='cmd') parser_list = subparsers.add_parser( 'list', help='List project frontiers or slots in a frontier') parser_list.add_argument('pid', type=int, help='Project ID') parser_list.add_argument('frontier', nargs='?', help='Define frontier to list it\'s slots') parser_list.add_argument('--all', action='store_true', help='List all frontiers and their slots') parser_count = subparsers.add_parser( 'count', help='Count requests in frontier slots') parser_count.add_argument('pid', type=int, help='Project ID') parser_count.add_argument('frontier', help='Frontier for which to count') parser_count.add_argument('--prefix', help='Count only slots with a given prefix', default='') parser_count.add_argument( '--regex', help='Count only slots that matches given regex', default='') parser_count.add_argument( '--num-slots', type=int, help="Specify number of slots instead of autodetect \ (much faster in most cases)" ) parser_delete = subparsers.add_parser( 'delete', help='Delete slots from frontier') parser_delete.add_argument('pid', type=int, help='Project ID') parser_delete.add_argument('frontier', help='Frontier to delete slots from') parser_delete.add_argument( '--prefix', help='Delete only slots with a given prefix') parser_dump = subparsers.add_parser( 'dump', help='Dump next requests in queue of a frontier slot') parser_dump.add_argument('pid', type=int, help='Project ID') parser_dump.add_argument('frontier', help='Frontier name from where to dump') parser_dump.add_argument('slot', help='Slot from where to dump') parser_dump.add_argument( '--num-requests', help='Number of requests to dump. Defaults to %(default)d.', type=int, default=100) parser_move = subparsers.add_parser( 'move', help='Move requests from slots of given prefix, into the given \ number of slots on another prefix.' ) parser_move.add_argument('pid', type=int, help='Project ID') parser_move.add_argument('frontier', help='Frontier name') parser_move.add_argument('prefix', help='Prefix name of the source slots') parser_move.add_argument('dest_prefix', help='Prefix name of the destination slots') parser_move.add_argument('dest_num_slots', help='Number of destination slots', type=int) parser_move.add_argument( '--num-slots', type=int, help='If given, source slots are computed using given prefix \ and this number instead of list api (sometimes list \ api works very slow)' ) parser_move.add_argument( '--uniform', action='store_true', help= 'Distribute requests uniformly among slots. By default uses standard \ assignation mapping.') parser_move_batch = subparsers.add_parser( 'move_batch', help='Move requests from given batch id into a new slot.') parser_move_batch.add_argument('pid', type=int, help='Project ID') parser_move_batch.add_argument('frontier', help='Frontier name') parser_move_batch.add_argument( 'source_slot', help='Source slot where to find the batch id') parser_move_batch.add_argument('batchid', help='Id of the target batch') parser_move_batch.add_argument('dest_slot', help='Destination slot') parser_move_batch.add_argument( '--max-scan-batches', default=100, type=int, help= 'Max number of batches to scan in order to find target batch id in the \ source slot') self.args = parser.parse_args() client = ScrapinghubClient(self.args.apikey) hsc = client._hsclient self.hsp = hsc.get_project(self.args.pid) self.hcf = HCFPal(self.hsp)
class HCFPalScript(object): def __init__(self): parser = argparse.ArgumentParser( description='Helper script for accessing HubCrawlFrontier.') parser.add_argument( '--apikey', help= 'API key to use for HCF access. Uses SH_APIKEY environment variable if not given' ) subparsers = parser.add_subparsers(dest='cmd') parser_list = subparsers.add_parser( 'list', help='List project frontiers or slots in a frontier') parser_list.add_argument('pid', type=int, help='Project ID') parser_list.add_argument('frontier', nargs='?', help='Define frontier to list it\'s slots') parser_list.add_argument('--all', action='store_true', help='List all frontiers and their slots') parser_count = subparsers.add_parser( 'count', help='Count requests in frontier slots') parser_count.add_argument('pid', type=int, help='Project ID') parser_count.add_argument('frontier', help='Frontier for which to count') parser_count.add_argument('--prefix', help='Count only slots with a given prefix', default='') parser_count.add_argument( '--regex', help='Count only slots that matches given regex', default='') parser_count.add_argument( '--num-slots', type=int, help="Specify number of slots instead of autodetect \ (much faster in most cases)" ) parser_delete = subparsers.add_parser( 'delete', help='Delete slots from frontier') parser_delete.add_argument('pid', type=int, help='Project ID') parser_delete.add_argument('frontier', help='Frontier to delete slots from') parser_delete.add_argument( '--prefix', help='Delete only slots with a given prefix') parser_dump = subparsers.add_parser( 'dump', help='Dump next requests in queue of a frontier slot') parser_dump.add_argument('pid', type=int, help='Project ID') parser_dump.add_argument('frontier', help='Frontier name from where to dump') parser_dump.add_argument('slot', help='Slot from where to dump') parser_dump.add_argument( '--num-requests', help='Number of requests to dump. Defaults to %(default)d.', type=int, default=100) parser_move = subparsers.add_parser( 'move', help='Move requests from slots of given prefix, into the given \ number of slots on another prefix.' ) parser_move.add_argument('pid', type=int, help='Project ID') parser_move.add_argument('frontier', help='Frontier name') parser_move.add_argument('prefix', help='Prefix name of the source slots') parser_move.add_argument('dest_prefix', help='Prefix name of the destination slots') parser_move.add_argument('dest_num_slots', help='Number of destination slots', type=int) parser_move.add_argument( '--num-slots', type=int, help='If given, source slots are computed using given prefix \ and this number instead of list api (sometimes list \ api works very slow)' ) parser_move.add_argument( '--uniform', action='store_true', help= 'Distribute requests uniformly among slots. By default uses standard \ assignation mapping.') parser_move_batch = subparsers.add_parser( 'move_batch', help='Move requests from given batch id into a new slot.') parser_move_batch.add_argument('pid', type=int, help='Project ID') parser_move_batch.add_argument('frontier', help='Frontier name') parser_move_batch.add_argument( 'source_slot', help='Source slot where to find the batch id') parser_move_batch.add_argument('batchid', help='Id of the target batch') parser_move_batch.add_argument('dest_slot', help='Destination slot') parser_move_batch.add_argument( '--max-scan-batches', default=100, type=int, help= 'Max number of batches to scan in order to find target batch id in the \ source slot') self.args = parser.parse_args() client = ScrapinghubClient(self.args.apikey) hsc = client._hsclient self.hsp = hsc.get_project(self.args.pid) self.hcf = HCFPal(self.hsp) def run(self): if self.args.cmd == 'list': self.list_hcf() elif self.args.cmd == 'count': self.count_slots() elif self.args.cmd == 'delete': self.delete_slots() elif self.args.cmd == 'dump': self.dump_slot() elif self.args.cmd == 'move': self.move_slots() elif self.args.cmd == 'move_batch': self.move_batch() def delete_slots(self): prefix_note = ' (with prefix "{}")'.format( self.args.prefix) if self.args.prefix else '' print('Deleting slots{} from frontier "{}", project {}...'.format( prefix_note, self.args.frontier, self.args.pid)) slots = [ slot for slot in self.hcf.get_slots(self.args.frontier) if slot.startswith(self.args.prefix) ] self.hcf.delete_slots(self.args.frontier, slots) print('Slots deleted: {}'.format(slots)) def list_hcf(self): if self.args.all: print( 'Listing all frontiers and their slots in project {}:'.format( self.args.pid)) print(self.hcf.list_all(prettyprint=True)) elif self.args.frontier: print('Listing slots for frontier "{}" in project {}:'.format( self.args.frontier, self.args.pid)) for slot in self.hcf.get_slots(self.args.frontier): print('\t{}'.format(slot)) else: print('Listing frontiers in project {}:'.format(self.args.pid)) for front in self.hcf.get_frontiers(): print('\t{}'.format(front)) def count_slots(self): note = '' if self.args.prefix: note = ' (with prefix "{}")'.format(self.args.prefix) elif self.args.regex: note = ' (with regex "{}")'.format(self.args.regex) print('Counting requests in slots{} for frontier "{}", project {}:'. format(note, self.args.frontier, self.args.pid)) total = 0 not_empty_slots = 0 slots = ['{}{}'.format(self.args.prefix, slot) for slot in range(self.args.num_slots)] if \ self.args.num_slots else self.hcf.get_slots(self.args.frontier) for slot in slots: if not slot.startswith(self.args.prefix): continue if not re.search(self.args.regex, slot): continue cnt = self.hcf.get_slot_count(self.args.frontier, slot) if cnt: not_empty_slots += 1 total += cnt cnt_text = '\t{}: {}'.format(slot, cnt) print(cnt_text) print('\t' + '-' * 25) print('\tTotal count: {}'.format(humanize.intcomma(total))) print('\tNot-empty slots: {}'.format(not_empty_slots)) def dump_slot(self): print('Dumping next {} requests from slot {}, frontier {}, pid {}:'. format(self.args.num_requests, self.args.slot, self.args.frontier, self.args.pid)) count = 0 for batch in self.hsp.frontier.read(self.args.frontier, self.args.slot, self.args.num_requests): print("Batch id:", batch['id']) for request in batch['requests']: print(request) count += 1 if count == self.args.num_requests: return def move_slots(self): print( "Moving requests from frontier {}, pid {}, prefix {} into {} slots of prefix {}" .format(self.args.frontier, self.args.pid, self.args.prefix, self.args.dest_num_slots, self.args.dest_prefix)) if self.args.num_slots: source_slots = [ self.args.prefix + str(slotno) for slotno in range(self.args.num_slots) ] else: # use list api source_slots = [ slot for slot in self.hcf.get_slots(self.args.frontier) if slot.startswith(self.args.prefix) ] cyclic_gen = cycle(range(self.args.dest_num_slots)) for slot in source_slots: print("Reading slot %s" % slot) while True: # read each one batch for batch in self.hsp.frontier.read(self.args.frontier, slot, 1): for fp, qdata in batch['requests']: if self.args.uniform: dslotno = next(cyclic_gen) else: dslotno = assign_slotno(fp, self.args.dest_num_slots) dslot = self.args.dest_prefix + str(dslotno) self.hsp.frontier.add(self.args.frontier, dslot, [{ 'fp': fp, 'qdata': qdata }]) # we don't want to generate batches bigger than source ones self.hsp.frontier.flush() count = len(batch['requests']) self.hsp.frontier.delete(self.args.frontier, slot, [batch['id']]) print("Moved batch %s (%d requests) from slot %s" % (batch['id'], count, slot)) break else: break self.hcf.delete_slots(self.args.frontier, [slot]) def move_batch(self): print( "Moving requests from frontier {}, pid {}, slot {}, batch {} to slot {}" .format(self.args.frontier, self.args.pid, self.args.source_slot, self.args.batchid, self.args.dest_slot)) for batch in self.hsp.frontier.read(self.args.frontier, self.args.source_slot, self.args.max_scan_batches): if batch['id'] == self.args.batchid: frequests = [] for fp, qdata in batch['requests']: frequests.append({'fp': fp, 'qdata': qdata}) self.hsp.frontier.add(self.args.frontier, self.args.dest_slot, frequests) self.hsp.frontier.delete(self.args.frontier, self.args.source_slot, [batch['id']]) self.hsp.frontier.flush() break
class HCFCrawlManager(CrawlManager): default_max_jobs = 1 def __init__(self): super().__init__() self.hcfpal = HCFPal(self.client._hsclient.get_project( self.project_id)) def add_argparser_options(self): super().add_argparser_options() self.argparser.add_argument('frontier', help='Frontier name') self.argparser.add_argument('prefix', help='Slot prefix') @property def description(self): return __doc__ def workflow_loop(self): slot_re = re.compile(rf"{self.args.prefix}\d+") available_slots = set( slot for slot in self.hcfpal.get_slots(self.args.frontier) if slot_re.match(slot)) running_jobs = 0 states = 'running', 'pending' for state in states: for job in self.get_project().jobs.list(spider=self.args.spider, state=state, meta='spider_args'): frontera_settings_json = json.loads(job['spider_args'].get( 'frontera_settings_json', '{}')) if 'HCF_CONSUMER_SLOT' in frontera_settings_json: slot = frontera_settings_json['HCF_CONSUMER_SLOT'] if slot_re.match(slot): available_slots.discard(slot) running_jobs += 1 available_slots = [ slot for slot in available_slots if self.hcfpal.get_slot_count(self.args.frontier, slot) > 0 ] logger.info(f"Available slots: {available_slots!r}") if available_slots: random.shuffle(available_slots) if self.max_running_jobs: max_jobs = self.max_running_jobs - running_jobs available_slots = available_slots[:max_jobs] if not available_slots: logger.info(f"Already running max number of jobs.") for slot in available_slots: frontera_settings_json = json.dumps({ 'HCF_CONSUMER_SLOT': slot, 'HCF_CONSUMER_FRONTIER': self.args.frontier, }) logger.info( f"Will schedule spider job with frontera settings {frontera_settings_json}" ) jobkey = self.schedule_spider( spider_args_override={ 'frontera_settings_json': frontera_settings_json }) logger.info( f"Scheduled job {jobkey} with frontera settings {frontera_settings_json}" ) return True return bool(running_jobs)