def _run_spider(project_id, job_spec, tail): """Connect to Scrapinghub and run the job.""" spider = job_spec.get('spider_name') info(f'running {spider} on scrapinghub: {project_id}: ' f'args={job_spec.get("args")} tags={job_spec.get("tags")}') from kp_scrapers.lib.services.shub import shub_conn job = (shub_conn().get_project(project_id).jobs.run( spider, add_tag=job_spec.get('tags'), job_args=job_spec.get('args'), priority=job_spec.get('priority'), )) url = f'https://app.scrapinghub.com/p/{job.key}' success(f'{spider} is scraping at {url}') if tail: # wait for the job to start info("starting to tail logs...") while job.metadata.get("state") != "finished": time.sleep(5) for line in job.logs.list(): info("[ {} ] {}".format(job_spec["spider"], line)) sys.exit(0 if job.metadata.get("close_reason") == "finished" else 1)
def search(cache, imo): ui.info(f"loading static fleet (caching: {cache})") fleet = vessels(disable_cache=not cache) ui.info(f"searching for vessel with imo: {imo}") vessel = fleet.get(imo) if vessel: ui.success("on Kpler radar") pp(vessel, indent=4)
def display(data_type, cache): """Display who produces what. """ info(f'looking for data sources exposing `{data_type}` information...') for crawler in lookup_providers(data_type, cache): success( f"Provider {crawler['provider']} (powered by spider={crawler['name']})" )
def schedule(**kwargs): """Register a single periodic job.""" spider_args = { k: v for k, v in list(map(lambda x: x.split("="), kwargs["args"])) } kwargs["args"] = spider_args job_id, err = shub.create_periodic_job(**kwargs) if job_id is None: fail(f"failed to create job: {err}") else: success("successfully created periodic job (id={})".format(job_id))
def export(bucket, filename): """Export spider attributes to S3. This is especially useful for downstream processing where Kpler-aware components want to process specific data types, but the datalake only knows about sources. Usage: ./who-produces.py export --bucket kp-datalake """ info(f"exporting spider meta ...") key = f'__meta/{filename}' upload_blob(bucket, key, spiders(use_cache=False)) success(f"spider meta is located at `s3://{bucket}/{key}`")
def delete(project): """Remove project from scrapinghub org. It is expected to be used with default arguments so it can find the same project created by by `create` command. This way one can use them at the start and the end of a feature-branch with namings already set for him. """ project = project or default_project_name() with shub.Config() as conf: # crash when not found... project_id = conf['projects'].pop(project) res = shub.delete_project(project_id) if res.ok: success('project {} successfully deleted ({})'.format( project, project_id)) # overwrite content on success conf.commit() else: fail('failed to delete project: {}'.format(res.text))
def doctor(project_id, jobs_path, ignore, checks, filters): """Register several periodic jobs from a yaml conf.""" info("loading local jobs path={}".format(jobs_path)) local_jobs = list(load_jobs(jobs_path, blacklist=['settings.yml'])) # TODO merge `sensible` table info("loading airtable base table={}".format('Overview')) records = list( kp_airtable.retrieve_all_records('Data Sourcing', 'Overview')) info("loading Scrapinghub periodic jobs project={}".format(project_id)) shub_jobs = [] disabled_jobs = [] for job in shub.periodic_jobs(project_id).json().get('results', []): if not job['disabled']: shub_jobs.extend(job.get('spiders')) else: disabled_jobs.extend(job.get('spiders')) filteropts = '' for _filter in filters: filteropts += '--filter {} '.format(_filter) info("filtering spiders by attributes: {}".format(filters)) for spider in run_scrapy_command('describe', '{} --silent'.format(filteropts)): if spider['name'] in ignore: info("ignoring spider {}".format(spider['name'])) continue if _should_run('scheduling', checks): if not lookup(local_jobs, value=spider['name'], key='spider'): fail("spider {name} is not scheduled locally".format(**spider)) if _should_run('scrapinghub', checks): if lookup(disabled_jobs, value=spider['name'], key='name'): info("spider {name} is disabled on Scrapinghub".format( **spider)) elif not lookup(shub_jobs, value=spider['name'], key='name'): fail("spider {name} is not scheduled on Scrapinghub".format( **spider)) if _should_run('airtable', checks) and spider.get('provider'): # TODO compare records['State'] with last job run if not lookup(records, value=spider['provider'], key='Name'): fail("spider {name} is not documented on Airtable".format( **spider)) if _should_run('spider', checks): if not spider['version']: fail("spider {name} is not versioned".format(**spider)) if not spider['provider']: fail("spider {name} doesn't define a data provider".format( **spider)) if not spider['produces']: fail("spider {name} doesn't define the data types produced". format(**spider)) hr('-') success("diagnostic done - patient in trouble")
def create(project, copy_from, org): """Create a new project on Scrapinghub. One is expected to use default arguments so we can share the same development workflow. Basically create a feature-branch, create a project, test on this project, merge and delete env. This wrapper uses by default the Kpler staging env to avoid eating production (paid) resources. Yet it also tries to reproduce as closely as possible this env, reading settings from env and pushing them on the new project. """ project = project or default_project_name() info('creating project {} on {}'.format(project, org)) res = shub.create_project(project, org) """Since it is not documented , here is `res.json()` format: { 'info': { 'description': '' }, 'owners': [], 'ui_settings': {}, 'visual_project_type': '', 'settings': {}, 'deleted': False, 'description': '', 'has_eggs': False, 'monitoring_enabled': False, 'organization_name': 'kpler.com', 'last_activity': None, 'version': None, 'data_retention': 0, 'organization': 288, 'addons': [], 'default_kumo_units': None, 'id': 169023, 'csv_fields': None, 'name': 'testing-tmp3' } """ if not res.ok: fail('failed to create project: {}'.format(res.text)) sys.exit(1) feedback = res.json() success('project {} successfully created ({})'.format( project, feedback['id'])) with shub.Config() as conf: info('updating config: {}'.format(conf.path)) conf['projects'][project] = feedback['id'] conf.commit() if copy_from: if copy_from.endswith('yml'): with open(copy_from, 'r') as fd: project_settings = {} for k, v in yaml.load(fd).items(): if isinstance(v, dict): # the only case is when using `secret: 'vault.decrypt("....")'` project_settings[k] = eval(v['secret']) else: project_settings[k] = v else: copy_from = shub.to_project_id(None, None, copy_from) info("populating project settings from project {}".format( copy_from)) # TODO could be safer project_settings = shub.project_settings(copy_from).json().get( 'settings') else: info("populating project settings from env") project_settings = {} for k, v in PROJECT_SETTINGS.items(): from_env = os.getenv(k) if from_env is None and v is None: info('no value defined for setting `{}`'.format(k)) elif from_env: info('overwriting setting with env value: {}={}'.format( k, from_env)) project_settings[k] = from_env res = shub.update_project_settings(feedback['id'], project_settings) if res.ok: success('successfully updated project settings') else: fail('failed to update project settings: {}'.format(res.text))
def export( spider, transform, output, markers, deduplicate_on, filter_file, session_id, retry, **opts ): """Export Scrapinghub items.""" if retry: # TODO append to session file? info("resuming session `{}`".format(retry)) session_id = retry opts['skip_tags'].append(session_id) else: info('starting session `{}`'.format(session_id)) transform = TRANSFORMERS.get(transform) or noop markers = list(markers + DEFAULT_MARKERS + (session_id,)) raw_data = [] unique_keys = [] fails = 0 constraint_keys = [] if filter_file: info("loading constraints file {}".format(filter_file)) with open(filter_file) as csvfile: reader = csv.DictReader(csvfile) constraint_keys = [row[deduplicate_on] for row in reader] opts['spider'] = spider for job in filter(has_scraped_data, fetch_jobs(**opts)): try: # tag it to remmeber we processed it (ETL style) shub.update_tags(job, add=markers) info("processing job {}".format(job.key)) for item in job.items.iter(): if deduplicate_on: if item[deduplicate_on] not in unique_keys: if constraint_keys: if item[deduplicate_on] in constraint_keys: # indivual items can be nested and store multiple types # so `transform` is a generic generator we consume here raw_data.extend([partial for partial in transform(item)]) unique_keys.append(item[deduplicate_on]) else: raw_data.extend([partial for partial in transform(item)]) unique_keys.append(item[deduplicate_on]) else: raw_data.extend([partial for partial in transform(item)]) except Exception as e: fail('fetching jobs crashed: {}, going on'.format(e)) # dumping the data we got so far # one might be able to resume execution # NOTE could dump data so far in output = '/tmp/' + session_id + '.' + output fails += 1 success("done ({} exceptions), exporting data".format(fails)) if raw_data: output_format = output.split('.')[-1] success('exporting data raw={} to {}'.format(len(raw_data), output)) # TODO support sqlite # TODO support stdout # TODO support gdrive export - might be better to wait for anbother PR bringing # kp-gdrive and its high-level interface with open(output, 'w') as fd: if output_format == 'csv': csv_export(raw_data, fd) elif output_format == 'jl': jl_export(raw_data, fd) else: fail("no data was fetched")
def batch_schedule(project_id, spiders, root_conf, conf_files, dry_run, reset, monitor): """Register several periodic jobs from a yaml conf.""" datadog.initialize(api_key=os.getenv("DATADOG_API_KEY"), app_key=os.getenv("DATADOG_APP_KEY")) # force yielding all the configs so wwe can merge spider names configs = list( walk_configs(conf_root=root_conf, paths=conf_files, blacklist=["settings.yml"])) spider_names = spiders or [] if not spiders: for conf in configs: spider_names.extend( [job_spec["spider"] for job_spec in conf["jobs"]]) if reset: # be gentle with the api and batch delete the jobs upfront shub.reset_periodic_jobs(project_id, set(spider_names), dry_run) # TODO validate configuration (a seperate command ?) for specs in configs: # init with global settings that apply for all g_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", []) g_settings = specs.get('global_settings', {}) d_crons = specs.get("default_crons", []) for job_spec in specs["jobs"]: # one can limit spiders from the command line if job_spec["spider"] not in spider_names: info('spider "{}" not required to schedule'.format( job_spec["spider"])) continue if job_spec.get("disabled"): info('spider "{}" disabled by config'.format( job_spec["spider"])) continue if not (job_spec.get("crons") or d_crons): fail('spider "{}" contains no cron'.format(job_spec["spider"])) continue # overwrite spider-level settings on scrapinghub if job_spec.get("settings") or g_settings: # use default settings if there exists any job_spec["settings"] = { **job_spec.get("settings", {}), **g_settings } for key, value in job_spec["settings"].items(): if isinstance(value, dict): # the only case is when using `secret: 'vault.decrypt("....")'` job_spec["settings"][key] = eval(value['secret']) spider_id = shub.to_spider_id(project_id, job_spec["spider"]) res = shub.update_spider_settings(project_id, spider_id, job_spec["settings"]) if not res.ok: fail( f'failed to update settings for spider "{job_spec["spider"]}"' ) # skip since we don't want scheduled jobs to fail due to incorrect settings continue # fill defaults # NOTE propably better done merging a hierarchy of dicts job_spec["priority"] = job_spec.get("priority", DEFAULT_PRIORITY) job_spec["tags"] = g_tags + job_spec.get("tags", []) job_spec["crons"] = job_spec.get("crons") or d_crons for combination in render_args(job_spec.get("dynamic_args", {})): # add static arguments for every combination generated combination.update(job_spec.get("args", {})) for cron_tpl in job_spec["crons"]: cron = generate_cron(cron_tpl) info(f"creating job on project {project_id}: {cron}") job_id, err = shub.create_periodic_job( job_spec["spider"], project_id, cron, combination, job_spec["tags"], job_spec.get("description", DEFAULT_DESCRIPTION), job_spec["priority"], dry_run, ) if job_id or dry_run: success(f"done: {job_id}") else: fail(f"failed to schedule job: {err}") if job_spec.get("monitoring") and monitor: job_spec["monitoring"]["tags"] = job_spec["monitoring"].get( "tags", []) if "creator:bot" not in job_spec["monitoring"]["tags"]: job_spec["monitoring"]["tags"].append("creator:bot") # TODO and only if the command above worked fine # Create a new monitor (don't care about the dynamic nature of arguments) try: info("creating new datadog monitor: {}".format( job_spec["monitoring"])) if not dry_run: feedback = datadog.api.Monitor.create( type="metric alert", **job_spec["monitoring"]) if feedback.get("errors"): fail("failed to create monitor: {}".format( feedback["errors"])) else: success("successfully created alert on {}".format( feedback["created"])) except ValueError as e: # usually error 403 forbidden that return an HTML page instead of json fail("failed to create monitor: {}".format(e))