Esempio n. 1
0
def _run_spider(project_id, job_spec, tail):
    """Connect to Scrapinghub and run the job."""
    spider = job_spec.get('spider_name')

    info(f'running {spider} on scrapinghub: {project_id}: '
         f'args={job_spec.get("args")} tags={job_spec.get("tags")}')

    from kp_scrapers.lib.services.shub import shub_conn

    job = (shub_conn().get_project(project_id).jobs.run(
        spider,
        add_tag=job_spec.get('tags'),
        job_args=job_spec.get('args'),
        priority=job_spec.get('priority'),
    ))
    url = f'https://app.scrapinghub.com/p/{job.key}'
    success(f'{spider} is scraping at {url}')

    if tail:
        # wait for the job to start
        info("starting to tail logs...")
        while job.metadata.get("state") != "finished":
            time.sleep(5)
            for line in job.logs.list():
                info("[ {} ] {}".format(job_spec["spider"], line))
        sys.exit(0 if job.metadata.get("close_reason") == "finished" else 1)
Esempio n. 2
0
def search(cache, imo):
    ui.info(f"loading static fleet (caching: {cache})")
    fleet = vessels(disable_cache=not cache)
    ui.info(f"searching for vessel with imo: {imo}")
    vessel = fleet.get(imo)
    if vessel:
        ui.success("on Kpler radar")
        pp(vessel, indent=4)
Esempio n. 3
0
def display(data_type, cache):
    """Display who produces what.

    """
    info(f'looking for data sources exposing `{data_type}` information...')
    for crawler in lookup_providers(data_type, cache):
        success(
            f"Provider {crawler['provider']} (powered by spider={crawler['name']})"
        )
Esempio n. 4
0
def schedule(**kwargs):
    """Register a single periodic job."""
    spider_args = {
        k: v
        for k, v in list(map(lambda x: x.split("="), kwargs["args"]))
    }
    kwargs["args"] = spider_args
    job_id, err = shub.create_periodic_job(**kwargs)
    if job_id is None:
        fail(f"failed to create job: {err}")
    else:
        success("successfully created periodic job (id={})".format(job_id))
Esempio n. 5
0
def export(bucket, filename):
    """Export spider attributes to S3.

    This is especially useful for downstream processing where Kpler-aware
    components want to process specific data types, but the datalake only knows
    about sources.

    Usage:

            ./who-produces.py export --bucket kp-datalake

    """
    info(f"exporting spider meta ...")
    key = f'__meta/{filename}'
    upload_blob(bucket, key, spiders(use_cache=False))
    success(f"spider meta is located at `s3://{bucket}/{key}`")
Esempio n. 6
0
def delete(project):
    """Remove project from scrapinghub org.

    It is expected to be used with default arguments so it can find the same
    project created by by `create` command. This way one can use them at the
    start and the end of a feature-branch with namings already set for him.

    """
    project = project or default_project_name()

    with shub.Config() as conf:
        # crash when not found...
        project_id = conf['projects'].pop(project)

        res = shub.delete_project(project_id)
        if res.ok:
            success('project {} successfully deleted ({})'.format(
                project, project_id))
            # overwrite content on success
            conf.commit()
        else:
            fail('failed to delete project: {}'.format(res.text))
Esempio n. 7
0
def doctor(project_id, jobs_path, ignore, checks, filters):
    """Register several periodic jobs from a yaml conf."""
    info("loading local jobs path={}".format(jobs_path))
    local_jobs = list(load_jobs(jobs_path, blacklist=['settings.yml']))

    # TODO merge `sensible` table
    info("loading airtable base table={}".format('Overview'))
    records = list(
        kp_airtable.retrieve_all_records('Data Sourcing', 'Overview'))

    info("loading Scrapinghub periodic jobs project={}".format(project_id))
    shub_jobs = []
    disabled_jobs = []
    for job in shub.periodic_jobs(project_id).json().get('results', []):
        if not job['disabled']:
            shub_jobs.extend(job.get('spiders'))
        else:
            disabled_jobs.extend(job.get('spiders'))

    filteropts = ''
    for _filter in filters:
        filteropts += '--filter {} '.format(_filter)
    info("filtering spiders by attributes: {}".format(filters))

    for spider in run_scrapy_command('describe',
                                     '{} --silent'.format(filteropts)):
        if spider['name'] in ignore:
            info("ignoring spider {}".format(spider['name']))
            continue

        if _should_run('scheduling', checks):
            if not lookup(local_jobs, value=spider['name'], key='spider'):
                fail("spider {name} is not scheduled locally".format(**spider))

        if _should_run('scrapinghub', checks):
            if lookup(disabled_jobs, value=spider['name'], key='name'):
                info("spider {name} is disabled on Scrapinghub".format(
                    **spider))
            elif not lookup(shub_jobs, value=spider['name'], key='name'):
                fail("spider {name} is not scheduled on Scrapinghub".format(
                    **spider))

        if _should_run('airtable', checks) and spider.get('provider'):
            # TODO compare records['State'] with last job run
            if not lookup(records, value=spider['provider'], key='Name'):
                fail("spider {name} is not documented on Airtable".format(
                    **spider))

        if _should_run('spider', checks):
            if not spider['version']:
                fail("spider {name} is not versioned".format(**spider))
            if not spider['provider']:
                fail("spider {name} doesn't define a data provider".format(
                    **spider))
            if not spider['produces']:
                fail("spider {name} doesn't define the data types produced".
                     format(**spider))

        hr('-')

    success("diagnostic done - patient in trouble")
Esempio n. 8
0
def create(project, copy_from, org):
    """Create a new project on Scrapinghub.

    One is expected to use default arguments so we can share the same
    development workflow.  Basically create a feature-branch, create a project,
    test on this project, merge and delete env.

    This wrapper uses by default the Kpler staging env to avoid eating
    production (paid) resources. Yet it also tries to reproduce as closely as
    possible this env, reading settings from env and pushing them on the new
    project.

    """
    project = project or default_project_name()

    info('creating project {} on {}'.format(project, org))
    res = shub.create_project(project, org)
    """Since it is not documented , here is `res.json()` format:
    {
        'info': {
            'description': ''
        },
        'owners': [],
        'ui_settings': {},
        'visual_project_type': '',
        'settings': {},
        'deleted': False,
        'description': '',
        'has_eggs': False,
        'monitoring_enabled': False,
        'organization_name': 'kpler.com',
        'last_activity': None,
        'version': None,
        'data_retention': 0,
        'organization': 288,
        'addons': [],
        'default_kumo_units': None,
        'id': 169023,
        'csv_fields': None,
        'name': 'testing-tmp3'
    }
    """
    if not res.ok:
        fail('failed to create project: {}'.format(res.text))
        sys.exit(1)

    feedback = res.json()
    success('project {} successfully created ({})'.format(
        project, feedback['id']))
    with shub.Config() as conf:
        info('updating config: {}'.format(conf.path))
        conf['projects'][project] = feedback['id']
        conf.commit()

    if copy_from:
        if copy_from.endswith('yml'):
            with open(copy_from, 'r') as fd:
                project_settings = {}
                for k, v in yaml.load(fd).items():
                    if isinstance(v, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        project_settings[k] = eval(v['secret'])
                    else:
                        project_settings[k] = v
        else:
            copy_from = shub.to_project_id(None, None, copy_from)
            info("populating project settings from project {}".format(
                copy_from))
            # TODO could be safer
            project_settings = shub.project_settings(copy_from).json().get(
                'settings')
    else:
        info("populating project settings from env")
        project_settings = {}
        for k, v in PROJECT_SETTINGS.items():
            from_env = os.getenv(k)
            if from_env is None and v is None:
                info('no value defined for setting `{}`'.format(k))
            elif from_env:
                info('overwriting setting with env value: {}={}'.format(
                    k, from_env))
                project_settings[k] = from_env

    res = shub.update_project_settings(feedback['id'], project_settings)
    if res.ok:
        success('successfully updated project settings')
    else:
        fail('failed to update project settings: {}'.format(res.text))
Esempio n. 9
0
def export(
    spider, transform, output, markers, deduplicate_on, filter_file, session_id, retry, **opts
):
    """Export Scrapinghub items."""
    if retry:
        # TODO append to session file?
        info("resuming session `{}`".format(retry))
        session_id = retry
        opts['skip_tags'].append(session_id)
    else:
        info('starting session `{}`'.format(session_id))

    transform = TRANSFORMERS.get(transform) or noop
    markers = list(markers + DEFAULT_MARKERS + (session_id,))
    raw_data = []
    unique_keys = []
    fails = 0

    constraint_keys = []
    if filter_file:
        info("loading constraints file {}".format(filter_file))
        with open(filter_file) as csvfile:
            reader = csv.DictReader(csvfile)
            constraint_keys = [row[deduplicate_on] for row in reader]

    opts['spider'] = spider
    for job in filter(has_scraped_data, fetch_jobs(**opts)):
        try:
            # tag it to remmeber we processed it (ETL style)
            shub.update_tags(job, add=markers)

            info("processing job {}".format(job.key))
            for item in job.items.iter():
                if deduplicate_on:
                    if item[deduplicate_on] not in unique_keys:
                        if constraint_keys:
                            if item[deduplicate_on] in constraint_keys:
                                # indivual items can be nested and store multiple types
                                # so `transform` is a generic generator we consume here
                                raw_data.extend([partial for partial in transform(item)])
                                unique_keys.append(item[deduplicate_on])
                        else:
                            raw_data.extend([partial for partial in transform(item)])
                            unique_keys.append(item[deduplicate_on])
                else:
                    raw_data.extend([partial for partial in transform(item)])
        except Exception as e:
            fail('fetching jobs crashed: {}, going on'.format(e))
            # dumping the data we got so far
            # one might be able to resume execution
            # NOTE could dump data so far in output = '/tmp/' + session_id + '.' + output
            fails += 1

    success("done ({} exceptions), exporting data".format(fails))

    if raw_data:
        output_format = output.split('.')[-1]
        success('exporting data raw={} to {}'.format(len(raw_data), output))

        # TODO support sqlite
        # TODO support stdout
        # TODO support gdrive export - might be better to wait for anbother PR bringing
        #      kp-gdrive and its high-level interface
        with open(output, 'w') as fd:
            if output_format == 'csv':
                csv_export(raw_data, fd)
            elif output_format == 'jl':
                jl_export(raw_data, fd)
    else:
        fail("no data was fetched")
Esempio n. 10
0
def batch_schedule(project_id, spiders, root_conf, conf_files, dry_run, reset,
                   monitor):
    """Register several periodic jobs from a yaml conf."""
    datadog.initialize(api_key=os.getenv("DATADOG_API_KEY"),
                       app_key=os.getenv("DATADOG_APP_KEY"))

    # force yielding all the configs so wwe can merge spider names
    configs = list(
        walk_configs(conf_root=root_conf,
                     paths=conf_files,
                     blacklist=["settings.yml"]))
    spider_names = spiders or []
    if not spiders:
        for conf in configs:
            spider_names.extend(
                [job_spec["spider"] for job_spec in conf["jobs"]])

    if reset:
        # be gentle with the api and batch delete the jobs upfront
        shub.reset_periodic_jobs(project_id, set(spider_names), dry_run)

    # TODO validate configuration (a seperate command ?)
    for specs in configs:
        # init with global settings that apply for all
        g_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", [])
        g_settings = specs.get('global_settings', {})
        d_crons = specs.get("default_crons", [])

        for job_spec in specs["jobs"]:
            # one can limit spiders from the command line
            if job_spec["spider"] not in spider_names:
                info('spider "{}" not required to schedule'.format(
                    job_spec["spider"]))
                continue

            if job_spec.get("disabled"):
                info('spider "{}" disabled by config'.format(
                    job_spec["spider"]))
                continue

            if not (job_spec.get("crons") or d_crons):
                fail('spider "{}" contains no cron'.format(job_spec["spider"]))
                continue

            # overwrite spider-level settings on scrapinghub
            if job_spec.get("settings") or g_settings:
                # use default settings if there exists any
                job_spec["settings"] = {
                    **job_spec.get("settings", {}),
                    **g_settings
                }
                for key, value in job_spec["settings"].items():
                    if isinstance(value, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        job_spec["settings"][key] = eval(value['secret'])

                spider_id = shub.to_spider_id(project_id, job_spec["spider"])
                res = shub.update_spider_settings(project_id, spider_id,
                                                  job_spec["settings"])
                if not res.ok:
                    fail(
                        f'failed to update settings for spider "{job_spec["spider"]}"'
                    )
                    # skip since we don't want scheduled jobs to fail due to incorrect settings
                    continue

            # fill defaults
            # NOTE propably better done merging a hierarchy of dicts
            job_spec["priority"] = job_spec.get("priority", DEFAULT_PRIORITY)
            job_spec["tags"] = g_tags + job_spec.get("tags", [])
            job_spec["crons"] = job_spec.get("crons") or d_crons

            for combination in render_args(job_spec.get("dynamic_args", {})):
                # add static arguments for every combination generated
                combination.update(job_spec.get("args", {}))

                for cron_tpl in job_spec["crons"]:
                    cron = generate_cron(cron_tpl)
                    info(f"creating job on project {project_id}: {cron}")
                    job_id, err = shub.create_periodic_job(
                        job_spec["spider"],
                        project_id,
                        cron,
                        combination,
                        job_spec["tags"],
                        job_spec.get("description", DEFAULT_DESCRIPTION),
                        job_spec["priority"],
                        dry_run,
                    )
                    if job_id or dry_run:
                        success(f"done: {job_id}")
                    else:
                        fail(f"failed to schedule job: {err}")

            if job_spec.get("monitoring") and monitor:
                job_spec["monitoring"]["tags"] = job_spec["monitoring"].get(
                    "tags", [])
                if "creator:bot" not in job_spec["monitoring"]["tags"]:
                    job_spec["monitoring"]["tags"].append("creator:bot")

                # TODO and only if the command above worked fine
                # Create a new monitor (don't care about the dynamic nature of arguments)
                try:
                    info("creating new datadog monitor: {}".format(
                        job_spec["monitoring"]))
                    if not dry_run:
                        feedback = datadog.api.Monitor.create(
                            type="metric alert", **job_spec["monitoring"])
                        if feedback.get("errors"):
                            fail("failed to create monitor: {}".format(
                                feedback["errors"]))
                        else:
                            success("successfully created alert on {}".format(
                                feedback["created"]))
                except ValueError as e:
                    # usually error 403 forbidden that return an HTML page instead of json
                    fail("failed to create monitor: {}".format(e))