Beispiel #1
0
def create_periodic_job(spider, project_id, cron, args, tags, description, priority, dry_run):
    # TODO support dry_run
    cron = SHCronType(*cron.replace(' ', '').split(','))
    payload = {
        'month': cron.month,
        'dayofmonth': cron.day_of_month,
        'day': cron.day_of_week,
        'hour': cron.hour,
        'minutes_shift': cron.min,
        "spiders": [{'name': spider, 'spider_args': args, 'priority': priority}],
        'description': description,
        'addtags': tags,
    }

    if dry_run:
        ui.info('job schedule: {}'.format(payload))
        # fake job id, no error
        return 222, None

    resrc = 'projects/{}/periodicjobs'.format(project_id)
    url = SH_API_BASE.format(v=SH_API_VERSION, resrc=resrc)
    # NOTE no headers ?
    res = requests.post(url, auth=HTTPBasicAuth(SH_API_KEY, ''), headers=HEADERS, json=payload)
    if res.status_code != 201:
        # TODO rollback the whole deployment ?
        return None, res.text

    return res.json().get('id'), None
Beispiel #2
0
def search(cache, imo):
    ui.info(f"loading static fleet (caching: {cache})")
    fleet = vessels(disable_cache=not cache)
    ui.info(f"searching for vessel with imo: {imo}")
    vessel = fleet.get(imo)
    if vessel:
        ui.success("on Kpler radar")
        pp(vessel, indent=4)
Beispiel #3
0
def display(data_type, cache):
    """Display who produces what.

    """
    info(f'looking for data sources exposing `{data_type}` information...')
    for crawler in lookup_providers(data_type, cache):
        success(
            f"Provider {crawler['provider']} (powered by spider={crawler['name']})"
        )
Beispiel #4
0
def scrape(project_id, spiders, root_conf, conf_files, tail, dry_run, args):
    """Run spiders on Scrapinghub based on configure files or command line argument"""
    spider_names = spiders or []

    if conf_files:
        # force yielding all the configs so wwe can merge spider names
        configs = list(
            walk_configs(conf_root=root_conf,
                         paths=conf_files,
                         blacklist=["settings.yml"]))

        if not spider_names:
            # do we really want to scrape all spiders if spider not specified?
            for conf in configs:
                spider_names.extend(
                    [job_spec["spider"] for job_spec in conf["jobs"]])

        for specs in configs:
            global_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", [])

            for raw_job_spec in specs['jobs']:
                spider_name = raw_job_spec['spider']
                if raw_job_spec['spider'] not in spider_names:
                    info(f'spider {spider_name} not required to schedule')

                if raw_job_spec.get('disabled'):
                    info(
                        f'spider {spider_name} disabled by configuration file')

                args = raw_job_spec.get('args', {})
                tags = global_tags + raw_job_spec.get('tags', [])
                priority = raw_job_spec.get('priority', None)

                # dynamic arguments
                for combination in render_args(
                        raw_job_spec.get("dynamic_args", {})):
                    args.update(combination)

                    if not dry_run:
                        job_spec = _build_job_spec(spider_name, args, tags,
                                                   priority)
                        _run_spider(project_id, job_spec, tail)
    else:
        if not spider_names:
            fail('Please specify spider name.')
            return

        job_args = {}
        for key_value in args:
            key, _, value = key_value.partition('=')
            job_args.update({key: value})

        if not dry_run:
            for spider in spider_names:
                job_spec = _build_job_spec(spider, args=job_args)
                _run_spider(project_id, job_spec, tail)
Beispiel #5
0
def reset_periodic_jobs(project_id, job_names, dry_run=False):
    res = periodic_jobs(project_id)
    if res.status_code != 200:
        raise ValueError('failed to retrieve jobs list: {}'.format(res.text))

    for job in res.json().get('results', []):
        spiders = [s['name'] for s in job['spiders']]
        if set(job_names).intersection(spiders) and job['type'] == 'spider':
            ui.info('existing job found ({}), reseting'.format(job['id']))
            if not dry_run:
                res = delete_periodic_job(project_id, job['id'])
                assert res.status_code == 204
Beispiel #6
0
def walk_configs(conf_root=None, paths=None, blacklist=None):
    assert conf_root or paths

    blacklist = blacklist or []
    list_confs = paths or _walk_files(conf_root, filetype='yml')

    for conf_file in list_confs:
        if any(to_avoid in conf_file for to_avoid in blacklist):
            continue

        with open(conf_file, 'r') as fd:
            info(f'loading jobs in {conf_file}')
            yield yaml.load(fd)
Beispiel #7
0
def batch_update(base, table, data):
    """Update multiple rows.

    Args:
        base (str):
        table (str):
        data (List[List[str]]):

    Returns:

    """
    for row, id in data:
        info(f'Updating row {row["Spider"]}')
        update(base, table, id, row)
Beispiel #8
0
def batch_create(base, table, data):
    """Create multiple records within rate limit.

    Args:
        base (str):
        table (str):
        data (List[List[str]]):

    Returns:
        None

    """
    for row in data:
        info(f'Creating row {row["Spider"]}')
        create(base, table, row)
Beispiel #9
0
def export(bucket, filename):
    """Export spider attributes to S3.

    This is especially useful for downstream processing where Kpler-aware
    components want to process specific data types, but the datalake only knows
    about sources.

    Usage:

            ./who-produces.py export --bucket kp-datalake

    """
    info(f"exporting spider meta ...")
    key = f'__meta/{filename}'
    upload_blob(bucket, key, spiders(use_cache=False))
    success(f"spider meta is located at `s3://{bucket}/{key}`")
Beispiel #10
0
def fetch_jobs(project_id, spider, with_tags, skip_tags, with_args, max_jobs,
               start_date, end_date, job_keys):
    """Basically remap scrapinghub expectations with Kpler cli conventions."""
    search_opts = {'spider': spider, 'job_keys': job_keys}
    if max_jobs:
        search_opts['count'] = max_jobs
    if with_tags:
        search_opts['has_tag'] = list(with_tags)
    if skip_tags:
        search_opts['lacks_tag'] = list(skip_tags)
    if start_date:
        # shub client expects unix millisecond timestamp
        search_opts['startts'] = int(
            dateutil.parser.parse(start_date).strftime('%s')) * 1000
    if end_date:
        search_opts['endts'] = int(
            dateutil.parser.parse(end_date).strftime('%s')) * 1000

    info("fetching jobs project={}".format(project_id))
    for job in shub.spider_jobs(project_id, **search_opts):
        info("inspecting new job {}".format(job.key))
        # reformat like cli args
        # NOTE `job.metadata` is an object and its `get` method does not
        #       allow usage like so `.get('spider_args', {})`
        job_args = [
            f'{k}={v}'
            for k, v in (job.metadata.get('spider_args') or {}).items()
        ]
        if not all([t in job_args for t in with_args]):
            info("skipping job {} with args: {}".format(job.key, job_args))
            continue

        yield job
Beispiel #11
0
def _run_spider(project_id, job_spec, tail):
    """Connect to Scrapinghub and run the job."""
    spider = job_spec.get('spider_name')

    info(f'running {spider} on scrapinghub: {project_id}: '
         f'args={job_spec.get("args")} tags={job_spec.get("tags")}')

    from kp_scrapers.lib.services.shub import shub_conn

    job = (shub_conn().get_project(project_id).jobs.run(
        spider,
        add_tag=job_spec.get('tags'),
        job_args=job_spec.get('args'),
        priority=job_spec.get('priority'),
    ))
    url = f'https://app.scrapinghub.com/p/{job.key}'
    success(f'{spider} is scraping at {url}')

    if tail:
        # wait for the job to start
        info("starting to tail logs...")
        while job.metadata.get("state") != "finished":
            time.sleep(5)
            for line in job.logs.list():
                info("[ {} ] {}".format(job_spec["spider"], line))
        sys.exit(0 if job.metadata.get("close_reason") == "finished" else 1)
Beispiel #12
0
def doctor(project_id, jobs_path, ignore, checks, filters):
    """Register several periodic jobs from a yaml conf."""
    info("loading local jobs path={}".format(jobs_path))
    local_jobs = list(load_jobs(jobs_path, blacklist=['settings.yml']))

    # TODO merge `sensible` table
    info("loading airtable base table={}".format('Overview'))
    records = list(
        kp_airtable.retrieve_all_records('Data Sourcing', 'Overview'))

    info("loading Scrapinghub periodic jobs project={}".format(project_id))
    shub_jobs = []
    disabled_jobs = []
    for job in shub.periodic_jobs(project_id).json().get('results', []):
        if not job['disabled']:
            shub_jobs.extend(job.get('spiders'))
        else:
            disabled_jobs.extend(job.get('spiders'))

    filteropts = ''
    for _filter in filters:
        filteropts += '--filter {} '.format(_filter)
    info("filtering spiders by attributes: {}".format(filters))

    for spider in run_scrapy_command('describe',
                                     '{} --silent'.format(filteropts)):
        if spider['name'] in ignore:
            info("ignoring spider {}".format(spider['name']))
            continue

        if _should_run('scheduling', checks):
            if not lookup(local_jobs, value=spider['name'], key='spider'):
                fail("spider {name} is not scheduled locally".format(**spider))

        if _should_run('scrapinghub', checks):
            if lookup(disabled_jobs, value=spider['name'], key='name'):
                info("spider {name} is disabled on Scrapinghub".format(
                    **spider))
            elif not lookup(shub_jobs, value=spider['name'], key='name'):
                fail("spider {name} is not scheduled on Scrapinghub".format(
                    **spider))

        if _should_run('airtable', checks) and spider.get('provider'):
            # TODO compare records['State'] with last job run
            if not lookup(records, value=spider['provider'], key='Name'):
                fail("spider {name} is not documented on Airtable".format(
                    **spider))

        if _should_run('spider', checks):
            if not spider['version']:
                fail("spider {name} is not versioned".format(**spider))
            if not spider['provider']:
                fail("spider {name} doesn't define a data provider".format(
                    **spider))
            if not spider['produces']:
                fail("spider {name} doesn't define the data types produced".
                     format(**spider))

        hr('-')

    success("diagnostic done - patient in trouble")
Beispiel #13
0
def create(project, copy_from, org):
    """Create a new project on Scrapinghub.

    One is expected to use default arguments so we can share the same
    development workflow.  Basically create a feature-branch, create a project,
    test on this project, merge and delete env.

    This wrapper uses by default the Kpler staging env to avoid eating
    production (paid) resources. Yet it also tries to reproduce as closely as
    possible this env, reading settings from env and pushing them on the new
    project.

    """
    project = project or default_project_name()

    info('creating project {} on {}'.format(project, org))
    res = shub.create_project(project, org)
    """Since it is not documented , here is `res.json()` format:
    {
        'info': {
            'description': ''
        },
        'owners': [],
        'ui_settings': {},
        'visual_project_type': '',
        'settings': {},
        'deleted': False,
        'description': '',
        'has_eggs': False,
        'monitoring_enabled': False,
        'organization_name': 'kpler.com',
        'last_activity': None,
        'version': None,
        'data_retention': 0,
        'organization': 288,
        'addons': [],
        'default_kumo_units': None,
        'id': 169023,
        'csv_fields': None,
        'name': 'testing-tmp3'
    }
    """
    if not res.ok:
        fail('failed to create project: {}'.format(res.text))
        sys.exit(1)

    feedback = res.json()
    success('project {} successfully created ({})'.format(
        project, feedback['id']))
    with shub.Config() as conf:
        info('updating config: {}'.format(conf.path))
        conf['projects'][project] = feedback['id']
        conf.commit()

    if copy_from:
        if copy_from.endswith('yml'):
            with open(copy_from, 'r') as fd:
                project_settings = {}
                for k, v in yaml.load(fd).items():
                    if isinstance(v, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        project_settings[k] = eval(v['secret'])
                    else:
                        project_settings[k] = v
        else:
            copy_from = shub.to_project_id(None, None, copy_from)
            info("populating project settings from project {}".format(
                copy_from))
            # TODO could be safer
            project_settings = shub.project_settings(copy_from).json().get(
                'settings')
    else:
        info("populating project settings from env")
        project_settings = {}
        for k, v in PROJECT_SETTINGS.items():
            from_env = os.getenv(k)
            if from_env is None and v is None:
                info('no value defined for setting `{}`'.format(k))
            elif from_env:
                info('overwriting setting with env value: {}={}'.format(
                    k, from_env))
                project_settings[k] = from_env

    res = shub.update_project_settings(feedback['id'], project_settings)
    if res.ok:
        success('successfully updated project settings')
    else:
        fail('failed to update project settings: {}'.format(res.text))
Beispiel #14
0
def export(
    spider, transform, output, markers, deduplicate_on, filter_file, session_id, retry, **opts
):
    """Export Scrapinghub items."""
    if retry:
        # TODO append to session file?
        info("resuming session `{}`".format(retry))
        session_id = retry
        opts['skip_tags'].append(session_id)
    else:
        info('starting session `{}`'.format(session_id))

    transform = TRANSFORMERS.get(transform) or noop
    markers = list(markers + DEFAULT_MARKERS + (session_id,))
    raw_data = []
    unique_keys = []
    fails = 0

    constraint_keys = []
    if filter_file:
        info("loading constraints file {}".format(filter_file))
        with open(filter_file) as csvfile:
            reader = csv.DictReader(csvfile)
            constraint_keys = [row[deduplicate_on] for row in reader]

    opts['spider'] = spider
    for job in filter(has_scraped_data, fetch_jobs(**opts)):
        try:
            # tag it to remmeber we processed it (ETL style)
            shub.update_tags(job, add=markers)

            info("processing job {}".format(job.key))
            for item in job.items.iter():
                if deduplicate_on:
                    if item[deduplicate_on] not in unique_keys:
                        if constraint_keys:
                            if item[deduplicate_on] in constraint_keys:
                                # indivual items can be nested and store multiple types
                                # so `transform` is a generic generator we consume here
                                raw_data.extend([partial for partial in transform(item)])
                                unique_keys.append(item[deduplicate_on])
                        else:
                            raw_data.extend([partial for partial in transform(item)])
                            unique_keys.append(item[deduplicate_on])
                else:
                    raw_data.extend([partial for partial in transform(item)])
        except Exception as e:
            fail('fetching jobs crashed: {}, going on'.format(e))
            # dumping the data we got so far
            # one might be able to resume execution
            # NOTE could dump data so far in output = '/tmp/' + session_id + '.' + output
            fails += 1

    success("done ({} exceptions), exporting data".format(fails))

    if raw_data:
        output_format = output.split('.')[-1]
        success('exporting data raw={} to {}'.format(len(raw_data), output))

        # TODO support sqlite
        # TODO support stdout
        # TODO support gdrive export - might be better to wait for anbother PR bringing
        #      kp-gdrive and its high-level interface
        with open(output, 'w') as fd:
            if output_format == 'csv':
                csv_export(raw_data, fd)
            elif output_format == 'jl':
                jl_export(raw_data, fd)
    else:
        fail("no data was fetched")
Beispiel #15
0
def batch_schedule(project_id, spiders, root_conf, conf_files, dry_run, reset,
                   monitor):
    """Register several periodic jobs from a yaml conf."""
    datadog.initialize(api_key=os.getenv("DATADOG_API_KEY"),
                       app_key=os.getenv("DATADOG_APP_KEY"))

    # force yielding all the configs so wwe can merge spider names
    configs = list(
        walk_configs(conf_root=root_conf,
                     paths=conf_files,
                     blacklist=["settings.yml"]))
    spider_names = spiders or []
    if not spiders:
        for conf in configs:
            spider_names.extend(
                [job_spec["spider"] for job_spec in conf["jobs"]])

    if reset:
        # be gentle with the api and batch delete the jobs upfront
        shub.reset_periodic_jobs(project_id, set(spider_names), dry_run)

    # TODO validate configuration (a seperate command ?)
    for specs in configs:
        # init with global settings that apply for all
        g_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", [])
        g_settings = specs.get('global_settings', {})
        d_crons = specs.get("default_crons", [])

        for job_spec in specs["jobs"]:
            # one can limit spiders from the command line
            if job_spec["spider"] not in spider_names:
                info('spider "{}" not required to schedule'.format(
                    job_spec["spider"]))
                continue

            if job_spec.get("disabled"):
                info('spider "{}" disabled by config'.format(
                    job_spec["spider"]))
                continue

            if not (job_spec.get("crons") or d_crons):
                fail('spider "{}" contains no cron'.format(job_spec["spider"]))
                continue

            # overwrite spider-level settings on scrapinghub
            if job_spec.get("settings") or g_settings:
                # use default settings if there exists any
                job_spec["settings"] = {
                    **job_spec.get("settings", {}),
                    **g_settings
                }
                for key, value in job_spec["settings"].items():
                    if isinstance(value, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        job_spec["settings"][key] = eval(value['secret'])

                spider_id = shub.to_spider_id(project_id, job_spec["spider"])
                res = shub.update_spider_settings(project_id, spider_id,
                                                  job_spec["settings"])
                if not res.ok:
                    fail(
                        f'failed to update settings for spider "{job_spec["spider"]}"'
                    )
                    # skip since we don't want scheduled jobs to fail due to incorrect settings
                    continue

            # fill defaults
            # NOTE propably better done merging a hierarchy of dicts
            job_spec["priority"] = job_spec.get("priority", DEFAULT_PRIORITY)
            job_spec["tags"] = g_tags + job_spec.get("tags", [])
            job_spec["crons"] = job_spec.get("crons") or d_crons

            for combination in render_args(job_spec.get("dynamic_args", {})):
                # add static arguments for every combination generated
                combination.update(job_spec.get("args", {}))

                for cron_tpl in job_spec["crons"]:
                    cron = generate_cron(cron_tpl)
                    info(f"creating job on project {project_id}: {cron}")
                    job_id, err = shub.create_periodic_job(
                        job_spec["spider"],
                        project_id,
                        cron,
                        combination,
                        job_spec["tags"],
                        job_spec.get("description", DEFAULT_DESCRIPTION),
                        job_spec["priority"],
                        dry_run,
                    )
                    if job_id or dry_run:
                        success(f"done: {job_id}")
                    else:
                        fail(f"failed to schedule job: {err}")

            if job_spec.get("monitoring") and monitor:
                job_spec["monitoring"]["tags"] = job_spec["monitoring"].get(
                    "tags", [])
                if "creator:bot" not in job_spec["monitoring"]["tags"]:
                    job_spec["monitoring"]["tags"].append("creator:bot")

                # TODO and only if the command above worked fine
                # Create a new monitor (don't care about the dynamic nature of arguments)
                try:
                    info("creating new datadog monitor: {}".format(
                        job_spec["monitoring"]))
                    if not dry_run:
                        feedback = datadog.api.Monitor.create(
                            type="metric alert", **job_spec["monitoring"])
                        if feedback.get("errors"):
                            fail("failed to create monitor: {}".format(
                                feedback["errors"]))
                        else:
                            success("successfully created alert on {}".format(
                                feedback["created"]))
                except ValueError as e:
                    # usually error 403 forbidden that return an HTML page instead of json
                    fail("failed to create monitor: {}".format(e))