Example #1
0
def scrape(project_id, spiders, root_conf, conf_files, tail, dry_run, args):
    """Run spiders on Scrapinghub based on configure files or command line argument"""
    spider_names = spiders or []

    if conf_files:
        # force yielding all the configs so wwe can merge spider names
        configs = list(
            walk_configs(conf_root=root_conf,
                         paths=conf_files,
                         blacklist=["settings.yml"]))

        if not spider_names:
            # do we really want to scrape all spiders if spider not specified?
            for conf in configs:
                spider_names.extend(
                    [job_spec["spider"] for job_spec in conf["jobs"]])

        for specs in configs:
            global_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", [])

            for raw_job_spec in specs['jobs']:
                spider_name = raw_job_spec['spider']
                if raw_job_spec['spider'] not in spider_names:
                    info(f'spider {spider_name} not required to schedule')

                if raw_job_spec.get('disabled'):
                    info(
                        f'spider {spider_name} disabled by configuration file')

                args = raw_job_spec.get('args', {})
                tags = global_tags + raw_job_spec.get('tags', [])
                priority = raw_job_spec.get('priority', None)

                # dynamic arguments
                for combination in render_args(
                        raw_job_spec.get("dynamic_args", {})):
                    args.update(combination)

                    if not dry_run:
                        job_spec = _build_job_spec(spider_name, args, tags,
                                                   priority)
                        _run_spider(project_id, job_spec, tail)
    else:
        if not spider_names:
            fail('Please specify spider name.')
            return

        job_args = {}
        for key_value in args:
            key, _, value = key_value.partition('=')
            job_args.update({key: value})

        if not dry_run:
            for spider in spider_names:
                job_spec = _build_job_spec(spider, args=job_args)
                _run_spider(project_id, job_spec, tail)
Example #2
0
def schedule(**kwargs):
    """Register a single periodic job."""
    spider_args = {
        k: v
        for k, v in list(map(lambda x: x.split("="), kwargs["args"]))
    }
    kwargs["args"] = spider_args
    job_id, err = shub.create_periodic_job(**kwargs)
    if job_id is None:
        fail(f"failed to create job: {err}")
    else:
        success("successfully created periodic job (id={})".format(job_id))
Example #3
0
def has_scraped_data(job):
    stats = job.metadata.get('scrapystats')
    if stats is None:
        fail('no stats available, skipping useless job {}'.format(job.key))
        return False

    count = stats.get('item_scraped_count')
    # TODO check why count is sometimes None
    if not count:
        fail("jobs didn't yield any data")
        return False

    return True
Example #4
0
def delete(project):
    """Remove project from scrapinghub org.

    It is expected to be used with default arguments so it can find the same
    project created by by `create` command. This way one can use them at the
    start and the end of a feature-branch with namings already set for him.

    """
    project = project or default_project_name()

    with shub.Config() as conf:
        # crash when not found...
        project_id = conf['projects'].pop(project)

        res = shub.delete_project(project_id)
        if res.ok:
            success('project {} successfully deleted ({})'.format(
                project, project_id))
            # overwrite content on success
            conf.commit()
        else:
            fail('failed to delete project: {}'.format(res.text))
Example #5
0
def doctor(project_id, jobs_path, ignore, checks, filters):
    """Register several periodic jobs from a yaml conf."""
    info("loading local jobs path={}".format(jobs_path))
    local_jobs = list(load_jobs(jobs_path, blacklist=['settings.yml']))

    # TODO merge `sensible` table
    info("loading airtable base table={}".format('Overview'))
    records = list(
        kp_airtable.retrieve_all_records('Data Sourcing', 'Overview'))

    info("loading Scrapinghub periodic jobs project={}".format(project_id))
    shub_jobs = []
    disabled_jobs = []
    for job in shub.periodic_jobs(project_id).json().get('results', []):
        if not job['disabled']:
            shub_jobs.extend(job.get('spiders'))
        else:
            disabled_jobs.extend(job.get('spiders'))

    filteropts = ''
    for _filter in filters:
        filteropts += '--filter {} '.format(_filter)
    info("filtering spiders by attributes: {}".format(filters))

    for spider in run_scrapy_command('describe',
                                     '{} --silent'.format(filteropts)):
        if spider['name'] in ignore:
            info("ignoring spider {}".format(spider['name']))
            continue

        if _should_run('scheduling', checks):
            if not lookup(local_jobs, value=spider['name'], key='spider'):
                fail("spider {name} is not scheduled locally".format(**spider))

        if _should_run('scrapinghub', checks):
            if lookup(disabled_jobs, value=spider['name'], key='name'):
                info("spider {name} is disabled on Scrapinghub".format(
                    **spider))
            elif not lookup(shub_jobs, value=spider['name'], key='name'):
                fail("spider {name} is not scheduled on Scrapinghub".format(
                    **spider))

        if _should_run('airtable', checks) and spider.get('provider'):
            # TODO compare records['State'] with last job run
            if not lookup(records, value=spider['provider'], key='Name'):
                fail("spider {name} is not documented on Airtable".format(
                    **spider))

        if _should_run('spider', checks):
            if not spider['version']:
                fail("spider {name} is not versioned".format(**spider))
            if not spider['provider']:
                fail("spider {name} doesn't define a data provider".format(
                    **spider))
            if not spider['produces']:
                fail("spider {name} doesn't define the data types produced".
                     format(**spider))

        hr('-')

    success("diagnostic done - patient in trouble")
Example #6
0
def create(project, copy_from, org):
    """Create a new project on Scrapinghub.

    One is expected to use default arguments so we can share the same
    development workflow.  Basically create a feature-branch, create a project,
    test on this project, merge and delete env.

    This wrapper uses by default the Kpler staging env to avoid eating
    production (paid) resources. Yet it also tries to reproduce as closely as
    possible this env, reading settings from env and pushing them on the new
    project.

    """
    project = project or default_project_name()

    info('creating project {} on {}'.format(project, org))
    res = shub.create_project(project, org)
    """Since it is not documented , here is `res.json()` format:
    {
        'info': {
            'description': ''
        },
        'owners': [],
        'ui_settings': {},
        'visual_project_type': '',
        'settings': {},
        'deleted': False,
        'description': '',
        'has_eggs': False,
        'monitoring_enabled': False,
        'organization_name': 'kpler.com',
        'last_activity': None,
        'version': None,
        'data_retention': 0,
        'organization': 288,
        'addons': [],
        'default_kumo_units': None,
        'id': 169023,
        'csv_fields': None,
        'name': 'testing-tmp3'
    }
    """
    if not res.ok:
        fail('failed to create project: {}'.format(res.text))
        sys.exit(1)

    feedback = res.json()
    success('project {} successfully created ({})'.format(
        project, feedback['id']))
    with shub.Config() as conf:
        info('updating config: {}'.format(conf.path))
        conf['projects'][project] = feedback['id']
        conf.commit()

    if copy_from:
        if copy_from.endswith('yml'):
            with open(copy_from, 'r') as fd:
                project_settings = {}
                for k, v in yaml.load(fd).items():
                    if isinstance(v, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        project_settings[k] = eval(v['secret'])
                    else:
                        project_settings[k] = v
        else:
            copy_from = shub.to_project_id(None, None, copy_from)
            info("populating project settings from project {}".format(
                copy_from))
            # TODO could be safer
            project_settings = shub.project_settings(copy_from).json().get(
                'settings')
    else:
        info("populating project settings from env")
        project_settings = {}
        for k, v in PROJECT_SETTINGS.items():
            from_env = os.getenv(k)
            if from_env is None and v is None:
                info('no value defined for setting `{}`'.format(k))
            elif from_env:
                info('overwriting setting with env value: {}={}'.format(
                    k, from_env))
                project_settings[k] = from_env

    res = shub.update_project_settings(feedback['id'], project_settings)
    if res.ok:
        success('successfully updated project settings')
    else:
        fail('failed to update project settings: {}'.format(res.text))
Example #7
0
def protect_projects(ctx, param, value):
    if value and int(value) in [
            SH_PLAYGROUND_PROJECT, SH_PRODUCTION_PROJECT, SH_LEGACY_PROJECT
    ]:
        fail(f"deleting project #{value} is forbidden")
        ctx.exit()
Example #8
0
def export(
    spider, transform, output, markers, deduplicate_on, filter_file, session_id, retry, **opts
):
    """Export Scrapinghub items."""
    if retry:
        # TODO append to session file?
        info("resuming session `{}`".format(retry))
        session_id = retry
        opts['skip_tags'].append(session_id)
    else:
        info('starting session `{}`'.format(session_id))

    transform = TRANSFORMERS.get(transform) or noop
    markers = list(markers + DEFAULT_MARKERS + (session_id,))
    raw_data = []
    unique_keys = []
    fails = 0

    constraint_keys = []
    if filter_file:
        info("loading constraints file {}".format(filter_file))
        with open(filter_file) as csvfile:
            reader = csv.DictReader(csvfile)
            constraint_keys = [row[deduplicate_on] for row in reader]

    opts['spider'] = spider
    for job in filter(has_scraped_data, fetch_jobs(**opts)):
        try:
            # tag it to remmeber we processed it (ETL style)
            shub.update_tags(job, add=markers)

            info("processing job {}".format(job.key))
            for item in job.items.iter():
                if deduplicate_on:
                    if item[deduplicate_on] not in unique_keys:
                        if constraint_keys:
                            if item[deduplicate_on] in constraint_keys:
                                # indivual items can be nested and store multiple types
                                # so `transform` is a generic generator we consume here
                                raw_data.extend([partial for partial in transform(item)])
                                unique_keys.append(item[deduplicate_on])
                        else:
                            raw_data.extend([partial for partial in transform(item)])
                            unique_keys.append(item[deduplicate_on])
                else:
                    raw_data.extend([partial for partial in transform(item)])
        except Exception as e:
            fail('fetching jobs crashed: {}, going on'.format(e))
            # dumping the data we got so far
            # one might be able to resume execution
            # NOTE could dump data so far in output = '/tmp/' + session_id + '.' + output
            fails += 1

    success("done ({} exceptions), exporting data".format(fails))

    if raw_data:
        output_format = output.split('.')[-1]
        success('exporting data raw={} to {}'.format(len(raw_data), output))

        # TODO support sqlite
        # TODO support stdout
        # TODO support gdrive export - might be better to wait for anbother PR bringing
        #      kp-gdrive and its high-level interface
        with open(output, 'w') as fd:
            if output_format == 'csv':
                csv_export(raw_data, fd)
            elif output_format == 'jl':
                jl_export(raw_data, fd)
    else:
        fail("no data was fetched")
Example #9
0
def batch_schedule(project_id, spiders, root_conf, conf_files, dry_run, reset,
                   monitor):
    """Register several periodic jobs from a yaml conf."""
    datadog.initialize(api_key=os.getenv("DATADOG_API_KEY"),
                       app_key=os.getenv("DATADOG_APP_KEY"))

    # force yielding all the configs so wwe can merge spider names
    configs = list(
        walk_configs(conf_root=root_conf,
                     paths=conf_files,
                     blacklist=["settings.yml"]))
    spider_names = spiders or []
    if not spiders:
        for conf in configs:
            spider_names.extend(
                [job_spec["spider"] for job_spec in conf["jobs"]])

    if reset:
        # be gentle with the api and batch delete the jobs upfront
        shub.reset_periodic_jobs(project_id, set(spider_names), dry_run)

    # TODO validate configuration (a seperate command ?)
    for specs in configs:
        # init with global settings that apply for all
        g_tags = GLOBAL_JOB_TAGS + specs.get("global_tags", [])
        g_settings = specs.get('global_settings', {})
        d_crons = specs.get("default_crons", [])

        for job_spec in specs["jobs"]:
            # one can limit spiders from the command line
            if job_spec["spider"] not in spider_names:
                info('spider "{}" not required to schedule'.format(
                    job_spec["spider"]))
                continue

            if job_spec.get("disabled"):
                info('spider "{}" disabled by config'.format(
                    job_spec["spider"]))
                continue

            if not (job_spec.get("crons") or d_crons):
                fail('spider "{}" contains no cron'.format(job_spec["spider"]))
                continue

            # overwrite spider-level settings on scrapinghub
            if job_spec.get("settings") or g_settings:
                # use default settings if there exists any
                job_spec["settings"] = {
                    **job_spec.get("settings", {}),
                    **g_settings
                }
                for key, value in job_spec["settings"].items():
                    if isinstance(value, dict):
                        # the only case is when using `secret: 'vault.decrypt("....")'`
                        job_spec["settings"][key] = eval(value['secret'])

                spider_id = shub.to_spider_id(project_id, job_spec["spider"])
                res = shub.update_spider_settings(project_id, spider_id,
                                                  job_spec["settings"])
                if not res.ok:
                    fail(
                        f'failed to update settings for spider "{job_spec["spider"]}"'
                    )
                    # skip since we don't want scheduled jobs to fail due to incorrect settings
                    continue

            # fill defaults
            # NOTE propably better done merging a hierarchy of dicts
            job_spec["priority"] = job_spec.get("priority", DEFAULT_PRIORITY)
            job_spec["tags"] = g_tags + job_spec.get("tags", [])
            job_spec["crons"] = job_spec.get("crons") or d_crons

            for combination in render_args(job_spec.get("dynamic_args", {})):
                # add static arguments for every combination generated
                combination.update(job_spec.get("args", {}))

                for cron_tpl in job_spec["crons"]:
                    cron = generate_cron(cron_tpl)
                    info(f"creating job on project {project_id}: {cron}")
                    job_id, err = shub.create_periodic_job(
                        job_spec["spider"],
                        project_id,
                        cron,
                        combination,
                        job_spec["tags"],
                        job_spec.get("description", DEFAULT_DESCRIPTION),
                        job_spec["priority"],
                        dry_run,
                    )
                    if job_id or dry_run:
                        success(f"done: {job_id}")
                    else:
                        fail(f"failed to schedule job: {err}")

            if job_spec.get("monitoring") and monitor:
                job_spec["monitoring"]["tags"] = job_spec["monitoring"].get(
                    "tags", [])
                if "creator:bot" not in job_spec["monitoring"]["tags"]:
                    job_spec["monitoring"]["tags"].append("creator:bot")

                # TODO and only if the command above worked fine
                # Create a new monitor (don't care about the dynamic nature of arguments)
                try:
                    info("creating new datadog monitor: {}".format(
                        job_spec["monitoring"]))
                    if not dry_run:
                        feedback = datadog.api.Monitor.create(
                            type="metric alert", **job_spec["monitoring"])
                        if feedback.get("errors"):
                            fail("failed to create monitor: {}".format(
                                feedback["errors"]))
                        else:
                            success("successfully created alert on {}".format(
                                feedback["created"]))
                except ValueError as e:
                    # usually error 403 forbidden that return an HTML page instead of json
                    fail("failed to create monitor: {}".format(e))