コード例 #1
0
def flow(parameters, *_):
    files_dump_to_path = parameters['files_dump_to_path']
    data_dump_to_path = parameters.get('data_dump_to_path')

    def _download_gdrive_data():
        stats = defaultdict(int)
        file_sources = parameters['file_sources']
        folder_id = parameters['google_drive_csv_folder_id']
        files_dir = os.path.join(files_dump_to_path, "files")
        os.makedirs(files_dir, exist_ok=True)
        client = get_client()
        existing_files = {}
        if os.path.exists(os.path.join(files_dump_to_path, "datapackage.json")):
            for row in Flow(load(os.path.join(files_dump_to_path, "datapackage.json"))).results()[0][0]:
                existing_files[row["name"]] = row
        for id, name, version in list_files(client, folder_id):
            source = file_sources.get(name)
            if source:
                assert name.endswith(".csv"), "only csv file sources are supported"
                stats['relevant_source_files'] += 1
                row = {"id": id, "name": name, "version": version, "source": source, "resource_name": "%s__%s" % (source, stats['relevant_source_files'])}
                yield row
                if (
                        os.path.exists(os.path.join(files_dump_to_path, "files", name))
                        and name in existing_files and existing_files[name]["id"] == id and existing_files[name]["version"] == version
                ):
                    logging.info("existing file, will not redownload: %s" % name)
                else:
                    logging.info("downloading file: %s" % name)
                    get_file(client, id, os.path.join(files_dump_to_path, "files", name))
        if stats['relevant_source_files'] != len(file_sources):
            raise Exception("source files mismatch")

    files_flow = Flow(
        _download_gdrive_data(),
        update_resource(-1, name="gdrive_data_files", path="gdrive_data_files.csv", **{"dpp:streaming": True}),
        dump_to_path(files_dump_to_path),
        printer()
    )
    data_flow_args = []
    for file_row in files_flow.results()[0][0]:
        data_flow_args += [
            load(os.path.join(files_dump_to_path, "files", file_row["name"]),
                 strip=False, infer_strategy=load.INFER_STRINGS, deduplicate_headers=True,
                 cast_strategy=load.CAST_TO_STRINGS, on_error=ignore, limit_rows=parameters.get("limit_rows"),
                 encoding="utf-8"),
            update_resource(-1, name=file_row["resource_name"], path=file_row["name"], **{"dpp:streaming": True})
        ]
    if data_dump_to_path:
        data_flow_args += [
            dump_to_path(data_dump_to_path)
        ]
    return Flow(*data_flow_args)
コード例 #2
0
def flow(parameters, *_):
    logging.info('Running COVID19-ISRAEL module %s' % parameters['module'])
    mtimes = {}
    sizes = {}
    hashes = {}
    for path in glob('../COVID19-ISRAEL/**', recursive=True):
        if os.path.isfile(path):
            mtimes[path] = os.path.getmtime(path)
            sizes[path] = os.path.getsize(path)
            hashes[path] = get_hash(path)
    if utils.subprocess_call_log(['python', '-u', '-m', parameters['module']],
                                 log_file=parameters.get('log_file'),
                                 cwd='../COVID19-ISRAEL') != 0:
        raise Exception('Failed to run COVID19-ISRAEL module %s' %
                        parameters['module'])
    resource_name = parameters.get('resource_name',
                                   'covid19_israel_updated_files')
    dump_to_path_name = parameters.get(
        'dump_to_path',
        'data/run_covid19_israel/last_updated_files/%s' % parameters['module'])
    printer_num_rows = parameters.get('printer_num_rows', 999)
    return Flow(
        get_updated_files(mtimes, sizes, hashes),
        update_resource(-1,
                        name=resource_name,
                        path='%s.csv' % resource_name,
                        **{'dpp:streaming': True}),
        *([printer(
            num_rows=printer_num_rows)] if printer_num_rows > 0 else []),
        *([dump_to_path(dump_to_path_name)] if dump_to_path_name else []))
コード例 #3
0
def flow(parameters, *_):
    logging.info('Pulling latest code from COVID19-ISRAEL github repo')
    logging.info('COVID19_ISRAEL_REPOSITORY=%s' %
                 os.environ.get('COVID19_ISRAEL_REPOSITORY'))
    logging.info('pulling from origin/master')
    utils.subprocess_call_log(
        ['git', 'config', 'user.email', 'avid-covider-pipelines@localhost'],
        cwd='../COVID19-ISRAEL')
    utils.subprocess_call_log(
        ['git', 'config', 'user.name', 'avid-covider-pipelines'],
        cwd='../COVID19-ISRAEL')
    if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'],
                                 cwd='../COVID19-ISRAEL') != 0:
        raise Exception('Failed to git pull')
    sha1 = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
                                   cwd='../COVID19-ISRAEL').decode().strip()
    return Flow(
        iter([{
            'sha1': sha1
        }]),
        update_resource(-1,
                        name='github_pull_covid19_israel',
                        path='github_pull_covid19_israel.csv',
                        **{'dpp:streaming': True}), printer(),
        dump_to_path(
            parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
コード例 #4
0
def store_destination_output_package(destination_output, csv_temp_files):
    logging.info("Storing destination output package")
    os.makedirs(destination_output, exist_ok=True)
    logging.info("Writing to destination_output dir: " + destination_output)
    last_package = {}
    if os.path.exists(os.path.join(destination_output, "datapackage.json")):

        def _load_last_package(row):
            last_package[row['name']] = row
            yield row

        Flow(
            load(os.path.join(destination_output, "datapackage.json")),
            _load_last_package
        ).process()

    def _files_list():
        for temp_filepath, name in csv_temp_files.items():
            target_filepath = os.path.join(destination_output, name)
            shutil.move(temp_filepath, target_filepath)
            os.chmod(target_filepath, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
            size = os.path.getsize(target_filepath)
            hash = get_hash(target_filepath)
            last_row = last_package.get(name)
            if last_row and hash == last_row.get('hash') and size == last_row['size']:
                mtime = last_row['mtime']
            else:
                mtime = datetime.datetime.fromtimestamp(os.path.getmtime(target_filepath))
            yield {"name": name, "size": size, "mtime": mtime, "hash": hash}

    Flow(
        _files_list(),
        update_resource(-1, name='files_list', path='files_list.csv'),
        dump_to_path(destination_output),
    ).process()
コード例 #5
0
def save_cache(parameters, kv):
    if parameters.get("gps_datapackage_path"):
        logging.info('Saving cache to ' + parameters['gps_datapackage_path'])
        Flow(
            ({"k": k, "v": v} for k, v in kv.items()),
            update_resource(-1, name="gps_data", path="gps_data.csv", **{"dpp:streaming": True}),
            dump_to_path(parameters['gps_datapackage_path'])
        ).process()
コード例 #6
0
def flow(parameters, *_):
    logging.info('Creating COVID19-ISRAEL files zip')
    os.makedirs(parameters["dump_to_path"], exist_ok=True)
    with zipfile.ZipFile(
            os.path.join(parameters["dump_to_path"],
                         "covid19-israel-data.zip"), "w",
            zipfile.ZIP_LZMA) as zipf:
        Flow(
            zip_files(zipf),
            update_resource(-1,
                            name='files_list',
                            path='files_list.csv',
                            **{"dpp:streaming": True}),
            dump_to_path(parameters['dump_to_path']),
        ).process()
    return Flow(
        load(os.path.join(parameters['dump_to_path'], "datapackage.json")),
        printer(num_rows=5))
コード例 #7
0
def flow(parameters, *_):
    def _get_last_runs():
        runs_history_last_rows = {}
        for id, path in parameters["check_covid19_israel_id_paths"].items():

            def _process_runs_history(rows):
                for row in rows:
                    yield row
                    runs_history_last_rows[id] = row

            Flow(load("%s/runs_history/datapackage.json" % path),
                 _process_runs_history).process()
        for id, row in runs_history_last_rows.items():
            start_time = row["start_time"]
            end_time = datetime.datetime.strptime(row["end_time"],
                                                  '%Y-%m-%dT%H:%M:%S')
            yield {
                "id":
                id,
                "github_sha1":
                row["github_sha1"],
                "error":
                row["error"],
                "start_time":
                start_time,
                "end_time":
                end_time,
                "duration_minutes":
                (end_time - start_time).total_seconds() / 60,
                "log_file":
                "https://avidcovider-pipelines-data.odata.org.il/data/%s/log_files/%s.log"
                % (id, start_time.strftime("%Y%m%dT%H%M%S")),
            }

    def _check_last_runs(rows):
        has_errors = []
        for row in rows:
            yield row
            if row["error"] != "no":
                has_errors.append(row["id"])
        if len(has_errors) > 0:
            raise Exception("pipelines failed: %s" % has_errors)

    Flow(
        _get_last_runs(),
        update_resource(-1,
                        name="last_runs",
                        path="last_runs.csv",
                        schema={
                            "fields": [
                                {
                                    "name": "id",
                                    "type": "string"
                                },
                                {
                                    "name": "github_sha1",
                                    "type": "string"
                                },
                                {
                                    "name": "error",
                                    "type": "string"
                                },
                                {
                                    "name": "start_time",
                                    "type": "datetime"
                                },
                                {
                                    "name": "end_time",
                                    "type": "datetime"
                                },
                                {
                                    "name": "duration_minutes",
                                    "type": "number"
                                },
                                {
                                    "name": "log_file",
                                    "type": "string"
                                },
                            ]
                        },
                        **{"dpp:streaming": True}),
        printer(num_rows=9999),
        dump_to_path(parameters["output-dir"]),
    ).process()
    return Flow(load("%s/datapackage.json" % parameters["output-dir"]),
                _check_last_runs)
コード例 #8
0
def flow(parameters, *_):
    logging.info('Pulling latest code from COVID19-ISRAEL github repo')
    logging.info('COVID19_ISRAEL_REPOSITORY=%s' %
                 os.environ.get('COVID19_ISRAEL_REPOSITORY'))
    logging.info('COVID19_ISRAEL_BRANCH=%s' %
                 os.environ.get('COVID19_ISRAEL_BRANCH'))
    if not os.environ.get('COVID19_ISRAEL_REPOSITORY'):
        logging.info(
            'skipping pull because COVID19_ISRAEL_REPOSITORY env var is empty')
        logging.info('using env var COVID19_ISRAEL_SHA1 for the sha1')
        logging.info('COVID19_ISRAEL_SHA1=' +
                     os.environ.get('COVID19_ISRAEL_SHA1', "_"))
        sha1 = os.environ.get('COVID19_ISRAEL_SHA1', "_")
    else:
        utils.subprocess_call_log([
            'git', 'config', 'user.email', 'avid-covider-pipelines@localhost'
        ],
                                  cwd='../COVID19-ISRAEL')
        utils.subprocess_call_log(
            ['git', 'config', 'user.name', 'avid-covider-pipelines'],
            cwd='../COVID19-ISRAEL')
        branch = os.environ.get('COVID19_ISRAEL_BRANCH')
        if branch:
            logging.info('Pulling from origin/' + branch)
            if utils.subprocess_call_log(['git', 'fetch', 'origin'],
                                         cwd='../COVID19-ISRAEL') != 0:
                raise Exception('Failed to fetch origin')
            if utils.subprocess_call_log(['git', 'checkout', branch],
                                         cwd='../COVID19-ISRAEL') != 0:
                raise Exception('Failed to switch branch')
            if utils.subprocess_call_log(['git', 'pull', 'origin', branch],
                                         cwd='../COVID19-ISRAEL') != 0:
                raise Exception('Failed to git pull')
        else:
            logging.info('pulling from origin/master')
            if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'],
                                         cwd='../COVID19-ISRAEL') != 0:
                raise Exception('Failed to git pull')
        sha1 = subprocess.check_output(
            ['git', 'rev-parse', 'HEAD'],
            cwd='../COVID19-ISRAEL').decode().strip()
    # sha1 = subprocess.check_output(['cat', '/pipelines/data/fake-sha1'], cwd='../COVID19-ISRAEL').decode().strip()
    if parameters.get('change-run-covid'):
        with open('avid_covider_pipelines/run_covid19_israel.py', 'r') as f:
            lines = f.readlines()
        with open('avid_covider_pipelines/run_covid19_israel.py', 'w') as f:
            for i, line in enumerate(lines):
                if i == 0:
                    if line.startswith('COVID19_ISRAEL_GITHUB_SHA1 = '):
                        line = 'COVID19_ISRAEL_GITHUB_SHA1 = "%s"\n' % sha1
                    else:
                        f.write('COVID19_ISRAEL_GITHUB_SHA1 = "%s"\n' % sha1)
                f.write(line)
    return Flow(
        iter([{
            'sha1': sha1
        }]),
        update_resource(-1,
                        name='github_pull_covid19_israel',
                        path='github_pull_covid19_israel.csv',
                        **{'dpp:streaming': True}), printer(),
        dump_to_path(
            parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
コード例 #9
0
def flow(parameters, *_):
    stats = defaultdict(int)
    kv = KVFile()
    last_id = None
    load_from = parameters.get("load_from", parameters.get('dump_to_path'))
    if load_from and os.path.exists(os.path.join(load_from,
                                                 "datapackage.json")):
        logging.info("Loading from last load_from_db package: " +
                     os.path.join(load_from, "datapackage.json"))
        row = None
        for resource in Flow(
                load(os.path.join(load_from, "datapackage.json"),
                     limit_rows=parameters.get("limit_rows"),
                     resources="db_data")).datastream().res_iter:
            for row in resource:
                stats['loaded from package'] += 1
                last_id = row['__id']
                kv.set("{:0>12}".format(last_id), row)
                if last_id % 10000 == 0:
                    logging.info("Loaded id: %s" % last_id)
        all_data_keys = set(row.keys()) if row else set()
    else:
        all_data_keys = set()
    logging.info('num rows loaded from package: %s' %
                 stats['loaded from package'])
    engine = create_engine(
        "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}"
        .format(**config.db_settings))
    engine.update_execution_options(stream_results=True)
    if parameters.get("where"):
        logging.info("Loading from DB, with where clause: " +
                     parameters["where"])
        where = " where " + parameters["where"]
    elif last_id:
        logging.info("Loading from DB, starting at id %s" % last_id)
        where = " where id > %s" % last_id
    else:
        logging.info("Loading all records from DB")
        where = ""
    for id, created, data in engine.execute(
            "select id, created, data from reports%s order by id" % where):
        if parameters.get("filter_db_row_callback"):
            id, created, data = parameters["filter_db_row_callback"](id,
                                                                     created,
                                                                     data)
        if not data or not isinstance(data, dict):
            stats['invalid data'] += 1
            continue
        stats['loaded from db'] += 1
        last_id = id
        row = {
            "__id": id,
            "__created": created,
        }
        for k, v in data.items():
            all_data_keys.add(k)
            row[k] = v
        kv.set("{:0>12}".format(id), row)
        if id % 100000 == 0:
            logging.info("Loaded id: %s" % id)
        if parameters.get("limit_rows") and stats[
                'loaded from db'] > parameters["limit_rows"]:
            break
    logging.info('DB rows with invalid data: %s' % stats['invalid data'])
    logging.info("last_id = %s" % last_id)
    logging.info('num rows loaded from db: %s' % stats['loaded from db'])

    def _yield_from_kv():
        for _, row in kv.items():
            yield {
                "__id": row["__id"],
                "__created": row["__created"],
                **{
                    k: json.dumps(row.get(k))
                    for k in all_data_keys if k not in ["__id", "__created"]
                }
            }

    flow_args = [
        _yield_from_kv(),
        update_resource(
            -1,
            name="db_data",
            path="db_data.csv",
            schema={
                "fields": [{
                    "name": "__id",
                    "type": "integer"
                }, {
                    "name": "__created",
                    "type": "datetime"
                }, *[{
                    "name": k,
                    "type": "string"
                } for k in all_data_keys if k not in ["__id", "__created"]]]
            },
            **{"dpp:streaming": True}),
    ]
    if parameters.get("dump_to_path"):
        flow_args += [dump_to_path(parameters['dump_to_path'])]
    return Flow(*flow_args)
コード例 #10
0
def add_gps_coordinates(stats, kv, parameters):
    logging.info('adding gps coordinates')

    def _add_gps_coordinates(rows):
        logging.info("resource name = " + rows.res.name)
        if rows.res.name == "db_data":
            source = "db"
        else:
            source = rows.res.name.split("__")[0]
        fields = parameters["source_fields"][source]
        workplace_fields = parameters.get("workplace_source_fields", {}).get(source)
        if workplace_fields and source != "db":
            raise Exception("sorry, wokrplace_fields is only supported for db source")
        for row in rows:
            inputs = {}
            workplace_inputs = {}
            for k, v in row.items():
                input = fields.get(k.strip())
                if input and v and v.strip():
                    if input in inputs:
                        logging.warning("duplicate input %s, %s: %s" % (source, input, row))
                    elif source == "db":
                        inputs[input] = json.loads(v)
                    else:
                        inputs[input] = v
                if workplace_fields:
                    input = workplace_fields.get(k.strip())
                    if input and v and v.strip():
                        if input in workplace_inputs:
                            logging.warning("duplicate workplace_input %s, %s: %s" % (source, input, row))
                        elif source == "db":
                            workplace_inputs[input] = json.loads(v)
                        else:
                            workplace_inputs[input] = v
            lat, lng, accurate = get_coords(stats, kv, inputs, get_coords_callback=parameters.get("get-coords-callback"))
            if workplace_fields:
                workplace_lat, workplace_lng, workplace_accurate = get_coords(stats, kv, workplace_inputs, get_coords_callback=parameters.get("get-coords-callback"))
            yield {
                **row,
                "lat": str(lat),
                "lng": str(lng),
                **({"address_street_accurate": str(accurate)} if source == "db" else {}),
                **({
                    "workplace_lat": str(workplace_lat),
                    "workplace_lng": str(workplace_lng),
                    **({"workplace_street_accurate": str(workplace_accurate)} if source == "db" else {}),
                } if workplace_fields else {}),
            }
        logging.info(str(dict(stats)))

    flow_args = []
    if parameters.get('load_db_data'):
        flow_args += [
            load(os.path.join(parameters['load_db_data'], 'datapackage.json'))
        ]
    if parameters.get('load_gdrive_data'):
        flow_args += [
            load(os.path.join(parameters['load_gdrive_data'], 'datapackage.json'))
        ]
    flow_args += [
        add_field('lat', 'string', default="0"),
        add_field('lng', 'string', default="0"),
        add_field('address_street_accurate', 'string', default="0", resources="db_data"),
        add_field('workplace_lat', 'string', default="0", resources="db_data"),
        add_field('workplace_lng', 'string', default="0", resources="db_data"),
        add_field('workplace_street_accurate', 'string', default="0", resources="db_data"),
        _add_gps_coordinates,
    ]
    if parameters.get('dump_to_path'):
        flow_args += [
            dump_to_path(parameters['dump_to_path'])
        ]
    return Flow(*flow_args)
コード例 #11
0
def flow(*_):
    run_row = None
    last_run_row = Flow(
        load_if_exists('%s/last_run/datapackage.json' % OUTPUT_DIR, 'last_run',
                       [{}])).results()[0][0][0]
    last_run_sha1 = last_run_row.get('COVID19-ISRAEL_github_sha1')
    last_run_time = last_run_row.get('start_time')
    if last_run_time and (datetime.datetime.now() -
                          last_run_time).total_seconds() < 120:
        logging.info('last run was less then 120 seconds ago, not running')
    else:
        new_sha1 = github_pull_covid19_israel.flow({
            'dump_to_path':
            '%s/last_github_pull' % OUTPUT_DIR
        }).results()[0][0][0]['sha1']
        if last_run_time and (
                datetime.datetime.now() - last_run_time
        ).total_seconds() < 60 * 60 * 24 and last_run_sha1 == new_sha1:
            logging.info(
                "No change detected in COVID19-ISRAEL GitHub, not running")
        else:
            run_row = {
                'start_time': datetime.datetime.now(),
                'COVID19-ISRAEL_github_sha1': new_sha1
            }
            for module in RUN_MODULES:
                try:
                    os.makedirs('data/preprocess_raw_data/log_files/%s' %
                                module['id'],
                                exist_ok=True)
                    run_covid19_israel.flow({
                        'module':
                        module['module'],
                        'resource_name':
                        '%s_last_updated_files' % module['id'],
                        'dump_to_path':
                        'data/preprocess_raw_data/last_updated_files/%s' %
                        module['id'],
                        'log_file':
                        'data/preprocess_raw_data/log_files/%s/%s.log' %
                        (module['id'],
                         datetime.datetime.now().strftime('%Y%m%dT%H%M%S'))
                    }).process()
                    run_row['%s_success' % module['id']] = 'yes'
                except Exception:
                    logging.exception('failed to run %s' % module['id'])
                    run_row['%s_success' % module['id']] = 'no'

    if run_row is not None:
        Flow(
            iter([run_row]),
            update_resource(-1,
                            name='last_run',
                            path='last_run.csv',
                            **{'dpp:streaming': True}),
            dump_to_path('%s/last_run' % OUTPUT_DIR)).process()

    def _get_runs_history():
        if os.path.exists('%s/runs_history/datapackage.json' % OUTPUT_DIR):
            for resource in Flow(
                    load('%s/runs_history/datapackage.json' %
                         OUTPUT_DIR), ).datastream().res_iter:
                yield from resource
        if run_row is not None:
            yield run_row

    Flow(
        _get_runs_history(),
        update_resource(-1,
                        name='runs_history',
                        path='runs_history',
                        **{'dpp:streaming': True}),
        dump_to_path('%s/runs_history' % OUTPUT_DIR)).process()

    return Flow(load('%s/runs_history/datapackage.json' % OUTPUT_DIR),
                sort_rows('{start_time}', reverse=True), printer(num_rows=10))