def flow(parameters, *_): files_dump_to_path = parameters['files_dump_to_path'] data_dump_to_path = parameters.get('data_dump_to_path') def _download_gdrive_data(): stats = defaultdict(int) file_sources = parameters['file_sources'] folder_id = parameters['google_drive_csv_folder_id'] files_dir = os.path.join(files_dump_to_path, "files") os.makedirs(files_dir, exist_ok=True) client = get_client() existing_files = {} if os.path.exists(os.path.join(files_dump_to_path, "datapackage.json")): for row in Flow(load(os.path.join(files_dump_to_path, "datapackage.json"))).results()[0][0]: existing_files[row["name"]] = row for id, name, version in list_files(client, folder_id): source = file_sources.get(name) if source: assert name.endswith(".csv"), "only csv file sources are supported" stats['relevant_source_files'] += 1 row = {"id": id, "name": name, "version": version, "source": source, "resource_name": "%s__%s" % (source, stats['relevant_source_files'])} yield row if ( os.path.exists(os.path.join(files_dump_to_path, "files", name)) and name in existing_files and existing_files[name]["id"] == id and existing_files[name]["version"] == version ): logging.info("existing file, will not redownload: %s" % name) else: logging.info("downloading file: %s" % name) get_file(client, id, os.path.join(files_dump_to_path, "files", name)) if stats['relevant_source_files'] != len(file_sources): raise Exception("source files mismatch") files_flow = Flow( _download_gdrive_data(), update_resource(-1, name="gdrive_data_files", path="gdrive_data_files.csv", **{"dpp:streaming": True}), dump_to_path(files_dump_to_path), printer() ) data_flow_args = [] for file_row in files_flow.results()[0][0]: data_flow_args += [ load(os.path.join(files_dump_to_path, "files", file_row["name"]), strip=False, infer_strategy=load.INFER_STRINGS, deduplicate_headers=True, cast_strategy=load.CAST_TO_STRINGS, on_error=ignore, limit_rows=parameters.get("limit_rows"), encoding="utf-8"), update_resource(-1, name=file_row["resource_name"], path=file_row["name"], **{"dpp:streaming": True}) ] if data_dump_to_path: data_flow_args += [ dump_to_path(data_dump_to_path) ] return Flow(*data_flow_args)
def flow(parameters, *_): logging.info('Running COVID19-ISRAEL module %s' % parameters['module']) mtimes = {} sizes = {} hashes = {} for path in glob('../COVID19-ISRAEL/**', recursive=True): if os.path.isfile(path): mtimes[path] = os.path.getmtime(path) sizes[path] = os.path.getsize(path) hashes[path] = get_hash(path) if utils.subprocess_call_log(['python', '-u', '-m', parameters['module']], log_file=parameters.get('log_file'), cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to run COVID19-ISRAEL module %s' % parameters['module']) resource_name = parameters.get('resource_name', 'covid19_israel_updated_files') dump_to_path_name = parameters.get( 'dump_to_path', 'data/run_covid19_israel/last_updated_files/%s' % parameters['module']) printer_num_rows = parameters.get('printer_num_rows', 999) return Flow( get_updated_files(mtimes, sizes, hashes), update_resource(-1, name=resource_name, path='%s.csv' % resource_name, **{'dpp:streaming': True}), *([printer( num_rows=printer_num_rows)] if printer_num_rows > 0 else []), *([dump_to_path(dump_to_path_name)] if dump_to_path_name else []))
def flow(parameters, *_): logging.info('Pulling latest code from COVID19-ISRAEL github repo') logging.info('COVID19_ISRAEL_REPOSITORY=%s' % os.environ.get('COVID19_ISRAEL_REPOSITORY')) logging.info('pulling from origin/master') utils.subprocess_call_log( ['git', 'config', 'user.email', 'avid-covider-pipelines@localhost'], cwd='../COVID19-ISRAEL') utils.subprocess_call_log( ['git', 'config', 'user.name', 'avid-covider-pipelines'], cwd='../COVID19-ISRAEL') if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to git pull') sha1 = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd='../COVID19-ISRAEL').decode().strip() return Flow( iter([{ 'sha1': sha1 }]), update_resource(-1, name='github_pull_covid19_israel', path='github_pull_covid19_israel.csv', **{'dpp:streaming': True}), printer(), dump_to_path( parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
def store_destination_output_package(destination_output, csv_temp_files): logging.info("Storing destination output package") os.makedirs(destination_output, exist_ok=True) logging.info("Writing to destination_output dir: " + destination_output) last_package = {} if os.path.exists(os.path.join(destination_output, "datapackage.json")): def _load_last_package(row): last_package[row['name']] = row yield row Flow( load(os.path.join(destination_output, "datapackage.json")), _load_last_package ).process() def _files_list(): for temp_filepath, name in csv_temp_files.items(): target_filepath = os.path.join(destination_output, name) shutil.move(temp_filepath, target_filepath) os.chmod(target_filepath, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH) size = os.path.getsize(target_filepath) hash = get_hash(target_filepath) last_row = last_package.get(name) if last_row and hash == last_row.get('hash') and size == last_row['size']: mtime = last_row['mtime'] else: mtime = datetime.datetime.fromtimestamp(os.path.getmtime(target_filepath)) yield {"name": name, "size": size, "mtime": mtime, "hash": hash} Flow( _files_list(), update_resource(-1, name='files_list', path='files_list.csv'), dump_to_path(destination_output), ).process()
def save_cache(parameters, kv): if parameters.get("gps_datapackage_path"): logging.info('Saving cache to ' + parameters['gps_datapackage_path']) Flow( ({"k": k, "v": v} for k, v in kv.items()), update_resource(-1, name="gps_data", path="gps_data.csv", **{"dpp:streaming": True}), dump_to_path(parameters['gps_datapackage_path']) ).process()
def flow(parameters, *_): logging.info('Creating COVID19-ISRAEL files zip') os.makedirs(parameters["dump_to_path"], exist_ok=True) with zipfile.ZipFile( os.path.join(parameters["dump_to_path"], "covid19-israel-data.zip"), "w", zipfile.ZIP_LZMA) as zipf: Flow( zip_files(zipf), update_resource(-1, name='files_list', path='files_list.csv', **{"dpp:streaming": True}), dump_to_path(parameters['dump_to_path']), ).process() return Flow( load(os.path.join(parameters['dump_to_path'], "datapackage.json")), printer(num_rows=5))
def flow(parameters, *_): def _get_last_runs(): runs_history_last_rows = {} for id, path in parameters["check_covid19_israel_id_paths"].items(): def _process_runs_history(rows): for row in rows: yield row runs_history_last_rows[id] = row Flow(load("%s/runs_history/datapackage.json" % path), _process_runs_history).process() for id, row in runs_history_last_rows.items(): start_time = row["start_time"] end_time = datetime.datetime.strptime(row["end_time"], '%Y-%m-%dT%H:%M:%S') yield { "id": id, "github_sha1": row["github_sha1"], "error": row["error"], "start_time": start_time, "end_time": end_time, "duration_minutes": (end_time - start_time).total_seconds() / 60, "log_file": "https://avidcovider-pipelines-data.odata.org.il/data/%s/log_files/%s.log" % (id, start_time.strftime("%Y%m%dT%H%M%S")), } def _check_last_runs(rows): has_errors = [] for row in rows: yield row if row["error"] != "no": has_errors.append(row["id"]) if len(has_errors) > 0: raise Exception("pipelines failed: %s" % has_errors) Flow( _get_last_runs(), update_resource(-1, name="last_runs", path="last_runs.csv", schema={ "fields": [ { "name": "id", "type": "string" }, { "name": "github_sha1", "type": "string" }, { "name": "error", "type": "string" }, { "name": "start_time", "type": "datetime" }, { "name": "end_time", "type": "datetime" }, { "name": "duration_minutes", "type": "number" }, { "name": "log_file", "type": "string" }, ] }, **{"dpp:streaming": True}), printer(num_rows=9999), dump_to_path(parameters["output-dir"]), ).process() return Flow(load("%s/datapackage.json" % parameters["output-dir"]), _check_last_runs)
def flow(parameters, *_): logging.info('Pulling latest code from COVID19-ISRAEL github repo') logging.info('COVID19_ISRAEL_REPOSITORY=%s' % os.environ.get('COVID19_ISRAEL_REPOSITORY')) logging.info('COVID19_ISRAEL_BRANCH=%s' % os.environ.get('COVID19_ISRAEL_BRANCH')) if not os.environ.get('COVID19_ISRAEL_REPOSITORY'): logging.info( 'skipping pull because COVID19_ISRAEL_REPOSITORY env var is empty') logging.info('using env var COVID19_ISRAEL_SHA1 for the sha1') logging.info('COVID19_ISRAEL_SHA1=' + os.environ.get('COVID19_ISRAEL_SHA1', "_")) sha1 = os.environ.get('COVID19_ISRAEL_SHA1', "_") else: utils.subprocess_call_log([ 'git', 'config', 'user.email', 'avid-covider-pipelines@localhost' ], cwd='../COVID19-ISRAEL') utils.subprocess_call_log( ['git', 'config', 'user.name', 'avid-covider-pipelines'], cwd='../COVID19-ISRAEL') branch = os.environ.get('COVID19_ISRAEL_BRANCH') if branch: logging.info('Pulling from origin/' + branch) if utils.subprocess_call_log(['git', 'fetch', 'origin'], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to fetch origin') if utils.subprocess_call_log(['git', 'checkout', branch], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to switch branch') if utils.subprocess_call_log(['git', 'pull', 'origin', branch], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to git pull') else: logging.info('pulling from origin/master') if utils.subprocess_call_log(['git', 'pull', 'origin', 'master'], cwd='../COVID19-ISRAEL') != 0: raise Exception('Failed to git pull') sha1 = subprocess.check_output( ['git', 'rev-parse', 'HEAD'], cwd='../COVID19-ISRAEL').decode().strip() # sha1 = subprocess.check_output(['cat', '/pipelines/data/fake-sha1'], cwd='../COVID19-ISRAEL').decode().strip() if parameters.get('change-run-covid'): with open('avid_covider_pipelines/run_covid19_israel.py', 'r') as f: lines = f.readlines() with open('avid_covider_pipelines/run_covid19_israel.py', 'w') as f: for i, line in enumerate(lines): if i == 0: if line.startswith('COVID19_ISRAEL_GITHUB_SHA1 = '): line = 'COVID19_ISRAEL_GITHUB_SHA1 = "%s"\n' % sha1 else: f.write('COVID19_ISRAEL_GITHUB_SHA1 = "%s"\n' % sha1) f.write(line) return Flow( iter([{ 'sha1': sha1 }]), update_resource(-1, name='github_pull_covid19_israel', path='github_pull_covid19_israel.csv', **{'dpp:streaming': True}), printer(), dump_to_path( parameters.get('dump_to_path', 'data/github_pull_covid19_israel')))
def flow(parameters, *_): stats = defaultdict(int) kv = KVFile() last_id = None load_from = parameters.get("load_from", parameters.get('dump_to_path')) if load_from and os.path.exists(os.path.join(load_from, "datapackage.json")): logging.info("Loading from last load_from_db package: " + os.path.join(load_from, "datapackage.json")) row = None for resource in Flow( load(os.path.join(load_from, "datapackage.json"), limit_rows=parameters.get("limit_rows"), resources="db_data")).datastream().res_iter: for row in resource: stats['loaded from package'] += 1 last_id = row['__id'] kv.set("{:0>12}".format(last_id), row) if last_id % 10000 == 0: logging.info("Loaded id: %s" % last_id) all_data_keys = set(row.keys()) if row else set() else: all_data_keys = set() logging.info('num rows loaded from package: %s' % stats['loaded from package']) engine = create_engine( "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}" .format(**config.db_settings)) engine.update_execution_options(stream_results=True) if parameters.get("where"): logging.info("Loading from DB, with where clause: " + parameters["where"]) where = " where " + parameters["where"] elif last_id: logging.info("Loading from DB, starting at id %s" % last_id) where = " where id > %s" % last_id else: logging.info("Loading all records from DB") where = "" for id, created, data in engine.execute( "select id, created, data from reports%s order by id" % where): if parameters.get("filter_db_row_callback"): id, created, data = parameters["filter_db_row_callback"](id, created, data) if not data or not isinstance(data, dict): stats['invalid data'] += 1 continue stats['loaded from db'] += 1 last_id = id row = { "__id": id, "__created": created, } for k, v in data.items(): all_data_keys.add(k) row[k] = v kv.set("{:0>12}".format(id), row) if id % 100000 == 0: logging.info("Loaded id: %s" % id) if parameters.get("limit_rows") and stats[ 'loaded from db'] > parameters["limit_rows"]: break logging.info('DB rows with invalid data: %s' % stats['invalid data']) logging.info("last_id = %s" % last_id) logging.info('num rows loaded from db: %s' % stats['loaded from db']) def _yield_from_kv(): for _, row in kv.items(): yield { "__id": row["__id"], "__created": row["__created"], **{ k: json.dumps(row.get(k)) for k in all_data_keys if k not in ["__id", "__created"] } } flow_args = [ _yield_from_kv(), update_resource( -1, name="db_data", path="db_data.csv", schema={ "fields": [{ "name": "__id", "type": "integer" }, { "name": "__created", "type": "datetime" }, *[{ "name": k, "type": "string" } for k in all_data_keys if k not in ["__id", "__created"]]] }, **{"dpp:streaming": True}), ] if parameters.get("dump_to_path"): flow_args += [dump_to_path(parameters['dump_to_path'])] return Flow(*flow_args)
def add_gps_coordinates(stats, kv, parameters): logging.info('adding gps coordinates') def _add_gps_coordinates(rows): logging.info("resource name = " + rows.res.name) if rows.res.name == "db_data": source = "db" else: source = rows.res.name.split("__")[0] fields = parameters["source_fields"][source] workplace_fields = parameters.get("workplace_source_fields", {}).get(source) if workplace_fields and source != "db": raise Exception("sorry, wokrplace_fields is only supported for db source") for row in rows: inputs = {} workplace_inputs = {} for k, v in row.items(): input = fields.get(k.strip()) if input and v and v.strip(): if input in inputs: logging.warning("duplicate input %s, %s: %s" % (source, input, row)) elif source == "db": inputs[input] = json.loads(v) else: inputs[input] = v if workplace_fields: input = workplace_fields.get(k.strip()) if input and v and v.strip(): if input in workplace_inputs: logging.warning("duplicate workplace_input %s, %s: %s" % (source, input, row)) elif source == "db": workplace_inputs[input] = json.loads(v) else: workplace_inputs[input] = v lat, lng, accurate = get_coords(stats, kv, inputs, get_coords_callback=parameters.get("get-coords-callback")) if workplace_fields: workplace_lat, workplace_lng, workplace_accurate = get_coords(stats, kv, workplace_inputs, get_coords_callback=parameters.get("get-coords-callback")) yield { **row, "lat": str(lat), "lng": str(lng), **({"address_street_accurate": str(accurate)} if source == "db" else {}), **({ "workplace_lat": str(workplace_lat), "workplace_lng": str(workplace_lng), **({"workplace_street_accurate": str(workplace_accurate)} if source == "db" else {}), } if workplace_fields else {}), } logging.info(str(dict(stats))) flow_args = [] if parameters.get('load_db_data'): flow_args += [ load(os.path.join(parameters['load_db_data'], 'datapackage.json')) ] if parameters.get('load_gdrive_data'): flow_args += [ load(os.path.join(parameters['load_gdrive_data'], 'datapackage.json')) ] flow_args += [ add_field('lat', 'string', default="0"), add_field('lng', 'string', default="0"), add_field('address_street_accurate', 'string', default="0", resources="db_data"), add_field('workplace_lat', 'string', default="0", resources="db_data"), add_field('workplace_lng', 'string', default="0", resources="db_data"), add_field('workplace_street_accurate', 'string', default="0", resources="db_data"), _add_gps_coordinates, ] if parameters.get('dump_to_path'): flow_args += [ dump_to_path(parameters['dump_to_path']) ] return Flow(*flow_args)
def flow(*_): run_row = None last_run_row = Flow( load_if_exists('%s/last_run/datapackage.json' % OUTPUT_DIR, 'last_run', [{}])).results()[0][0][0] last_run_sha1 = last_run_row.get('COVID19-ISRAEL_github_sha1') last_run_time = last_run_row.get('start_time') if last_run_time and (datetime.datetime.now() - last_run_time).total_seconds() < 120: logging.info('last run was less then 120 seconds ago, not running') else: new_sha1 = github_pull_covid19_israel.flow({ 'dump_to_path': '%s/last_github_pull' % OUTPUT_DIR }).results()[0][0][0]['sha1'] if last_run_time and ( datetime.datetime.now() - last_run_time ).total_seconds() < 60 * 60 * 24 and last_run_sha1 == new_sha1: logging.info( "No change detected in COVID19-ISRAEL GitHub, not running") else: run_row = { 'start_time': datetime.datetime.now(), 'COVID19-ISRAEL_github_sha1': new_sha1 } for module in RUN_MODULES: try: os.makedirs('data/preprocess_raw_data/log_files/%s' % module['id'], exist_ok=True) run_covid19_israel.flow({ 'module': module['module'], 'resource_name': '%s_last_updated_files' % module['id'], 'dump_to_path': 'data/preprocess_raw_data/last_updated_files/%s' % module['id'], 'log_file': 'data/preprocess_raw_data/log_files/%s/%s.log' % (module['id'], datetime.datetime.now().strftime('%Y%m%dT%H%M%S')) }).process() run_row['%s_success' % module['id']] = 'yes' except Exception: logging.exception('failed to run %s' % module['id']) run_row['%s_success' % module['id']] = 'no' if run_row is not None: Flow( iter([run_row]), update_resource(-1, name='last_run', path='last_run.csv', **{'dpp:streaming': True}), dump_to_path('%s/last_run' % OUTPUT_DIR)).process() def _get_runs_history(): if os.path.exists('%s/runs_history/datapackage.json' % OUTPUT_DIR): for resource in Flow( load('%s/runs_history/datapackage.json' % OUTPUT_DIR), ).datastream().res_iter: yield from resource if run_row is not None: yield run_row Flow( _get_runs_history(), update_resource(-1, name='runs_history', path='runs_history', **{'dpp:streaming': True}), dump_to_path('%s/runs_history' % OUTPUT_DIR)).process() return Flow(load('%s/runs_history/datapackage.json' % OUTPUT_DIR), sort_rows('{start_time}', reverse=True), printer(num_rows=10))