# Copy all output files to the V2 folder print("Copying files to public folder...") for output_file in (ROOT / "output" / "tables").glob("*.csv"): shutil.copy(output_file, v2_folder / output_file.name) # Merge all output files into a single master table print("Creating master table...") master = read_file(v2_folder / "index.csv") for output_file in v2_folder.glob("*.csv"): if output_file.name not in ("index.csv", "master.csv"): master = master.merge(read_file(output_file, low_memory=False), how="left") # Drop rows without a single dated record export_csv(master.dropna(subset=["date"]), v2_folder / "master.csv") # Create subsets with the last 30, 14 and 7 days of data print("Creating last N days subsets...") for n_days in (30, 14, 7): n_days_folder = v2_folder / str(n_days) n_days_folder.mkdir(exist_ok=True) for csv_file in (v2_folder).glob("*.csv"): data = read_file(csv_file, low_memory=False) export_csv(subset_last_days(data, n_days), n_days_folder / csv_file.name) # Create a subset with the latest known day of data for each key print("Creating the latest subset...") latest_folder = v2_folder / "latest" latest_folder.mkdir(exist_ok=True)
profiler.enable() # A pipeline chain is any subfolder not starting with "_" in the pipelines folder all_pipeline_chains = [] for item in (ROOT / "src" / "pipelines").iterdir(): if not item.name.startswith("_") and not item.is_file(): all_pipeline_chains.append(item.name) # Run all the pipelines and place their outputs into the output folder # The output name for each pipeline chain will be the name of the directory that the chain is in for pipeline_name in all_pipeline_chains: table_name = pipeline_name.replace("_", "-") if args.only and not table_name in args.only.split(","): continue if args.exclude and table_name in args.exclude.split(","): continue pipeline_chain = DataPipeline.load(pipeline_name) show_progress = not args.no_progress pipeline_output = pipeline_chain.run(pipeline_name, verify=args.verify, process_count=args.process_count, progress=show_progress) export_csv(pipeline_output, ROOT / "output" / "tables" / f"{table_name}.csv") if args.profile: stats = Stats(profiler) stats.strip_dirs() stats.sort_stats("cumtime") stats.print_stats(20)
def main( output_folder: Path, verify: str = None, only: List[str] = None, exclude: List[str] = None, process_count: int = cpu_count(), show_progress: bool = True, ) -> None: """ Executes the data pipelines and places all outputs into `output_folder`. This is typically followed by publishing of the contents of the output folder to a server. Args: output_folder: Root folder where snapshot, intermediate and tables will be placed. verify: Run anomaly detection on the outputs using this strategy. Value must be one of: - None: (default) perform no anomaly detection - "simple": perform only fast anomaly detection - "full": perform exhaustive anomaly detection (can be very slow) only: If provided, only pipelines with a name appearing in this list will be run. exclude: If provided, pipelines with a name appearing in this list will not be run. process_count: Maximum number of processes to use during the data pipeline execution. show_progress: Display progress for the execution of individual DataSources within this pipeline. """ assert not (only is not None and exclude is not None ), "--only and --exclude options cannot be used simultaneously" # Ensure that there is an output folder to put the data in (output_folder / "snapshot").mkdir(parents=True, exist_ok=True) (output_folder / "intermediate").mkdir(parents=True, exist_ok=True) (output_folder / "tables").mkdir(parents=True, exist_ok=True) # A pipeline chain is any subfolder not starting with "_" in the pipelines folder all_pipeline_names = [] for item in (ROOT / "src" / "pipelines").iterdir(): if not item.name.startswith("_") and not item.is_file(): all_pipeline_names.append(item.name) # Verify that all of the provided pipeline names exist as pipelines for pipeline_name in (only or []) + (exclude or []): module_name = pipeline_name.replace("-", "_") assert module_name in all_pipeline_names, f'"{pipeline_name}" pipeline does not exist' # Run all the pipelines and place their outputs into the output folder # The output name for each pipeline chain will be the name of the directory that the chain is in for pipeline_name in all_pipeline_names: table_name = pipeline_name.replace("_", "-") # Skip if `exclude` was provided and this table is in it if exclude is not None and table_name in exclude: continue # Skip is `only` was provided and this table is not in it if only is not None and not table_name in only: continue data_pipeline = DataPipeline.load(pipeline_name) pipeline_output = data_pipeline.run( pipeline_name, output_folder, verify=verify, process_count=process_count, progress=show_progress, ) export_csv(pipeline_output, output_folder / "tables" / f"{table_name}.csv")
def main(output_folder: Path, tables_folder: Path, show_progress: bool = True) -> None: """ This script takes the processed outputs located in `tables_folder` and publishes them into the output folder by performing the following operations: 1. Copy all the tables as-is from `tables_folder` to `output_folder` 2. Produce a main table, created by iteratively performing left outer joins on all other tables (with a few exceptions) 3. Create different slices of data, such as the latest known record for each region, files for the last N days of data, files for each individual region """ # TODO: respect disable progress flag disable_progress = not show_progress # Wipe the output folder first for item in output_folder.glob("*"): if item.name.startswith("."): continue if item.is_file(): item.unlink() else: shutil.rmtree(item) # Create the folder which will be published using a stable schema v2_folder = output_folder / "v2" v2_folder.mkdir(exist_ok=True, parents=True) # Copy all output files to the V2 folder for output_file in tqdm([*tables_folder.glob("*.csv")], desc="Copy tables"): shutil.copy(output_file, v2_folder / output_file.name) # Merge all output files into a single table main_table = read_file(v2_folder / "index.csv") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=7)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table = DataFrame(date_list, columns=["date"], dtype=str) main_table = table_cross_product(main_table, date_table) # Some tables are not included into the main table exclude_from_main_table = ( "main.csv", "index.csv", "worldbank.csv", "worldpop.csv", "by-age.csv", "by-sex.csv", ) non_dated_columns = set(main_table.columns) for output_file in tqdm([*v2_folder.glob("*.csv")], desc="Main table"): if output_file.name not in exclude_from_main_table: # Load the table and perform left outer join table = read_file(output_file, low_memory=False) main_table = main_table.merge(table, how="left") # Keep track of columns which are not indexed by date if not "date" in table.columns: non_dated_columns = non_dated_columns | set(table.columns) # There can only be one record per <key, date> pair main_table = main_table.groupby(["key", "date"]).first().reset_index() # Drop rows with null date or without a single dated record main_table = drop_na_records(main_table.dropna(subset=["date"]), non_dated_columns) export_csv(main_table, v2_folder / "main.csv") # Create subsets with the last 30, 14 and 7 days of data map_func = partial(subset_last_days, v2_folder) for _ in thread_map(map_func, (30, 14, 7), desc="Last N days subsets"): pass # Create a subset with the latest known day of data for each key map_func = partial(subset_latest, v2_folder) for _ in thread_map(map_func, [*(v2_folder).glob("*.csv")], desc="Latest subset"): pass # Create subsets with each known key main_indexed = main_table.set_index("key") map_func = partial(subset_grouped_key, main_indexed, v2_folder) for _ in thread_map(map_func, main_indexed.index.unique(), desc="Grouped key subsets"): pass # Convert all CSV files to JSON using values format map_func = export_json_without_index for _ in thread_map(map_func, [*(v2_folder).glob("**/*.csv")], desc="JSON conversion"): pass
def make_main_table( tables_folder: Path, output_path: Path, logger: ErrorLogger = ErrorLogger()) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) logger.log_info("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = date_range("2020-01-01", max_date) date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) logger.log_info("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) logger.log_info("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path, how="outer") logger.log_info("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path, how="outer") shutil.move(temp_file_path, main_table_path) logger.log_info(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) logger.log_info("Sorted main table")
def _test_table_merge(self, how_mem: str, how_pandas: str): test_data_1 = DataFrame.from_records([ { "col1": "a", "col2": "1" }, { "col1": "a", "col2": "2" }, { "col1": "b", "col2": "3" }, { "col1": "b", "col2": "4" }, { "col1": "c", "col2": "5" }, { "col1": "c", "col2": "6" }, ]) test_data_2 = DataFrame.from_records([ { "col1": "a", "col3": "foo" }, { "col1": "b", "col3": "bar" }, { "col1": "c", "col3": "baz" }, ]) test_data_3 = DataFrame.from_records([ { "col1": "a", "col4": "apple" }, { "col1": "b", "col4": "banana" }, { "col1": "c", "col4": "orange" }, ]) with temporary_directory() as workdir: test_file_1 = workdir / "test.1.csv" test_file_2 = workdir / "test.2.csv" test_file_3 = workdir / "test.3.csv" export_csv(test_data_1, test_file_1) export_csv(test_data_2, test_file_2) export_csv(test_data_3, test_file_3) output_file_1 = workdir / "output.1.csv" output_file_2 = workdir / "output.2.csv" expected = table_merge_pandas( [test_data_1, test_data_2, test_data_3], on=["col1"], how=how_pandas) export_csv(expected, path=output_file_1) table_merge_mem([test_file_1, test_file_2, test_file_3], output_file_2, on=["col1"], how=how_mem) _compare_tables_equal(self, output_file_1, output_file_2)