Beispiel #1
0
def convert_tables_to_json(csv_folder: Path,
                           output_folder: Path) -> Iterable[Path]:
    def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path:
        # JSON output defaults to same as the CSV file but with extension swapped
        json_output = output_folder / str(
            csv_file.relative_to(csv_folder)).replace(".csv", ".json")
        json_output.parent.mkdir(parents=True, exist_ok=True)

        # Converting to JSON is not critical and it may fail in some corner cases
        # As long as the "important" JSON files are created, this should be OK
        try:
            print(f"Converting {csv_file} to JSON")
            convert_csv_to_json_records(schema, csv_file, json_output)
            return json_output
        except Exception as exc:
            print(f"Unable to convert CSV file {csv_file} to JSON: ${exc}",
                  file=sys.stderr)
            traceback.print_exc()
            return None

    # Convert all CSV files to JSON using values format
    map_iter = list(csv_folder.glob("**/*.csv"))
    map_func = partial(try_json_covert, get_schema())
    for json_output in thread_map(map_func,
                                  map_iter,
                                  max_workers=2,
                                  desc="JSON conversion"):
        if json_output is not None:
            yield json_output
Beispiel #2
0
    def test_table_records_reimport(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            schema = {
                _safe_column_name(col): dtype
                for col, dtype in get_schema().items()
            }
            sqlite_file = workdir / "tmp.sqlite"
            tables_folder = SRC / "test" / "data"
            with create_sqlite_database(db_file=sqlite_file) as conn:
                for table_path in tables_folder.glob("*.csv"):
                    table_name = _safe_table_name(table_path.stem)
                    table_import_from_file(conn, table_path, schema=schema)

                    # Export the records to a list
                    records_output_1 = list(table_select_all(conn, table_name))

                    # Import the list of records
                    table_name_2 = table_name + "_new"
                    table_import_from_records(conn,
                                              table_name_2,
                                              records_output_1,
                                              schema=schema)

                    # Re-export the records as a list
                    records_output_2 = list(
                        table_select_all(conn, table_name_2))

                    for record1, record2 in zip(records_output_1,
                                                records_output_2):
                        self.assertDictEqual(record1, record2)
Beispiel #3
0
def convert_tables_to_json(csv_folder: Path, output_folder: Path, **tqdm_kwargs) -> Iterable[Path]:

    # Convert all CSV files to JSON using values format
    map_iter = list(csv_folder.glob("**/*.csv"))
    map_opts = dict(total=len(map_iter), desc="Converting to JSON", **tqdm_kwargs)
    map_func = partial(_try_json_covert, get_schema(), csv_folder, output_folder)
    return list(pbar(map(map_func, map_iter), **map_opts))
Beispiel #4
0
def import_tables_into_sqlite(table_paths: List[Path],
                              output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        table_paths: List of CSV files to join into a single table.
        output_path: Output path for the resulting SQLite file.
    """
    # Import all tables into a database on disk at the provided path
    with create_sqlite_database(output_path) as conn:

        # Get a list of all tables indexed by <location_key> or by <location_key, date>
        schema = get_schema()
        for table_file_path in table_paths:
            table_name = table_file_path.stem
            _logger.log_info(f"Importing {table_name} into SQLite")
            table_columns = get_table_columns(table_file_path)
            table_schema = {col: schema.get(col, str) for col in table_columns}
            table_import_from_file(conn,
                                   table_file_path,
                                   table_name=table_name,
                                   schema=table_schema)
Beispiel #5
0
from pandas import DataFrame
from lib.constants import SRC
from lib.io import read_lines, read_table, export_csv
from lib.memory_efficient import (
    table_cross_product,
    table_join,
    table_group_tail,
    _convert_csv_to_json_records_fast,
    _convert_csv_to_json_records_slow,
)
from lib.pipeline_tools import get_schema
from lib.utils import agg_last_not_null, pbar
from .profiled_test_case import ProfiledTestCase

# Read the expected dtypes to ensure casting does not throw off test results
SCHEMA = get_schema()


class TestTableJoins(ProfiledTestCase):
    def _test_join_pair(
        self,
        read_table_: Callable,
        schema: Dict[str, str],
        left: Path,
        right: Path,
        on: List[str],
        how: str,
    ):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            tmpfile = workdir / "tmpfile.csv"
def main():
    schema = get_schema()
    for table_name in tqdm(list(get_table_names())):
        table = fetch_table(table_name)
        table = table.sort_values([col for col in ("key", "date") if col in table.columns])
        export_csv(table, path=SRC / "test" / "data" / f"{table_name}.csv", schema=schema)