Esempio n. 1
0
    def test_table_init(self):
        tm = read_table_json("example/meta_data/db1/teams.json")

        self.assertTrue(tm.database is None)

        gtd = tm.glue_table_definition("full_db_path")
        self.assertTrue(
            gtd["StorageDescriptor"]["Location"] == 'full_db_path/teams/')
Esempio n. 2
0
    def test_add_remove_table(self):
        db = read_database_folder('example/meta_data/db1/')
        self.assertRaises(ValueError, db.remove_table, 'not_a_table')
        db.remove_table('employees')
        tns = db.table_names
        self.assertEqual(set(tns), set(['teams', 'pay']))

        emp_table = read_table_json('example/meta_data/db1/employees.json')
        db.add_table(emp_table)
        t = all(t in ['teams', 'employees', 'pay'] for t in db.table_names)
        self.assertTrue(t)

        self.assertRaises(ValueError, db.add_table, 'not a table obj')
        self.assertRaises(ValueError, db.add_table, emp_table)
Esempio n. 3
0
    def test_add_remove_table(self):
        db = read_database_folder("example/meta_data/db1/")
        self.assertRaises(ValueError, db.remove_table, "not_a_table")
        db.remove_table("employees")
        tns = db.table_names
        self.assertEqual(set(tns), set(["teams", "pay"]))

        emp_table = read_table_json("example/meta_data/db1/employees.json")
        db.add_table(emp_table)
        t = all(t in ["teams", "employees", "pay"] for t in db.table_names)
        self.assertTrue(t)

        self.assertRaises(ValueError, db.add_table, "not a table obj")
        self.assertRaises(ValueError, db.add_table, emp_table)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path):

    if s3_path.startswith("s3://"):
        s3_path = s3_path.replace("s3://", "", 1)

    local = fs.LocalFileSystem()
    s3 = fs.S3FileSystem(region=REGION)
    with local.open_input_stream(local_data_path) as f:
        tab = csv.read_csv(f)

    metadata = read_table_json(local_meta_path)
    arrow_cols = []
    for col in metadata.columns:
        if col["name"] not in metadata.partitions:
            arrow_cols.append(convert_meta_col_to_arrow_tuple(col))

    s = pa.schema(arrow_cols)
    tab = tab.cast(s)

    with s3.open_output_stream(s3_path) as f:
        pq.write_table(tab, f)
    def create_glue_database(self):
        """Creates glue database"""
        # Create database based on db_schema
        db = DatabaseMeta(**self.db_schema)
        for table_name, data_paths in self.meta_and_files.items():
            tm = read_table_json(data_paths["meta_path"], database=db)
            tm.data_format = "parquet"
            if tm.partitions:
                raise AttributeError("Automated lookup tables can only be "
                                     "partitioned by their GitHub release")
            # Add a release column as the first file partition to every table
            tm.add_column(
                name="release",
                type="character",
                description="github release tag of this lookup",
            )
            tm.partitions = ["release"]
            db.add_table(tm)

        db.create_glue_database(delete_if_exists=True)
        db.refresh_all_table_partitions()
Esempio n. 6
0
 def test_glue_specific_table(self):
     t = read_table_json("example/meta_data/db1/pay.json")
     glue_def = t.glue_table_definition("db_path")
     self.assertTrue(
         t.glue_table_definition("db_path")["Parameters"]
         ['skip.header.line.count'] == '1')