Esempio n. 1
0
    def test_strip_characters_from_column_name(self):
        """Test that column names are converted properly."""
        bad_str = r"column\one:two-three four,five/six_seven"
        expected = "column_one_two_three_four_five_six_seven"

        result = common_utils.strip_characters_from_column_name(bad_str)
        self.assertEqual(result, expected)
Esempio n. 2
0
    def _generate_create_table_sql(self):
        """Generate SQL to create table."""
        parquet_columns = self._generate_column_list()
        s3_path = f"{settings.S3_BUCKET_NAME}/{self._s3_path}"

        sql = f"CREATE TABLE IF NOT EXISTS {self._schema_name}.{self._table_name} ("

        for idx, col in enumerate(parquet_columns):
            norm_col = strip_characters_from_column_name(col)
            if norm_col in self._column_types["numeric_columns"]:
                col_type = "double"
            elif norm_col in self._column_types["date_columns"]:
                col_type = "timestamp"
            elif norm_col in self._column_types["boolean_columns"]:
                col_type = "boolean"
            else:
                col_type = "varchar"

            sql += f"{norm_col} {col_type}"
            if idx < (len(parquet_columns) - 1):
                sql += ","
        sql += ",source varchar, year varchar, month varchar"

        sql += (
            f") WITH(external_location = 's3a://{s3_path}', format = 'PARQUET',"
            " partitioned_by=ARRAY['source', 'year', 'month'])")
        LOG.info(f"Create Parquet Table SQL: {sql}")
        return sql
Esempio n. 3
0
def aws_post_processor(data_frame):
    """
    Consume the AWS data and add a column creating a dictionary for the aws tags
    """

    def scrub_resource_col_name(res_col_name):
        return res_col_name.replace("resourceTags/user:"******"")

    columns = set(list(data_frame))
    columns = set(PRESTO_REQUIRED_COLUMNS).union(columns)
    columns = sorted(list(columns))

    resource_tag_columns = [column for column in columns if "resourceTags/user:"******"resourceTags"] = resource_tags_dict.apply(json.dumps)
    # Make sure we have entries for our required columns
    data_frame = data_frame.reindex(columns=columns)

    columns = list(data_frame)
    column_name_map = {}
    drop_columns = []
    for column in columns:
        new_col_name = strip_characters_from_column_name(column)
        column_name_map[column] = new_col_name
        if "resourceTags/" in column:
            drop_columns.append(column)
    data_frame = data_frame.drop(columns=drop_columns)
    data_frame = data_frame.rename(columns=column_name_map)
    return (data_frame, unique_keys)
Esempio n. 4
0
def gcp_post_processor(data_frame):
    """Guarantee column order for Azure parquet files"""
    columns = list(data_frame)
    column_name_map = {}
    for column in columns:
        new_col_name = strip_characters_from_column_name(column)
        column_name_map[column] = new_col_name
    data_frame = data_frame.rename(columns=column_name_map)

    label_set = set()
    unique_labels = data_frame.labels.unique()
    for label in unique_labels:
        label_set.update(json.loads(label).keys())

    return (data_frame, label_set)
Esempio n. 5
0
def azure_post_processor(data_frame):
    """Guarantee column order for Azure parquet files"""
    columns = list(data_frame)
    column_name_map = {}

    for column in columns:
        new_col_name = strip_characters_from_column_name(column)
        column_name_map[column] = new_col_name

    data_frame = data_frame.rename(columns=column_name_map)

    columns = set(list(data_frame))
    columns = set(PRESTO_COLUMNS).union(columns)
    columns = sorted(columns)

    data_frame = data_frame.reindex(columns=columns)

    return data_frame