def test_strip_characters_from_column_name(self): """Test that column names are converted properly.""" bad_str = r"column\one:two-three four,five/six_seven" expected = "column_one_two_three_four_five_six_seven" result = common_utils.strip_characters_from_column_name(bad_str) self.assertEqual(result, expected)
def _generate_create_table_sql(self): """Generate SQL to create table.""" parquet_columns = self._generate_column_list() s3_path = f"{settings.S3_BUCKET_NAME}/{self._s3_path}" sql = f"CREATE TABLE IF NOT EXISTS {self._schema_name}.{self._table_name} (" for idx, col in enumerate(parquet_columns): norm_col = strip_characters_from_column_name(col) if norm_col in self._column_types["numeric_columns"]: col_type = "double" elif norm_col in self._column_types["date_columns"]: col_type = "timestamp" elif norm_col in self._column_types["boolean_columns"]: col_type = "boolean" else: col_type = "varchar" sql += f"{norm_col} {col_type}" if idx < (len(parquet_columns) - 1): sql += "," sql += ",source varchar, year varchar, month varchar" sql += ( f") WITH(external_location = 's3a://{s3_path}', format = 'PARQUET'," " partitioned_by=ARRAY['source', 'year', 'month'])") LOG.info(f"Create Parquet Table SQL: {sql}") return sql
def aws_post_processor(data_frame): """ Consume the AWS data and add a column creating a dictionary for the aws tags """ def scrub_resource_col_name(res_col_name): return res_col_name.replace("resourceTags/user:"******"") columns = set(list(data_frame)) columns = set(PRESTO_REQUIRED_COLUMNS).union(columns) columns = sorted(list(columns)) resource_tag_columns = [column for column in columns if "resourceTags/user:"******"resourceTags"] = resource_tags_dict.apply(json.dumps) # Make sure we have entries for our required columns data_frame = data_frame.reindex(columns=columns) columns = list(data_frame) column_name_map = {} drop_columns = [] for column in columns: new_col_name = strip_characters_from_column_name(column) column_name_map[column] = new_col_name if "resourceTags/" in column: drop_columns.append(column) data_frame = data_frame.drop(columns=drop_columns) data_frame = data_frame.rename(columns=column_name_map) return (data_frame, unique_keys)
def gcp_post_processor(data_frame): """Guarantee column order for Azure parquet files""" columns = list(data_frame) column_name_map = {} for column in columns: new_col_name = strip_characters_from_column_name(column) column_name_map[column] = new_col_name data_frame = data_frame.rename(columns=column_name_map) label_set = set() unique_labels = data_frame.labels.unique() for label in unique_labels: label_set.update(json.loads(label).keys()) return (data_frame, label_set)
def azure_post_processor(data_frame): """Guarantee column order for Azure parquet files""" columns = list(data_frame) column_name_map = {} for column in columns: new_col_name = strip_characters_from_column_name(column) column_name_map[column] = new_col_name data_frame = data_frame.rename(columns=column_name_map) columns = set(list(data_frame)) columns = set(PRESTO_COLUMNS).union(columns) columns = sorted(columns) data_frame = data_frame.reindex(columns=columns) return data_frame