Beispiel #1
0
 def __init__(
     self,
     csv_rows,
     delimiter='|',
     escapechar='\\',
     quoting=csv.QUOTE_NONE,
     *args,
     **kwargs
 ):
     SimpleS3Config.__init__(self, *args, **kwargs)
     self._gzip_csv = BytesIO()
     with get_stream(gzip.GzipFile(fileobj=self._gzip_csv, mode='wb')) as gz:
         writer = csv.writer(gz, delimiter=delimiter, escapechar=escapechar, quoting=quoting)
         for row in csv_rows:
             writer.writerow(row)
 def test_query(self):
     expected_query = r"""
         create external table schema.table (
             int_col_1 INTEGER
         )
         ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES (
             'separatorChar' = '|',
             'quoteChar' = '\"',
             'escapeChar' = '\\'
         )
         stored as textfile
         location 's3://some_bucket/prefix/csv/manifest'
         table properties (
             'compression_type'='gzip'
         );
     """
     s3_config = SimpleS3Config.from_base_path("s3://some_bucket/prefix")
     sa_meta = sqlalchemy.MetaData()
     sa_table = sqlalchemy.Table(
         'unit_test_table',
         sa_meta,
         sqlalchemy.Column('int_col_1', sqlalchemy.INTEGER),
     )
     open_csv_serde_table_creator = OpenCSVSerdeTableCreator(
         engine=None,
         schema_name="schema",
         table_name="table",
         sa_table=sa_table,
         s3_config=s3_config)
     self.assertEqual(textwrap.dedent(expected_query),
                      textwrap.dedent(open_csv_serde_table_creator.query))
Beispiel #3
0
def convert(ctx, table, s3_path):
    engine = get_sa_engine(ctx)
    sa_table = SqlAlchemySchemaReader(engine).get_table_schema(table)
    s3_config = SimpleS3Config.from_base_path(s3_path)

    converter = ConcurrentManifestConverter(sa_table, s3_config)
    converter.convert_manifest()
Beispiel #4
0
def transform(ctx, table, s3_path, dest_schema, dest_table, s3_region):
    dest_table = dest_table or table
    engine = get_sa_engine(ctx)
    s3_config = SimpleS3Config.from_base_path(s3_path, region=s3_region)
    transformer = TableTransformer(engine, table, s3_config, dest_schema,
                                   dest_table)
    transformer.transform()
Beispiel #5
0
def create_table(ctx, s3_path, source_table, dest_table, dest_schema):
    click.echo('Create Spectrum table')
    engine = get_sa_engine(ctx)
    sa_table = SqlAlchemySchemaReader(engine).get_table_schema(source_table)
    s3_config = SimpleS3Config.from_base_path(s3_path)

    table_creator = SpectrumTableCreator(engine, dest_schema, dest_table,
                                         sa_table, s3_config)
    table_creator.log_query()
    table_creator.confirm()
    table_creator.create()
def spectrify_run(start,
                  end,
                  source_schema,
                  source_table,
                  table_name,
                  spectrum_schema,
                  spectrum_table,
                  csv_bucket_name,
                  csv_prefix,
                  parquet_bucket_name,
                  parquet_prefix,
                  sa_engine,
                  export_whole_table=True,
                  timestamp_col=None,
                  iam_role=None):

    start_date = datetime.strptime(start, '%Y-%m-%d %H:%M:%S.%f')
    end_date = datetime.strptime(end, '%Y-%m-%d %H:%M:%S.%f')

    # Construct a S3Config object with the source CSV folder and
    # destination Spectrum/Parquet folder on S3.
    csv_path = csv_path_template.format(start=start_date,
                                        source_schema=source_schema,
                                        source_table=source_table,
                                        csv_bucket_name=csv_bucket_name,
                                        csv_prefix=csv_prefix)
    spectrum_path = spectrum_path_template.format(
        start=start_date,
        source_schema=source_schema,
        source_table=source_table,
        parquet_bucket_name=parquet_bucket_name,
        parquet_prefix=parquet_prefix)
    s3_config = SimpleS3Config(csv_path, spectrum_path)

    transformer = TimedDataTransformer(sa_engine,
                                       table_name,
                                       s3_config,
                                       spectrum_schema,
                                       spectrum_table,
                                       start_date=start_date,
                                       end_date=end_date,
                                       timestamp_col=timestamp_col,
                                       export_whole_table=export_whole_table,
                                       iam_role=iam_role)
    transformer.transform()
Beispiel #7
0
def spectrify_last_week():
    end_date = datetime.utcnow().date()
    start_date = end_date - timedelta(days=7)

    # Replace with your table names (or pass in as parameters)
    source_table = 'my_table'
    dest_table = 'my_table'
    spectrum_schema = 'spectrum'

    sa_engine = get_redshift_engine()

    # Construct a S3Config object with the source CSV folder and
    # destination Spectrum/Parquet folder on S3.
    csv_path = csv_path_template.format(start=start_date)
    spectrum_path = spectrum_path_template.format(start=start_date)
    s3_config = SimpleS3Config(csv_path, spectrum_path)

    transformer = WeeklyDataTransformer(
        sa_engine, source_table, s3_config, spectrum_schema, dest_table,
        start_date=start_date, end_date=end_date)
    transformer.transform()
Beispiel #8
0
def export(ctx, table, s3_path, s3_region):
    engine = get_sa_engine(ctx)
    s3_config = SimpleS3Config.from_base_path(s3_path, region=s3_region)
    RedshiftDataExporter(engine, s3_config).export_to_csv(table)