def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns, csv_rows): key_prefix = temp_table_name + SEPARATOR bucket, key = self.s3.persist(csv_rows, key_prefix=key_prefix) credentials = self.s3.credentials() copy_sql = sql.SQL( 'COPY {}.{} ({}) FROM {} CREDENTIALS {} FORMAT AS CSV NULL AS {}' ).format( sql.Identifier(self.postgres_schema), sql.Identifier(temp_table_name), sql.SQL(', ').join(map(sql.Identifier, columns)), sql.Literal('s3://{}/{}'.format(bucket, key)), sql.Literal('aws_access_key_id={};aws_secret_access_key={}'.format( credentials.get('aws_access_key_id'), credentials.get('aws_secret_access_key'))), sql.Literal(RESERVED_NULL_DEFAULT)) cur.execute(copy_sql) pattern = re.compile(SINGER_LEVEL.format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, columns)) update_sql = self._get_update_sql(remote_schema['name'], temp_table_name, remote_schema['key_properties'], columns, subkeys) cur.execute(update_sql)
def _denest_records(table_path, records, records_map, key_properties, pk_fks=None, level=-1): row_index = 0 """ [{...} ...] | [[...] ...] | [literal ...] """ for record in records: if pk_fks: record_pk_fks = pk_fks.copy() record_pk_fks[SINGER_LEVEL.format(level)] = row_index if not isinstance(record, dict): """ [...] | literal """ record = {SINGER_VALUE: record} for key, value in record_pk_fks.items(): record[key] = value row_index += 1 else: ## top level record_pk_fks = {} for key in key_properties: record_pk_fks[SINGER_SOURCE_PK_PREFIX + key] = record[key] if SINGER_SEQUENCE in record: record_pk_fks[SINGER_SEQUENCE] = record[SINGER_SEQUENCE] """ {...} """ _denest_record(table_path, record, records_map, key_properties, record_pk_fks, level)
def _create_subtable(table_path, table_json_schema, key_prop_schemas, subtables, level): if json_schema.is_object(table_json_schema['items']): new_properties = table_json_schema['items']['properties'] else: new_properties = {SINGER_VALUE: table_json_schema['items']} key_properties = [] for pk, item_json_schema in key_prop_schemas.items(): key_properties.append(SINGER_SOURCE_PK_PREFIX + pk) new_properties[SINGER_SOURCE_PK_PREFIX + pk] = item_json_schema new_properties[SINGER_SEQUENCE] = {'type': ['null', 'integer']} for i in range(0, level + 1): new_properties[SINGER_LEVEL.format(i)] = {'type': ['integer']} new_schema = { 'type': [json_schema.OBJECT], 'properties': new_properties, 'level': level, 'key_properties': key_properties } _denest_schema(table_path, new_schema, key_prop_schemas, subtables, level=level) subtables[table_path] = new_schema
def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns, csv_rows): copy = sql.SQL( 'COPY {}.{} ({}) FROM STDIN WITH CSV NULL AS {}').format( sql.Identifier(self.postgres_schema), sql.Identifier(temp_table_name), sql.SQL(', ').join(map(sql.Identifier, columns)), sql.Literal(RESERVED_NULL_DEFAULT)) cur.copy_expert(copy, csv_rows) pattern = re.compile(SINGER_LEVEL.format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, columns)) canonicalized_key_properties = [ self.fetch_column_from_path((key_property, ), remote_schema)[0] for key_property in remote_schema['key_properties'] ] update_sql = self._get_update_sql(remote_schema['name'], temp_table_name, canonicalized_key_properties, columns, subkeys) cur.execute(update_sql)
def persist_rows(self, cur, target_table_name, temp_table_name, target_table_json_schema, key_properties, records): headers = list(target_table_json_schema['properties'].keys()) datetime_fields = [ k for k, v in target_table_json_schema['properties'].items() if v.get('format') == 'date-time' ] rows = iter(records) def transform(): try: row = next(rows) with io.StringIO() as out: ## Serialize datetime to postgres compatible format for prop in datetime_fields: if prop in row: row[prop] = self.get_postgres_datetime(row[prop]) writer = csv.DictWriter(out, headers) writer.writerow(row) return out.getvalue() except StopIteration: return '' csv_rows = TransformStream(transform) copy = sql.SQL('COPY {}.{} ({}) FROM STDIN CSV').format( sql.Identifier(self.postgres_schema), sql.Identifier(temp_table_name), sql.SQL(', ').join(map(sql.Identifier, headers))) cur.copy_expert(copy, csv_rows) pattern = re.compile(SINGER_LEVEL.format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, headers)) update_sql = self.get_update_sql(target_table_name, temp_table_name, key_properties, subkeys) cur.execute(update_sql)
def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns, csv_rows): copy = sql.SQL( 'COPY {}.{} ({}) FROM STDIN WITH (FORMAT CSV, NULL {})').format( sql.Identifier(self.postgres_schema), sql.Identifier(temp_table_name), sql.SQL(', ').join(map(sql.Identifier, columns)), sql.Literal(RESERVED_NULL_DEFAULT)) cur.copy_expert(copy, csv_rows) pattern = re.compile(SINGER_LEVEL.format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, columns)) update_sql = self.get_update_sql(remote_schema['name'], temp_table_name, remote_schema['key_properties'], subkeys) cur.execute(update_sql)
def denest_records(self, table_name, records, records_map, key_properties, pk_fks=None, level=-1): row_index = 0 for record in records: if pk_fks: record_pk_fks = pk_fks.copy() record_pk_fks[SINGER_LEVEL.format(level)] = row_index for key, value in record_pk_fks.items(): record[key] = value row_index += 1 else: ## top level record_pk_fks = {} for key in key_properties: record_pk_fks[SINGER_SOURCE_PK_PREFIX + key] = record[key] if SINGER_SEQUENCE in record: record_pk_fks[SINGER_SEQUENCE] = record[SINGER_SEQUENCE] self.denest_record(table_name, None, record, records_map, key_properties, record_pk_fks, level)
def create_subtable(self, table_name, table_json_schema, key_prop_schemas, subtables, level): if json_schema.is_object(table_json_schema['items']): new_properties = table_json_schema['items']['properties'] else: new_properties = {'value': table_json_schema['items']} for pk, item_json_schema in key_prop_schemas.items(): new_properties[SINGER_SOURCE_PK_PREFIX + pk] = item_json_schema new_properties[SINGER_SEQUENCE] = {'type': ['null', 'integer']} for i in range(0, level + 1): new_properties[SINGER_LEVEL.format(i)] = {'type': ['integer']} new_schema = {'type': ['object'], 'properties': new_properties} self.denest_schema(table_name, new_schema, key_prop_schemas, subtables, level=level) subtables[table_name] = new_schema
def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns, csv_rows): params = [] if self.s3: bucket, key = self.s3.persist(csv_rows, key_prefix=temp_table_name + SEPARATOR) stage_location = "'s3://{bucket}/{key}' credentials=(AWS_KEY_ID=%s AWS_SECRET_KEY=%s)".format( bucket=bucket, key=key) params = [ self.s3.credentials()['aws_access_key_id'], self.s3.credentials()['aws_secret_access_key'] ] else: stage_location = '@{db}.{schema}.%{table}'.format( db=sql.identifier(self.connection.configured_database), schema=sql.identifier(self.connection.configured_schema), table=sql.identifier(temp_table_name)) rel_path = '/tmp/target-snowflake/' file_name = str(uuid.uuid4()).replace('-', '_') # Make tmp folder to hold data file os.makedirs(rel_path, exist_ok=True) # Write readable csv_rows to file with open(rel_path + file_name, 'wb') as file: line = csv_rows.read() while line: file.write(line.encode('utf-8')) line = csv_rows.read() # Upload to internal table stage cur.execute(''' PUT file://{rel_path}{file_name} {stage_location} '''.format(rel_path=rel_path, file_name=file_name, stage_location=stage_location)) # Tidy up and remove tmp staging file os.remove(rel_path + file_name) stage_location += '/{}'.format(file_name) cur.execute(''' COPY INTO {db}.{schema}.{table} ({cols}) FROM {stage_location} FILE_FORMAT = (TYPE = CSV EMPTY_FIELD_AS_NULL = FALSE) '''.format(db=sql.identifier(self.connection.configured_database), schema=sql.identifier(self.connection.configured_schema), table=sql.identifier(temp_table_name), cols=','.join([sql.identifier(x) for x in columns]), stage_location=stage_location), params=params) pattern = re.compile(SINGER_LEVEL.upper().format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, columns)) canonicalized_key_properties = [ self.fetch_column_from_path((key_property, ), remote_schema)[0] for key_property in remote_schema['key_properties'] ] self.perform_update(cur, remote_schema['name'], temp_table_name, canonicalized_key_properties, columns, subkeys)
def persist_rows(self, cur, target_table_name, temp_table_name, target_table_json_schema, key_properties, records): target_schema = self.get_schema(cur, self.postgres_schema, target_table_name) headers = list(target_schema['schema']['properties'].keys()) datetime_fields = [ k for k, v in target_table_json_schema['properties'].items() if v.get('format') == 'date-time' ] default_fields = { k: v.get('default') for k, v in target_table_json_schema['properties'].items() if v.get('default') is not None } fields = set( headers + [v['from'] for k, v in target_schema.get('mappings', {}).items()]) records_iter = iter(records) def transform(): try: record = next(records_iter) row = {} for field in fields: value = record.get(field, None) ## Serialize fields which are not present but have default values set if field in default_fields \ and value is None: value = default_fields[field] ## Serialize datetime to postgres compatible format if field in datetime_fields \ and value is not None: value = self.get_postgres_datetime(value) ## Serialize NULL default value if value == RESERVED_NULL_DEFAULT: self.logger.warning( 'Reserved {} value found in field: {}. Value will be turned into literal null' .format(RESERVED_NULL_DEFAULT, field)) if value is None: value = RESERVED_NULL_DEFAULT field_name = field if field in target_table_json_schema['properties']: field_name = self.get_mapping(target_schema, field, target_table_json_schema['properties'][field]) \ or field if not field_name in row \ or row[field_name] is None \ or row[field_name] == RESERVED_NULL_DEFAULT: row[field_name] = value with io.StringIO() as out: writer = csv.DictWriter(out, headers) writer.writerow(row) return out.getvalue() except StopIteration: return '' csv_rows = TransformStream(transform) copy = sql.SQL( 'COPY {}.{} ({}) FROM STDIN WITH (FORMAT CSV, NULL {})').format( sql.Identifier(self.postgres_schema), sql.Identifier(temp_table_name), sql.SQL(', ').join(map(sql.Identifier, headers)), sql.Literal(RESERVED_NULL_DEFAULT)) cur.copy_expert(copy, csv_rows) pattern = re.compile(SINGER_LEVEL.format('[0-9]+')) subkeys = list( filter(lambda header: re.match(pattern, header) is not None, headers)) update_sql = self.get_update_sql(target_table_name, temp_table_name, key_properties, subkeys) cur.execute(update_sql)