def for_each_time_entry(time_entry, time_extracted, stream_version=None): # Extract external_reference external_reference_schema = load_and_write_schema("external_reference") load_and_write_schema( "time_entry_external_reference", key_properties=["time_entry_id", "external_reference_id"]) if time_entry['external_reference'] is not None: with Transformer() as transformer: external_reference = time_entry['external_reference'] external_reference = transformer.transform( external_reference, external_reference_schema) new_record = singer.RecordMessage( stream="external_reference", record=external_reference, version=stream_version, time_extracted=time_extracted) singer.write_message(new_record) # Create pivot row for time_entry and external_reference pivot_row = { 'time_entry_id': time_entry['id'], 'external_reference_id': external_reference['id'] } new_record = singer.RecordMessage( stream="time_entry_external_reference", record=pivot_row, version=stream_version, time_extracted=time_extracted) singer.write_message(new_record)
def row_to_singer_message(stream, row, version, columns, time_extracted): row_to_persist = () for idx, elem in enumerate(row): property_type = stream.schema.properties[columns[idx]].type multiple_of = stream.schema.properties[columns[idx]].multipleOf format = stream.schema.properties[columns[idx]].format #date-time if elem is None: row_to_persist += (elem,) elif 'integer' in property_type or property_type == 'integer': integer_representation = int(elem) row_to_persist += (integer_representation,) elif ('number' in property_type or property_type == 'number') and multiple_of: decimal_representation = decimal.Decimal(elem) row_to_persist += (decimal_representation,) elif ('number' in property_type or property_type == 'number'): row_to_persist += (float(elem),) elif format == 'date-time': row_to_persist += (elem,) else: row_to_persist += (elem,) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage( stream=stream.stream, record=rec, version=version, time_extracted=time_extracted)
def sync_report(report, stream, config): report_url = report['report_url'] username = config['username'] password = config['password'] LOGGER.info('Syncing report "%s".', report_url) record_count = 0 record = {} stream_version = int(time.time() * 1000) extraction_time = utils.now().isoformat() singer.write_version(stream.tap_stream_id, stream_version) with Transformer() as transformer: for record in stream_report(report_url, username, password): to_write = transformer.transform(record, stream.schema.to_dict(), metadata.to_map(stream.metadata)) to_write['_sdc_extracted_at'] = extraction_time record_message = singer.RecordMessage(stream.tap_stream_id, to_write, version=stream_version) singer.write_message(record_message) record_count += 1 return record_count
def row_to_singer_record(catalog_entry, version, db_column_map, row, time_extracted): row_to_persist = {} for column_name, val in row.items(): property_type = catalog_entry.schema.properties[column_name].type db_column_type = db_column_map.get(column_name) if isinstance(val, (datetime.datetime, datetime.date, datetime.timedelta)): the_utc_date = common.to_utc_datetime_str(val) row_to_persist[column_name] = the_utc_date elif db_column_type == FIELD_TYPE.JSON: row_to_persist[column_name] = json.dumps(json_bytes_to_string(val)) elif 'boolean' in property_type or property_type == 'boolean': if val is None: boolean_representation = None elif val == 0: boolean_representation = False elif db_column_type == FIELD_TYPE.BIT: boolean_representation = int(val) != 0 else: boolean_representation = True row_to_persist[column_name] = boolean_representation else: row_to_persist[column_name] = val return singer.RecordMessage(stream=catalog_entry.stream, record=row_to_persist, version=version, time_extracted=time_extracted)
def sync_file(bucket, s3_path, stream, version=None): LOGGER.info('Syncing file "%s".', s3_path) table_name = stream['stream'] s3_file_handle = s3.get_file_handle(bucket, s3_path) iterator = fastavro.reader(s3_file_handle._raw_stream) mdata = metadata.to_map(stream['metadata']) schema = generate_schema_from_avro(iterator.schema) key_properties = metadata.get(mdata, (), 'table-key-properties') singer.write_schema(table_name, schema, key_properties) # Activate a version so we execute a full table sync if version is not None: LOGGER.info('Sending Activate Version Message with version %d', version) message = singer.ActivateVersionMessage(stream=table_name, version=version) singer.write_message(message) records_synced = 0 with Transformer() as transformer: for row in iterator: to_write = transformer.filter_data_by_metadata(row, mdata) singer.write_message( singer.RecordMessage(table_name, to_write, version=version)) records_synced += 1 return records_synced
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted): row_to_persist = () for idx, elem in enumerate(row): property_type = catalog_entry.schema.properties[columns[idx]].type if isinstance(elem, (datetime.datetime, datetime.date, datetime.timedelta)): the_utc_date = to_utc_datetime_str(elem) row_to_persist += (the_utc_date, ) elif isinstance(elem, bytes): # for BIT value, treat 0 as False and anything else as True boolean_representation = elem != b'\x00' row_to_persist += (boolean_representation, ) elif 'boolean' in property_type or property_type == 'boolean': if elem is None: boolean_representation = None elif elem == 0: boolean_representation = False else: boolean_representation = True row_to_persist += (boolean_representation, ) else: row_to_persist += (elem, ) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage(stream=catalog_entry.stream, record=rec, version=version, time_extracted=time_extracted)
def do_sync(self): bookmark_date_as_date = str_to_date(self.bookmark_date) max_product_date = bookmark_date_as_date product_response = self.client.make_request("/products/mine") product_ids = [] with singer.metrics.Counter('record_count', {'endpoint': 'products'}) as counter: for product in product_response.json().values(): product_ids.append(product['id']) # Only upsert messages which have changed product_date = product['updated_date'] if product['updated_date']\ else product['added_date'] product_date = str_to_date(product_date) product = tidy_dates(product) if product_date > bookmark_date_as_date: singer.write_message( singer.RecordMessage( stream='products', record=product, )) max_product_date = max(max_product_date, product_date) counter.increment() self.state = singer.write_bookmark(self.state, self.STREAM_NAME, 'last_record', date_to_str(max_product_date)) self.product_ids = product_ids
def row_to_singer_message(stream, row, version, columns, time_extracted, md_map, conn_info): row_to_persist = () md_map[('properties', '_sdc_deleted_at')] = { 'sql-datatype': 'timestamp with time zone' } md_map[('properties', '_sdc_lsn')] = {'sql-datatype': "character varying"} for idx, elem in enumerate(row): sql_datatype = md_map.get( ('properties', columns[idx])).get('sql-datatype') if not sql_datatype: LOGGER.info("No sql-datatype found for stream %s: %s", stream, columns[idx]) raise Exception( "Unable to find sql-datatype for stream {}".format(stream)) cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info) row_to_persist += (cleaned_elem, ) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), record=rec, version=version, time_extracted=time_extracted)
def do_sync(self): """ Main sync functionality Most of the streams use this A few of the streams work differently and override this method """ start_date = str_to_date(self.bookmark_date).strftime('%Y-%m-%d') try: response = self.client.make_request(self.URI.format(start_date)) except RequestError: return new_bookmark_date = self.bookmark_date with singer.metrics.Counter('record_count', {'endpoint': self.STREAM_NAME}) as counter: for entry in self.traverse_nested_dicts(response.json(), self.RESPONSE_LEVELS): new_bookmark_date = max(new_bookmark_date, entry['date']) entry = strings_to_floats(entry) singer.write_message( singer.RecordMessage( stream=self.STREAM_NAME, record=entry, )) counter.increment() self.state = singer.write_bookmark(self.state, self.STREAM_NAME, 'last_record', new_bookmark_date)
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted): row_to_persist = () for idx, elem in enumerate(row): property_type = catalog_entry.schema.properties[columns[idx]].type if isinstance(elem, datetime.datetime): row_to_persist += (elem.isoformat() + '+00:00', ) elif isinstance(elem, datetime.date): row_to_persist += (elem.isoformat() + 'T00:00:00+00:00', ) elif isinstance(elem, datetime.timedelta): epoch = datetime.datetime.utcfromtimestamp(0) timedelta_from_epoch = epoch + elem row_to_persist += (timedelta_from_epoch.isoformat() + '+00:00', ) elif 'boolean' in property_type or property_type == 'boolean': if elem is None: boolean_representation = None elif elem == 0: boolean_representation = False else: boolean_representation = True row_to_persist += (boolean_representation, ) else: row_to_persist += (elem, ) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage(stream=catalog_entry.stream, record=rec, version=version, time_extracted=time_extracted)
def row_to_singer_record(stream, row, version, time_extracted): row_to_persist = {k: transform_value(v) for k, v in row.items()} return singer.RecordMessage(stream=stream['tap_stream_id'], record=row_to_persist, version=version, time_extracted=time_extracted)
def do_sync(self): max_product_date = self.bookmark_date product_response = self.client.make_request("/products/mine") product_ids = [] product_types = [] with singer.metrics.Counter('record_count', {'endpoint': 'products'}) as counter: for product in product_response.json().values(): record = ProductRecord(product, self.schema) product_ids.append(record.clean_data['id']) product_types.append(record.clean_data['type']) # Only upsert messages which have changed if record.product_date > self.bookmark_date: singer.write_message( singer.RecordMessage( stream='products', record=product, )) max_product_date = max(max_product_date, record.product_date) counter.increment() self.state = singer.write_bookmark(self.state, self.STREAM_NAME, 'last_record', date_to_str(max_product_date)) self.product_ids = product_ids self.product_types = product_types
def test_round_trip(self): record_message = singer.RecordMessage(record={'name': 'foo'}, stream='users') schema_message = singer.SchemaMessage(stream='users', key_properties=['name'], schema={ 'type': 'object', 'properties': { 'name': { 'type': 'string' } } }) state_message = singer.StateMessage(value={'seq': 1}) self.assertEqual( record_message, singer.parse_message(singer.format_message(record_message))) self.assertEqual( schema_message, singer.parse_message(singer.format_message(schema_message))) self.assertEqual( state_message, singer.parse_message(singer.format_message(state_message)))
def do_sync(self): """ Main sync functionality Allows for differences in schemas between catalog and the actual received data to unravel lists This permits the user to get more granular ratings info (e.g. number of reviews for each rating) """ start_date = str_to_date(self.bookmark_date).strftime('%Y-%m-%d') while str_to_date(start_date).date() < datetime.date.today(): end_date = min( str_to_date(start_date).date() + datetime.timedelta(days=28), datetime.date.today() - datetime.timedelta(days=1)) try: response = self.client.make_request( self.URI.format(start_date, end_date.strftime('%Y-%m-%d'))) except RequestError: return new_bookmark_date = self.bookmark_date with singer.metrics.Counter( 'record_count', {'endpoint': self.STREAM_NAME}) as counter: for entry in self.traverse_nested_dicts( response.json(), self.RESPONSE_LEVELS): new_bookmark_date = max(new_bookmark_date, entry['date']) schema_keys = [ x for x in self.schema['properties'].keys() if x not in entry.keys() ] entry_keys = [ x for x in entry.keys() if x not in self.schema['properties'].keys() ] if schema_keys and entry_keys: entries = list( itertools.chain.from_iterable([ entry[entry_item] for entry_item in entry_keys ])) for j, schema_item in enumerate(schema_keys): entry[schema_item] = entries[j] for key in entry_keys: del (entry[key]) entry = strings_to_floats(entry) singer.write_message( singer.RecordMessage( stream=self.STREAM_NAME, record=entry, )) counter.increment() self.state = singer.write_bookmark(self.state, self.STREAM_NAME, 'last_record', new_bookmark_date) if end_date == datetime.date.today() - datetime.timedelta(days=1): break start_date = end_date.strftime('%Y-%m-%d')
def sync_stream(kafka_config, stream, state): consumer = KafkaConsumer( kafka_config['topic'], group_id=kafka_config['group_id'], enable_auto_commit=False, consumer_timeout_ms=kafka_config.get('consumer_timeout_ms', 10000), auto_offset_reset='earliest', value_deserializer=lambda m: json.loads(m.decode('ascii')), bootstrap_servers=kafka_config['bootstrap_servers']) send_schema_message(stream) stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=stream['tap_stream_id'], version=stream_version) singer.write_message(activate_version_message) time_extracted = utils.now() rows_saved = 0 for message in consumer: LOGGER.info("%s:%s:%s: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) # stream['schema'] record = singer.RecordMessage(stream=stream['tap_stream_id'], record=message.value, time_extracted=time_extracted) [valid, error] = validate_record(stream['schema'], record) rows_saved = rows_saved + 1 if valid: singer.write_message(record) elif kafka_config.get('reject_topic'): send_reject_message(kafka_config, record, error) else: raise Exception( "record failed validation and no reject_topic was specified") state = singer.write_bookmark(state, stream['tap_stream_id'], 'offset', message.offset) #commit offsets because we processed the message tp = TopicPartition(message.topic, message.partition) consumer.commit({tp: OffsetAndMetadata(message.offset + 1, None)}) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_query(config, state, stream): table_name = stream['tap_stream_id'] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, table_name, 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, table_name, 'last_evaluated_key') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, table_name, 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, table_name, 'version', stream_version) singer.write_state(state) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_version(table_name, stream_version) mdata = metadata.to_map(stream['metadata']) queries = metadata.get(mdata, (), "queries") rows_saved = 0 deserializer = Deserializer() for result in query_table(table_name, queries, config): for item in result.get('Items', []): rows_saved += 1 # TODO: Do we actually have to put the item we retreive from # dynamo into a map before we can deserialize? record = deserializer.deserialize_item(item) record_message = singer.RecordMessage(stream=table_name, record=record, version=stream_version) singer.write_message(record_message) if result.get('LastEvaluatedKey'): state = singer.write_bookmark(state, table_name, 'last_evaluated_key', result.get('LastEvaluatedKey')) singer.write_state(state) state = singer.clear_bookmark(state, table_name, 'last_evaluated_key') state = singer.write_bookmark(state, table_name, 'initial_full_table_complete', True) singer.write_state(state) singer.write_version(table_name, stream_version) return rows_saved
def test_extraction_time_strftime(self): """ Test that we're not corrupting timestamps with cross platform parsing. (Test case for OSX, specifically) """ message = singer.RecordMessage( record={'name': 'foo'}, stream='users', version=2, time_extracted=dateutil.parser.parse("1970-01-02T00:00:00.000Z")) expected = "1970-01-02T00:00:00.000000Z" self.assertEqual(message.asdict()["time_extracted"], expected)
def do_sync(self): start_date = self.bookmark_date new_bookmark_date = self.bookmark_date # Ranks cannot be fetched for inapp product_ids = ','.join([ str(id) for i, id in enumerate(self.product_ids) if self.product_types[i] != "inapp" ]) if any( [product_type == "inapp" for product_type in self.product_types]): LOGGER.info( "Skipping id={} since ranks cannot be fetched for inapp purchases." .format(','.join([ str(id) for i, id in enumerate(self.product_ids) if self.product_types[i] == "inapp" ]))) while start_date.date() <= date.today(): end_date = start_date + timedelta(days=28) uri = '/ranks/{}/daily/{}/{}'.format( product_ids, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) data = self.client.make_request(uri).json() rank_dates = data['dates'] rank_data = data['data'] with singer.metrics.Counter('record_count', {'endpoint': 'ranks'}) as counter: for rank_entry in rank_data: for i, rank_date in enumerate(rank_dates): record = RankRecord( dict( country=rank_entry['country'], category=rank_entry['category'], product_id=rank_entry['product_id'], position=rank_entry['positions'][i], delta=rank_entry['deltas'][i], date=rank_date, ), self.schema) new_bookmark_date = max(new_bookmark_date, record.bookmark) singer.write_message( singer.RecordMessage( stream=self.STREAM_NAME, record=record.for_export, )) counter.increment() self.state = singer.write_bookmark(self.state, self.STREAM_NAME, 'last_record', date_to_str(new_bookmark_date)) start_date = end_date
def test_parse_message_record_with_version_good(self): message = singer.parse_message( '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2}' ) self.assertEqual( message, singer.RecordMessage(record={'name': 'foo'}, stream='users', version=2))
def row_to_record(catalog_entry, version, row, columns, time_extracted): row_to_persist = () for idx, elem in enumerate(row): if isinstance(elem, datetime.datetime): elem = elem.isoformat('T') + 'Z' row_to_persist += (elem, ) return singer.RecordMessage(stream=catalog_entry.stream, record=dict(zip(columns, row_to_persist)), version=version, time_extracted=time_extracted)
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter): bulk = Bulk(sf) current_bookmark = singer.get_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry) current_bookmark = singer_utils.strptime_with_tz(current_bookmark) batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'BatchIDs') start_time = singer_utils.now() stream = catalog_entry['stream'] stream_id = catalog_entry['tap_stream_id'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry.get('metadata')) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) schema = catalog_entry['schema'] if not bulk.job_exists(job_id): LOGGER.info( "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state." ) return counter # Iterate over the remaining batches, removing them once they are synced for batch_id in batch_ids[:]: with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_id or stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) # Update bookmark if necessary replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark: current_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(current_bookmark)) batch_ids.remove(batch_id) LOGGER.info("Finished syncing batch %s. Removing batch from state.", batch_id) LOGGER.info("Batches to go: %d", len(batch_ids)) singer.write_state(state) return counter
def sync_table_file(config, s3_path, table_spec, stream, last_modified): LOGGER.info('Syncing file "%s".', s3_path) bucket = config["bucket"] table_name = table_spec["table_name"] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) encoding_module = singer_encodings_csv if 'encoding_module' in config: try: encoding_module = importlib.import_module( config['encoding_module']) except ModuleNotFoundError: LOGGER.warning( f'Failed to load encoding module [{config["encoding_module"]}]. Defaulting to [singer_encodings.csv]' ) iterator = encoding_module.get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2, } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream["schema"], metadata.to_map(stream["metadata"])) to_write_with_sequence = RecordMessageWithSequence( singer.RecordMessage(stream=table_name, record=to_write), last_modified) singer.write_message(to_write_with_sequence) records_synced += 1 return records_synced
def row_to_singer_record(catalog_entry, version, db_column_map, row, time_extracted): row_to_persist = {} for column_name, val in row.items(): property_type = catalog_entry.schema.properties[column_name].type db_column_type = db_column_map.get(column_name) if isinstance(val, datetime.datetime): if db_column_type in MYSQL_TIMESTAMP_TYPES: # The mysql-replication library creates datetimes from TIMESTAMP columns using fromtimestamp which # will use the local timezone thus we must set tzinfo accordingly See: # https://github.com/noplay/python-mysql-replication/blob/master/pymysqlreplication/row_event.py#L143 # -L145 timezone = tzlocal.get_localzone() local_datetime = timezone.localize(val) utc_datetime = local_datetime.astimezone(pytz.UTC) row_to_persist[column_name] = utc_datetime.isoformat() else: row_to_persist[column_name] = val.isoformat() + '+00:00' elif isinstance(val, datetime.date): row_to_persist[column_name] = val.isoformat() + 'T00:00:00+00:00' elif isinstance(val, datetime.timedelta): timedelta_from_epoch = datetime.datetime.utcfromtimestamp(0) + val row_to_persist[column_name] = timedelta_from_epoch.isoformat( ) + '+00:00' elif db_column_type == FIELD_TYPE.JSON: row_to_persist[column_name] = json.dumps(json_bytes_to_string(val)) elif isinstance(val, bytes): # encode bytes as hex bytes then to utf8 string row_to_persist[column_name] = codecs.encode(val, 'hex').decode('utf-8') elif 'boolean' in property_type or property_type == 'boolean': if val is None: boolean_representation = None elif val == 0: boolean_representation = False elif db_column_type == FIELD_TYPE.BIT: boolean_representation = int(val) != 0 else: boolean_representation = True row_to_persist[column_name] = boolean_representation else: row_to_persist[column_name] = val return singer.RecordMessage(stream=catalog_entry.stream, record=row_to_persist, version=version, time_extracted=time_extracted)
def test_parse_message_record_aware_extraction_time(self): message = singer.parse_message( '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2, "time_extracted": "1970-01-02T00:00:00.000Z"}') expected = singer.RecordMessage( record={'name': 'foo'}, stream='users', version=2, time_extracted=dateutil.parser.parse("1970-01-02T00:00:00.000Z")) print(message) print(expected) self.assertEqual(message, expected)
def sync(self): """ Perform sync action These steps are the same for all streams Differences between streams are implemented by overriding .do_sync() method """ if not self.KEEP_IDS and not self.include_stream: LOGGER.info('Skipping stream %s - excluded in catalog', self.STREAM_NAME) return new_bookmark_date = self.bookmark_date = self.starting_bookmark_date() # amazon doesn't guarantee that all orders created after the createdafter data that you specify will be returned # Will be set to false if we stop early due to reaching the end of a batch # to tell the runner to continue with the next batch all_done = True singer.write_schema(self.STREAM_NAME, self.schema, self.key_properties) rows = self.request_list() self.ids = [] with singer.metrics.Counter('record_count', {'endpoint': self.STREAM_NAME}) as counter: for row in rows: row_as_dict = self.row_to_dict(row) if self.KEEP_IDS: self.ids.append(row_as_dict[self.ID_FIELD]) self.remove_excluded_fields(row_as_dict) message = singer.RecordMessage( stream=self.STREAM_NAME, record=row_as_dict, time_extracted=singer.utils.now()) if self.include_stream: singer.write_message(message) if self.BOOKMARK_FIELD: new_bookmark_date = max(new_bookmark_date, row_as_dict[self.BOOKMARK_FIELD]) counter.increment() # Stop if we've done enough for one batch if self.BATCH_SIZE and counter.value >= self.BATCH_SIZE: # Sync action stopped due to end of batch - so probably more rows # Note that there is a 1/BATCH_SIZE chance that the end of a # batch is exactly the end of the whole process. In that case # the runner will make one more .sync request, for one more (empty) batch all_done = False break if self.BOOKMARK_FIELD: singer.write_bookmark(self.state, self.STREAM_NAME, self.BOOKMARK_FIELD, new_bookmark_date) return all_done
def for_each_role(role, time_extracted, stream_version=None): # Extract user_roles load_and_write_schema("user_roles", key_properties=["user_id", "role_id"]) for user_id in role['user_ids']: pivot_row = {'role_id': role['id'], 'user_id': user_id} new_record = singer.RecordMessage(stream="user_roles", record=pivot_row, version=stream_version, time_extracted=time_extracted) singer.write_message(new_record)
def row_to_singer_record(stream, row, version, time_extracted): # pylint: disable=unidiomatic-typecheck try: row_to_persist = {k:transform_value(v, [k]) for k, v in row.items() if type(v) not in [bson.min_key.MinKey, bson.max_key.MaxKey]} except MongoInvalidDateTimeException as ex: raise Exception("Error syncing collection {}, object ID {} - {}".format(stream["tap_stream_id"], row['_id'], ex)) return singer.RecordMessage( stream=calculate_destination_stream_name(stream), record=row_to_persist, version=version, time_extracted=time_extracted)
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted): """Transform SQL row to singer compatible record message""" row_to_persist = () for idx, elem in enumerate(row): property_type = catalog_entry.schema.properties[columns[idx]].type if isinstance(elem, datetime.datetime): row_to_persist += (elem.isoformat() + '+00:00', ) elif isinstance(elem, datetime.date): row_to_persist += (elem.isoformat() + 'T00:00:00+00:00', ) elif isinstance(elem, datetime.timedelta): epoch = datetime.datetime.utcfromtimestamp(0) timedelta_from_epoch = epoch + elem row_to_persist += (timedelta_from_epoch.isoformat() + '+00:00', ) elif isinstance(elem, datetime.time): row_to_persist += (str(elem), ) elif isinstance(elem, bytes): # for BIT value, treat 0 as False and anything else as True if 'boolean' in property_type: boolean_representation = elem != b'\x00' row_to_persist += (boolean_representation, ) else: row_to_persist += (elem.hex(), ) elif 'boolean' in property_type or property_type == 'boolean': if elem is None: boolean_representation = None elif elem == 0: boolean_representation = False else: boolean_representation = True row_to_persist += (boolean_representation, ) elif 'object' in property_type or property_type == 'object': obj_rep = None if elem: obj_rep = json.loads(elem) row_to_persist += (obj_rep, ) else: row_to_persist += (elem, ) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage(stream=catalog_entry.stream, record=rec, version=version, time_extracted=time_extracted)
def sync_shard(shard, seq_number_bookmarks, streams_client, stream_arn, projection, deserializer, table_name, stream_version, state): seq_number = seq_number_bookmarks.get(shard['ShardId']) rows_synced = 0 for record in get_shard_records(streams_client, stream_arn, shard, seq_number): if record['eventName'] == 'REMOVE': record_message = deserializer.deserialize_item( record['dynamodb']['Keys']) record_message[SDC_DELETED_AT] = singer.utils.strftime( record['dynamodb']['ApproximateCreationDateTime']) else: record_message = deserializer.deserialize_item( record['dynamodb'].get('NewImage')) if record_message is None: LOGGER.fatal( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) raise RuntimeError( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) if projection is not None and projection != '': try: record_message = deserializer.apply_projection( record_message, projection) except: LOGGER.fatal("Projection failed to apply: %s", projection) raise RuntimeError( 'Projection failed to apply: {}'.format(projection)) record_message = singer.RecordMessage(stream=table_name, record=record_message, version=stream_version) singer.write_message(record_message) rows_synced += 1 seq_number_bookmarks[ shard['ShardId']] = record['dynamodb']['SequenceNumber'] state = singer.write_bookmark(state, table_name, 'shard_seq_numbers', seq_number_bookmarks) # Every 100 rows write the state if rows_synced % 100 == 0: singer.write_state(state) singer.write_state(state) return rows_synced
def selected_row_to_singer_message(stream, row, version, columns, time_extracted, md_map): row_to_persist = () for idx, elem in enumerate(row): sql_datatype = md_map.get(('properties', columns[idx]))['sql-datatype'] cleaned_elem = selected_value_to_singer_value(elem, sql_datatype) row_to_persist += (cleaned_elem,) rec = dict(zip(columns, row_to_persist)) return singer.RecordMessage( stream=calculate_destination_stream_name(stream, md_map), record=rec, version=version, time_extracted=time_extracted)