def sync(config, state, stream): table_name = stream['tap_stream_id'] # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, table_name, 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, table_name, 'last_evaluated_key') is not None # pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, table_name, 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, table_name, 'version', stream_version) singer.write_state(state) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_version(table_name, stream_version) last_evaluated_key = singer.get_bookmark(state, table_name, 'last_evaluated_key') md_map = metadata.to_map(stream['metadata']) projection = metadata.get(md_map, (), 'tap-mongodb.projection') rows_saved = 0 deserializer = Deserializer() for result in scan_table(table_name, projection, last_evaluated_key, config): for item in result.get('Items', []): rows_saved += 1 # TODO: Do we actually have to put the item we retreive from # dynamo into a map before we can deserialize? record = deserializer.deserialize_item(item) record_message = singer.RecordMessage(stream=table_name, record=record, version=stream_version) singer.write_message(record_message) if result.get('LastEvaluatedKey'): state = singer.write_bookmark(state, table_name, 'last_evaluated_key', result.get('LastEvaluatedKey')) singer.write_state(state) state = singer.clear_bookmark(state, table_name, 'last_evaluated_key') state = singer.write_bookmark(state, table_name, 'initial_full_table_complete', True) singer.write_state(state) singer.write_version(table_name, stream_version) return rows_saved
def do_discover(sf): """Describes a Salesforce instance's objects and generates a JSON schema for each field.""" global_description = sf.describe() objects_to_discover = {o['name'] for o in global_description['sobjects']} key_properties = ['Id'] sf_custom_setting_objects = [] object_to_tag_references = {} # For each SF Object describe it, loop its fields and build a schema entries = [] for sobject_name in objects_to_discover: # Skip blacklisted SF objects depending on the api_type in use if sobject_name in sf.get_blacklisted_objects(): continue sobject_description = sf.describe(sobject_name) # Cache customSetting and Tag objects to check for blacklisting after # all objects have been described if sobject_description.get("customSetting"): sf_custom_setting_objects.append(sobject_name) elif sobject_name.endswith("__Tag"): relationship_field = next((f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"), None) if relationship_field: # Map {"Object":"Object__Tag"} object_to_tag_references[relationship_field["referenceTo"] [0]] = sobject_name fields = sobject_description['fields'] replication_key = get_replication_key(sobject_name, fields) unsupported_fields = set() properties = {} mdata = metadata.new() found_id_field = False # Loop over the object's fields for f in fields: field_name = f['name'] if field_name == "Id": found_id_field = True property_schema, mdata = create_property_schema(f, mdata) # Compound Address fields cannot be queried by the Bulk API if f['type'] == "address" and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE: unsupported_fields.add( (field_name, 'cannot query compound address fields with bulk API')) # Blacklisted fields are dependent on the api_type being used field_pair = (sobject_name, field_name) if field_pair in sf.get_blacklisted_fields(): unsupported_fields.add( (field_name, sf.get_blacklisted_fields()[field_pair])) inclusion = metadata.get(mdata, ('properties', field_name), 'inclusion') if sf.select_fields_by_default and inclusion != 'unsupported': mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) properties[field_name] = property_schema if replication_key: mdata = metadata.write(mdata, ('properties', replication_key), 'inclusion', 'automatic') # There are cases where compound fields are referenced by the associated # subfields but are not actually present in the field list field_name_set = {f['name'] for f in fields} filtered_unsupported_fields = [ f for f in unsupported_fields if f[0] in field_name_set ] missing_unsupported_field_names = [ f[0] for f in unsupported_fields if f[0] not in field_name_set ] if missing_unsupported_field_names: LOGGER.info( "Ignoring the following unsupported fields for object %s as they are missing from the field list: %s", sobject_name, ', '.join(sorted(missing_unsupported_field_names))) if filtered_unsupported_fields: LOGGER.info( "Not syncing the following unsupported fields for object %s: %s", sobject_name, ', '.join(sorted([k for k, _ in filtered_unsupported_fields]))) # Salesforce Objects are skipped when they do not have an Id field if not found_id_field: LOGGER.info("Skipping Salesforce Object %s, as it has no Id field", sobject_name) continue # Any property added to unsupported_fields has metadata generated and # removed for prop, description in filtered_unsupported_fields: if metadata.get(mdata, ('properties', prop), 'selected-by-default'): metadata.delete(mdata, ('properties', prop), 'selected-by-default') mdata = metadata.write(mdata, ('properties', prop), 'unsupported-description', description) mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'unsupported') if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) else: mdata = metadata.write( mdata, (), 'forced-replication-method', { 'replication-method': 'FULL_TABLE', 'reason': 'No replication keys found from the Salesforce API' }) mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) schema = { 'type': 'object', 'additionalProperties': False, 'properties': properties } entry = { 'stream': sobject_name, 'tap_stream_id': sobject_name, 'schema': schema, 'metadata': metadata.to_list(mdata) } entries.append(entry) # For each custom setting field, remove its associated tag from entries # See Blacklisting.md for more information unsupported_tag_objects = [ object_to_tag_references[f] for f in sf_custom_setting_objects if f in object_to_tag_references ] if unsupported_tag_objects: LOGGER.info( #pylint:disable=logging-not-lazy "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " + "are not supported by the Bulk API:") LOGGER.info(unsupported_tag_objects) entries = [ e for e in entries if e['stream'] not in unsupported_tag_objects ] result = {'streams': entries} json.dump(result, sys.stdout, indent=4)
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Synced {} records for {}'.format(rows_saved, tap_stream_id))
def sync_stream(stream_name): """ Sync each stream, looking for newly created records. Updates are captured by events stream. """ LOGGER.info("Started syncing stream %s", stream_name) stream_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata']) stream_field_whitelist = json.loads( Context.config.get('whitelist_map', '{}')).get(stream_name) extraction_time = singer.utils.now() replication_key = metadata.get(stream_metadata, (), 'valid-replication-keys')[0] # Invoice Items bookmarks on `date`, but queries on `created` filter_key = 'created' if stream_name == 'invoice_items' else replication_key stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) bookmark = stream_bookmark # if this stream has a sub_stream, compare the bookmark sub_stream_name = SUB_STREAMS.get(stream_name) # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark) should_sync_sub_stream = sub_stream_name and Context.is_selected( sub_stream_name) if should_sync_sub_stream: sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \ or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) # if there is a sub stream, set bookmark to sub stream's bookmark # since we know it must be earlier than the stream's bookmark if sub_stream_bookmark != stream_bookmark: bookmark = sub_stream_bookmark else: sub_stream_bookmark = None with Transformer( singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: end_time = dt_to_epoch(utils.now()) window_size = int( Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE)) if DEFAULT_DATE_WINDOW_SIZE != window_size: LOGGER.info('Using non-default date window size of %d', window_size) start_window = bookmark # NB: Immutable streams are never synced for updates. We've # observed a short lag period between when records are created and # when they are available via the API, so these streams will need # a short lookback window. if stream_name in IMMUTABLE_STREAMS: # pylint:disable=fixme # TODO: This may be an issue for other streams' created_at # entries, but to keep the surface small, doing this only for # immutable streams at first to confirm the suspicion. start_window -= IMMUTABLE_STREAM_LOOKBACK # NB: We observed records coming through newest->oldest and so # date-windowing was added and the tap only bookmarks after it has # gotten through a date window while start_window < end_time: stop_window = dt_to_epoch( epoch_to_dt(start_window) + timedelta(days=window_size)) # cut off the last window at the end time if stop_window > end_time: stop_window = end_time for stream_obj in paginate( STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key, start_window, stop_window): # get the replication key value from the object rec = unwrap_data_objects(stream_obj.to_dict_recursive()) rec = reduce_foreign_keys(rec, stream_name) stream_obj_created = rec[replication_key] rec['updated'] = stream_obj_created # sync stream if object is greater than or equal to the bookmark if stream_obj_created >= stream_bookmark: rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], stream_metadata) # At this point, the record has been transformed and so # any de-selected fields have been pruned. Now, prune off # any fields that aren't present in the whitelist. if stream_field_whitelist: rec = apply_whitelist(rec, stream_field_whitelist) singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.new_counts[stream_name] += 1 # sync sub streams if its selected and the parent object # is greater than its bookmark if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark: sync_sub_stream(sub_stream_name, stream_obj) # Update stream/sub-streams bookmarks as stop window if stop_window > stream_bookmark: stream_bookmark = stop_window singer.write_bookmark(Context.state, stream_name, replication_key, stream_bookmark) # the sub stream bookmarks on its parent if should_sync_sub_stream and stop_window > sub_stream_bookmark: sub_stream_bookmark = stop_window singer.write_bookmark(Context.state, sub_stream_name, replication_key, sub_stream_bookmark) singer.write_state(Context.state) # update window for next iteration start_window = stop_window singer.write_state(Context.state)
def is_selected(cls, stream_name): stream = cls.get_catalog_entry(stream_name) stream_metadata = metadata.to_map(stream['metadata']) return metadata.get(stream_metadata, (), 'selected')
def sync_log_based(config, state, stream): table_name = stream['tap_stream_id'] client = dynamodb.get_client(config) streams_client = dynamodb.get_stream_client(config) md_map = metadata.to_map(stream['metadata']) projection = metadata.get(md_map, (), 'tap-mongodb.projection') if projection is not None: projection = [x.strip().split('.') for x in projection.split(',')] # Write activate version message stream_version = singer.get_bookmark(state, table_name, 'version') singer.write_version(table_name, stream_version) table = client.describe_table(TableName=table_name)['Table'] stream_arn = table['LatestStreamArn'] seq_number_bookmarks = singer.get_bookmark(state, table_name, 'shard_seq_numbers') deserializer = deserialize.Deserializer() rows_saved = 0 for shard in get_shards(streams_client, stream_arn): # check for bookmark seq_number = seq_number_bookmarks.get(shard['ShardId']) if seq_number: iterator_type = 'AFTER_SEQUENCE_NUMBER' else: iterator_type = 'TRIM_HORIZON' for record in get_shard_records(streams_client, stream_arn, shard, iterator_type, seq_number): if record['eventName'] == 'REMOVE': record_message = deserializer.deserialize_item( record['dynamodb']['Keys']) record_message[SDC_DELETED_AT] = singer.utils.strftime( record['dynamodb']['ApproximateCreationDateTime']) else: record_message = deserializer.deserialize_item( record['dynamodb'].get('NewImage')) if record_message is None: LOGGER.fatal( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) raise RuntimeError( 'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"' ) if projection is not None: try: record_message = deserializer.apply_projection( record_message, projection) except: LOGGER.fatal( "Projection failed to apply: %s", metadata.get(md_map, (), 'tap-mongodb.projection')) raise RuntimeError( 'Projection failed to apply: {}'.format( metadata.get(md_map, (), 'tap-mongodb.projection'))) record_message = singer.RecordMessage(stream=table_name, record=record_message, version=stream_version) singer.write_message(record_message) rows_saved += 1 seq_number_bookmarks[ shard['ShardId']] = record['dynamodb']['SequenceNumber'] state = singer.write_bookmark(state, table_name, 'shard_seq_numbers', seq_number_bookmarks) if rows_saved % WRITE_STATE_PERIOD == 0: singer.write_state(state) # If the shard we just finished syncing is closed (i.e. has an # EndingSequenceNumber), pop it off if shard['SequenceNumberRange'].get('EndingSequenceNumber'): # Must check if the bookmark exists because if a shard has 0 # records we will never set a bookmark for the shard if seq_number_bookmarks.get(shard['ShardId']): seq_number_bookmarks.pop(shard['ShardId']) state = singer.write_bookmark(state, table_name, 'shard_seq_numbers', seq_number_bookmarks) singer.write_state(state) return rows_saved
def stream_is_selected(stream): md_map = metadata.to_map(stream.metadata) selected_md = metadata.get(md_map, (), "selected") return selected_md