def sync(self): self.on_window_started() parent = self.parent_class(self.client, self.config, self.state) # Get the most recent parent ID and resume from there, if necessary bookmarked_parent = singer.get_bookmark(self.state, self.stream_id, 'parent_id') parent_ids = [ p['id'] for p in self._sort_parent_ids_by_created( self.get_parent_ids(parent)) ] if bookmarked_parent and bookmarked_parent in parent_ids: # NB: This will cause some rework, but it will guarantee the tap doesn't miss records if interrupted. # - If there's too much data to sync all parents in a single run, this API is not appropriate for that data set. parent_ids = dropwhile(lambda p: p != bookmarked_parent, parent_ids) for parent_id in parent_ids: singer.write_bookmark(self.state, self.stream_id, "parent_id", parent_id) singer.write_state(self.state) for rec in self.get_records([parent_id]): yield rec singer.clear_bookmark(self.state, self.stream_id, "parent_id") self.on_window_finished()
def on_window_finished(self): window_start = singer.get_bookmark(self.state, self.stream_id, 'window_end') singer.write_bookmark(self.state, self.stream_id, 'last_record', window_start) singer.clear_bookmark(self.state, self.stream_id, 'window_end') singer.write_state(self.state)
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry) pk_clause = "" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if perform_resumable_sync: LOGGER.info( "Full table sync is resumable based on primary key definition, will replicate incrementally" ) state = update_incremental_full_table_state( catalog_entry, state, cur) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause try: select_sql = _create_temp_table(mysql_conn, catalog_entry, columns, pk_clause) except Exception as ex: logging.warning("creating temp table failed: {}".format( str(ex))) params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def whitelist_bookmark_keys(bookmark_key_set, tap_stream_id, state): for bk in [ non_whitelisted_bookmark_key for non_whitelisted_bookmark_key in state.get('bookmarks', {}).get( tap_stream_id, {}).keys() if non_whitelisted_bookmark_key not in bookmark_key_set ]: singer.clear_bookmark(state, tap_stream_id, bk)
def on_window_finished(self): # Set window_start to current window_end window_start = singer.get_bookmark(self.state, self.stream_id, "window_end") singer.write_bookmark(self.state, self.stream_id, "window_start", window_start) singer.clear_bookmark(self.state, self.stream_id, "window_end") singer.write_state(self.state)
def sync_table(snowflake_conn, catalog_entry, state, columns): """Sync table incrementally""" common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) singer.write_message(activate_version_message) select_sql = common.generate_select_sql(catalog_entry, columns) params = {} with snowflake_conn.connect_with_backoff() as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse( replication_key_value) # pylint: disable=duplicate-string-formatting-argument select_sql += ' WHERE "{}" >= \'{}\' ORDER BY "{}" ASC'.format( replication_key_metadata, replication_key_value, replication_key_metadata) elif replication_key_metadata is not None: select_sql += ' ORDER BY "{}" ASC'.format( replication_key_metadata) common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
def sync(config, state, catalog): client = ZenefitsClient(config['token']) company_id = config['company_id'] with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) for record in stream_obj.sync(company_id=company_id): transformed_record = transformer.transform( record, stream_schema, stream_metadata) LOGGER.info(f"Writing record: {transformed_record}") singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'cursor') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def sync(self, state, stream_schema, stream_metadata, config, transformer): start_time = singer.get_bookmark(state, self.tap_stream_id, self.replication_key, config['start_date']) sync_start_bookmark = singer.get_bookmark( state, self.tap_stream_id, 'sync_start', singer.utils.strftime(singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)) state = singer.write_bookmark( state, self.tap_stream_id, 'sync_start', sync_start_bookmark, ) bookmarked_cursor = singer.get_bookmark(state, self.tap_stream_id, 'cursor') for page, cursor in self.get_pages_safe(state, bookmarked_cursor, start_time): for record in page: if record[self.replication_key] >= start_time: transformed_record = transformer.transform( record, stream_schema, stream_metadata, ) singer.write_record( self.tap_stream_id, transformed_record, ) state = singer.write_bookmark(state, self.tap_stream_id, 'cursor', cursor) singer.write_state(state) state = singer.clear_bookmark(state, self.tap_stream_id, 'sync_start') state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor') state = singer.write_bookmark( state, self.tap_stream_id, self.replication_key, sync_start_bookmark, ) singer.write_state(state) return state
def do_sync_full_table(mysql_conn, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version") state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete", True ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(mysql_conn, catalog_entry, state, columns, original_state_file=''): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream='%s_%s' % (common.get_database_name(catalog_entry), catalog_entry.stream), version=stream_version) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse( replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key_metadata, replication_key_metadata) params['replication_key_value'] = replication_key_value common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params, original_state_file)
def do_sync_full_table(mssql_conn, config, catalog_entry, state, columns): key_properties = common.get_key_properties(catalog_entry) mssql_conn = MSSQLConnection(config) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mssql_conn, config, catalog_entry, state, columns, stream_version) # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version") state = singer.write_bookmark( state, catalog_entry.tap_stream_id, "initial_full_table_complete", True ) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_query(config, state, stream): table_name = stream['tap_stream_id'] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, table_name, 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, table_name, 'last_evaluated_key') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, table_name, 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, table_name, 'version', stream_version) singer.write_state(state) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_version(table_name, stream_version) mdata = metadata.to_map(stream['metadata']) queries = metadata.get(mdata, (), "queries") rows_saved = 0 deserializer = Deserializer() for result in query_table(table_name, queries, config): for item in result.get('Items', []): rows_saved += 1 # TODO: Do we actually have to put the item we retreive from # dynamo into a map before we can deserialize? record = deserializer.deserialize_item(item) record_message = singer.RecordMessage(stream=table_name, record=record, version=stream_version) singer.write_message(record_message) if result.get('LastEvaluatedKey'): state = singer.write_bookmark(state, table_name, 'last_evaluated_key', result.get('LastEvaluatedKey')) singer.write_state(state) state = singer.clear_bookmark(state, table_name, 'last_evaluated_key') state = singer.write_bookmark(state, table_name, 'initial_full_table_complete', True) singer.write_state(state) singer.write_version(table_name, stream_version) return rows_saved
def sync_table(mssql_conn, config, catalog_entry, state, columns): mssql_conn = MSSQLConnection(config) common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get("replication-key") replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, "replication_key") replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, "replication_key_value") else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "replication_key", replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, "replication_key_value") stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "version", stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) singer.write_message(activate_version_message) LOGGER.info("Beginning SQL") with connect_with_backoff(mssql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key_metadata].format == "date-time": replication_key_value = pendulum.parse( replication_key_value) select_sql += " WHERE \"{}\" >= %(replication_key_value)s ORDER BY \"{}\" ASC".format( replication_key_metadata, replication_key_metadata) params["replication_key_value"] = replication_key_value elif replication_key_metadata is not None: select_sql += " ORDER BY \"{}\" ASC".format( replication_key_metadata) common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
def do_sync_full_table(mssql_conn, config, catalog_entry, state, columns): LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) key_properties = common.get_key_properties(catalog_entry) write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) full_table.sync_table(mssql_conn, catalog_entry, state, columns, stream_version) singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_table(mysql_conn, config, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream='%s_%s' % (common.get_database_name(catalog_entry), catalog_entry.stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} # common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(connection, catalog_entry, state, columns): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key = replication_key_state or replication_key_metadata replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) yield singer.ActivateVersionMessage(stream=catalog_entry.stream, version=stream_version) with connection.cursor() as cursor: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[ replication_key].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key, replication_key) params['replication_key_value'] = replication_key_value elif replication_key is not None: select_sql += ' ORDER BY `{}` ASC'.format(replication_key) for message in common.sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): yield message
def get_pages_safe(self, state, bookmarked_cursor, start_time): try: yield from self.get_pages(bookmarked_cursor, start_time) except (RuntimeError, RequestException): # NB> If we get a non-retryable error we should delete the # pagination cursor bookmark before re-raising the exception. LOGGER.fatal( "Received fatal exception during syncing of stream %s, Clearing cursor bookmark.", self.tap_stream_id) state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor') singer.write_state(state) raise
def clean_state_for_report(config, state, tap_stream_id): top_level_bookmark = get_bookmark(state, tap_stream_id, 'last_report_date') if top_level_bookmark: top_level_bookmark = utils.strptime_to_utc(top_level_bookmark) LOGGER.info("%s - Converting state to multi-profile format.", tap_stream_id) view_ids = get_view_ids(config) for view_id in view_ids: state = singer.write_bookmark( state, tap_stream_id, view_id, {'last_report_date': top_level_bookmark.strftime("%Y-%m-%d")}) state = singer.clear_bookmark(state, tap_stream_id, 'last_report_date') singer.write_state(state) return state
def sync(config, state, catalog): client = PeekClient(config['token']) partner_id = config['partner_id'] with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) start_date = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) end_date = singer.utils.strftime( singer.utils.now(), format_str=singer.utils.DATETIME_PARSE) for record in stream_obj.sync(partner_id=partner_id, start_date=start_date, end_date=end_date): LOGGER.info(f"Writing record: {record}") transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'start_date') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def sync(config, state, catalog): # Any client required PARAMETERS to hit the endpoint client = CLIENT_CLASS_NAME(CLIENT_PARAMETERS_HERE) with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) # replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema( tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key ) client = CLIENT_CLASS_NAME(CLIENT_PARAMETERS_HERE) for record in stream_obj.sync(CLIENT_PARAMETERS_HERE): transformed_record = transformer.transform( record, stream_schema, stream_metadata) LOGGER.info(f"Writing record: {transformed_record}") singer.write_record( tap_stream_id, transformed_record, ) # If there is a Bookmark or state based key to store state = singer.clear_bookmark( state, tap_stream_id, BOOKMARK_KEY) singer.write_state(state, tap_stream_id, ) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def sync(self, state, stream_schema, stream_metadata, config, transformer): start_time = config['start_date'] bookmarked_cursor = singer.get_bookmark(state, self.tap_stream_id, 'cursor') for page, cursor in self.get_pages_safe(state, bookmarked_cursor, start_time): for record in page: if record['updated_at'] >= start_time: transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( self.tap_stream_id, transformed_record, ) singer.write_bookmark(state, self.tap_stream_id, 'cursor', cursor) singer.write_state(state) state = singer.clear_bookmark(state, self.tap_stream_id, 'cursor') singer.write_state(state) return state
def sync_collection(client, stream, state, projection): tap_stream_id = stream['tap_stream_id'] LOGGER.info('Starting full table sync for %s', tap_stream_id) md_map = metadata.to_map(stream['metadata']) database_name = metadata.get(md_map, (), 'database-name') db = client[database_name] collection = db[stream['stream']] #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None #pick a new table version if last run wasn't interrupted if was_interrupted: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value') max_id_type = singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type') max_id_value = common.string_to_class(max_id_value, max_id_type) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: last_id_fetched_type = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') find_filter['$gte'] = common.string_to_class(last_id_fetched, last_id_fetched_type) query_message = 'Querying {} with:\n\tFind Parameters: {}'.format( stream['tap_stream_id'], find_filter) if projection: query_message += '\n\tProjection: {}'.format(projection) # pylint: disable=logging-format-interpolation LOGGER.info(query_message) with collection.find({'_id': find_filter}, projection, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() start_time = time.time() schema = stream['schema'] or {"type": "object", "properties": {}} for row in cursor: rows_saved += 1 schema_build_start_time = time.time() if common.row_to_schema(schema, row): singer.write_message( singer.SchemaMessage( stream=common.calculate_destination_stream_name( stream), schema=schema, key_properties=['_id'])) common.SCHEMA_COUNT[stream['tap_stream_id']] += 1 common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time( ) - schema_build_start_time record_message = common.row_to_singer_record( stream, row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[tap_stream_id] += rows_saved common.TIMES[tap_stream_id] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') state = singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def _sync_stream(client, stream, transformer, config, state, catalog, **kwargs): record = kwargs.get('record', None) substreams = kwargs.get('substreams') tap_stream_id = stream.tap_stream_id stream_obj = stream() stream_catalog = catalog.get_stream(stream.tap_stream_id) replication_key = stream_obj.replication_key stream_schema = stream_catalog.schema.to_dict() stream_metadata = metadata.to_map(stream_catalog.metadata) replication_method = metadata.get(stream_metadata, (), 'replication-method') stream_obj.update_replication_method(replication_method) LOGGER.debug('Starting sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) # Only write schema once if not tap_stream_id in schemas_written: singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) schemas_written.append(tap_stream_id) start_date = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) offset = singer.get_bookmark(state, tap_stream_id, 'offset', 0) max_record_value = start_date for page, cursor in stream_obj.sync(client, config, state, record=record, start_date=start_date, offset=offset): for record in page: transformed_record = transformer.transform(record, stream_schema, stream_metadata) time_extracted = singer.utils.now() singer.write_record(tap_stream_id, transformed_record, time_extracted=time_extracted) if stream_obj.replication_method == 'INCREMENTAL': current_replication_value = deep_get(record, replication_key) if current_replication_value \ and current_replication_value > max_record_value: max_record_value = current_replication_value if substreams: _sync_streams(client, substreams.values(), transformer, config, state, catalog, record=record, start_date=start_date) state = singer.write_bookmark(state, tap_stream_id, 'offset', cursor) if stream_obj.replication_method == 'INCREMENTAL': state = singer.write_bookmark(state, tap_stream_id, replication_key, max_record_value) singer.write_state(state) state = singer.clear_bookmark(state, tap_stream_id, 'offset') singer.write_state(state)
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = True if 'version' in bookmark else False initial_full_table_complete = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) key_props_are_auto_incrementing = pks_are_auto_incrementing(mysql_conn, catalog_entry) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if key_props_are_auto_incrementing: LOGGER.info("Detected auto-incrementing primary key(s) - will replicate incrementally") max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') or get_max_pk_values(cur, catalog_entry) if not max_pk_values: LOGGER.info("No max value for auto-incrementing PK found for table {}".format(catalog_entry.table)) else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values', max_pk_values) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause params = {} common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def sync_table(mysql_conn, catalog_entry, state, columns, limit=None): common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state) catalog_metadata = metadata.to_map(catalog_entry.metadata) stream_metadata = catalog_metadata.get((), {}) iterate_limit = True while iterate_limit: replication_key_metadata = stream_metadata.get('replication-key') replication_key_state = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') replication_key_value = None if replication_key_metadata == replication_key_state: replication_key_value = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key_metadata) state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value') stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version ) singer.write_message(activate_version_message) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) params = {} if replication_key_value is not None: if catalog_entry.schema.properties[replication_key_metadata].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format( replication_key_metadata, replication_key_metadata) params['replication_key_value'] = replication_key_value elif replication_key_metadata is not None: select_sql += ' ORDER BY `{}` ASC'.format(replication_key_metadata) if limit: select_sql += ' LIMIT {}'.format(limit) num_rows = common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params) if limit is None or num_rows < limit: iterate_limit = False
def sync_collection(collection: Collection, stream: Dict, state: Dict) -> None: """ Sync collection records incrementally Args: collection: MongoDB collection instance stream: dictionary of all stream details state: the tap state """ LOGGER.info('Starting full table sync for %s', stream['tap_stream_id']) # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # last run was interrupted if there is a last_id_fetched bookmark # pick a new table version if last run wasn't interrupted if singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') is not None: stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') else: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=common.calculate_destination_stream_name(stream), version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if first_run: singer.write_message(activate_version_message) if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'): # There is a bookmark max_id_value = common.string_to_class( singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'), singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_type')) else: max_id_value = get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') if max_id_value: # Write the bookmark if max_id_value is defined state = singer.write_bookmark( state, stream['tap_stream_id'], 'max_id_value', common.class_to_string(max_id_value, max_id_value.__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_type', max_id_value.__class__.__name__) find_filter = {'$lte': max_id_value} if last_id_fetched: find_filter['$gte'] = common.string_to_class( last_id_fetched, singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type')) LOGGER.info('Querying %s with: %s', stream['tap_stream_id'], dict(find=find_filter)) with collection.find({'_id': find_filter}, sort=[("_id", pymongo.ASCENDING)]) as cursor: rows_saved = 0 start_time = time.time() for row in cursor: rows_saved += 1 singer.write_message( common.row_to_singer_record(stream=stream, row=row, time_extracted=utils.now(), time_deleted=None, version=stream_version)) state = singer.write_bookmark( state, stream['tap_stream_id'], 'last_id_fetched', common.class_to_string(row['_id'], row['_id'].__class__.__name__)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type', row['_id'].__class__.__name__) if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) common.COUNTS[stream['tap_stream_id']] += rows_saved common.TIMES[stream['tap_stream_id']] += time.time() - start_time # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched_type') singer.write_bookmark(state, stream['tap_stream_id'], 'initial_full_table_complete', True) singer.write_message(activate_version_message) LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
def sync(config, state, catalog): client = SquareClient(config) with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) start_time = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) bookmarked_cursor = singer.get_bookmark(state, tap_stream_id, 'cursor') if tap_stream_id == 'shifts': replication_key = stream_obj.replication_key sync_start_bookmark = singer.get_bookmark( state, tap_stream_id, 'sync_start', singer.utils.strftime( singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)) state = singer.write_bookmark( state, tap_stream_id, 'sync_start', sync_start_bookmark, ) for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: if record[replication_key] >= start_time: transformed_record = transformer.transform( record, stream_schema, stream_metadata, ) singer.write_record( tap_stream_id, transformed_record, ) state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) singer.write_state(state) state = singer.clear_bookmark(state, tap_stream_id, 'sync_start') state = singer.write_bookmark( state, tap_stream_id, replication_key, sync_start_bookmark, ) singer.write_state(state) elif stream_obj.replication_method == 'INCREMENTAL': replication_key = stream_obj.replication_key max_record_value = start_time for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) if record[replication_key] > max_record_value: max_record_value = transformed_record[ replication_key] state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) state = singer.write_bookmark(state, tap_stream_id, replication_key, max_record_value) singer.write_state(state) else: for record in stream_obj.sync(start_time, bookmarked_cursor): transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'cursor') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def generate_messages(con, config, catalog, state): catalog = resolve_catalog(con, catalog, state) for catalog_entry in catalog.streams: columns = list(catalog_entry.schema.properties.keys()) if not columns: LOGGER.warning( 'There are no columns selected for stream %s, skipping it.', catalog_entry.stream) continue state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id) # Emit a state message to indicate that we've started this stream yield singer.StateMessage(value=copy.deepcopy(state)) md_map = metadata.to_map(catalog_entry.metadata) replication_method = md_map.get((), {}).get('replication-method') replication_key = md_map.get((), {}).get('replication-key') if catalog_entry.is_view: key_properties = md_map.get((), {}).get('view-key-properties') else: key_properties = md_map.get((), {}).get('table-key-properties') with metrics.job_timer('sync_table') as timer: timer.tags['database'] = catalog_entry.database timer.tags['table'] = catalog_entry.table log_engine(con, catalog_entry) if replication_method == 'INCREMENTAL': LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, [replication_key]) for message in incremental.sync_table(con, catalog_entry, state, columns): yield message elif replication_method == 'LOG_BASED': if catalog_entry.is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view." .format(catalog_entry.stream)) LOGGER.info("Stream %s is using binlog replication", catalog_entry.stream) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') yield generate_schema_message(catalog_entry, key_properties, []) if log_file and log_pos: columns = binlog.add_automatic_properties( catalog_entry, columns) for message in binlog.sync_table(con, config, catalog_entry, state, columns): yield message else: LOGGER.info("Performing initial full table sync") log_file, log_pos = binlog.fetch_current_log_file_and_pos( con) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) for message in full_table.sync_table( con, catalog_entry, state, columns, stream_version): yield message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', log_pos) yield singer.StateMessage(value=copy.deepcopy(state)) elif replication_method == 'FULL_TABLE': LOGGER.info("Stream %s is using full table replication", catalog_entry.stream) yield generate_schema_message(catalog_entry, key_properties, []) stream_version = common.get_stream_version( catalog_entry.tap_stream_id, state) for message in full_table.sync_table(con, catalog_entry, state, columns, stream_version): yield message # Prefer initial_full_table_complete going forward singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'version') state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_full_table_complete', True) yield singer.StateMessage(value=copy.deepcopy(state)) else: raise Exception( "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported" ) # if we get here, we've finished processing all the streams, so clear # currently_syncing from the state and emit a state message. state = singer.set_currently_syncing(state, None) yield singer.StateMessage(value=copy.deepcopy(state))
def sync_table(client, stream, state, stream_version, columns): common.whitelist_bookmark_keys(generate_bookmark_keys(stream), stream['tap_stream_id'], state) mdata = metadata.to_map(stream['metadata']) stream_metadata = mdata.get(()) database_name = stream_metadata['database-name'] db = client[database_name] collection = db[stream['stream']] activate_version_message = singer.ActivateVersionMessage( stream=stream['stream'], version=stream_version) initial_full_table_complete = singer.get_bookmark( state, stream['tap_stream_id'], 'initial_full_table_complete') # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete: singer.write_message(activate_version_message) max_id_value = singer.get_bookmark( state, stream['tap_stream_id'], 'max_id_value') or get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_value', max_id_value) find_filter = {'$lte': objectid.ObjectId(max_id_value)} if last_id_fetched: find_filter['$gt':objectid.ObjectId(last_id_fetched)] LOGGER.info("Starting full table replication for table {}.{}".format( database_name, stream['stream'])) with metrics.record_counter(None) as counter: with collection.find({'_id': find_filter}, sort=[("_id", pymongo.DESCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() for row in cursor: rows_saved += 1 whitelisted_row = { k: v for k, v in row.items() if k in columns } record_message = common.row_to_singer_record( stream, whitelisted_row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched', str(row['_id'])) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.write_message(activate_version_message)