def sync_stream(config, state, table_spec, stream): table_name = table_spec['table_name'] bookmark = singer.get_bookmark(state, table_name, 'modified_since') modified_since = singer_utils.strptime_with_tz( bookmark or '1990-01-01T00:00:00Z') LOGGER.info('Syncing table "%s".', table_name) LOGGER.info('Getting files modified since %s.', modified_since) s3_files = s3.get_input_files_for_table( config, table_spec, modified_since) records_streamed = 0 # Original implementation sorted by 'modified_since' so that the modified_since bookmark makes # sense. We sort by 'key' because we import multiple part files generated from Spark where the # names are incremental order. # This means that we can't sync s3 buckets that are larger than # we can sort in memory which is suboptimal. If we could bookmark # based on anything else then we could just sync files as we see them. for s3_file in sorted(s3_files, key=lambda item: item['key']): records_streamed += sync_table_file( config, s3_file['key'], table_spec, stream) state = singer.write_bookmark( state, table_name, 'modified_since', s3_file['last_modified'].isoformat()) singer.write_state(state) if s3.skipped_files_count: LOGGER.warn("%s files got skipped during the last sync.", s3.skipped_files_count) LOGGER.info('Wrote %s records for table "%s".', records_streamed, table_name) return records_streamed
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map): replication_key = md_map.get((), {}).get('replication-key') LOGGER.info( "Stream %s is using incremental replication with replication key %s", stream['tap_stream_id'], replication_key) stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id']) illegal_bk_keys = set(stream_state.keys()).difference( set([ 'replication_key', 'replication_key_value', 'version', 'last_replication_method' ])) if len(illegal_bk_keys) != 0: raise Exception( "invalid keys found in state: {}".format(illegal_bk_keys)) state = singer.write_bookmark(state, stream['tap_stream_id'], 'replication_key', replication_key) sync_common.send_schema_message(stream, [replication_key]) state = incremental.sync_table(conn_config, stream, state, desired_columns, md_map) return state
def sync_owners(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("owners") bookmark_key = 'updatedAt' singer.write_schema("owners", schema, ["ownerId"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "owners", bookmark_key) max_bk_value = start LOGGER.info("sync_owners from %s", start) params = {} if CONFIG.get('include_inactives'): params['includeInactives'] = "true" data = request(get_url("owners"), params).json() time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] if record[bookmark_key] >= start: singer.write_record("owners", record, catalog.get('stream_alias'), time_extracted=time_extracted) STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def sync(self): start_position = 1 max_results = int(self.config.get('max_results', '200')) bookmark = singer.get_bookmark(self.state, self.stream_name, 'LastUpdatedTime', self.config.get('start_date')) while True: query = query_builder.build_query(self.table_name, bookmark, start_position, max_results, additional_where=self.additional_where) resp = self.client.get(self.endpoint, params={"query": query}).get('QueryResponse',{}) results = resp.get(self.table_name, []) for rec in results: yield rec if results: self.state = singer.write_bookmark(self.state, self.stream_name, 'LastUpdatedTime', rec.get('MetaData').get('LastUpdatedTime')) singer.write_state(self.state) if len(results) < max_results: break start_position += max_results singer.write_state(self.state)
def clear_state_on_replication_change(state: Dict, tap_stream_id: str, replication_key: str, replication_method: str) -> Dict: """ Update state if replication method change is detected Returns: new state dictionary """ # user changed replication, nuke state last_replication_method = singer.get_bookmark(state, tap_stream_id, 'last_replication_method') if last_replication_method is not None and (replication_method != last_replication_method): state = singer.reset_stream(state, tap_stream_id) # key changed if replication_method == 'INCREMENTAL' and \ replication_key != singer.get_bookmark(state, tap_stream_id, 'replication_key'): state = singer.reset_stream(state, tap_stream_id) state = singer.write_bookmark(state, tap_stream_id, 'last_replication_method', replication_method) return state
def sync_forms(state: State): schema = load_schema("forms") bookmark_key = 'updatedAt' singer.write_schema("hubspot_forms", schema, ["guid"], [bookmark_key]) start = get_start(state, "forms", bookmark_key) max_bk_value = start logger.info("sync_forms from %s", start) data = request(get_url("forms")).json() for row in data: record = build_record(row, schema) if record[bookmark_key] >= start: write_record('hubspot_forms', record) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] state = singer.write_bookmark(state, 'hubspot_forms', bookmark_key, max_bk_value) singer.write_state(state) return state
def sync(self, state, stream_schema, stream_metadata, transformer): # Bookmark is in timezone UTC start_time = self._get_start_time(state, RAW_BOOKMARK_DATE_FORMAT) end_time = self._get_end_time(RAW_BOOKMARK_DATE_FORMAT) for record in self.client.get_raw_data( self.report_name, self.report_version, start_time, end_time, RAW_INSTALL_N_IN_APP_FIELDNAMES): transformed_record = transformer.transform(xform(record), stream_schema, stream_metadata) singer.write_record(self.tap_stream_id, transformed_record, time_extracted=end_time) # Convert to bookmark format end_time_str = datetime.strftime(end_time, RAW_BOOKMARK_DATE_FORMAT) state = singer.write_bookmark(state, self.tap_stream_id, self.replication_key, end_time_str) singer.write_state(state) return state
def build_state(raw_state, catalog): state = {} for catalog_entry in catalog['streams']: tap_stream_id = catalog_entry['tap_stream_id'] catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_method = catalog_metadata.get((), {}).get('replication-method') version = singer.get_bookmark(raw_state, tap_stream_id, 'version') # Preserve state that deals with resuming an incomplete bulk job if singer.get_bookmark(raw_state, tap_stream_id, 'JobID'): job_id = singer.get_bookmark(raw_state, tap_stream_id, 'JobID') batches = singer.get_bookmark(raw_state, tap_stream_id, 'BatchIDs') current_bookmark = singer.get_bookmark(raw_state, tap_stream_id, 'JobHighestBookmarkSeen') state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id) state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batches) state = singer.write_bookmark(state, tap_stream_id, 'JobHighestBookmarkSeen', current_bookmark) if replication_method == 'INCREMENTAL': replication_key = catalog_metadata.get((), {}).get('replication-key') replication_key_value = singer.get_bookmark( raw_state, tap_stream_id, replication_key) if version is not None: state = singer.write_bookmark(state, tap_stream_id, 'version', version) if replication_key_value is not None: state = singer.write_bookmark(state, tap_stream_id, replication_key, replication_key_value) elif replication_method == 'FULL_TABLE' and version is None: state = singer.write_bookmark(state, tap_stream_id, 'version', version) return state
def build_state(raw_state, catalog): state = {} replication_method = "INCREMENTAL" for catalog_entry in catalog["streams"]: tap_stream_id = catalog_entry["tap_stream_id"] catalog_metadata = metadata.to_map(catalog_entry["metadata"]) version = singer.get_bookmark(raw_state, tap_stream_id, "version") # Preserve state that deals with resuming an incomplete bulk job if singer.get_bookmark(raw_state, tap_stream_id, "JobID"): job_id = singer.get_bookmark(raw_state, tap_stream_id, "JobID") batches = singer.get_bookmark(raw_state, tap_stream_id, "BatchIDs") current_bookmark = singer.get_bookmark(raw_state, tap_stream_id, "JobHighestBookmarkSeen") state = singer.write_bookmark(state, tap_stream_id, "JobID", job_id) state = singer.write_bookmark(state, tap_stream_id, "BatchIDs", batches) state = singer.write_bookmark(state, tap_stream_id, "JobHighestBookmarkSeen", current_bookmark) if replication_method == "INCREMENTAL": replication_key = catalog_metadata.get( (), {}).get("valid-replication-keys")[0] replication_key_value = singer.get_bookmark( raw_state, tap_stream_id, replication_key) if version is not None: state = singer.write_bookmark(state, tap_stream_id, "version", version) if replication_key_value is not None: state = singer.write_bookmark(state, tap_stream_id, replication_key, replication_key_value) elif replication_method == "FULL_TABLE" and version is None: state = singer.write_bookmark(state, tap_stream_id, "version", version) return state
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("engagements") bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must save a lookback window # that handles the duration of time that this stream was last syncing, # and look back by that amount on the next sync last_sync_duration = get_previous_time_window(STATE, "engagements") current_sync_start = utils.now() if has_bookmark(STATE, "engagements", bookmark_key) and \ last_sync_duration is not None: LOGGER.info(( "Last sync of engagements lasted {} seconds. Adjusting bookmark by this " "amount to account for race conditions with record updates." ).format(last_sync_duration)) start = utils.strptime_to_utc(start) - datetime.timedelta( seconds=last_sync_duration) start = utils.strftime(start) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform(engagement, schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value) # Write duration for next sync's lookback window STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now()) singer.write_state(STATE) return STATE
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version IFF we do not have an xmin in our state #the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream.tap_stream_id, 'xmin') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: xmin = singer.get_bookmark(state, stream.tap_stream_id, 'xmin') if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) < age('{}'::xid) ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table), xmin) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table)) LOGGER.info("select %s", select_sql) cur.execute(select_sql) rows_saved = 0 rec = cur.fetchone() while rec is not None: xmin = rec['xmin'] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', xmin) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() rec = cur.fetchone() #once we have completed the full table replication, discard the xmin bookmark. #the xmin bookmark only comes into play when a full table replication is interrupted state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', None) #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_table(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute( """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""" ) cur.execute( """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""" ) time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') # If there was no bookmark for stream_version, it is the first time # this table is being sync'd, so get a new version, write to # state if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=stream_version) singer.write_message(activate_version_message) md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table replication_key = md.get((), {}).get('replication-key') #escaped_replication_key = common.prepare_columns_sql(stream, replication_key) replication_key_value = singer.get_bookmark(state, stream.tap_stream_id, 'replication_key_value') replication_key_sql_datatype = md.get( ('properties', replication_key)).get('sql-datatype') with metrics.record_counter(None) as counter: if replication_key_value: LOGGER.info("Resuming Incremental replication from %s = %s", replication_key, replication_key_value) casted_where_clause_arg = common.prepare_where_clause_arg( replication_key_value, replication_key_sql_datatype) select_sql = """SELECT {} FROM {}.{} WHERE {} >= {} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key, casted_where_clause_arg, replication_key) else: select_sql = """SELECT {} FROM {}.{} ORDER BY {} ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, replication_key) rows_saved = 0 LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): record_message = common.row_to_singer_message( stream, row, stream_version, desired_columns, time_extracted) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() cur.close() connection.close() return state
def sync_rows(config, state, tap_stream_id, key_properties=[], auth_method=None, max_page=None, assume_sorted=True, filter_by_schema=True, raw_output=False): """ - max_page: Force sync to end after max_page. Mostly used for debugging. - assume_sorted: Trust the data to be presorted by the index/timestamp/datetime keys so it is safe to finish the replication once the last update index/timestamp/datetime passes the end. """ schema = load_schema(config["schema_dir"], tap_stream_id) params = get_init_endpoint_params(config, state, tap_stream_id) bookmark_type = get_bookmark_type(config) start = get_start(config, state, tap_stream_id, "last_update") end = get_end(config) headers = get_http_headers(config) if start is None: LOGGER.warning("None of timestamp_key, datetime_key, and index_key" + " are set in conifg. Bookmarking is not available.") start_str = human_readable(bookmark_type, start) end_str = human_readable(bookmark_type, end) # Log the conditions LOGGER.info("Stream %s has %s set starting %s and ending %s." % (tap_stream_id, bookmark_type, start_str, end_str)) # I trust you set URL format contains those params. The behavior depends # on the data source API's spec. # I will not filter out the records outside the boundary. Every record # received is will be written out. LOGGER.info("assume_sorted is set to %s" % assume_sorted) # I trust the data to be sorted by the index/timestamp/datetime keys. # So it is safe to finish the replication once the last # update index/timestamp/datetime passes the end. # When in doubt, set this to False. Always perform post-replication dedup. LOGGER.info("filter_by_schema is set to %s." % filter_by_schema) # The fields undefined/not-conforming to the schema will be written out. LOGGER.info("auth_method is set to %s" % auth_method) # Initialize the counters last_update = start # Offset is the number of records (vs. page) offset_number = params.get("current_offset", 0) page_number = params.get("current_page", 0) # When we rely on index/datetime/timestamp to parse the next GET URL, # we will get the record we have already seen in the current process. # When we get last_record_extracted from state file, we can also # compare with the previous process to further avoiding duplicated # records in the target data store. prev_written_record = None last_record_extracted = singer.get_bookmark(state, tap_stream_id, "last_record_extracted") if last_record_extracted: prev_written_record = json.loads(last_record_extracted) # First writ out the schema if raw_output is False: singer.write_schema(tap_stream_id, schema, key_properties) # Fetch and iterate over to write the records with metrics.record_counter(tap_stream_id) as counter: while True: params.update({"current_page": page_number}) params.update({"current_page_one_base": page_number + 1}) params.update({"current_offset": offset_number}) params.update({"last_update": last_update}) endpoint = get_endpoint(config["url"], tap_stream_id, params) LOGGER.info("GET %s", endpoint) rows = generate_request(tap_stream_id, endpoint, auth_method, headers, config.get("username"), config.get("password")) rows = get_record_list(rows, config.get("record_list_level")) LOGGER.info("Current page %d" % page_number) LOGGER.info("Current offset %d" % offset_number) for row in rows: record = get_record(row, config.get("record_level")) if filter_by_schema: record = filter_record(record, schema) if not validate(record, schema): LOGGER.debug("Skipping the schema invalidated row %s" % record) continue # It's important to compare the record before adding # EXTRACT_TIMESTAMP if record == prev_written_record: LOGGER.debug("Skipping the duplicated row %s" % record) continue if EXTRACT_TIMESTAMP in schema["properties"].keys(): extract_tstamp = datetime.datetime.utcnow() extract_tstamp = extract_tstamp.replace( tzinfo=datetime.timezone.utc) record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat() next_last_update = get_last_update(config, record, last_update) if not end or next_last_update < end: if raw_output: sys.stdout.write(json.dumps(record) + "\n") else: singer.write_record(tap_stream_id, record) counter.increment() # Increment only when we write last_update = next_last_update # prev_written_record may be persisted for the next run. # EXTRACT_TIMESTAMP will be different. So popping it out # before storing. record.pop(EXTRACT_TIMESTAMP) prev_written_record = record # Exit conditions if len(rows) < config["items_per_page"]: LOGGER.info( ("Response is less than set item per page (%d)." + "Finishing the extraction") % config["items_per_page"]) break if max_page and page_number + 1 >= max_page: LOGGER.info("Max page %d reached. Finishing the extraction.") break if assume_sorted and end and next_last_update >= end: LOGGER.info(("Record greater than %s and assume_sorted is" + " set. Finishing the extraction.") % end) break page_number += 1 offset_number += len(rows) state = singer.write_bookmark(state, tap_stream_id, "last_update", last_update) if prev_written_record: state = singer.write_bookmark(state, tap_stream_id, "last_record_extracted", json.dumps(prev_written_record)) if raw_output == False: singer.write_state(state) return state
def do_sync(self): logger.debug('Starting sync') # resuming when currently_syncing within state resume_from_stream = False if self.state and 'currently_syncing' in self.state: resume_from_stream = self.state['currently_syncing'] for stream in self.streams: stream.tap = self if resume_from_stream: if stream.schema == resume_from_stream: logger.info('Resuming from {}'.format(resume_from_stream)) resume_from_stream = False else: logger.info( 'Skipping stream {} as resuming from {}'.format( stream.schema, resume_from_stream)) continue # stream state, from state/bookmark or start_date stream.set_initial_state(self.state, self.config['start_date']) # currently syncing if stream.state_field: set_currently_syncing(self.state, stream.schema) self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state)) singer.write_state(self.state) # schema stream.write_schema() # paginate while stream.has_data(): with singer.metrics.http_request_timer(stream.schema) as timer: try: response = self.execute_stream_request(stream) except (ConnectionError, RequestException) as e: raise e timer.tags[singer.metrics.Tag. http_status_code] = response.status_code self.validate_response(response) self.rate_throttling(response) stream.paginate(response) # records with metrics with singer.metrics.record_counter(stream.schema) as counter: with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING ) as optimus_prime: for row in self.iterate_response(response): row = stream.process_row(row) row = optimus_prime.transform( row, stream.get_schema()) if stream.write_record(row): counter.increment() stream.update_state(row) # update state / bookmarking only when supported by stream if stream.state_field: self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.earliest_state)) singer.write_state(self.state) # clear currently_syncing del self.state['currently_syncing'] singer.write_state(self.state)
def sync_records(sf, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( sf.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') replication_key = catalog_entry.get('replication_key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Salesforce data for stream %s', stream) with Transformer(pre_hook=transform_bulk_data_hook) as transformer: for rec in sf.query(catalog_entry, state): counter.increment() rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if sf.pk_chunking: if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark: # Replace the highest seen bookmark and save the state in case we need to resume later chunked_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(chunked_bookmark)) singer.write_state(state) # Before writing a bookmark, make sure Salesforce has not given us a # record with one outside our range elif replication_key_value and replication_key_value <= start_time: state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, rec[replication_key]) singer.write_state(state) # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', None) # If pk_chunking is set, only write a bookmark at the end if sf.pk_chunking: # Write a bookmark with the highest value we've seen state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, singer_utils.strptime(chunked_bookmark))
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'limit': 100, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True v3_fields = None has_selected_properties = mdata.get(('properties', 'properties'), {}).get('selected') if has_selected_properties or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. params['includeAllProperties'] = True params['allPropertiesFetchMode'] = 'latest_version' # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with v3_fields = [ breadcrumb[1].replace('property_', '') for breadcrumb, mdata_map in mdata.items() if breadcrumb and (mdata_map.get('selected') == True or has_selected_properties) and any( prefix in breadcrumb[1] for prefix in V3_PREFIXES) ] url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"], v3_fields=v3_fields): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) for row in data[path]: counter.increment() record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def get_all_pull_requests(schemas, repo_path, state, mdata): ''' https://developer.github.com/v3/pulls/#list-pull-requests ''' bookmark_value = get_bookmark(state, repo_path, "pull_requests", "since") if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter('pull_requests') as counter: with metrics.record_counter('reviews') as reviews_counter: for response in authed_get_all_pages( 'pull_requests', 'https://api.github.com/repos/{}/pulls?state=all&sort=updated&direction=desc' .format(repo_path)): pull_requests = response.json() extraction_time = singer.utils.now() for pr in pull_requests: # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting if bookmark_time and singer.utils.strptime_to_utc( pr.get('updated_at')) < bookmark_time: return state pr_num = pr.get('number') pr_id = pr.get('id') pr['_sdc_repository'] = repo_path # transform and write pull_request record with singer.Transformer() as transformer: rec = transformer.transform( pr, schemas['pull_requests'], metadata=metadata.to_map(mdata)) singer.write_record('pull_requests', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'pull_requests', {'since': singer.utils.strftime(extraction_time)}) counter.increment() # sync reviews if that schema is present (only there if selected) if schemas.get('reviews'): for review_rec in get_reviews_for_pr( pr_num, schemas['reviews'], repo_path, state, mdata): singer.write_record('reviews', review_rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'reviews', { 'since': singer.utils.strftime(extraction_time) }) reviews_counter.increment() # sync review comments if that schema is present (only there if selected) if schemas.get('review_comments'): for review_comment_rec in get_review_comments_for_pr( pr_num, schemas['review_comments'], repo_path, state, mdata): singer.write_record('review_comments', review_comment_rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'review_comments', { 'since': singer.utils.strftime(extraction_time) }) if schemas.get('pr_commits'): for pr_commit in get_commits_for_pr( pr_num, pr_id, schemas['pr_commits'], repo_path, state, mdata): singer.write_record('pr_commits', pr_commit, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'pr_commits', { 'since': singer.utils.strftime(extraction_time) }) return state
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str): binlog.verify_binlog_config(mysql_conn) if use_gtid and engine == MYSQL_ENGINE: binlog.verify_gtid_config(mysql_conn) is_view = common.get_is_view(catalog_entry) if is_view: raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.") log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') gtid = None if use_gtid: gtid = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'gtid') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)): LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn) current_gtid = None if use_gtid: current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) if current_gtid: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'gtid', current_gtid)
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map( partial(post_db.prepare_columns_for_select_sql, md_map=md_map), desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version) singer.write_message(activate_version_message) replication_key = md_map.get((), {}).get('replication-key') replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'], 'replication_key_value') replication_key_sql_datatype = md_map.get( ('properties', replication_key)).get('sql-datatype') hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to the value in postgresql.conf under client_encoding. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='pipelinewise') as cur: cur.itersize = post_db.CURSOR_ITER_SIZE LOGGER.info("Beginning new incremental replication sync %s", stream_version) if replication_key_value: select_sql = """SELECT {} FROM {} WHERE {} >= '{}'::{} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream['table_name']), post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, post_db.prepare_columns_sql(replication_key)) else: #if not replication_key_value select_sql = """SELECT {} FROM {} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream['table_name']), post_db.prepare_columns_sql(replication_key)) LOGGER.info('select statement: %s with itersize %s', select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream['tap_stream_id'], 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() return state
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = GAClient(config) if not state.get('bookmarks'): state['bookmarks'] = {} # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] report_definition = ReportsHelper.get_report_definition(stream) stream_metadata = metadata.to_map(stream['metadata']) key_properties = metadata.get(stream_metadata, (), "table-key-properties") if stream_id in selected_stream_ids: LOGGER.info('Syncing stream: ' + stream_id) start_date = get_start_date(config, state, stream_id) end_date = get_end_date(config) date_range = [] while start_date <= end_date: date_range.append(utils.strftime(start_date, '%Y-%m-%d')) start_date += timedelta(days=1) for date in date_range: try: results = client.process_stream(date, report_definition) # we write the schema message after we are sure that we could # fetch records without errors singer.write_schema(stream_id, stream_schema, key_properties) singer.write_records(stream_id, results) singer.write_bookmark(state, stream_id, 'last_report_date', date) singer.write_state(state) except TapGaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except TapGaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return
def setUp(self): self.maxDiff = None self.state = {} self.conn = test_utils.get_test_connection() log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn) with connect_with_backoff(self.conn) as open_conn: with open_conn.cursor() as cursor: cursor.execute( 'CREATE TABLE binlog_1 (id int, updated datetime)') cursor.execute( 'CREATE TABLE binlog_2 (id int, updated datetime)') cursor.execute( 'INSERT INTO binlog_1 (id, updated) VALUES (1, \'2017-06-01\')' ) cursor.execute( 'INSERT INTO binlog_1 (id, updated) VALUES (2, \'2017-06-20\')' ) cursor.execute( 'INSERT INTO binlog_1 (id, updated) VALUES (3, \'2017-09-22\')' ) cursor.execute( 'INSERT INTO binlog_2 (id, updated) VALUES (1, \'2017-10-22\')' ) cursor.execute( 'INSERT INTO binlog_2 (id, updated) VALUES (2, \'2017-11-10\')' ) cursor.execute( 'INSERT INTO binlog_2 (id, updated) VALUES (3, \'2017-12-10\')' ) cursor.execute( 'UPDATE binlog_1 set updated=\'2018-06-18\' WHERE id = 3') cursor.execute( 'UPDATE binlog_2 set updated=\'2018-06-18\' WHERE id = 2') cursor.execute('DELETE FROM binlog_1 WHERE id = 2') cursor.execute('DELETE FROM binlog_2 WHERE id = 1') open_conn.commit() self.catalog = test_utils.discover_catalog(self.conn, {}) for stream in self.catalog.streams: stream.stream = stream.table stream.metadata = [{ 'breadcrumb': (), 'metadata': { 'selected': True, 'database-name': 'tap_mysql_test', 'table-key-propertes': ['id'] } }, { 'breadcrumb': ('properties', 'id'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'updated'), 'metadata': { 'selected': True } }] test_utils.set_replication_method_and_key(stream, 'LOG_BASED', None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_file', log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_pos', log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'version', singer.utils.now())
def sync_table(connection, catalog_entry, state): columns = list(catalog_entry.schema.properties.keys()) start_date = CONFIG.get('start_date') formatted_start_date = None if not columns: LOGGER.warning( 'There are no columns selected for table {}, skipping it'.format( catalog_entry.table)) return tap_stream_id = catalog_entry.tap_stream_id LOGGER.info('Beginning sync for {} table'.format(tap_stream_id)) with connection.cursor() as cursor: schema, table = catalog_entry.table.split('.') select = 'SELECT {} FROM {}.{}'.format( ','.join('"{}"'.format(c) for c in columns), '"{}"'.format(schema), '"{}"'.format(table)) params = {} if start_date is not None: formatted_start_date = datetime.datetime.strptime( start_date, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC) replication_key = metadata.to_map(catalog_entry.metadata).get( (), {}).get('replication-key') replication_key_value = None bookmark_is_empty = state.get('bookmarks', {}).get(tap_stream_id) is None stream_version = get_stream_version(tap_stream_id, state) state = singer.write_bookmark(state, tap_stream_id, 'version', stream_version) activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # If there's a replication key, we want to emit an ACTIVATE_VERSION # message at the beginning so the records show up right away. If # there's no bookmark at all for this stream, assume it's the very # first replication. That is, clients have never seen rows for this # stream before, so they can immediately acknowledge the present # version. if replication_key or bookmark_is_empty: yield activate_version_message if replication_key: replication_key_value = singer.get_bookmark( state, tap_stream_id, 'replication_key_value') or formatted_start_date.isoformat() if replication_key_value is not None: entry_schema = catalog_entry.schema if entry_schema.properties[replication_key].format == 'date-time': replication_key_value = pendulum.parse(replication_key_value) select += ' WHERE {} >= %(replication_key_value)s ORDER BY {} ' \ 'ASC'.format(replication_key, replication_key) params['replication_key_value'] = replication_key_value elif replication_key is not None: select += ' ORDER BY {} ASC'.format(replication_key) time_extracted = utils.now() query_string = cursor.mogrify(select, params) LOGGER.info('Running {}'.format(query_string)) cursor.execute(select, params) row = cursor.fetchone() rows_saved = 0 with metrics.record_counter(None) as counter: counter.tags['database'] = catalog_entry.database counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_record(catalog_entry, stream_version, row, columns, time_extracted) yield record_message if replication_key is not None: state = singer.write_bookmark( state, tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: yield singer.StateMessage(value=copy.deepcopy(state)) row = cursor.fetchone() if not replication_key: yield activate_version_message state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', None) yield singer.StateMessage(value=copy.deepcopy(state))
async def sync_report_interval(client, account_id, report_stream, start_date, end_date): state_key = '{}_{}'.format(account_id, report_stream.stream) report_name = stringcase.pascalcase(report_stream.stream) report_schema = get_report_schema(client, report_name) singer.write_schema(report_stream.stream, report_schema, []) report_time = arrow.get().isoformat() # Get request id to retrieve report stream request_id = get_report_request_id(client, account_id, report_stream, report_name, start_date, end_date, state_key) singer.write_bookmark(STATE, state_key, 'request_id', request_id) singer.write_state(STATE) try: # Get success status and download url success, download_url = await poll_report(client, account_id, report_name, start_date, end_date, request_id) except Exception as some_error: # pylint: disable=broad-except,unused-variable LOGGER.info( 'The request_id %s for %s is invalid, generating a new one', request_id, state_key) request_id = get_report_request_id(client, account_id, report_stream, report_name, start_date, end_date, state_key, force_refresh=True) singer.write_bookmark(STATE, state_key, 'request_id', request_id) singer.write_state(STATE) success, download_url = await poll_report(client, account_id, report_name, start_date, end_date, request_id) if success and download_url: # pylint: disable=no-else-return LOGGER.info('Streaming report: %s for account %s - from %s to %s', report_name, account_id, start_date, end_date) stream_report(report_stream.stream, report_name, download_url, report_time) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat()) singer.write_state(STATE) return True elif success and not download_url: LOGGER.info('No data for report: %s for account %s - from %s to %s', report_name, account_id, start_date, end_date) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat()) singer.write_state(STATE) return True else: LOGGER.info( 'Unsuccessful request for report: %s for account %s - from %s to %s', report_name, account_id, start_date, end_date) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_state(STATE) return False
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): """...""" replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') time_extracted = utils.now() LOGGER.info('Running %s', select_sql) cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method == 'FULL_TABLE': key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync(config: dict, state: dict, catalog: singer.Catalog): """ Synchronise data from source schemas using input context """ session = None # Get bookmarks of state of each stream bookmarks = state.get('bookmarks', dict()) # Parse timestamp and convert to date start_date = singer.utils.strptime_to_utc(config['start_date']) # selected_stream_ids = get_selected_streams(catalog) # # if not selected_stream_ids: # singer.log_warning('No streams selected') # Iterate over streams in catalog for stream in catalog.streams: stream_id = stream.tap_stream_id # Skip if not selected for sync # if stream_id not in selected_stream_ids: # continue LOGGER.info('Syncing stream: "%s"', stream_id) filter_schema(stream.schema, stream.metadata) # Emit schema singer.write_schema(stream_name=stream_id, schema=stream.schema.to_dict(), key_properties=stream.key_properties) # Initialise Gemini HTTP API session (only do this once) if session is None: session = tap_gemini.transport.GeminiSession( # Mandatory client_id=config['username'], client_secret=config['password'], refresh_token=config['refresh_token'], # Optional api_version=config.get('api_version'), user_agent=config.get('user_agent'), session_options=config.get('session', dict()), sandbox=config.get('sandbox')) # Get a list of all the account IDs advertiser_ids = config.get( 'advertiser_ids', [adv['id'] for adv in session.advertisers]) # Create data stream if stream_id in OBJECT_MAP.keys(): # List API objects model = OBJECT_MAP[stream_id] write_records(stream=stream, rows=model.list_data(session=session), tags=dict(object=stream_id)) else: # Run report # Use bookmark to continue where we left off bookmark = bookmarks.get(stream_id, dict()) start_date = bookmark.get(tap_gemini.settings.BOOKMARK_KEY, start_date) # Define time range try: # Is there a maximum look back? (i.e. earliest start date for report) days = tap_gemini.settings.MAX_LOOK_BACK_DAYS[stream_id] # Get the current timestamp and "look back" the specified number of days look_back_start_date = singer.utils.now() - datetime.timedelta( days=days) # Must we confine the time range to avoid errors? if look_back_start_date > start_date: start_date = look_back_start_date singer.log_warning( "\"%s\" enforced maximum look back of %s days, start date set to %s", stream_id, days, start_date) except KeyError: pass # Break into time window chunks, if necessary try: time_windows = generate_time_windows( start=start_date, size=tap_gemini.settings.MAX_WINDOW_DAYS[stream_id]) except KeyError: # Default time window: just use specified start/end date time_windows = ((start_date, cast_date_to_datetime( date=datetime.date.today())), ) # Each report is run within a single time window for start, end in time_windows: # Build report definition report_params = build_report_params(config=config, stream=stream, start_date=start, end_date=end) report_params['advertiser_ids'] = advertiser_ids # Define the report rep = tap_gemini.report.GeminiReport( session=session, poll_interval=config.get('poll_interval'), **report_params) # Emit records write_records(stream=stream, rows=rep.stream(), tags=rep.tags) # Bookmark the progress through the stream # Get the time when the data is complete (no further changes will occur) bookmark_timestamp = get_books_closed(rep=rep) # Preserve state for each stream singer.write_bookmark( state=state, tap_stream_id=stream_id, key=tap_gemini.settings.BOOKMARK_KEY, val=cast_date_to_datetime(bookmark_timestamp).isoformat()) singer.write_state(state)
def write_stream_duration(state, tap_stream_id, start, end): duration = (end - start).total_seconds() return singer.write_bookmark(state, tap_stream_id, "last_sync_duration", duration)
def write_current_sync_start(state, tap_stream_id, start): value = start if start is not None: value = utils.strftime(start) return singer.write_bookmark(state, tap_stream_id, "current_sync_start", value)
def setUp(self): self.conn = test_utils.get_test_connection() self.state = {} log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn) with connect_with_backoff(self.conn) as open_conn: with open_conn.cursor() as cursor: cursor.execute( 'CREATE TABLE datetime_types (id int, datetime_col datetime, timestamp_col timestamp, time_col time, date_col date)' ) cursor.execute( 'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (1, \'0000-00-00\', \'0000-00-00 00:00:00\', \'00:00:00\', \'0000-00-00\' )' ) cursor.execute( 'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (2, NULL, NULL, NULL, NULL)' ) open_conn.commit() self.catalog = test_utils.discover_catalog(self.conn, {}) for stream in self.catalog.streams: stream.stream = stream.table stream.metadata = [{ 'breadcrumb': (), 'metadata': { 'selected': True, 'database-name': 'tap_mysql_test', 'table-key-propertes': ['id'] } }, { 'breadcrumb': ('properties', 'id'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'datetime_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'timestamp_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'time_col'), 'metadata': { 'selected': True } }, { 'breadcrumb': ('properties', 'date_col'), 'metadata': { 'selected': True } }] test_utils.set_replication_method_and_key(stream, 'LOG_BASED', None) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_file', log_file) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'log_pos', log_pos) self.state = singer.write_bookmark(self.state, stream.tap_stream_id, 'version', singer.utils.now())
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform( lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE