def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. params['includeAllProperties'] = True params['allPropertiesFetchMode'] = 'latest_version' url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_endpoint( schema_name, endpoint=None, path=None, date_fields=None, with_updated_since=True, #pylint: disable=too-many-arguments for_each_handler=None, map_handler=None, object_to_id=None): schema = load_schema(schema_name) bookmark_property = 'updated_at' singer.write_schema(schema_name, schema, ["id"], bookmark_properties=[bookmark_property]) start = get_start(schema_name) start_dt = pendulum.parse(start) updated_since = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ") with Transformer() as transformer: page = 1 while page is not None: url = get_url(endpoint or schema_name) params = { "updated_since": updated_since } if with_updated_since else {} params['page'] = page response = request(url, params) path = path or schema_name data = response[path] time_extracted = utils.now() for row in data: if map_handler is not None: row = map_handler(row) if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None remove_empty_date_times(row, schema) item = transformer.transform(row, schema) append_times_to_dates(item, date_fields) if item[bookmark_property] >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint if for_each_handler is not None: for_each_handler(row, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) page = response['next_page'] singer.write_state(STATE)
def sync_records(sf, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( sf.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Salesforce data for stream %s', stream) for rec in sf.query(catalog_entry, state): counter.increment() with Transformer(pre_hook=transform_bulk_data_hook) as transformer: rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) replication_key_value = replication_key and singer_utils.strptime_with_tz( rec[replication_key]) if sf.pk_chunking: if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark: # Replace the highest seen bookmark and save the state in case we need to resume later chunked_bookmark = singer_utils.strptime_with_tz( rec[replication_key]) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(chunked_bookmark)) singer.write_state(state) # Before writing a bookmark, make sure Salesforce has not given us a # record with one outside our range elif replication_key_value and replication_key_value <= start_time: state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, rec[replication_key]) singer.write_state(state) # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', None) # If pk_chunking is set, only write a bookmark at the end if sf.pk_chunking: # Write a bookmark with the highest value we've seen state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, singer_utils.strftime(chunked_bookmark))
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=stream_version) if first_run: singer.write_message(activate_version_message) replication_key = md_map.get((), {}).get('replication-key') replication_key_value = singer.get_bookmark(state, stream.tap_stream_id, 'replication_key_value') replication_key_sql_datatype = md_map.get( ('properties', replication_key)).get('sql-datatype') hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.cursor_iter_size LOGGER.info("Beginning new incremental replication sync %s", stream_version) if replication_key_value: select_sql = """SELECT {} FROM {} WHERE {} >= '{}'::{} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table), post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, post_db.prepare_columns_sql(replication_key)) else: #if not replication_key_value select_sql = """SELECT {} FROM {} ORDER BY {} ASC""".format( ','.join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream.table), post_db.prepare_columns_sql(replication_key)) LOGGER.info("SELECT STATEMENT: %s", select_sql) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great #event worse would be allowing the NULL value to enter into the state if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() return state
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, site, sub_type, dimensions_list, path, endpoint_config, api_method, pagination, static_params, bookmark_field=None, data_key=None, body_params=None, id_fields=None): # Get the latest bookmark for the stream and set the last_datetime last_datetime = None max_bookmark_value = None last_datetime = get_bookmark(state, stream_name, site, sub_type, start_date) max_bookmark_value = last_datetime # Pagination: loop thru all pages of data # Pagination types: none, body, params # Each page has an offset (starting value) and a limit (batch size, number of records) # Increase the "offset" by the "limit" for each batch. # Continue until the "offset" exceeds the total_records. offset = 0 # Starting offset value for each batch API call limit = endpoint_config.get( 'row_limit', 1000) # Batch size; Number of records per API call total_records = 0 batch_count = limit page = 1 while limit == batch_count: if pagination == 'body': body = { 'startRow': offset, 'rowLimit': limit, **body_params # adds in endpoint specific, sort, filter body params } params = static_params elif pagination == 'params': params = { 'startRow': offset, 'rowLimit': limit, **static_params # adds in endpoint specific, sort, filter body params } body = body_params else: params = static_params body = body_params LOGGER.info( 'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'. format(stream_name, site, sub_type, offset)) # Squash params to query-string params querystring = None if params.items(): querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) LOGGER.info('URL for Stream: {}, Site: {} ({}): {}/{}{}'.format( stream_name, site, api_method, BASE_URL, path, '?{}'.format(querystring) if querystring else '')) if body and not body == {}: LOGGER.info('body = {}'.format(body)) # API request data, endpoint = stream_name passed to client for metrics logging data = {} fetch_state = "running" wait_time = 4 while fetch_state != "success": try: if api_method == 'GET': data = client.get(path=path, params=querystring, endpoint=stream_name) elif api_method == 'POST': data = client.post(path=path, params=querystring, endpoint=stream_name, data=json.dumps(body)) fetch_state = "success" except GoogleError as err: LOGGER.info('API quota exceeded, waiting... ' + str(wait_time) + ' seconds') time.sleep(wait_time) wait_time *= 2 # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('xxx NO DATA xxx') return 0 # No data results # Transform data with transform_json from transform.py transformed_data = [] # initialize the record list # Sites endpoint returns a single record dictionary (not a list) if stream_name == 'sites': data_list = [] data_list.append(data) data_dict = {} data_dict[data_key] = data_list data = data_dict if data_key in data: transformed_data = transform_json(data, stream_name, data_key, site, sub_type, dimensions_list)[data_key] else: LOGGER.info('Number of raw data records: 0') if not transformed_data or transformed_data is None: LOGGER.info('xxx NO TRANSFORMED DATA xxx') return 0 # No data results for record in transformed_data: for key in id_fields: if not record.get(key): primary_keys_only = { id_field: record.get(id_field) for id_field in id_fields } raise ValueError( 'Missing key {} in record with primary keys {}'.format( key, primary_keys_only)) batch_count = len(transformed_data) # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) # to_rec: to record; ending record for the batch to_rec = offset + limit if to_rec > total_records: to_rec = total_records LOGGER.info( 'Stream: {}, Site: {}, Type: {}, Page: {}, Batch records: {} to {}' .format(stream_name, site, sub_type, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) offset = offset + limit total_records = total_records + batch_count page = page + 1 # Update the state with the max_bookmark_value for the stream, site, sub_type # Reference: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics/query # NOTE: Results are sorted by click count descending. # If two rows have the same click count, they are sorted in an arbitrary way. # Records are NOT sorted in DATE order. # THEREFOR: State is updated after ALL pages of data for stream, site, sub_type, date window if bookmark_field: write_bookmark(state, stream_name, site, sub_type, max_bookmark_value) # Return total_records across all batches return total_records
def write_record(entity_type: str, record: dict): time_extracted = utils.now() singer.write_record(entity_type, record, time_extracted=time_extracted)
def sync_statistics_for_day( config, state, stream, sdk_client, token, start, report_metrics, report_dimensions, ): # pylint: disable=too-many-locals """Sync and output Criteo Statistics endpoint for one day.""" mdata = metadata.to_map(stream.metadata) stats_query = { "report_type": stream.tap_stream_id, "dimensions": report_dimensions, "metrics": report_metrics, "start_date": start.strftime("%Y-%m-%d"), "end_date": start.strftime("%Y-%m-%d"), "currency": metadata.get(mdata, (), "tap-criteo.currency"), } # Filter advertiser_ids if defined in config advertiser_ids = config.get("advertiser_ids") if advertiser_ids: stats_query["advertiserId"] = advertiser_ids # Add ignore_x_device if defined in metadata ignore_x_device = metadata.get(mdata, (), "tap-criteo.ignoreXDevice") if ignore_x_device: stats_query["tap-criteo.ignoreXDevice"] = ignore_x_device # Fetch the report as a csv string with metrics.http_request_timer(stream.tap_stream_id): result = get_statistics_report(sdk_client, stats_query, token=token) csv_reader = parse_csv_string(mdata, result) with metrics.record_counter(stream.tap_stream_id) as counter: time_extracted = utils.now() with Transformer() as bumble_bee: for row in csv_reader: row["_sdc_report_datetime"] = REPORT_RUN_DATETIME row["_sdc_report_currency"] = metadata.get( mdata, (), "tap-criteo.currency") row = bumble_bee.transform(row, stream.schema.to_dict()) singer.write_record(stream.stream, row, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(config, state, advertiser_ids, stream.stream): LOGGER.info( "updating bookmark: %s > %s", start, get_start_for_stream(config, state, advertiser_ids, stream.stream), ) bookmarks.write_bookmark( state, state_key_name(advertiser_ids, stream.stream), "date", utils.strftime(start), ) singer.write_state(state) else: LOGGER.info( "not updating bookmark: %s <= %s", start, get_start_for_stream(config, state, advertiser_ids, stream.stream), ) LOGGER.info( "Done syncing %s records for the %s report for " + "advertiser_ids %s on %s", counter.value, stream.stream, advertiser_ids, start, )
def sync_generic_endpoint(config, state, stream, sdk_client, token): """Sync a stream which is backed by a generic Criteo endpoint.""" stream = add_synthetic_keys_to_stream_schema(stream) stream = add_synthetic_keys_to_stream_metadata(stream) mdata = metadata.to_map(stream.metadata) primary_keys = metadata.get(mdata, (), "table-key-properties") or [] LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys)) singer.write_schema(stream.stream, stream.schema.to_dict(), primary_keys) advertiser_ids = config.get("advertiser_ids", None) if stream.tap_stream_id == "Audiences": if not advertiser_ids: LOGGER.warn( "%s stream needs at least one advertiser_id defined in config" % stream.stream) for advertiser_id in advertiser_ids.split(","): token = refresh_auth_token(sdk_client, token) with metrics.http_request_timer(stream.tap_stream_id): result = get_audiences_endpoint(sdk_client, advertiser_id, token=token) else: module = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["module"] method = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["method"] if stream.tap_stream_id in ( "Portfolio", "AdvertiserInfo", "Sellers", "SellerBudgets", "SellerCampaigns", ): result = call_generic_endpoint(stream, sdk_client, module, method, token=token) else: result = call_generic_endpoint( stream, sdk_client, module, method, advertiser_ids=advertiser_ids, token=token, ) result = convert_keys_snake_to_camel([_.to_dict() for _ in result]) with metrics.record_counter(stream.tap_stream_id) as counter: time_extracted = utils.now() with Transformer() as bumble_bee: for row in result: row["_sdc_report_datetime"] = REPORT_RUN_DATETIME row = bumble_bee.transform(row, stream.schema.to_dict()) singer.write_record(stream.stream, row, time_extracted=time_extracted) counter.increment() LOGGER.info( "Done syncing %s records for the %s report for advertiser_ids %s", counter.value, stream.stream, advertiser_ids, )
def _query_recur(self, query, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES): params = {"q": query} url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url) headers = self.sf._get_standard_headers() sync_start = singer_utils.now() if end_date is None: end_date = sync_start if retries == 0: raise TapSalesforceException( "Ran out of retries attempting to query Salesforce Object {}". format(catalog_entry['stream'])) retryable = False try: for rec in self._sync_records(url, headers, params): yield rec # If the date range was chunked (an end_date was passed), sync # from the end_date -> now if end_date < sync_start: next_start_date_str = singer_utils.strftime(end_date) query = self.sf._build_query_string(catalog_entry, next_start_date_str) for record in self._query_recur(query, catalog_entry, next_start_date_str, retries=retries): yield record except HTTPError as ex: response = ex.response.json() if isinstance( response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT": start_date = singer_utils.strptime_with_tz(start_date_str) day_range = (end_date - start_date).days LOGGER.info( "Salesforce returned QUERY_TIMEOUT querying %d days of %s", day_range, catalog_entry['stream']) retryable = True else: raise ex if retryable: start_date = singer_utils.strptime_with_tz(start_date_str) half_day_range = (end_date - start_date) // 2 end_date = end_date - half_day_range if half_day_range.days == 0: raise TapSalesforceException( "Attempting to query by 0 day range, this would cause infinite looping." ) query = self.sf._build_query_string( catalog_entry, singer_utils.strftime(start_date), singer_utils.strftime(end_date)) for record in self._query_recur(query, catalog_entry, start_date_str, end_date, retries - 1): yield record
create_sdk_client, get_audiences_endpoint, get_generic_endpoint, get_statistics_report, refresh_auth_token, ) from tap_criteo.endpoints import ( GENERIC_ENDPOINT_MAPPINGS, SELLER_STATS_REPORT_TYPES, STATISTICS_REPORT_TYPES, ) CSV_DELIMITER = ";" LOGGER = singer.get_logger() REPORT_RUN_DATETIME = utils.strftime(utils.now()) def get_attribution_window_bookmark(state, advertiser_ids, stream_name): """Get attribution window for stream from Singer State.""" mid_bk_value = bookmarks.get_bookmark( state, state_key_name(advertiser_ids, stream_name), "last_attribution_window_date", ) return utils.strptime_with_tz(mid_bk_value) if mid_bk_value else None def get_start_for_stream(config, state, advertiser_ids, stream_name): """Get start date for stream sync.""" bk_value = bookmarks.get_bookmark(
def _run_binlog_sync(mysql_conn: MySQLConnection, reader: BinLogStreamReader, binlog_streams_map: Dict, state: Dict, config: Dict, end_log_file: str, end_log_pos: int): processed_rows_events = 0 events_skipped = 0 log_file = None log_pos = None gtid_pos = reader.auto_position # initial gtid, we set this when we created the reader's instance # A set to hold all columns that are detected as we sync but should be ignored cuz they are unsupported types. # Saving them here to avoid doing the check if we should ignore a column over and over again ignored_columns = set() # Exit from the loop when the reader either runs out of streams to return or we reach # the end position (which is Master's) for binlog_event in reader: # get reader current binlog file and position log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if (log_file > end_log_file) or (end_log_file == log_file and log_pos >= end_log_pos): LOGGER.info( 'BinLog reader (file: %s, pos:%s) has reached or exceeded end position, exiting!', log_file, log_pos) # There are cases when a mass operation (inserts, updates, deletes) starts right after we get the Master # binlog file and position above, making the latter behind the stream reader and it causes some data loss # in the next run by skipping everything between end_log_file and log_pos # so we need to update log_pos back to master's position log_file = end_log_file log_pos = end_log_pos break if isinstance(binlog_event, RotateEvent): LOGGER.debug('RotateEvent: log_file=%s, log_pos=%d', binlog_event.next_binlog, binlog_event.position) state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position, gtid_pos) elif isinstance(binlog_event, MariadbGtidEvent) or isinstance( binlog_event, GtidEvent): gtid_pos = binlog_event.gtid LOGGER.debug('%s: gtid=%s', binlog_event.__class__.__name__, gtid_pos) state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) # There is strange behavior happening when using GTID in the pymysqlreplication lib, # explained here: https://github.com/noplay/python-mysql-replication/issues/367 # Fix: Updating the reader's auto-position to the newly encountered gtid means we won't have to restart # consuming binlog from old GTID pos when connection to server is lost. reader.auto_position = gtid_pos else: time_extracted = utils.now() tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped += 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, processed_rows_events) else: # Compare event's columns to the schema properties diff = __get_diff_in_columns_list( binlog_event, catalog_entry.schema.properties.keys(), ignored_columns) # If there are additional cols in the event then run discovery if needed and update the catalog if diff: LOGGER.info( 'Stream `%s`: Difference detected between event and schema: %s', tap_stream_id, diff) md_map = metadata.to_map(catalog_entry.metadata) if not should_run_discovery(diff, md_map): LOGGER.info( 'Stream `%s`: Not running discovery. Ignoring all detected columns in %s', tap_stream_id, diff) ignored_columns = ignored_columns.union(diff) else: LOGGER.info('Stream `%s`: Running discovery ... ', tap_stream_id) # run discovery for the current table only new_catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] selected = { k for k, v in new_catalog_entry.schema.properties.items() if common.property_is_selected( new_catalog_entry, k) } # the new catalog has "stream" property = table name, we need to update that to make it the # same as the result of the "resolve_catalog" function new_catalog_entry.stream = tap_stream_id # These are the columns we need to select new_columns = desired_columns(selected, new_catalog_entry.schema) cols = set(new_catalog_entry.schema.properties.keys()) # drop unsupported properties from schema for col in cols: if col not in new_columns: new_catalog_entry.schema.properties.pop( col, None) # Add the _sdc_deleted_at col new_columns = add_automatic_properties( new_catalog_entry, list(new_columns)) # send the new scheme to target if we have a new schema if new_catalog_entry.schema.properties != catalog_entry.schema.properties: write_schema_message( catalog_entry=new_catalog_entry) catalog_entry = new_catalog_entry # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = new_catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = new_columns columns = new_columns if isinstance(binlog_event, WriteRowsEvent): processed_rows_events = handle_write_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): processed_rows_events = handle_update_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): processed_rows_events = handle_delete_rows_event( binlog_event, catalog_entry, state, columns, processed_rows_events, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update singer bookmark and send STATE message periodically if ((processed_rows_events and processed_rows_events % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) LOGGER.info('Processed %s rows', processed_rows_events) # Update singer bookmark at the last time to point it the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos, gtid_pos)
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') query_string = cursor.mogrify(select_sql, params) time_extracted = utils.now() LOGGER.info('Running %s', query_string) cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method in {'FULL_TABLE', 'LOG_BASED'}: key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_milestones(entity, element="project"): url = get_url(element + "_milestones", entity['id']) with Transformer(pre_hook=format_timestamp) as transformer: for row in gen_request(url): transformed_row = transformer.transform(row, RESOURCES[element + "_milestones"]["schema"]) if row["updated_at"] >= get_start(element + "_{}".format(entity["id"])): singer.write_record(element + "_milestones", transformed_row, time_extracted=utils.now())
def sync_contact_lists(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("contact_lists") bookmark_key = 'updatedAt' singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "contact_lists", bookmark_key) max_bk_value = start LOGGER.info("sync_contact_lists from %s", start) url = get_url("contact_lists") params = {'count': 250} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'contact_lists', url, params, "lists", "has-more", ["offset"], ["offset"]): record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) if record[bookmark_key] >= start: singer.write_record("contact_lists", record, catalog.get('stream_alias'), time_extracted=utils.now()) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def get_end_date(config): """Get end date from config file.""" if config.get("end_date"): return utils.strptime_with_tz(config.get("end_date")) return utils.now()
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema('deal_pipelines') singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) LOGGER.info('sync_deal_pipelines') data = request(get_url('deal_pipelines')).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now()) singer.write_state(STATE) return STATE
def sync_records(qb, catalog_entry, state, counter, state_passed): chunked_bookmark = singer_utils.strptime_with_tz(qb.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Quickbooks data for stream %s', stream) previous_max_replication_key = None; query_func = qb.query if stream.endswith("Report"): query_func = qb.query_report for rec in query_func(catalog_entry, state, state_passed): counter.increment() with Transformer(pre_hook=transform_data_hook) as transformer: rec = transformer.transform(rec, schema) singer.write_message( singer.RecordMessage( stream=( stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) if replication_key: jsonpath_expression = parse(f"$.{replication_key}") _rec = {'MetaData': json.loads(rec.get('MetaData', {}))} match = jsonpath_expression.find(_rec) original_replication_key_value = "" if replication_key and len(match) > 0: original_replication_key_value = match[0].value replication_key_value = singer_utils.strptime_with_tz(original_replication_key_value) # Before writing a bookmark, make sure Quickbooks has not given us a # record with one outside our range if previous_max_replication_key is None or ( replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key ): state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, original_replication_key_value) previous_max_replication_key = replication_key_value # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'version', None)
def sync(client, config, catalog, state): start_date = config.get('start_date') # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams or selected_streams == []: return # Get current datetime (now_dt_str) for query parameters now_dttm = utils.now() now_dt_str = strftime(now_dttm)[0:10] # Reference: https://support.google.com/webmasters/answer/96568?hl=en # There is some delay/lag in Google Search Console results reconcilliation attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS) # Loop through selected_streams for stream_name in selected_streams: LOGGER.info('STARTED Syncing: {}'.format(stream_name)) update_currently_syncing(state, stream_name) write_schema(catalog, stream_name) endpoint_config = STREAMS[stream_name] bookmark_field = next( iter(endpoint_config.get('replication_keys', [])), None) body_params = endpoint_config.get('body', {}) endpoint_total = 0 # Initialize body body = endpoint_config.get('body', {}) # Loop through sites from config site_urls site_list = [] if 'site_urls' in config: site_list = config['site_urls'].replace(" ", "").split(",") for site in site_list: # Skip/ignore sitemaps for domain property sites # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607 # "...sitemaps API does not support domain property urls at this time." if stream_name == 'sitemaps' and site[0:9] == 'sc-domain': LOGGER.info('Skipping Site: {}'.format(site)) LOGGER.info( ' Sitemaps API does not support domain property urls at this time.' ) else: # Not sitemaps and sites = sc-domain LOGGER.info('STARTED Syncing: {}, Site: {}'.format( stream_name, site)) site_total = 0 site_encoded = quote(site, safe='') path = endpoint_config.get('path').format(site_encoded) # Set dimension_list for performance_reports if stream_name == 'performance_report_custom': dimensions_list = [] # Create dimensions_list from catalog breadcrumb stream = catalog.get_stream(stream_name) mdata = metadata.to_map(stream.metadata) dimensions_all = [ 'date', 'country', 'device', 'page', 'query' ] for dim in dimensions_all: if singer.should_sync_field( singer.metadata.get(mdata, ('properties', dim), 'inclusion'), singer.metadata.get(mdata, ('properties', dim), 'selected')): # metadata is selected for the dimension dimensions_list.append(dim) body_params['dimensions'] = dimensions_list dimensions_list = body_params.get('dimensions') LOGGER.info('stream: {}, dimensions_list: {}'.format( stream_name, dimensions_list)) # loop through each sub type sub_types = endpoint_config.get('sub_types', ['self']) for sub_type in sub_types: sub_type_total = 0 # Initialize date window if stream_name.startswith('performance_report'): reports_dttm_str = get_bookmark( state, stream_name, site, sub_type, start_date) reports_dttm = strptime_to_utc(reports_dttm_str) if reports_dttm < attribution_start_dttm: start_dttm = reports_dttm else: start_dttm = attribution_start_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm else: start_dttm = strptime_to_utc(start_date) end_dttm = now_dttm # Date window loop while start_dttm < now_dttm: start_str = strftime(start_dttm)[0:10] end_str = strftime(end_dttm)[0:10] if stream_name.startswith('performance_report'): body = { 'searchType': sub_type, 'startDate': start_str, 'endDate': end_str, **body_params } else: body = None LOGGER.info( 'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, site=site, sub_type=sub_type, dimensions_list=dimensions_list, path=path, endpoint_config=endpoint_config, api_method=endpoint_config.get( 'api_method', 'GET'), pagination=endpoint_config.get( 'pagination', 'none'), static_params=endpoint_config.get('params', {}), bookmark_field=bookmark_field, data_key=endpoint_config.get('data_key', None), body_params=body, id_fields=endpoint_config.get('key_properties')) # Increment totals endpoint_total = endpoint_total + total_records site_total = site_total + total_records sub_type_total = sub_type_total + total_records LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) LOGGER.info( ' Records Synced for Date Window: {}'.format( total_records)) # Set next date window start_dttm = end_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm # End date window loop LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}'. format(stream_name, site, sub_type)) LOGGER.info( ' Records Synced for Type: {}'.format(sub_type_total)) # End sub-type loop # End else: Not sitemaps and sites = sc-domain LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format( stream_name, site)) LOGGER.info(' Records Synced for Site: {}'.format(site_total)) # End site loop LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name)) LOGGER.info(' Records Synced for Stream: {}'.format(endpoint_total)) update_currently_syncing(state, None)
def batch_record_success(response, stream=None, transformer=None, schema=None): '''A success callback for the FB Batch endpoint used when syncing AdCreatives. Needs the stream to resolve schema refs and transform the successful response object.''' rec = response.json() record = transformer.transform(rec, schema) singer.write_record(stream.name, record, stream.stream_alias, utils.now())
def sync_companies(state: State): bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(state, "companies", bookmark_key)) logger.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("hubspot_companies", schema, ["companyId"], [bookmark_key]) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(state, "companies") or utils.now() state = write_current_sync_start(state, "companies", current_sync_start) singer.write_state(state) url = get_url("companies_all") max_bk_value = start contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("hubspot_contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) for row in gen_request(state, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = build_record(record, schema) write_record('hubspot_companies', record) state = _sync_contacts_by_company(state, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) state = singer.write_bookmark(state, 'hubspot_companies', bookmark_key, utils.strftime(new_bookmark)) state = write_current_sync_start(state, 'companies', None) singer.write_state(state) return state
def sync_endpoint(client, config, catalog, state, stream_name, endpoint_config, sync_streams, selected_streams, timezone_desc=None, parent_id=None): # endpoint_config variables base_path = endpoint_config.get('path', stream_name) bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None) params = endpoint_config.get('params', {}) paging = endpoint_config.get('paging', False) bookmark_query_field_from = endpoint_config.get( 'bookmark_query_field_from') bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to') targeting_group = endpoint_config.get('targeting_group') targeting_type = endpoint_config.get('targeting_type') targeting_country_ind = endpoint_config.get('targeting_country_ind', False) data_key_array = endpoint_config.get('data_key_array') data_key_record = endpoint_config.get('data_key_record').format( targeting_type=targeting_type) id_fields = endpoint_config.get('key_properties') parent = endpoint_config.get('parent') date_window_size = int(endpoint_config.get('date_window_size', '1')) # tap config variabless start_date = config.get('start_date') swipe_up_attribution_window = config.get('swipe_up_attribution_window', '28_DAY') view_attribution_window = config.get('view_attribution_window', '7_DAY') swipe_up_attr = int(swipe_up_attribution_window.replace('_DAY', '')) if view_attribution_window in ( '1_HOUR', '3_HOUR', '6_HOUR', ): view_attr = 1 else: view_attr = int(view_attribution_window.replace('_DAY', '')) attribution_window = max(1, swipe_up_attr, view_attr) omit_empty = config.get('omit_empty', 'true') if '_stats_' in stream_name: params['omit_empty'] = omit_empty country_codes = config.get('targeting_country_codes', 'us').replace(' ', '').lower() if targeting_country_ind: country_code_list = country_codes.split(',') else: country_code_list = ['none'] # Get the timezone and latest bookmark for the stream if not timezone_desc: timezone = pytz.timezone('UTC') else: timezone = pytz.timezone(timezone_desc) LOGGER.info('timezone = {}'.format(timezone)) last_datetime = get_bookmark(state, stream_name, start_date, bookmark_field, parent, parent_id) max_bookmark_value = last_datetime # Convert to datetimes in local/ad account timezone now_datetime = utils.now() last_dttm = strptime_to_utc(last_datetime) report_granularity = params.get('granularity', 'HOUR') if '_stats_' in stream_name: LOGGER.info('report_granularity: {}'.format(report_granularity)) if bookmark_query_field_from and bookmark_query_field_to: # date_window_size: Number of days in each date window # Set start window start_window = now_datetime - timedelta(days=attribution_window) if last_dttm < start_window: start_window = last_dttm # Set end window end_window = start_window + timedelta(days=date_window_size) else: start_window = last_dttm end_window = now_datetime diff_sec = (end_window - start_window).seconds date_window_size = math.ceil( diff_sec / (3600 * 24)) # round-up difference to days endpoint_total = 0 total_records = 0 while start_window < now_datetime: LOGGER.info('START Sync for Stream: {}{}'.format( stream_name, ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \ if bookmark_query_field_from else '')) if bookmark_query_field_from and bookmark_query_field_to: # Query parameter startDate and endDate must be in Eastern time zone # API will error if future dates are requested if report_granularity == 'DAY': window_start_dt_str = remove_hours_local( start_window, timezone) window_end_dt_str = remove_hours_local(end_window, timezone) if window_start_dt_str == window_end_dt_str: window_end_dt_str = remove_hours_local( end_window + timedelta(days=1), timezone) else: window_start_dt_str = remove_minutes_local( start_window, timezone) window_end_dt_str = remove_minutes_local(end_window, timezone) if window_start_dt_str == window_end_dt_str: window_end_dt_str = remove_hours_local( end_window + timedelta(hours=1), timezone) params[bookmark_query_field_from] = window_start_dt_str params[bookmark_query_field_to] = window_end_dt_str # This loop will run once for non-country_code endpoints # and one or more times (for each country) for country_code endpoints for country_code in country_code_list: # Path if stream_name.startswith('targeting_'): path = base_path.format(targeting_group=targeting_group, targeting_type=targeting_type, country_code=country_code, parent_id=parent_id) else: path = base_path.format(country_code=country_code, parent_id=parent_id) # pagination: loop thru all pages of data using next (if not None) # Reference: https://developers.snapchat.com/api/docs/#pagination total_records = 0 offset = 1 page = 1 if paging: limit = 500 # Allowed values: 50 - 1000 params['limit'] = limit else: limit = None for key, val in params.items(): # Replace variables in params new_val = str(val).format( swipe_up_attribution_window=swipe_up_attribution_window, view_attribution_window=view_attribution_window) params[key] = new_val # concate params querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) # initialize next_url next_url = '{}/{}?{}'.format(client.base_url, path, querystring) # pagination loop while next_url is not None: # API request data data = {} try: data = client.get(url=next_url, endpoint=stream_name) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('URL for Stream {}: {}'.format( stream_name, next_url)) raise Exception(err) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('No data results returned') total_records = 0 break # No data results request_status = data.get('request_status') if request_status != 'SUCCESS': raise RuntimeError(data) # Get pagination next_url next_url = data.get('paging', {}).get('next_link', None) # Transform data with transform_json from transform.py # The data_key_array identifies the array/list of records below the <root> element # LOGGER.info('data = {}'.format(data)) # TESTING, comment out transformed_data = [] # initialize the record list # Reports stats streams de-nesting if '_stats_' in stream_name: for data_record in data.get(data_key_array, []): base_record = data_record.get(data_key_record, {}) records = base_record.get('timeseries', []) for record in records: # Add parent base_record fields to record for key, val in base_record.items(): if key not in ('start_time', 'end_time', 'timeseries'): record[key] = val # De-nest stats stats = record.get('stats', {}) for key, val in stats.items(): record[key] = val record.pop('stats', None) # transform record try: transformed_record = humps.decamelize(record) except Exception as err: LOGGER.error('{}'.format(err)) # LOGGER.error('error record: {}'.format(record)) # COMMENT OUT raise Exception(err) # verify primary_keys are in tansformed_record if 'id' not in transformed_record or 'start_time' not in transformed_record: LOGGER.error( 'Stream: {}, Missing key (id or start_time)' .format(stream_name)) LOGGER.error('transformed_record: {}'.format( transformed_record)) raise RuntimeError transformed_data.append(transformed_record) # End for record in records # End for data_record in array # End stats stream # Other streams de-nesting else: # Not stats stream for data_record in data.get(data_key_array, []): sub_request_status = data_record.get( 'sub_request_status') if sub_request_status != 'SUCCESS': raise RuntimeError(data_record) record = data_record.get(data_key_record, {}) # Transforms to align schemas for targeting streams if stream_name.startswith('targeting_'): record['targeting_group'] = targeting_group record['targeting_type'] = targeting_type if country_code != 'none': record['country_code'] = country_code if targeting_group == 'geo': record_id = record.get(targeting_type, {}).get('id') record_name = record.get(targeting_type, {}).get('name') record['id'] = record_id record['name'] = record_name if targeting_type == 'postal_code': record_id = record.get('postalCode') record['id'] = record_id record['name'] = record_id record.pop('postalCode') # Add parent id field/value if parent and parent_id: parent_key = '{}_id'.format(parent) record[parent_key] = parent_id # transform record (remove inconsistent use of CamelCase) try: transformed_record = humps.decamelize(record) except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('error record: {}'.format(record)) raise Exception(err) # verify primary_keys are in tansformed_record for key in id_fields: if not transformed_record.get(key): LOGGER.error( 'Stream: {}, Missing key {}'.format( stream_name, key)) LOGGER.info('transformed_record: {}'.format( transformed_record)) raise RuntimeError transformed_data.append(transformed_record) # End for data_record in array # End non-stats stream # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT if not transformed_data or transformed_data is None: LOGGER.info( 'No transformed data for data = {}'.format(data)) total_records = 0 break # No transformed_data results # Process records and get the max_bookmark_value and record_count if stream_name in sync_streams: max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) LOGGER.info('Stream {}, batch processed {} records'.format( stream_name, record_count)) # Loop thru parent batch records for each children objects (if should stream) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items( ): if child_stream_name in sync_streams: LOGGER.info( 'START Syncing: {}'.format(child_stream_name)) write_schema(catalog, child_stream_name) # For each parent record for record in transformed_data: i = 0 # Set parent_id for id_field in id_fields: if i == 0: parent_id_field = id_field if id_field == 'id': parent_id_field = id_field i = i + 1 parent_id = record.get(parent_id_field) if stream_name == 'ad_accounts': timezone_desc = record.get( 'timezone', timezone_desc) # sync_endpoint for child LOGGER.info( 'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\ .format(child_stream_name, stream_name, parent_id)) child_total_records = sync_endpoint( client=client, config=config, catalog=catalog, state=state, stream_name=child_stream_name, endpoint_config=child_endpoint_config, sync_streams=sync_streams, selected_streams=selected_streams, timezone_desc=timezone_desc, parent_id=parent_id) LOGGER.info( 'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\ .format(child_stream_name, parent_id, child_total_records)) # End transformed data record loop # End if child in sync_streams # End child streams for parent # End if children # Parent record batch total_records = total_records + record_count endpoint_total = endpoint_total + record_count LOGGER.info( 'Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, total_records)) # Pagination: increment the offset by the limit (batch-size) and page if limit: offset = offset + limit page = page + 1 # End page/batch - while next URL loop # End country_code loop # Update the state with the max_bookmark_value for the stream date window # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window if bookmark_field and stream_name in selected_streams: write_bookmark(state, stream_name, max_bookmark_value, bookmark_field, parent, parent_id) # Increment date window and sum endpoint_total start_window = end_window next_end_window = end_window + timedelta(days=date_window_size) if next_end_window > now_datetime: end_window = now_datetime else: end_window = next_end_window # End date window # Return total_records (for all pages and date windows) return endpoint_total
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, path, static_params, bookmark_query_field=None, bookmark_field=None, bookmark_type=None, parent=None, parent_id=None): # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None last_integer = None max_bookmark_value = None if bookmark_type == 'integer': last_integer = get_bookmark(state, stream_name, 0) max_bookmark_value = last_integer else: last_datetime = get_bookmark(state, stream_name, start_date) max_bookmark_value = last_datetime # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 to_rec = 0 limit = 100 # Default per_page limit is 100 total_endpoint_records = 0 next_url = '{}/{}'.format(client.base_url, path) params = { 'page': page, 'per': limit, **static_params # adds in endpoint specific, sort, filter params } total_processed_records = 0 while next_url is not None: # Need URL querystring for 1st page; subsequent pages provided by next_url # querystring: Squash query params into string if page == 1: if bookmark_query_field: if bookmark_type == 'datetime': params[bookmark_query_field] = start_date elif bookmark_type == 'integer': params[bookmark_query_field] = last_integer if params != {}: querystring = '&'.join([ '%s=%s' % (key, value) for (key, value) in params.items() ]) else: querystring = None LOGGER.info('URL for Stream {}: {}{}'.format( stream_name, next_url, '?{}'.format(querystring) if querystring else '')) # API request data # total_endpoint_records: API response for all pages data = {} data, total_endpoint_records, next_url = client.get( url=next_url, path=path, params=querystring, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: return total_endpoint_records # No data results # Transform data with transform_json from transform.py # The data_key identifies the array/list of records below the <root> element transformed_data = [] # initialize the record list if isinstance(data, list): transformed_data = transform_json(data, stream_name) if not transformed_data or transformed_data is None: LOGGER.info('No transformed data for data = {}'.format(data)) return total_endpoint_records # No data results # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, bookmark_type=bookmark_type, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime, last_integer=last_integer, parent=parent, parent_id=parent_id) total_processed_records = total_processed_records + record_count LOGGER.info( 'Stream {}, batch processed {} records, total processed records {}' .format(stream_name, record_count, total_processed_records)) # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, max_bookmark_value) # to_rec: to record; ending record for the batch page to_rec = offset + len(data) LOGGER.info( 'Synced Stream: {}, page: {}, records: {} to {} of {}'.format( stream_name, page, offset, to_rec, total_endpoint_records)) # Pagination: increment the offset by the limit (batch-size) and page if not next_url: offset = offset + len(data) else: offset = offset + limit page = page + 1 # Return total_endpoint_records across all pages LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format( stream_name, page - 1, total_endpoint_records)) return total_endpoint_records
def sync_records(ns, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( ns.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing NetSuite data for stream %s', stream) previous_max_replication_key = None query_func = ns.query for rec in query_func(ns, catalog_entry, state): counter.increment() with Transformer( pre_hook=transform_data_hook(ns, stream)) as transformer: rec = transformer.transform(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) if replication_key: _rec = rec.get(replication_key, None) original_replication_key_value = "" replication_key_value = None if replication_key and _rec is not None: original_replication_key_value = _rec replication_key_value = singer_utils.strptime_with_tz( original_replication_key_value) # Before writing a bookmark, make sure Quickbooks has not given us a # record with one outside our range if previous_max_replication_key is None or ( replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key): state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, original_replication_key_value) previous_max_replication_key = replication_key_value # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', None)
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform( lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE
def _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config: Dict): time_extracted = utils.now() rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) log_file = None log_pos = None for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.debug( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) else: # Compare event's columns to the schema properties diff = set(get_db_column_types(binlog_event).keys()).\ difference(catalog_entry.schema.properties.keys()) # If there are additional cols in the event then run discovery and update the catalog if diff: #run discovery for the current table only catalog_entry = discover_catalog( mysql_conn, config.get('filter_dbs'), catalog_entry.table).streams[0] # the new catalog has "stream" property = table name, we need to update that to make it the same as # the result of the "resolve_catalog" function catalog_entry.stream = tap_stream_id desired_columns = list( catalog_entry.schema.properties.keys()) # Add the _sdc_deleted_at col add_automatic_properties(catalog_entry, desired_columns) # update this dictionary while we're at it binlog_streams_map[tap_stream_id][ 'catalog_entry'] = catalog_entry binlog_streams_map[tap_stream_id][ 'desired_columns'] = desired_columns # send the new scheme to target write_schema_message(catalog_entry=catalog_entry) if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.debug( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) # Update log_file and log_pos after every processed binlog event log_file = reader.log_file log_pos = reader.log_pos # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == log_file and log_pos >= current_log_pos: break # Update singer bookmark and send STATE message periodically if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): state = update_bookmarks(state, binlog_streams_map, log_file, log_pos) singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) # Update singer bookmark at the last time to point it the the last processed binlog event if log_file and log_pos: state = update_bookmarks(state, binlog_streams_map, log_file, log_pos)
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def sync_campaigns(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("campaigns") singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias')) LOGGER.info("sync_campaigns(NO bookmarks)") url = get_url("campaigns_all") params = {'limit': 500} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]): record = request(get_url("campaigns_detail", campaign_id=row['id'])).json() record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata) singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now()) return STATE
def sync_endpoint( client, # pylint: disable=too-many-branches,too-many-nested-blocks catalog, state, start_date, stream_name, path, static_params, endpoint_config, bookmark_query_field=None, bookmark_field=None, bookmark_type=None, id_fields=None, selected_streams=None, parent=None, parent_id=None): # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None last_integer = None data_key = endpoint_config.get('data_key') if bookmark_type == 'integer': last_integer = get_bookmark(state, stream_name, 0) max_bookmark_value = last_integer else: last_datetime = get_bookmark(state, stream_name, start_date) max_bookmark_value = last_datetime # pagination: loop thru all pages of data using next_url (if not None) page = 1 offset = 0 limit = 100 # Default per_page limit is 100 total_endpoint_records = 0 url = '{}/{}'.format(client.base_url, path) next_token = None params = { 'perPage': limit, **static_params # adds in endpoint specific, sort, filter params } total_processed_records = 0 while url is not None: # Need URL querystring for 1st page; subsequent pages provided by next_url # querystring: Squash query params into string if page == 1: if bookmark_query_field: if bookmark_type == 'datetime': params[bookmark_query_field] = start_date elif bookmark_type == 'integer': params[bookmark_query_field] = last_integer else: if next_token: params['nextToken'] = next_token if params != {}: querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) querystring = querystring.replace('<parent_id>', str(parent_id)) LOGGER.info('URL for Stream {}: {}{}'.format( stream_name, url, '?{}'.format(querystring) if querystring else '')) if stream_name == 'recipients' and parent_id is None: break # API request data # total_endpoint_records: API response for all pages data, next_token = client.get(url=url, path=path, params=querystring, endpoint=stream_name) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: return total_endpoint_records # No data results if stream_name == 'recipients': if not data.get(data_key): break # Transform data with transform_data from transform.py # The data_key identifies the array/list of records below the <root> element transformed_data = transform_data(data.get(data_key), stream_name, parent_id) record_count = 0 # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value, record_count = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, bookmark_type=bookmark_type, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime, last_integer=last_integer, parent=parent, parent_id=parent_id) total_processed_records = total_processed_records + record_count LOGGER.info( 'Stream {}, batch processed {} records, total processed records {}' .format(stream_name, record_count, total_processed_records)) # Loop thru parent batch records for each children objects (if should stream) children = endpoint_config.get('children') if children: for child_stream_name, child_endpoint_config in children.items(): LOGGER.info(child_stream_name, child_endpoint_config) if child_stream_name in selected_streams: write_schema(catalog, child_stream_name) # For each parent record for record in transformed_data: i = 0 # Set parent_id for id_field in id_fields: if i == 0: parent_id_field = id_field if id_field == 'id': parent_id_field = id_field i = i + 1 parent_id = record.get(parent_id_field) # sync_endpoint for child LOGGER.info( 'START Sync for Stream: {}, parent_stream: {}, parent_id: {}' .format(child_stream_name, stream_name, parent_id)) child_path = child_endpoint_config.get( 'path', child_stream_name).format(str(parent_id)) child_bookmark_field = next( iter( child_endpoint_config.get( 'replication_keys', [])), None) child_total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=child_stream_name, path=child_path, endpoint_config=child_endpoint_config, static_params=child_endpoint_config.get( 'params', {}), bookmark_query_field=child_endpoint_config.get( 'bookmark_query_field'), bookmark_field=child_bookmark_field, bookmark_type=child_endpoint_config.get( 'bookmark_type'), id_fields=child_endpoint_config.get( 'key_properties'), selected_streams=selected_streams, parent=child_endpoint_config.get('parent'), parent_id=parent_id) LOGGER.info( 'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}' .format(child_stream_name, parent_id, child_total_records)) # to_rec: to record; ending record for the batch page to_rec = offset + record_count total_processed_records = to_rec LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format( stream_name, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) and page offset = offset + record_count page = page + 1 # If the API doesn't return a next token then that was the last page of results if not next_token: # Update the state with the max_bookmark_value for the stream if bookmark_field: write_bookmark(state, stream_name, max_bookmark_value) url = None # Return total_endpoint_records across all pages LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format( stream_name, page - 1, total_endpoint_records)) return total_endpoint_records
def get_absolute_start_end_time(self, last_dttm, lookback=0): now_dttm = now() abs_start, abs_end = self.round_times( last_dttm - timedelta(days=lookback), now_dttm) return abs_start, abs_end