def sync_contact_lists(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("contact_lists") bookmark_key = 'updatedAt' singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "contact_lists", bookmark_key) max_bk_value = start LOGGER.info("sync_contact_lists from %s", start) url = get_url("contact_lists") params = {'count': 250} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'contact_lists', url, params, "lists", "has-more", ["offset"], ["offset"]): record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) if record[bookmark_key] >= start: singer.write_record("contact_lists", record, catalog.get('stream_alias'), time_extracted=utils.now()) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def sync_forms(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("forms") bookmark_key = 'updatedAt' singer.write_schema("forms", schema, ["guid"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "forms", bookmark_key) max_bk_value = start LOGGER.info("sync_forms from %s", start) data = request(get_url("forms")).json() time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) if record[bookmark_key] >= start: singer.write_record("forms", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] STATE = singer.write_bookmark(STATE, 'forms', bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def sync_owners(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("owners") bookmark_key = 'updatedAt' singer.write_schema("owners", schema, ["ownerId"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "owners", bookmark_key) max_bk_value = start LOGGER.info("sync_owners from %s", start) params = {} if CONFIG.get('include_inactives'): params['includeInactives'] = "true" data = request(get_url("owners"), params).json() time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] if record[bookmark_key] >= start: singer.write_record("owners", record, catalog.get('stream_alias'), time_extracted=time_extracted) STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def get_streams_to_sync(streams, state): # get selected streams selected_streams = [s for s in streams if is_stream_selected(s)] # prioritize streams that have not been processed streams_with_state = [] streams_without_state = [] for stream in selected_streams: if state.get('bookmarks', {}).get(stream['tap_stream_id']): streams_with_state.append(stream) else: streams_without_state.append(stream) ordered_streams = streams_without_state + streams_with_state # If the state says we were in the middle of processing a stream, skip # to that stream. Then process streams without prior state and finally # move onto streams with state (i.e. have been synced in the past) currently_syncing = singer.get_currently_syncing(state) if currently_syncing: currently_syncing_stream = list(filter( lambda s: s['tap_stream_id'] == currently_syncing, ordered_streams)) non_currently_syncing_streams = list(filter(lambda s: s['tap_stream_id'] != currently_syncing, ordered_streams)) streams_to_sync = currently_syncing_stream + non_currently_syncing_streams else: streams_to_sync = ordered_streams return streams_to_sync
def _sync_contacts_by_company(STATE, ctx, company_id): schema = load_schema(CONTACTS_BY_COMPANY) catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) url = get_url("contacts_by_company", company_id=company_id) path = 'vids' with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: with metrics.record_counter(CONTACTS_BY_COMPANY) as counter: data = request(url, default_contacts_by_company_params).json() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) for row in data[path]: counter.increment() record = {'company-id': company_id, 'contact-id': row} record = bumble_bee.transform( lift_properties_and_versions(record), schema, mdata) singer.write_record("contacts_by_company", record, time_extracted=utils.now()) return STATE
def do_sync(conn_config, catalog, default_replication_method, state): currently_syncing = singer.get_currently_syncing(state) streams = list(filter(is_selected_via_metadata, catalog['streams'])) streams.sort(key=lambda s: s['tap_stream_id']) LOGGER.info("Selected streams: %s ", list(map(lambda s: s['tap_stream_id'], streams))) if any_logical_streams(streams, default_replication_method): LOGGER.info("Use of logical replication requires fetching an lsn...") end_lsn = logical_replication.fetch_current_lsn(conn_config) LOGGER.info("End LSN: %s ", end_lsn) else: end_lsn = None sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method) if currently_syncing: LOGGER.info("found currently_syncing: %s", currently_syncing) currently_syncing_stream = list(filter(lambda s: s['tap_stream_id'] == currently_syncing, traditional_streams)) if currently_syncing_stream is None: LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s['tap_stream_id'], traditional_streams))) other_streams = list(filter(lambda s: s['tap_stream_id'] != currently_syncing, traditional_streams)) traditional_streams = currently_syncing_stream + other_streams else: LOGGER.info("No currently_syncing found") for stream in traditional_streams: state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream['tap_stream_id']], end_lsn) logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(()).get('database-name')) for dbname, streams in itertools.groupby(logical_streams, lambda s: metadata.to_map(s['metadata']).get(()).get('database-name')): conn_config['dbname'] = dbname state = sync_logical_streams(conn_config, list(streams), state, end_lsn) return state
def do_sync(conn_config, catalog, default_replication_method, state): currently_syncing = singer.get_currently_syncing(state) streams = list(filter(is_selected_via_metadata, catalog.streams)) streams.sort(key=lambda s: s.tap_stream_id) LOGGER.info("Selected streams: %s ", list(map(lambda s: s.tap_stream_id, streams))) if any_logical_streams(streams, default_replication_method): LOGGER.info("Use of log_miner requires fetching current scn...") end_scn = log_miner.fetch_current_scn(conn_config) LOGGER.info("End SCN: %s ", end_scn) else: end_scn = None sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method) if currently_syncing: LOGGER.info("found currently_syncing: %s", currently_syncing) currently_syncing_stream = list(filter(lambda s: s.tap_stream_id == currently_syncing, traditional_streams)) if currently_syncing_stream is None: LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s.tap_stream_id, traditional_streams))) else: other_streams = list(filter(lambda s: s.tap_stream_id != currently_syncing, traditional_streams)) traditional_streams = currently_syncing_stream + other_streams else: LOGGER.info("No currently_syncing found") for stream in traditional_streams: state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream.tap_stream_id], end_scn) state = sync_log_miner_streams(conn_config, list(logical_streams), state, end_scn) return state
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True # Append all the properties fields for deals to the request additional_properties = schema.get("properties").get("properties").get( "properties") for key in additional_properties.keys(): params['properties'].append(key) url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(row, schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def do_sync(conn_config, catalog, default_replication_method, state): streams = list(filter(is_selected_via_metadata, catalog.streams)) streams.sort(key=lambda s: s.tap_stream_id) currently_syncing = singer.get_currently_syncing(state) if currently_syncing: streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing, streams) for stream in streams: md_map = metadata.to_map(stream.metadata) conn_config['dbname'] = md_map.get(()).get('database-name') state = singer.set_currently_syncing(state, stream.tap_stream_id) desired_columns = [ c for c in stream.schema.properties.keys() if should_sync_column(md_map, c) ] desired_columns.sort() if len(desired_columns) == 0: LOGGER.warning( 'There are no columns selected for stream %s, skipping it', stream.tap_stream_id) continue replication_method = md_map.get( (), {}).get('replication-method', default_replication_method) if replication_method not in set( ['LOG_BASED', 'FULL_TABLE', 'INCREMENTAL']): raise Exception("Unrecognized replication_method {}".format( replication_method)) replication_key = md_map.get((), {}).get('replication-key') state = clear_state_on_replication_change(state, stream.tap_stream_id, replication_key, replication_method) if replication_method == 'LOG_BASED' and md_map.get( (), {}).get('is-view'): LOGGER.warning( 'Logical Replication is NOT supported for views. skipping stream %s', stream.tap_stream_id) continue if replication_method == 'LOG_BASED': state = do_sync_logical_replication(conn_config, stream, state, desired_columns, md_map) elif replication_method == 'FULL_TABLE': state = do_sync_full_table(conn_config, stream, state, desired_columns, md_map) elif replication_method == 'INCREMENTAL': state = do_sync_incremental(conn_config, stream, state, desired_columns, md_map) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync(conn_config, catalog, default_replication_method, state, state_file=None): """ Orchestrates sync of all streams """ currently_syncing = singer.get_currently_syncing(state) streams = list(filter(is_selected_via_metadata, catalog['streams'])) streams.sort(key=lambda s: s['tap_stream_id']) LOGGER.info("Selected streams: %s ", [s['tap_stream_id'] for s in streams]) if any_logical_streams(streams, default_replication_method): # Use of logical replication requires fetching an lsn end_lsn = logical_replication.fetch_current_lsn(conn_config) LOGGER.debug("end_lsn = %s ", end_lsn) else: end_lsn = None refresh_streams_schema(conn_config, streams) sync_method_lookup, traditional_streams, logical_streams = \ sync_method_for_streams(streams, state, default_replication_method) if currently_syncing: LOGGER.debug("Found currently_syncing: %s", currently_syncing) currently_syncing_stream = list( filter(lambda s: s['tap_stream_id'] == currently_syncing, traditional_streams)) if not currently_syncing_stream: LOGGER.warning( "unable to locate currently_syncing(%s) amongst selected traditional streams(%s). " "Will ignore", currently_syncing, {s['tap_stream_id'] for s in traditional_streams}) other_streams = list( filter(lambda s: s['tap_stream_id'] != currently_syncing, traditional_streams)) traditional_streams = currently_syncing_stream + other_streams else: LOGGER.info("No streams marked as currently_syncing in state file") for stream in traditional_streams: state = sync_traditional_stream( conn_config, stream, state, sync_method_lookup[stream['tap_stream_id']], end_lsn) logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get( ()).get('database-name')) for dbname, streams in itertools.groupby( logical_streams, lambda s: metadata.to_map(s['metadata']).get( ()).get('database-name')): conn_config['dbname'] = dbname state = sync_logical_streams(conn_config, list(streams), state, end_lsn, state_file) return state
def sync(client, config, catalog, state): # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams or selected_streams == []: return # Get the streams to sync (based on dependencies) sync_streams = [] flat_streams = flatten_streams() # Loop thru all streams for stream_name, stream_metadata in flat_streams.items(): # If stream has a parent_stream, then it is a child stream parent_stream = stream_metadata.get('parent_stream') grandparent_stream = stream_metadata.get('grandparent_stream') great_grandparent_stream = stream_metadata.get( 'great_grandparent_stream') if stream_name in selected_streams: LOGGER.info( 'stream: {}, parent: {}, grandparent: {}, great_grandparent: {}' .format(stream_name, parent_stream, grandparent_stream, great_grandparent_stream)) if stream_name not in sync_streams: sync_streams.append(stream_name) if parent_stream and parent_stream not in sync_streams: sync_streams.append(parent_stream) if grandparent_stream and grandparent_stream not in sync_streams: sync_streams.append(grandparent_stream) if great_grandparent_stream and great_grandparent_stream not in sync_streams: sync_streams.append(grandparent_stream) LOGGER.info('Sync Streams: {}'.format(sync_streams)) # Loop through selected_streams # Loop through endpoints in selected_streams for stream_name, endpoint_config in STREAMS.items(): if stream_name in sync_streams: LOGGER.info('START Syncing: {}'.format(stream_name)) write_schema(catalog, stream_name) update_currently_syncing(state, stream_name) total_records = sync_endpoint(client=client, config=config, catalog=catalog, state=state, stream_name=stream_name, endpoint_config=endpoint_config, sync_streams=sync_streams, selected_streams=selected_streams) update_currently_syncing(state, None) LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format( stream_name, total_records))
def sync_deal_histories(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deal_histories", bookmark_key)) max_bk_value = start LOGGER.info("sync_deal_histories from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deal_histories") singer.write_schema("deal_histories", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True # if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. # params['includeAllProperties'] = True # params['allPropertiesFetchMode'] = 'latest_version' url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deal_histories', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( url=get_url("deals_detail", deal_id=row['dealId']), params={"includePropertyVersions": "true"} ).json() record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata) singer.write_record("deal_histories", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deal_histories', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync(client, config, catalog, state): if 'start_date' in config: start_date = config['start_date'] # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] flat_streams = flatten_streams() for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) parent_stream = flat_streams.get(stream.stream, {}).get('parent_stream') if parent_stream: if parent_stream not in selected_streams: selected_streams.append(parent_stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams: return # Loop through selected_streams for stream_name, endpoint_config in STREAMS.items(): if stream_name in selected_streams: LOGGER.info('START Syncing: {}'.format(stream_name)) update_currently_syncing(state, stream_name) path = endpoint_config.get('path', stream_name) bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None) replication_ind = endpoint_config.get('replication_ind', True) if replication_ind: selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format(stream_name, selected_fields)) write_schema(catalog, stream_name) else: selected_fields = None total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, path=path, endpoint_config=endpoint_config, static_params=endpoint_config.get('params', {}), bookmark_query_field=endpoint_config.get('bookmark_query_field', None), bookmark_field=bookmark_field, bookmark_type=endpoint_config.get('bookmark_type', None), data_key=endpoint_config.get('data_key', stream_name), id_fields=endpoint_config.get('key_properties'), selected_streams=selected_streams, replication_ind=replication_ind) update_currently_syncing(state, None) LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format( stream_name, total_records))
def get_streams(snowflake_conn, catalog, config, state): """Returns the Catalog of data we're going to sync for all SELECT-based streams (i.e. INCREMENTAL and FULL_TABLE that require a historical sync). Using the Catalog provided from the input file, this function will return a Catalog representing exactly which tables and columns that will be emitted by SELECT-based syncs. This is achieved by comparing the input Catalog to a freshly discovered Catalog to determine the resulting Catalog. The resulting Catalog will include the following any streams marked as "selected" that currently exist in the database. Columns marked as "selected" and those labled "automatic" (e.g. primary keys and replication keys) will be included. Streams will be prioritized in the following order: 1. currently_syncing if it is SELECT-based 2. any streams that do not have state 3. any streams that do not have a replication method of LOG_BASED """ discovered = discover_catalog(snowflake_conn, config) # Filter catalog to include only selected streams # pylint: disable=unnecessary-lambda selected_streams = list( filter(lambda s: common.stream_is_selected(s), catalog.streams)) streams_with_state = [] streams_without_state = [] for stream in selected_streams: stream_state = state.get('bookmarks', {}).get(stream.tap_stream_id) if not stream_state: streams_without_state.append(stream) else: streams_with_state.append(stream) # If the state says we were in the middle of processing a stream, skip # to that stream. Then process streams without prior state and finally # move onto streams with state (i.e. have been synced in the past) currently_syncing = singer.get_currently_syncing(state) # prioritize streams that have not been processed ordered_streams = streams_without_state + streams_with_state if currently_syncing: currently_syncing_stream = list( filter(lambda s: s.tap_stream_id == currently_syncing, streams_with_state)) non_currently_syncing_streams = list( filter(lambda s: s.tap_stream_id != currently_syncing, ordered_streams)) streams_to_sync = currently_syncing_stream + non_currently_syncing_streams else: # prioritize streams that have not been processed streams_to_sync = ordered_streams return resolve_catalog(discovered, streams_to_sync)
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( url=get_url("companies_detail", company_id=row['companyId']), params={"includePropertyVersions": "true"} ).json() record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform(record, schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, record['companyId']) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def get_streams_to_sync(streams, state): target_stream = singer.get_currently_syncing(state) result = streams if target_stream: result = list(itertools.dropwhile( lambda x: x.tap_stream_id != target_stream, streams)) if not result: raise Exception('Unknown stream {} in state'.format(target_stream)) return result
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("engagements") bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "engagements") or utils.now() STATE = write_current_sync_start(STATE, "engagements", current_sync_start) singer.write_state(STATE) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform( lift_properties_and_versions(engagement), schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'engagements', None) singer.write_state(STATE) return STATE
def sync_subscription_changes(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) STATE = sync_entity_chunked( STATE, catalog, "subscription_changes", ["timestamp", "portalId", "recipient"], "timeline", ) return STATE
def get_streams_to_sync(streams, state): '''Get the streams to sync''' current_stream = singer.get_currently_syncing(state) result = streams if current_stream: result = list(itertools.dropwhile( lambda x: x.tap_stream_id != current_stream, streams)) if not result: raise Exception("Unknown stream {} in state".format(current_stream)) return result
def get_non_oplog_streams(client, streams, state): selected_streams = list(filter(lambda s: is_stream_selected(s), streams)) streams_with_state = [] streams_without_state = [] for stream in selected_streams: stream_metadata = metadata.to_map(stream['metadata']) replication_method = stream_metadata.get((), {}).get('replication-method') state = singer.write_bookmark(state, stream['tap_stream_id'], 'replication_method', replication_method) stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id']) if not stream_state: if replication_method == 'LOG_BASED': LOGGER.info( "LOG_BASED stream %s requires full historical sync", stream['tap_stream_id']) streams_without_state.append(stream) elif stream_state and replication_method == 'LOG_BASED' and oplog_stream_requires_historical( stream, state): LOGGER.info("LOG_BASED stream %s will resume its historical sync", stream['tap_stream_id']) streams_with_state.append(stream) elif stream_state and replication_method != 'LOG_BASED': streams_with_state.append(stream) # If the state says we were in the middle of processing a stream, skip # to that stream. Then process streams without prior state and finally # move onto streams with state (i.e. have been synced in the past) currently_syncing = singer.get_currently_syncing(state) # prioritize streams that have not been processed ordered_streams = streams_without_state + streams_with_state if currently_syncing: currently_syncing_stream = list( filter( lambda s: s['tap_stream_id'] == currently_syncing and is_valid_currently_syncing_stream(s, state), streams_with_state)) non_currently_syncing_streams = list( filter(lambda s: s['tap_stream_id'] != currently_syncing, ordered_streams)) streams_to_sync = currently_syncing_stream + non_currently_syncing_streams else: # prioritize streams that have not been processed streams_to_sync = ordered_streams return (streams_to_sync, state)
def do_sync(catalog, sdk_client, state): # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) for stream in catalog.get_selected_streams(state): stream_name = stream.stream stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Syncing stream %s ...', stream_name) sync_stream(stream_name, stream_metadata, sdk_client)
def sync(client, config, catalog, state): start_date = config.get('start_date')[:10] period_types = config.get('period_types', 'FiscalQuarter') # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] selected_streams_by_name = {} for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) selected_streams_by_name[stream.stream] = stream LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams or selected_streams == []: return # Loop through endpoints in selected_streams for stream_name, endpoint_config in STREAMS.items(): if stream_name in selected_streams: LOGGER.info('START Syncing: {}'.format(stream_name)) stream = selected_streams_by_name[stream_name] bookmark_field = next( iter(endpoint_config.get('replication_keys', [])), None) id_fields = endpoint_config.get('key_properties') singer_ops.write_schema(catalog, stream_name) total_records = 0 last_date = singer_ops.get_bookmark(state, stream_name, start_date) #Request is made using currrent ddate + 1 as the end period. req_state = singer_ops.get_request_state( client=client, stream_name=stream_name, start_date=start_date, last_date=last_date, end_date=datetime.now(), state=state, bookmark_field=bookmark_field, id_fields=id_fields, period_types=period_types, stream=stream, catalog=catalog) # Main sync routine total_records = __sync_endpoint(req_state) LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format( stream_name, total_records)) LOGGER.info('sync.py: sync complete')
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) schema = load_schema('deal_pipelines') singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) LOGGER.info('sync_deal_pipelines') data = request(get_url('deal_pipelines')).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema) singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now()) singer.write_state(STATE) return STATE
def sync(client, config, catalog, state): start_date = config.get('start_date') git_owner = config.get('git_owner') git_repository_list = config['git_repositories'].replace(" ", "").split(",") # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: %s', last_stream) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: %s', selected_streams) if not selected_streams: return # Loop through selected_streams for stream_name, endpoint_config in STREAMS.items(): if stream_name in selected_streams: for git_repository in git_repository_list: LOGGER.info('START Syncing Repository: %s, Stream: %s', git_repository, stream_name) update_currently_syncing(state, stream_name) search_path = endpoint_config.get( 'search_path', stream_name).replace( '[GIT_OWNER]', git_owner).replace('[GIT_REPOSITORY]', git_repository) bookmark_field = next( iter(endpoint_config.get('replication_keys', [])), None) total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, search_path=search_path, endpoint_config=endpoint_config, git_owner=git_owner, git_repository=git_repository, bookmark_query_field=endpoint_config.get( 'bookmark_query_field', None), bookmark_field=bookmark_field, data_key=endpoint_config.get('data_key', stream_name), id_fields=endpoint_config.get('key_properties'), selected_streams=selected_streams) update_currently_syncing(state, None) LOGGER.info( 'FINISHED Syncing Repository: %s, Stream: %s, total_records: %s', git_repository, stream_name, total_records)
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema("engagements") bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must save a lookback window # that handles the duration of time that this stream was last syncing, # and look back by that amount on the next sync last_sync_duration = get_previous_time_window(STATE, "engagements") current_sync_start = utils.now() if has_bookmark(STATE, "engagements", bookmark_key) and \ last_sync_duration is not None: LOGGER.info(("Last sync of engagements lasted {} seconds. Adjusting bookmark by this " "amount to account for race conditions with record updates.").format(last_sync_duration)) start = utils.strptime_to_utc(start) - datetime.timedelta(seconds=last_sync_duration) start = utils.strftime(start) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform(engagement, schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value) # Write duration for next sync's lookback window STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now()) singer.write_state(STATE) return STATE
def get_streams_to_sync(streams, state): target_stream = singer.get_currently_syncing(state) result = streams if target_stream: skipped = list( itertools.takewhile(lambda x: x.tap_stream_id != target_stream, streams)) rest = list( itertools.dropwhile(lambda x: x.tap_stream_id != target_stream, streams)) result = rest + skipped # Move skipped streams to end if not result: raise Exception("Unknown stream {} in state".format(target_stream)) return result
def get_streams_to_sync(streams, state): '''Get the streams to sync''' current_stream = singer.get_currently_syncing(state) result = streams if current_stream: for key in result.keys(): if result[key].tap_stream_id != current_stream: result.pop(key, None) if not result: raise Exception("Unknown stream {} in state".format(current_stream)) return result
def sync_contacts(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) bookmark_key = "versionTimestamp" start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key)) LOGGER.info("sync_contacts from %s", start) max_bk_value = start schema = load_schema("contacts") singer.write_schema("contacts", schema, ["vid"], [bookmark_key], catalog.get("stream_alias")) url = get_url("contacts_all") vids = [] with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request( STATE, "contacts", url, default_contact_params, "contacts", "has-more", ["vid-offset"], ["vidOffset"], ): modified_time = None if bookmark_key in row: modified_time = utils.strptime_with_tz( _transform_datetime( # pylint: disable=protected-access row[bookmark_key], UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) if not modified_time or modified_time >= start: vids.append(row["vid"]) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if len(vids) == 100: _sync_contact_vids(catalog, vids, schema, bumble_bee) vids = [] _sync_contact_vids(catalog, vids, schema, bumble_bee) STATE = singer.write_bookmark(STATE, "contacts", bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_campaigns(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) schema = load_schema("campaigns") singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias')) LOGGER.info("sync_campaigns(NO bookmarks)") url = get_url("campaigns_all") params = {'limit': 500} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]): record = request(get_url("campaigns_detail", campaign_id=row['id'])).json() record = bumble_bee.transform(record, schema) singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now()) return STATE