def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets): if len(offset_keys) != len(offset_targets): raise ValueError( "Number of offset_keys must match number of offset_targets") if singer.get_offset(STATE, tap_stream_id): params.update(singer.get_offset(STATE, tap_stream_id)) with metrics.record_counter(tap_stream_id) as counter: while True: data = request(url, params).json() for row in data[path]: counter.increment() yield row if not data.get(more_key, False): break STATE = singer.clear_offset(STATE, tap_stream_id) for key, target in zip(offset_keys, offset_targets): if key in data: params[target] = data[key] STATE = singer.set_offset(STATE, tap_stream_id, target, data[key]) singer.write_state(STATE) STATE = singer.clear_offset(STATE, tap_stream_id) singer.write_state(STATE)
def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets, v3_fields=None): if len(offset_keys) != len(offset_targets): raise ValueError( "Number of offset_keys must match number of offset_targets") if singer.get_offset(STATE, tap_stream_id): params.update(singer.get_offset(STATE, tap_stream_id)) with metrics.record_counter(tap_stream_id) as counter: while True: data = request(url, params).json() if v3_fields: url = get_url('deals_v3_search') body = {"properties": v3_fields, "limit": 100} v3_data = post_search_endpoint(url, body) additional_fields = {} for item in v3_data.json()['results']: # We nest `y` under 'value' in order to match the # schema and the shape of other fields in 'properties' additional_fields[int(item['id'])] = { key: { 'value': value } for key, value in item['properties'].items() if ('hs_date_entered' in key or 'hs_date_exited' in key ) } for item in data[path]: item['properties'] = { **item['properties'], **additional_fields.get(item['dealId']) } for row in data[path]: counter.increment() yield row if not data.get(more_key, False): break STATE = singer.clear_offset(STATE, tap_stream_id) for key, target in zip(offset_keys, offset_targets): if key in data: params[target] = data[key] STATE = singer.set_offset(STATE, tap_stream_id, target, data[key]) singer.write_state(STATE) STATE = singer.clear_offset(STATE, tap_stream_id) singer.write_state(STATE)
def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets, v3_fields=None): if len(offset_keys) != len(offset_targets): raise ValueError( "Number of offset_keys must match number of offset_targets") if singer.get_offset(STATE, tap_stream_id): params.update(singer.get_offset(STATE, tap_stream_id)) with metrics.record_counter(tap_stream_id) as counter: while True: data = request(url, params).json() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) if v3_fields: v3_data = get_v3_deals(v3_fields, data[path]) # The shape of v3_data is different than the V1 response, # so we transform v3 to look like v1 transformed_v3_data = process_v3_deals_records(v3_data) merge_responses(data[path], transformed_v3_data) for row in data[path]: counter.increment() yield row if not data.get(more_key, False): break STATE = singer.clear_offset(STATE, tap_stream_id) for key, target in zip(offset_keys, offset_targets): if key in data: params[target] = data[key] STATE = singer.set_offset(STATE, tap_stream_id, target, data[key]) singer.write_state(STATE) STATE = singer.clear_offset(STATE, tap_stream_id) singer.write_state(STATE)
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + CHUNK_SIZES[entity_name] params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform(row, schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def gen_request(state: State, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets, v3_fields=None): if len(offset_keys) != len(offset_targets): raise ValueError( "Number of offset_keys must match number of offset_targets") offset = singer.get_offset(state, tap_stream_id) logger.debug(f'offset={offset}') if offset: params.update(offset) with metrics.record_counter(tap_stream_id) as counter: while True: data = request(url, params).json() rows = data.pop(path, None) if rows is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) logger.debug(f'Got resp: {len(rows)} {tap_stream_id}, {data}') if v3_fields: v3_data = get_v3_deals(v3_fields, rows) # The shape of v3_data is different than the V1 response, # so we transform v3 to look like v1 transformed_v3_data = process_v3_deals_records(v3_data) merge_responses(rows, transformed_v3_data) for row in rows: counter.increment() yield row if not data.get(more_key, False): logger.debug('HAS NO MORE, BREAK') break state = singer.clear_offset(state, tap_stream_id) for key, target in zip(offset_keys, offset_targets): if key in data: params[target] = data[key] state = singer.set_offset(state, tap_stream_id, target, data[key]) singer.write_state(state) state = singer.clear_offset(state, tap_stream_id) singer.write_state(state)
def sync_stream(config, consumer, stream, state): write_schema(stream.tap_stream_id, stream.schema.to_dict(), stream.key_properties or []) stream_version = singer.get_bookmark(state, stream.tap_stream_id, "version") if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream.tap_stream_id, "version", stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage(stream=stream.tap_stream_id, version=stream_version) singer.write_message(activate_version_message) validator = Draft4Validator(stream.schema.to_dict(), format_checker=FormatChecker()) time_extracted = utils.now() rows_saved = 0 # Assign all the partitions for the topic to this consumer topic_partitions = [TopicPartition(config['topic'], partition_id) for partition_id in consumer.partitions_for_topic(config['topic'])] consumer.assign(topic_partitions) # Seek each partition to it's value from the STATE, or to the beginning otherwise offsets = singer.get_offset(state, stream.tap_stream_id, {}) for topic_partition in topic_partitions: if str(topic_partition.partition) in offsets: consumer.seek(topic_partition, offsets[str(topic_partition.partition)]) else: consumer.seek_to_beginning(topic_partition) for message in consumer: record = singer.RecordMessage(stream=stream.tap_stream_id, record=message.value, time_extracted=time_extracted) validator.validate(record.record) singer.write_message(record) state = singer.set_offset(state, stream.tap_stream_id, message.partition, message.offset) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): if "schema" in catalog: schema = catalog["schema"] else: schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) for row in data[path]: counter.increment() record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def sync_entity_chunked(state: State, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema('hubspot_' + entity_name, schema, key_properties, [bookmark_key]) start = get_start(state, entity_name, bookmark_key) logger.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } while True: our_offset = singer.get_offset(state, entity_name) if bool(our_offset) and our_offset.get('offset') is not None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) for row in data[path]: counter.increment() record = build_record(row, schema) write_record(entity_name, record) if data.get('hasMore'): state = singer.set_offset(state, entity_name, 'offset', data['offset']) singer.write_state(state) else: state = singer.clear_offset(state, entity_name) singer.write_state(state) break state = singer.write_bookmark(state, "hubspot_" + entity_name, 'startTimestamp', utils.strftime( datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(state) start_ts = end_ts state = singer.clear_offset(state, entity_name) singer.write_state(state) return state
def gen_request_v3(STATE, tap_stream_id, url, params, path, custom_properties_chunks=[], associations=[]): # Looping on all the pages until we reach the final one while True: if singer.get_offset(STATE, tap_stream_id): params.update(singer.get_offset(STATE, tap_stream_id)) # We are doing 1 API call per properties chunk # Each call will return the same deal page but with a subset of the deal custom properties each time results = [] for property_chunk in custom_properties_chunks: if property_chunk: params['properties'] = ",".join(property_chunk) data = request(url, params).json() results.append(data[path]) # Resetting the "properties" param del params["properties"] # Making an API call to fetch the associations for a given page of deals params['associations'] = ",".join(associations) data = request(url, params).json() results.append(data[path]) del params["associations"] # We use a dict to merge together all the properties for a given deal # We use the dealId as the dict key records_map = {} for result in results: for record in result: # We have to init the dict entry if it doesn't exist if not record["id"] in records_map: records_map[record["id"]] = {"properties": {}} merged_properties = { **records_map[record["id"]]["properties"], **record["properties"] } records_map[record["id"]]["properties"] = merged_properties # If we reached the deal record that contains the association info, we take it if "associations" in record: records_map[ record["id"]]["associations"] = record["associations"] # Keep the DealId in "dealId" field to preserve the table # primary key that is "dealId" before the switch to v3 API for deals records_map[record["id"]]["dealId"] = record["id"] with metrics.record_counter(tap_stream_id) as counter: for key, value in records_map.items(): counter.increment() yield value # This is the paging break signal from Hubspot if not "paging" in data: break # We update the paging parameter and go the next page / loop # "after" is the paging offset after = data["paging"]["next"]["after"] STATE = singer.set_offset(STATE, tap_stream_id, "after", after) singer.write_state(STATE) # We clear the offset STATE = singer.clear_offset(STATE, tap_stream_id) singer.write_state(STATE)