def update_bookmark(self, state, value): current_bookmark = self.get_bookmark(state) if value and utils.strptime_with_tz(value) > current_bookmark: singer.write_bookmark(state, self.name, self.replication_key, value)
def sync_records(ns, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz( ns.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing NetSuite data for stream %s', stream) previous_max_replication_key = None query_func = ns.query for rec in query_func(ns, catalog_entry, state): counter.increment() with Transformer( pre_hook=transform_data_hook(ns, stream)) as transformer: rec = transformer.transform(rec, schema) singer.write_message( singer.RecordMessage(stream=(stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) if replication_key: _rec = rec.get(replication_key, None) original_replication_key_value = "" replication_key_value = None if replication_key and _rec is not None: original_replication_key_value = _rec replication_key_value = singer_utils.strptime_with_tz( original_replication_key_value) # Before writing a bookmark, make sure Quickbooks has not given us a # record with one outside our range if previous_max_replication_key is None or ( replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key): state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, original_replication_key_value) previous_max_replication_key = replication_key_value # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', None)
def test_run(self): # Connect to stitch service. runner.run_check_job_and_check_status(self) # Get and check streams. self.found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(self.found_catalogs), 9, msg="unable to locate schemas for connection {}".format(self.conn_id)) # Match streams. our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(self.conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator, verify tap and target exit codes # and verify actual rows were synced. first_sync_record_count = self.run_sync(self.conn_id) replicated_row_count = reduce(lambda accum, c : accum + c, first_sync_record_count.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_sync_record_count)) print("total replicated row count: {}".format(replicated_row_count)) # Get incremental vs. non-incremental streams. non_incremental_streams = {key for key, value in self.expected_replication_method().items() if value != 'INCREMENTAL'} incremental_streams = {key for key, value in self.expected_replication_method().items() if value == 'INCREMENTAL'} # Get bookmark and state data for first sync, excluding full table streams. first_sync_state = menagerie.get_state(self.conn_id) first_sync_records = runner.get_records_from_target_output() for v in non_incremental_streams: first_sync_records.pop(v, None) first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records) first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records) # Run a second sync job using orchestrator. second_sync_record_count = self.run_sync(self.conn_id) # Get data about rows synced, excluding full table streams. second_sync_records = runner.get_records_from_target_output() for v in non_incremental_streams: second_sync_records.pop(v, None) second_min_bookmarks = self.min_bookmarks_by_stream(second_sync_records) for stream in incremental_streams: if stream in {'tasks', 'groups'}: continue with self.subTest(stream=stream): # get bookmark values from state and target data stream_bookmark_key = self.expected_rks().get(stream, set()) assert len(stream_bookmark_key) == 1 # There shouldn't be a compound replication key stream_bookmark_key = stream_bookmark_key.pop() if not first_sync_state.get("bookmarks", {}).get(stream, None): # Some streams require more than a free tier plan (tasks) continue state_value = first_sync_state.get("bookmarks", {}).get( stream, {None: None}).get(stream_bookmark_key) target_value = first_max_bookmarks.get( stream, {None: None}).get(stream_bookmark_key) target_min_value = first_min_bookmarks.get( stream, {None: None}).get(stream_bookmark_key) # Convert everything to datetime. state_value = utils.strptime_with_tz(state_value) target_value = utils.strptime_with_tz(target_value) target_min_value = utils.strptime_with_tz(target_min_value) # verify that there is data with different bookmark values - setup necessary self.assertTrue(target_value >= target_min_value, msg="Data isn't set up to be able to test bookmarks") # verify state agrees with target data after 1st sync self.assertEqual(state_value, target_value, msg="The bookmark value isn't correct based on target data") # verify that you get less data the 2nd time around self.assertGreater( first_sync_record_count.get(stream, 0), second_sync_record_count.get(stream, 0), msg="second sync didn't have less records, bookmark usage not verified") if len(second_sync_records) > 0 and len(second_min_bookmarks) > 0: # verify all data from 2nd sync >= 1st bookmark target_value = second_min_bookmarks.get(stream, {None: None}).get(stream_bookmark_key) target_value = utils.strptime_with_tz(target_value) # verify that the minimum bookmark sent to the target for the second sync # is greater than or equal to the bookmark from the first sync self.assertTrue(target_value >= state_value)
def get_attribution_window_bookmark(customer_id, stream_name): mid_bk_value = bookmarks.get_bookmark( STATE, state_key_name(customer_id, stream_name), 'last_attribution_window_date') return utils.strptime_with_tz(mid_bk_value) if mid_bk_value else None
def is_session_bookmark_old(self, value): if self.session_bookmark is None: return True return utils.strptime_with_tz(value) > utils.strptime_with_tz( self.session_bookmark)
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True v3_fields = None has_selected_properties = mdata.get(('properties', 'properties'), {}).get('selected') if has_selected_properties or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. params['includeAllProperties'] = True params['allPropertiesFetchMode'] = 'latest_version' # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with v3_fields = [ x[1].replace('property_', '') for x, y in mdata.items() if x and (y.get('selected') == True or has_selected_properties) and ('hs_date_entered' in x[1] or 'hs_date_exited' in x[1]) ] url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"], v3_fields=v3_fields): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'limit': 100} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # We fetch all the deals properties deals_v3_custom_schema = get_v3_schema("deals") properties = [] for key, value in deals_v3_custom_schema.items(): properties.append(key) # Splitting properties into chunks of max 100 properties # to avoid asking for too many properties at once # as properties names are passed in the URL # URL have a safe length limit of 2000 chars, so 100 properties max should do it property_chunks = [] while len(properties) > 100: head, tail = head_100(properties) property_chunks.append(head) properties = tail property_chunks.append(properties) # TODO: Make it configurable through the singer catalog metadata selection pattern associations = ["contacts"] url = get_url('deals_v3_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request_v3(STATE, "deals", url, params, "results", custom_properties_chunks=property_chunks, associations=associations): modified_time = None if 'updatedAt' in row: # Hubspot returns timestamps in ISO 8601 modified_time = dateutil.parser.isoparse(row['updatedAt']) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform( lift_properties_and_versions_v3(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # Select all Catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata( conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) max_bookmarks_from_records = runner.get_most_recent_records_from_target( self, self.expected_bookmarks(), self.get_properties()['start_date']) start_of_today = utils.strftime( datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc)) max_bookmarks_from_records['subscription_changes'] = start_of_today max_bookmarks_from_records['email_events'] = start_of_today #if we didn't replicate data, the bookmark should be the start_date for k in self.expected_bookmarks().keys(): if max_bookmarks_from_records.get(k) is None: max_bookmarks_from_records[k] = utils.strftime( datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc)) state = menagerie.get_state(conn_id) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys()) #verify bookmarks and offsets for k, v in sorted(list(self.expected_bookmarks().items())): for w in v: bk_value = bookmarks.get(k, {}).get(w) self.assertEqual( utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}" .format(bk_value, w, k, max_bookmarks_from_records[k])) print("bookmark {}({}) updated to {} from max record value {}". format(k, w, bk_value, max_bookmarks_from_records[k])) for k, v in self.expected_offsets().items(): self.assertEqual( bookmarks.get(k, {}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}". format(k, v, state)) print("offsets {} cleared".format(k)) diff = bookmark_streams.difference(self.acceptable_bookmarks()) self.assertEqual( len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format( diff, self.acceptable_bookmarks(), bookmarks)) self.assertEqual( state.get('currently_syncing'), None, "Unexpected `currently_syncing` bookmark value: {} Expected: None". format(state.get('currently_syncing')))
def _datetime_string_to_epoch(self, datetime_string): return utils.strptime_with_tz(datetime_string).timestamp() * 1000
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True # Append all the properties fields for deals to the request if # properties is selectedOB if mdata.get(('properties', 'properties'), {}).get('selected'): additional_properties = schema.get("properties").get("properties").get( "properties") for key in additional_properties.keys(): params['properties'].append(key) url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(row, schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def _query_recur(self, query, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES): params = {"q": query} url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url) headers = self.sf._get_standard_headers() if end_date is None: end_date = singer_utils.now() if retries == 0: raise TapSalesforceException( "Ran out of retries attempting to query Salesforce Object {}". format(catalog_entry['stream'])) retryable = False try: while True: resp = self.sf._make_request('GET', url, headers=headers, params=params) resp_json = resp.json() for rec in resp_json.get('records'): yield rec next_records_url = resp_json.get('nextRecordsUrl') if next_records_url is None: break else: url = "{}{}".format(self.sf.instance_url, next_records_url) except HTTPError as ex: response = ex.response.json() if isinstance( response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT": start_date = singer_utils.strptime_with_tz(start_date_str) day_range = (end_date - start_date).days LOGGER.info( "Salesforce returned QUERY_TIMEOUT querying %d days of %s", day_range, catalog_entry['stream']) retryable = True else: raise ex if retryable: start_date = singer_utils.strptime_with_tz(start_date_str) half_day_range = (end_date - start_date) // 2 end_date = end_date - half_day_range if half_day_range.days == 0: raise TapSalesforceException( "Attempting to query by 0 day range, this would cause infinite looping." ) query = self.sf._build_query_string( catalog_entry, singer_utils.strftime(start_date), singer_utils.strftime(end_date)) for record in self._query_recur(query, catalog_entry, start_date_str, end_date, retries - 1): yield record
def get_end_date(): if CONFIG.get('end_date'): return utils.strptime_with_tz(CONFIG.get('end_date')) return utils.now()
def get_start_for_stream(customer_id, stream_name): bk_value = bookmarks.get_bookmark(STATE, state_key_name(customer_id, stream_name), 'date') bk_start_date = utils.strptime_with_tz(bk_value or CONFIG['start_date']) return bk_start_date
def sync_deals(state: State): bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(state, "deals", bookmark_key)) max_bk_value = start logger.info("sync_deals: start from %s", start) params = { 'limit': 100, 'includeAssociations': True, # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. 'includeAllProperties': True, 'allPropertiesFetchMode': 'latest_version', } schema = load_schema("deals") singer.write_schema("hubspot_deals", schema, ["dealId"], [bookmark_key]) v3_fields = None # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with v3_fields = [] for field_name in schema['properties']: hs_field_name = field_name.replace('property_', '') if any(hs_field_name.startswith(prefix) for prefix in V3_PREFIXES): v3_fields.append(hs_field_name) url = get_url('deals_all') for row in gen_request(state, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"], v3_fields=v3_fields): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = build_record(row, schema) write_record('hubspot_deals', record) state = singer.write_bookmark(state, 'hubspot_deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(state) return state
def _query_recur(self, query, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES): params = {"q": query} url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url) headers = self.sf._get_standard_headers() sync_start = singer_utils.now() if end_date is None: end_date = sync_start if retries == 0: raise TapSalesforceException( "Ran out of retries attempting to query Salesforce Object {}". format(catalog_entry['stream'])) retryable = False try: for rec in self._sync_records(url, headers, params): yield rec # If the date range was chunked (an end_date was passed), sync # from the end_date -> now if end_date < sync_start: next_start_date_str = singer_utils.strftime(end_date) query = self.sf._build_query_string(catalog_entry, next_start_date_str) for record in self._query_recur(query, catalog_entry, next_start_date_str, retries=retries): yield record except HTTPError as ex: response = ex.response.json() if isinstance( response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT": start_date = singer_utils.strptime_with_tz(start_date_str) day_range = (end_date - start_date).days LOGGER.info( "Salesforce returned QUERY_TIMEOUT querying %d days of %s", day_range, catalog_entry['stream']) retryable = True else: raise ex if retryable: start_date = singer_utils.strptime_with_tz(start_date_str) half_day_range = (end_date - start_date) // 2 end_date = end_date - half_day_range if half_day_range.days == 0: raise TapSalesforceException( "Attempting to query by 0 day range, this would cause infinite looping." ) query = self.sf._build_query_string( catalog_entry, singer_utils.strftime(start_date), singer_utils.strftime(end_date)) for record in self._query_recur(query, catalog_entry, start_date_str, end_date, retries - 1): yield record
def sync_entity_chunked(state: State, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema('hubspot_' + entity_name, schema, key_properties, [bookmark_key]) start = get_start(state, entity_name, bookmark_key) logger.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } while True: our_offset = singer.get_offset(state, entity_name) if bool(our_offset) and our_offset.get('offset') is not None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) for row in data[path]: counter.increment() record = build_record(row, schema) write_record(entity_name, record) if data.get('hasMore'): state = singer.set_offset(state, entity_name, 'offset', data['offset']) singer.write_state(state) else: state = singer.clear_offset(state, entity_name) singer.write_state(state) break state = singer.write_bookmark(state, "hubspot_" + entity_name, 'startTimestamp', utils.strftime( datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(state) start_ts = end_ts state = singer.clear_offset(state, entity_name) singer.write_state(state) return state
def get_start_for_stream(config, state, advertiser_ids, stream_name): """Get start date for stream sync.""" bk_value = bookmarks.get_bookmark( state, state_key_name(advertiser_ids, stream_name), "date") bk_start_date = utils.strptime_with_tz(bk_value or config["start_date"]) return bk_start_date
def append_times_to_dates(item, date_fields): if date_fields: for date_field in date_fields: if item.get(date_field): item[date_field] = utils.strftime( utils.strptime_with_tz(item[date_field]))
def get_end_date(config): """Get end date from config file.""" if config.get("end_date"): return utils.strptime_with_tz(config.get("end_date")) return utils.now()
def is_bookmark_old(self, state, value, name=None): current_bookmark = self.get_bookmark(state, name) return utils.strptime_with_tz(value) > utils.strptime_with_tz( current_bookmark)
def sync_records(qb, catalog_entry, state, counter, state_passed): chunked_bookmark = singer_utils.strptime_with_tz(qb.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Quickbooks data for stream %s', stream) previous_max_replication_key = None; query_func = qb.query if stream.endswith("Report"): query_func = qb.query_report for rec in query_func(catalog_entry, state, state_passed): counter.increment() with Transformer(pre_hook=transform_data_hook) as transformer: rec = transformer.transform(rec, schema) singer.write_message( singer.RecordMessage( stream=( stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) if replication_key: jsonpath_expression = parse(f"$.{replication_key}") _rec = {'MetaData': json.loads(rec.get('MetaData', {}))} match = jsonpath_expression.find(_rec) original_replication_key_value = "" if replication_key and len(match) > 0: original_replication_key_value = match[0].value replication_key_value = singer_utils.strptime_with_tz(original_replication_key_value) # Before writing a bookmark, make sure Quickbooks has not given us a # record with one outside our range if previous_max_replication_key is None or ( replication_key_value and replication_key_value <= start_time and replication_key_value > previous_max_replication_key ): state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, original_replication_key_value) previous_max_replication_key = replication_key_value # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'version', None)
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def get_bookmark(self, state): return utils.strptime_with_tz( singer.get_bookmark(state, self.name, self.replication_key))
def sync_records(sf, catalog_entry, state, counter): chunked_bookmark = singer_utils.strptime_with_tz(sf.get_start_date(state, catalog_entry)) stream = catalog_entry['stream'] schema = catalog_entry['schema'] stream_alias = catalog_entry.get('stream_alias') catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') stream_version = get_stream_version(catalog_entry, state) activate_version_message = singer.ActivateVersionMessage(stream=(stream_alias or stream), version=stream_version) start_time = singer_utils.now() LOGGER.info('Syncing Salesforce data for stream %s', stream) for rec in sf.query(catalog_entry, state): counter.increment() with Transformer(pre_hook=transform_bulk_data_hook) as transformer: rec = transformer.transform(rec, schema) rec = fix_record_anytype(rec, schema) singer.write_message( singer.RecordMessage( stream=( stream_alias or stream), record=rec, version=stream_version, time_extracted=start_time)) replication_key_value = replication_key and singer_utils.strptime_with_tz(rec[replication_key]) if sf.pk_chunking: if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark: # Replace the highest seen bookmark and save the state in case we need to resume later chunked_bookmark = singer_utils.strptime_with_tz(rec[replication_key]) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'JobHighestBookmarkSeen', singer_utils.strftime(chunked_bookmark)) singer.write_state(state) # Before writing a bookmark, make sure Salesforce has not given us a # record with one outside our range elif replication_key_value and replication_key_value <= start_time: state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, rec[replication_key]) singer.write_state(state) # Tables with no replication_key will send an # activate_version message for the next sync if not replication_key: singer.write_message(activate_version_message) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], 'version', None) # If pk_chunking is set, only write a bookmark at the end if sf.pk_chunking: # Write a bookmark with the highest value we've seen state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, singer_utils.strftime(chunked_bookmark))
def _get_end_datetime(self, startDateTime): endDateTime = utils.strptime_with_tz(startDateTime) + timedelta( self.api_window_in_days) return endDateTime.strftime("%Y-%m-%d %H:%M:%S")