def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def load_and_write_schema(ctx, stream): singer.write_schema( stream.tap_stream_id, load_schema(ctx, stream.tap_stream_id), stream.pk_fields, )
def sync(config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream:" + stream.tap_stream_id) schema = stream.schema.to_dict() singer.write_schema( stream_name=stream.stream, schema=schema, key_properties=stream.key_properties, ) id_list_ordered = [] name_list_ordered = [] if "sale" == stream.tap_stream_id: tap_data_types_formulaires = get_sales_data_from_API(config, state, stream.tap_stream_id, "f") tap_data_types_ventes = get_sales_data_from_API(config, state, stream.tap_stream_id, "v") record_dict = {} for row in tap_data_types_formulaires: row = html.unescape(row) keys = list(stream.schema.properties.keys()) value = row.split(";") record_dict['types'] = "f" for i in range(0, len(value)): record_dict[keys[i+1]] = value[i] singer.write_records(stream.tap_stream_id, [record_dict]) for row in tap_data_types_ventes: row = html.unescape(row) keys = list(stream.schema.properties.keys()) value = row.split(";") record_dict = {} record_dict['types'] = "v" for i in range(0, len(value)): record_dict[keys[i+1]] = value[i] singer.write_records(stream.tap_stream_id, [record_dict]) elif stream.tap_stream_id in ["stats_by_campain", "stats_by_site"]: if stream.tap_stream_id == "stats_by_campain": select_data_by_campain_or_site = "campain" else: select_data_by_campain_or_site = "site" tap_data = get_stats_data_from_API(config, state, stream.tap_stream_id) for row in tap_data: value = row.split(";") id_value = value[0] name_value = value[1] if id_value not in id_list_ordered and name_value not in name_list_ordered: id_list_ordered.append(id_value) name_list_ordered.append(name_value) continue for id, name in zip(id_list_ordered, name_list_ordered): tap_data = get_stats_data_from_API_by_id(config, state, stream.tap_stream_id, id, select_data_by_campain_or_site) for row in tap_data: row = html.unescape(row) keys = list(stream.schema.properties.keys()) value = row.split(";") record_dict = {} if stream.tap_stream_id == "stats_by_site": record_dict['idsite'] = id record_dict['nomsite'] = name else: record_dict['idcamp'] = id record_dict['nomcamp'] = name for j in range(0, len(value)): if j == 0: # init date and skip id and nom last_date = value[0][0:4] + "-" + value[0][4:6] + "-" + value[0][6:8] + " 13:37:42 UTC" record_dict[keys[0]] = last_date else: record_dict[keys[j + 2]] = value[j] singer.write_records(stream.tap_stream_id, [record_dict]) else: tap_data = get_stats_data_from_API(config, state, stream.tap_stream_id) for row in tap_data: row = html.unescape(row) keys = list(stream.schema.properties.keys()) value = row.split(";") record_dict = {} for i in range(0, len(value)): record_dict[keys[i]] = value[i] if "stats_by_day" in stream.tap_stream_id: last_date = value[0][0:4] + "-" + value[0][4:6] + "-" + value[0][6:8] + " 13:37:42 UTC" record_dict[keys[0]] = last_date elif "stats_by_month" in stream.tap_stream_id: last_date = value[0][0:4] + "-" + value[0][4:6] + "-01 13:37:42 UTC" record_dict[keys[0]] = last_date singer.write_records(stream.tap_stream_id, [record_dict]) bookmark_state(stream.tap_stream_id, state) return
def sync(config, state, catalog): errors_encountered = False selected_stream_ids = get_selected_streams(catalog) client = GAClient(config) # Loop over streams in catalog for stream in catalog['streams']: stream_id = stream['tap_stream_id'] stream_schema = stream['schema'] stream_metadata = metadata.to_map(stream['metadata']) key_properties = metadata.get(stream_metadata, (), "table-key-properties") if stream_id in selected_stream_ids: LOGGER.info('Syncing stream: ' + stream_id) try: report_definition = ReportsHelper.get_report_definition(stream) results = client.process_stream(report_definition) # we write the schema message after we are sure that we could # fetch records without errors singer.write_schema(stream_id, stream_schema, key_properties) singer.write_records(stream_id, results) except TapGaInvalidArgumentError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to invalid report definition.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaRateLimitError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Rate Limit Errors.".format( stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaQuotaExceededError as e: errors_encountered = True LOGGER.error( "Skipping stream: '{}' due to Quota Exceeded Errors.". format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) except TapGaAuthenticationError as e: LOGGER.error( "Stopping execution while processing '{}' due to Authentication Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) except TapGaUnknownError as e: LOGGER.error( "Stopping execution while processing '{}' due to Unknown Errors." .format(stream_id)) LOGGER.debug("Error: '{}'.".format(e)) sys.exit(1) else: LOGGER.info('Skipping unselected stream: ' + stream_id) # If we encountered errors, exit with 1 if errors_encountered: sys.exit(1) return
'type': 'string', "format": "date" }, 'countrycode': { 'type': 'string', 'pattern': "^[A-Z]{2}$" }, 'store_name': { 'type': 'string' } }, '$schema': 'http://json-schema.org/draft-07/schema#' } # Write the schema to stdout. singer.write_schema(stream_name='products', schema=schema, key_properties=[]) # Return the set of items scraped from a specific store as a list def retrieve_store_items(store_name, items_endpoint=items_template): return requests.get(f"{items_endpoint}{store_name}").json()["items"] def main(): for shop in requests.get(shops_template).json()["shops"]: singer.write_records( stream_name='products', # Add the name of the store to every record. records=({ 'store_name': shop, **item
def sync_modified_rows(STATE, catalog, schema_name="orders", key_properties=["order_id"]): schema = load_schema(schema_name, CONFIG["schema_dir"]) singer.write_schema(schema_name, schema, key_properties) start = get_start(STATE, schema_name, "last_update") last_update = start offset = 0 start_at_time = parser.parse(start) if start_at_time.tzinfo is None: start_at_time = start_at_time.replace(tzinfo=pytz.utc) tz_offset = parser.parse("1970-01-02T00:00:00+00:00").replace( tzinfo=pytz.utc) - parser.parse( "1970-01-02T00:00:00" + CONFIG["timezone_offset"]) + datetime.timedelta( seconds=CONFIG["relative_time_safety_margin"]) id_set = set() start_process_at = datetime.datetime.now() LOGGER.info("Starting %s Sync at %s" % (schema_name, str(start_process_at))) LOGGER.info("Only syncing %s updated since %s" % (schema_name, start)) while True: # We need to update the relative time inside the loop as the time is moving utc_now = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) datediff = utc_now - start_at_time + tz_offset LOGGER.info("Offset: %d" % offset) # First get the list of IDs params = { "resource": schema_name, "days": datediff.days, "hours": datediff.seconds / 3600, "offset": offset, "items_per_page": CONFIG["items_per_page"] } endpoint = get_endpoint("modified_items", params) LOGGER.info("GET %s", endpoint) rows = gen_modified_items_request(schema_name, endpoint) for row in rows[schema_name]: # last_updated is an unix timestamp current_timestamp = None if row["last_updated"]: current_timestamp = datetime.datetime.utcfromtimestamp( int(row["last_updated"])).replace(tzinfo=pytz.utc) end_at = parser.parse(CONFIG["end_at"]) if end_at.tzinfo is None: end_at = end_at.replace(tzinfo=pytz.utc) if CONFIG.get("end_at") is None or row["last_updated"] is None or ( current_timestamp and current_timestamp < end_at): id_set.add(row["id"]) if len(rows[schema_name]) < CONFIG["items_per_page"]: LOGGER.info("End of records %d" % len(rows[schema_name])) break else: offset = offset + CONFIG["items_per_page"] LOGGER.info("Found %d records" % len(id_set)) ids = list(id_set) with metrics.record_counter(schema_name) as counter: current_idx = 0 while current_idx < len(ids): params = { "resource": schema_name, "ids": ",".join(ids[ current_idx:min(len(ids), current_idx + INCREMENTAL_ITEMS_PER_PAGE)]) } endpoint = get_endpoint(schema_name + "_by_id", params) LOGGER.info("GET %s", endpoint) rows = gen_request(schema_name, endpoint) if len(rows) < len(ids[current_idx:min( len(ids), current_idx + INCREMENTAL_ITEMS_PER_PAGE)]): LOGGER.warning( "Number of items returned from WC API is lower than the ID list size" ) for row in rows: counter.increment() row = filter_result(row, schema) if "_etl_tstamp" in schema["properties"].keys(): row["_etl_tstamp"] = time.time() singer.write_record(schema_name, row) current_idx = current_idx + INCREMENTAL_ITEMS_PER_PAGE STATE = singer.write_bookmark(STATE, schema_name, 'last_update', last_update) singer.write_state(STATE) end_process_at = datetime.datetime.now() LOGGER.info("Completed %s Sync at %s" % (schema_name, str(end_process_at))) LOGGER.info("Process duration: " + str(end_process_at - start_process_at)) return STATE
def sync_project( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, is_selected=False, selected_sub_stream=[], ): schema = load_schema(schema_name) if is_selected: bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id'], bookmark_properties=[bookmark_property]) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) for row in response: item = transformer.transform(row, schema) time_extracted = utils.now() # find related if 'expense_items' in selected_sub_stream: sync_endpoint( 'expense_items', BASE_API_URL + 'projects/' + str(row['id']) + '/expense_items', None, 'project_id', str(row['id'])) if 'invoices' in selected_sub_stream: sync_endpoint( 'invoices', BASE_API_URL + 'projects/' + str(row['id']) + '/invoices', None, 'project_id', str(row['id'])) if 'milestones' in selected_sub_stream: sync_endpoint( 'milestones', BASE_API_URL + 'projects/' + str(row['id']) + '/milestones', None, 'project_id', str(row['id'])) if 'project_team' in selected_sub_stream: sync_endpoint( 'project_team', BASE_API_URL + 'projects/' + str(row['id']) + '/team', None, 'project_id', str(row['id']), ['person_id', 'project_id'], ) if 'sprints' in selected_sub_stream: sync_endpoint( 'sprints', BASE_API_URL + 'projects/' + str(row['id']) + '/sprints', None, 'project_id', str(row['id'])) if 'workflow_columns' in selected_sub_stream: sync_endpoint( 'workflow_columns', BASE_API_URL + 'projects/' + str(row['id']) + '/workflow_columns', None, 'project_id', str(row['id'])) if 'project_financials' in selected_sub_stream: sync_endpoint( 'project_financials', BASE_API_URL + 'projects/' + str(row['id']) + '/financials', None, None, None, ['project_id'], ) if is_selected and (bookmark_property in item and item[bookmark_property] \ >= start): singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_endpoint(schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, parameter_for_updated=None): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) start_dt = datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S.%fZ') updated_since = start_dt.strftime("%Y%m%dT%H%M%S") LOGGER.info('updated_since ' + updated_since) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url if parameter_for_updated is not None: url = url + '?' + parameter_for_updated + '=' + updated_since response = request(url, None) LOGGER.info('URL :' + url) if schema_name is 'project_financials': response = [response] time_extracted = utils.now() for row in response: if special_field_name is not None: row[special_field_name] = special_field_value if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00.00Z' if datetime.datetime.strptime(item[bookmark_property], '%Y-%m-%dT%H:%M:%S.%fZ') >= start_dt: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def write_schema(stream): schema = stream.schema.to_dict() singer.write_schema(stream.tap_stream_id, schema, stream.key_properties)
def sync(self): key_properties = self.catalog.get('key_properties') table = self.TABLE singer.write_schema(self.catalog.get('stream'), self.catalog.get('schema'), key_properties=key_properties) field_selector = get_field_selector(self.catalog, self.catalog.get('schema')) includeGeoIpData = self.any_selected([ 'geoIPCity', 'geoIPStateRegion', 'geoIPZip', 'geoIPCountry', 'geoIPCountryCode' ]) includeTechnologyData = self.any_selected([ 'primaryBrowser', 'mobileBrowser', 'primaryEmailClient' 'mobileEmailClient', 'operatingSystem' ]) includeRFMData = self.any_selected([ 'firstOrderDate', 'lastOrderDate', 'lastOrderTotal' 'totalOrders', 'totalRevenue', 'averageOrderValue' ]) includeEngagementData = self.any_selected( ['lastDeliveryDate', 'lastOpenDate', 'lastClickDate']) if includeGeoIpData: LOGGER.info('Including GEOIP data.') if includeTechnologyData: LOGGER.info('Including technology data.') if includeRFMData: LOGGER.info('Including RFM data.') if includeEngagementData: LOGGER.info('Including engagement data.') LOGGER.info('Syncing contacts.') start = self.get_start_date(table) end = start interval = timedelta(hours=6) def flatten(item): read_only_data = item.get('readOnlyContactData', {}) or {} item.pop('readOnlyContactData', None) return dict(item, **read_only_data) while end < datetime.now(pytz.utc): start = end end = start + interval LOGGER.info("Fetching contacts modified from {} to {}".format( start, end)) _filter = self.make_filter(start, end) pageNumber = 1 hasMore = True while hasMore: retry_count = 0 try: results = self.client.service.readContacts( filter=_filter, includeLists=True, fields=[], pageNumber=pageNumber, includeSMSKeywords=True, includeGeoIPData=includeGeoIpData, includeTechnologyData=includeTechnologyData, includeRFMData=includeRFMData, includeEngagementData=includeEngagementData) except socket.timeout: retry_count += 1 if retry_count >= 5: LOGGER.error( "Retried more than five times, moving on!") raise LOGGER.warn("Timeout caught, retrying request") continue except Fault as e: if '103' in e.message: LOGGER.warn( "Got signed out - logging in again and retrying") self.login() continue else: raise LOGGER.info("... {} results".format(len(results))) extraction_time = singer.utils.now() for result in results: result_dict = zeep.helpers.serialize_object( result, target_cls=dict) flattened = flatten(result_dict) singer.write_record(table, field_selector(flattened), time_extracted=extraction_time) if len(results) == 0: hasMore = False pageNumber = pageNumber + 1 self.state = incorporate(self.state, table, self.REPLICATION_KEY, start.replace(microsecond=0).isoformat()) save_state(self.state) LOGGER.info("Done syncing contacts.")
def sync_invoices(): messages_schema = load_schema("invoice_messages") bookmark_property = 'updated_at' singer.write_schema("invoice_messages", messages_schema, ["id"], bookmark_properties=[bookmark_property]) payments_schema = load_schema("invoice_payments") singer.write_schema("invoice_payments", payments_schema, ["id"], bookmark_properties=[bookmark_property]) schema = load_schema("invoices") singer.write_schema("invoices", schema, ["id"], bookmark_properties=[bookmark_property]) start = get_start("invoices") start_dt = pendulum.parse(start) updated_since = start_dt.strftime("%Y-%m-%d %H:%M") url = get_url("invoices") with Transformer() as transformer: while True: data = request(url, {"updated_since": updated_since}) invoices_time_extracted = utils.now() for row in data: item = row["invoices"] item = transformer.transform(item, schema) append_times_to_dates(item, ["issued_at", "due_at"]) singer.write_record("invoices", item, time_extracted=invoices_time_extracted) utils.update_state(STATE, "invoices", item['updated_at']) suburl = url + "/{}/messages".format(item['id']) messages_data = request(suburl) messages_time_extracted = utils.now() for subrow in messages_data: subitem = subrow["message"] if subitem['updated_at'] >= start: append_times_to_dates(subitem, ["send_reminder_on"]) singer.write_record( "invoice_messages", subitem, time_extracted=messages_time_extracted) suburl = url + "/{}/payments".format(item['id']) payments_data = request(suburl) payments_time_extracted = utils.now() for subrow in payments_data: subitem = subrow["payment"] subitem = transformer.transform(subitem, payments_schema) if subitem['updated_at'] >= start: singer.write_record( "invoice_payments", subitem, time_extracted=payments_time_extracted) singer.write_state(STATE) if len(data) < 50: break singer.write_state(STATE)
def sync_projects(): bookmark_property = 'updated_at' tasks_schema = load_schema("project_tasks") singer.write_schema("project_tasks", tasks_schema, ["id"], bookmark_properties=[bookmark_property]) users_schema = load_schema("project_users") singer.write_schema("project_users", users_schema, ["id"], bookmark_properties=[bookmark_property]) entries_schema = load_schema("time_entries") singer.write_schema("time_entries", entries_schema, ["id"], bookmark_properties=[bookmark_property]) schema = load_schema("projects") singer.write_schema("projects", schema, ["id"], bookmark_properties=[bookmark_property]) start = get_start("projects") start_dt = pendulum.parse(start) updated_since = start_dt.strftime("%Y-%m-%d %H:%M") url = get_url("projects") projects_data = request(url) projects_time_extracted = utils.now() with Transformer() as transformer: for row in projects_data: item = row["project"] item = transformer.transform(item, schema) date_fields = [ "starts_on", "ends_on", "hint_earliest_record_at", "hint_latest_record_at" ] append_times_to_dates(item, date_fields) if item[bookmark_property] >= start: singer.write_record("projects", item, time_extracted=projects_time_extracted) utils.update_state(STATE, "projects", item[bookmark_property]) suburl = url + "/{}/user_assignments".format(item["id"]) project_users_data = request( suburl, params={"updated_since": updated_since}) project_users_time_extracted = utils.now() for subrow in project_users_data: subitem = subrow["user_assignment"] subitem = transformer.transform(subitem, users_schema) singer.write_record( "project_users", subitem, time_extracted=project_users_time_extracted) suburl = url + "/{}/task_assignments".format(item["id"]) task_assignments_data = request( suburl, params={"updated_since": updated_since}) task_assignments_time_extracted = utils.now() for subrow in task_assignments_data: subitem = subrow["task_assignment"] subitem = transformer.transform(subitem, tasks_schema) singer.write_record( "project_tasks", subitem, time_extracted=task_assignments_time_extracted) suburl = url + "/{}/entries".format(item["id"]) subparams = { "from": start_dt.strftime("%Y%m%d"), "to": datetime.datetime.utcnow().strftime("%Y%m%d"), "updated_since": updated_since, } time_entries_data = request(suburl, params=subparams) time_entries_time_extracted = utils.now() for subrow in time_entries_data: subitem = subrow["day_entry"] subitem = transformer.transform(subitem, entries_schema) singer.write_record("time_entries", subitem, time_extracted=time_entries_time_extracted) singer.write_state(STATE)
def sync_transactions(): schema = load_schema("transactions") singer.write_schema("transactions", schema, ["id"], bookmark_properties=['created_at']) latest_updated_at = utils.strptime_to_utc( STATE.get('latest_updated_at', DEFAULT_TIMESTAMP)) run_maximum_updated_at = latest_updated_at latest_disbursement_date = utils.strptime_to_utc( STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP)) run_maximum_disbursement_date = latest_disbursement_date latest_start_date = utils.strptime_to_utc(get_start("transactions")) period_start = latest_start_date - TRAILING_DAYS period_end = utils.now() logger.info("transactions: Syncing from {}".format(period_start)) logger.info( "transactions: latest_updated_at from {}, disbursement_date from {}". format(latest_updated_at, latest_disbursement_date)) logger.info( "transactions: latest_start_date from {}".format(latest_start_date)) # increment through each day (20k results max from api) for start, end in daterange(period_start, period_end): end = min(end, period_end) data = braintree.Transaction.search( braintree.TransactionSearch.created_at.between(start, end)) time_extracted = utils.now() logger.info("transactions: Fetched {} records from {} - {}".format( data.maximum_size, start, end)) row_written_count = 0 row_skipped_count = 0 for row in data: # Ensure updated_at consistency if not getattr(row, 'updated_at'): row.updated_at = row.created_at transformed = transform_row(row, schema) updated_at = to_utc(row.updated_at) # if disbursement is successful, get disbursement date # set disbursement datetime to min if not found if row.disbursement_details is None: disbursement_date = datetime.min else: if row.disbursement_details.disbursement_date is None: row.disbursement_details.disbursement_date = datetime.min disbursement_date = to_utc( datetime.combine( row.disbursement_details.disbursement_date, datetime.min.time())) # Is this more recent than our past stored value of update_at? # Is this more recent than our past stored value of disbursement_date? # Use >= for updated_at due to non monotonic updated_at values # Use > for disbursement_date - confirming all transactions disbursed # at the same time # Update our high water mark for updated_at and disbursement_date # in this run if (updated_at >= latest_updated_at) or (disbursement_date >= latest_disbursement_date): if updated_at > run_maximum_updated_at: run_maximum_updated_at = updated_at if disbursement_date > run_maximum_disbursement_date: run_maximum_disbursement_date = disbursement_date singer.write_record("transactions", transformed, time_extracted=time_extracted) row_written_count += 1 else: row_skipped_count += 1 logger.info("transactions: Written {} records from {} - {}".format( row_written_count, start, end)) logger.info("transactions: Skipped {} records from {} - {}".format( row_skipped_count, start, end)) # End day loop logger.info("transactions: Complete. Last updated record: {}".format( run_maximum_updated_at)) logger.info("transactions: Complete. Last disbursement date: {}".format( run_maximum_disbursement_date)) latest_updated_at = min(run_maximum_updated_at, period_end) latest_disbursement_date = min(run_maximum_disbursement_date, period_end) STATE['latest_updated_at'] = utils.strftime(latest_updated_at) STATE['latest_disbursement_date'] = utils.strftime( latest_disbursement_date) utils.update_state(STATE, "transactions", utils.strftime(end)) singer.write_state(STATE)
def sync_engagements(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) if "schema" in catalog: schema = catalog["schema"] else: schema = load_schema('engagements') bookmark_key = 'lastUpdated' singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, "engagements", bookmark_key) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "engagements") or utils.now() STATE = write_current_sync_start(STATE, "engagements", current_sync_start) singer.write_state(STATE) max_bk_value = start LOGGER.info("sync_engagements from %s", start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start) singer.write_state(STATE) url = get_url("engagements_all") params = {'limit': 250} top_level_key = "results" engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"]) time_extracted = utils.now() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for engagement in engagements: record = bumble_bee.transform( lift_properties_and_versions(engagement), schema, mdata) if record['engagement'][bookmark_key] >= start: # hoist PK and bookmark field to top-level record record['engagement_id'] = record['engagement']['id'] record[bookmark_key] = record['engagement'][bookmark_key] singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted) if record['engagement'][bookmark_key] >= max_bk_value: max_bk_value = record['engagement'][bookmark_key] # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start) STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'engagements', None) singer.write_state(STATE) return STATE
def write_schema(self): singer.write_schema(self.schema, self.get_schema(), key_properties=self.key_properties)
def sync_substream(self, state, parent, sub_stream, parent_response): bookmark_date = self.get_bookmark(state, sub_stream.name, self.config.get('start_date'), sub_stream.replication_key) # If last sync was interrupted, get last processed parent record last_processed = self.get_bookmark(state, sub_stream.name, None, key="last_processed") bookmark_dttm = strptime_to_utc(bookmark_date) new_bookmark = bookmark_dttm singer.write_schema(sub_stream.name, sub_stream.stream.schema.to_dict(), sub_stream.key_properties) # Slice response for >= last processed if last_processed: for i, e in enumerate(parent_response): if e.get(parent.key_properties[0]) == last_processed: LOGGER.info("Resuming %s sync with %s", sub_stream.name, e.get(parent.key_properties[0])) parent_response = parent_response[i:len(parent_response)] continue for record in parent_response: try: with metrics.record_counter( sub_stream.name) as counter, Transformer( integer_datetime_fmt= "unix-milliseconds-integer-datetime-parsing" ) as transformer: stream_events = sub_stream.sync( state, new_bookmark, record.get(parent.key_properties[0])) for event in stream_events: counter.increment() schema_dict = sub_stream.stream.schema.to_dict() stream_metadata = metadata.to_map( sub_stream.stream.metadata) transformed_event = humps.decamelize(event) try: transformed_record = transformer.transform( transformed_event, schema_dict, stream_metadata) except Exception as err: LOGGER.error('Error: %s', err) LOGGER.error( ' for schema: %s', json.dumps(schema_dict, sort_keys=True, indent=2)) raise err event_time = strptime_to_utc( transformed_record.get(sub_stream.replication_key)) new_bookmark = max(new_bookmark, event_time) singer.write_record(sub_stream.stream.tap_stream_id, transformed_record) except HTTPError: LOGGER.warning( "Unable to retrieve %s Event for Stream (ID: %s)", sub_stream.name, record[parent.key_properties[0]]) # All events for all parents processed; can removed last processed self.update_bookmark(state=state, stream=sub_stream.name, bookmark_value=record.get( parent.key_properties[0]), bookmark_key="last_processed") self.update_bookmark(state=state, stream=sub_stream.name, bookmark_value=strftime(new_bookmark), bookmark_key=sub_stream.replication_key) # After processing for all parent ids we can remove our resumption state state.get('bookmarks').get(sub_stream.name).pop('last_processed') update_currently_syncing(state, None)
def write_schema(self): singer.write_schema(self.catalog.stream, self.catalog.schema.to_dict(), key_properties=self.stream_metadata.get( 'table-key-properties', []))
def load_and_write_schema(tap_stream_id, catalog): stream = get_stream_from_catalog(tap_stream_id, catalog) singer.write_schema(tap_stream_id, stream['schema'], PK_FIELDS[tap_stream_id])
def sync_allocations( schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, keys=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) if keys is None: keys = ['id'] singer.write_schema(schema_name, schema, keys, bookmark_properties=[bookmark_property]) start = get_start(schema_name) weekDays = [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', ] with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: # add here logic date = datetime.datetime.strptime(row['start_date'], '%Y-%m-%d') LOGGER.info("Project" + str(row['project']) + "-" + str(row['person'])) end_date = datetime.datetime.strptime(row['end_date'], '%Y-%m-%d') newRow = {} #LOGGER.info("ID:" + str(row['id'])) #LOGGER.info("Date : " + date.strftime('%Y%m%d')) while date <= end_date: #LOGGER.info('Date : ' + str(date.weekday()) + 'weekday' # + weekDays[date.weekday()]) #LOGGER.info(row['project']) #LOGGER.info(row[weekDays[date.weekday()]]) #LOGGER.info(str(date.strftime('%Y-%m-%d'))) #if row['id'] = 72051: # LOGGER.info(row['project']) # LOGGER.info(row['person']) # LOGGER.info(str(date.strftime('%Y-%m-%d'))) # LOGGER.info(str(end_date.strftime('%Y-%m-%d'))) newRow['allocation'] = row[weekDays[date.weekday()]] if not newRow['allocation'] > 0: date = date + timedelta(days=1) continue newRow['project'] = row['project'] newRow['non_project_time'] = row['non_project_time'] newRow['connected_project'] = row['connected_project'] newRow['person'] = row['person'] newRow['project'] = row['project'] newRow['date'] = date.strftime('%Y-%m-%d') newRow['notes'] = row['notes'] newRow['created_by'] = row['created_by'] newRow['updated_by'] = row['updated_by'] newRow['created_at'] = row['created_at'] newRow['updated_at'] = row['updated_at'] newRow['id'] = str(row['id']) \ + str(date.strftime('%Y%m%d')) date = date + timedelta(days=1) item = transformer.transform(newRow, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' if bookmark_property in item \ and item[bookmark_property] >= start: singer.write_record(schema_name, item, time_extracted=time_extracted) utils.update_state(STATE, schema_name, item[bookmark_property]) else: singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_companies(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key)) LOGGER.info("sync_companies from %s", start) schema = load_schema('companies') singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias')) # Because this stream doesn't query by `lastUpdated`, it cycles # through the data set every time. The issue with this is that there # is a race condition by which records may be updated between the # start of this table's sync and the end, causing some updates to not # be captured, in order to combat this, we must store the current # sync's start in the state and not move the bookmark past this value. current_sync_start = get_current_sync_start(STATE, "companies") or utils.now() STATE = write_current_sync_start(STATE, "companies", current_sync_start) singer.write_state(STATE) url = get_url("companies_all") max_bk_value = start if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY) singer.write_schema("contacts_by_company", contacts_by_company_schema, ["company-id", "contact-id"]) with bumble_bee: for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'], ['offset']): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = request( get_url("companies_detail", company_id=row['companyId'])).json() record = bumble_bee.transform( lift_properties_and_versions(record), schema, mdata) singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now()) if CONTACTS_BY_COMPANY in ctx.selected_stream_ids: STATE = _sync_contacts_by_company(STATE, ctx, record['companyId']) # Don't bookmark past the start of this sync to account for updated records during the sync. new_bookmark = min(max_bk_value, current_sync_start) STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark)) STATE = write_current_sync_start(STATE, 'companies', None) singer.write_state(STATE) return STATE
def sync_rate_cards( # pylint: disable=too-many-arguments schema_name, endpoint=None, path=None, special_field_name=None, special_field_value=None, date_fields=None, with_updated_since=True, for_each_handler=None, map_handler=None, object_to_id=None, ): schema = load_schema(schema_name) bookmark_property = 'updated_at' LOGGER.info('Loading ' + schema_name) singer.write_schema(schema_name, schema, ['id'], bookmark_properties=[bookmark_property]) start = get_start(schema_name) with Transformer() as transformer: url = get_url(endpoint or schema_name) url = endpoint or url response = request(url, None) time_extracted = utils.now() for row in response: if map_handler is not None: row = map_handler(row) if object_to_id is not None: for key in object_to_id: if row[key] is not None: row[key + '_id'] = row[key]['id'] else: row[key + '_id'] = None item = transformer.transform(row, schema) if not bookmark_property in item: item[bookmark_property] = \ datetime.datetime.now().strftime('%Y-%m-%d') \ + 'T00:00:00Z' # find expenses sync_endpoint( 'rate_cards_rates', BASE_API_URL + 'rate_cards/' + str(row['id']) + '/rates', None, 'rate_card_id', str(row['id']), ['rate_card_id', 'role'], ) singer.write_record(schema_name, item, time_extracted=time_extracted) # take any additional actions required for the currently loaded endpoint utils.update_state(STATE, schema_name, item[bookmark_property]) singer.write_state(STATE)
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'includeAssociations': False, 'properties': []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Check if we should include associations for key in mdata.keys(): if 'associations' in key: assoc_mdata = mdata.get(key) if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True): params['includeAssociations'] = True if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata): # On 2/12/20, hubspot added a lot of additional properties for # deals, and appending all of them to requests ended up leading to # 414 (url-too-long) errors. Hubspot recommended we use the # `includeAllProperties` and `allpropertiesFetchMode` params # instead. params['includeAllProperties'] = True params['allPropertiesFetchMode'] = 'latest_version' url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate'][ 'timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp( timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def write_schema(self): # for /recents/ streams override default (schema name equals to endpoint) with items singer.write_schema(self.schema, self.get_schema(), key_properties=self.key_properties)
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) if entity_name == 'email_events': window_size = int(CONFIG['email_chunk_size']) elif entity_name == 'subscription_changes': window_size = int(CONFIG['subscription_chunk_size']) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + window_size params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer( UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform( lift_properties_and_versions(row), schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def output_schema(stream): schema = schemas.load_schema(stream.tap_stream_id) pk_fields = schemas.PK_FIELDS[stream.tap_stream_id] singer.write_schema(stream.tap_stream_id, schema, pk_fields)
def write_schema(self): singer.write_schema(self.catalog.stream, self.catalog.schema.to_dict(), key_properties=self.KEY_PROPERTIES, bookmark_properties=self.BOOKMARK_PROPERTIES)
def do_sync(config, state, catalog): """ Sync data from tap source """ # Loop over selected streams in catalog # pickup_year is the most recent year value in the STATE file now = datetime.datetime.now() for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream:" + stream.tap_stream_id) bookmark_column = stream.replication_key is_sorted = False # TODO: indicate whether data is sorted ascending on bookmark value if "startyear" in config.keys(): stream_start_year = config['startyear'] else: stream_start_year = "2000" if ("endyear" in config.keys()) and (len(config['endyear']) > 3): try: stream_end_year = config['endyear'] if int( config['endyear']) <= now.year else now.year except: stream_end_year = now.year else: stream_end_year = now.year if "calculations" in config.keys(): stream_calculations = config['calculations'] else: stream_calculations = "False" if "annualaverage" in config.keys(): stream_annualaverage = config['annualaverage'] else: stream_annualaverage = "False" if "aspects" in config.keys(): stream_aspects = config['aspects'] else: stream_aspects = "False" # check if the STATE.json requests a more recent start date if "bookmarks" in state.keys( ): # check the state even has bookmarks... if stream.stream in state["bookmarks"].keys( ): # if this stream as an entry in the state.json file... try: pickup_year = int( state["bookmarks"][stream.stream]['year']) except: start_year = False year_reset = "There was an error with the year format \"" + state[ stream. stream] + "\" in the State.json file for stream " + str( stream.stream) + " - pickin up at year " + str( stream_start_year) + "." LOGGER.info(year_reset) else: start_year = int(config['startyear']) if (start_year < pickup_year and pickup_year <= now.year): stream_start_year = str(pickup_year) year_reset = "As per state, overriding start year for stream " + str( stream.stream) + " to " + stream_start_year LOGGER.info(year_reset) # make the call the_call = { "seriesid": [stream.tap_stream_id], "startyear": stream_start_year, "endyear": stream_end_year, "calculations": stream_calculations, "annualaverage": stream_annualaverage, "aspects": stream_aspects } if 'api-key' in config.keys(): the_call["registrationkey"] = config['api-key'] json_data = call_api(the_call) if not json_data: return raw_schema = stream.schema.to_dict() series_frequency = json_data['Results']['series'][0]['data'][0][ 'period'][ 0] # assigns 'A' for annual, 'Q' for quarterly and 'M' for monthly. if series_frequency == "A": # series is annual raw_schema['properties']['year'] = {"type": ["null", "integer"]} if series_frequency == "S": # series is semi-annual raw_schema['properties']['period'] = {"type": ["null", "integer"]} if series_frequency == "Q": # series is quarterly raw_schema['properties']['quarter'] = {"type": ["null", "integer"]} raw_schema['properties']['year'] = {"type": ["null", "integer"]} if series_frequency == "M": # series is monthly raw_schema['properties']['month'] = {"type": ["null", "integer"]} if ("calculations" in config.keys()) and (config['calculations'].lower() == "true"): raw_schema['properties']['net_change_1'] = { "type": ["null", "number"] } raw_schema['properties']['net_change_3'] = { "type": ["null", "number"] } raw_schema['properties']['net_change_6'] = { "type": ["null", "number"] } raw_schema['properties']['net_change_12'] = { "type": ["null", "number"] } raw_schema['properties']['pct_change_1'] = { "type": ["null", "number"] } raw_schema['properties']['pct_change_3'] = { "type": ["null", "number"] } raw_schema['properties']['pct_change_6'] = { "type": ["null", "number"] } raw_schema['properties']['pct_change_12'] = { "type": ["null", "number"] } if ("aspects" in config.keys()) and (config['aspects'].lower() == "true"): raw_schema['properties']['aspects'] = {"type": ["null", "string"]} if ("annualaverage" in config.keys()) and (config['annualaverage'].lower() == "true"): raw_schema['properties']['annualaverage'] = { "type": ["null", "number"] } singer.write_schema( stream_name=stream.tap_stream_id, schema= raw_schema, #the "to_dict()" bit is a change to the current cookiecutter template on Github. key_properties=stream.key_properties, ) max_bookmark = 0 max_year = 0 utc = pytz.timezone('UTC') thetime = utc.localize(datetime.datetime.now()) thetimeformatted = thetime.astimezone().isoformat() for series in json_data['Results']['series']: seriesId = series['seriesID'] time_extracted = utc.localize( datetime.datetime.now()).astimezone().isoformat() for item in series['data']: year = item['year'] if max_year < int(year): max_year = int(year) period = item['period'] if period[0] == 'M': month = int(period[1] + period[2]) quarter = round((int(period[1] + period[2]) / 3) + 0.3) elif period[0] == 'Q': month = 0 quarter = period[2] elif period[0] == 'S': month = 0 quarter = 0 period = period[2] elif period[0] == 'A': month = 0 quarter = 0 else: month = "" quater = "" value = item['value'] # if series_frequency == "A": # next_row['year'] = item['something'] # if series_frequency == "Q": # next_row['quarter'] = item['something'] # next_row['year'] = item['something'] # if series_frequency == "M": # next_row['month'] = item['something'] full_period = str(year) + "-" + str( "{0:0=2d}".format(month)) + "-01T00:00:00-04:00" footnotes = "" for footnote in item['footnotes']: if footnote: footnotes = footnotes + footnote['text'] + ',' next_row = { "type": "RECORD", "stream": seriesId, "time_extracted": time_extracted, "schema": seriesId, "frequency": series_frequency, "record": { "SeriesID": seriesId, "year": year, "period": period, "value": value, "footnotes": footnotes[0:-1], "month": str(month), "quarter": str(quarter), "time_extracted": time_extracted, "full_period": full_period } } if ("calculations" in config.keys()) and (config['calculations'].lower() == "true"): if ("calculations" in item.keys()): if ("net_changes" in item["calculations"].keys()): next_row['net_change_1'] = float( item['calculations']['net_changes'] ['1']) if '1' in item['calculations'][ 'net_changes'].keys() else None next_row['net_change_3'] = float( item['calculations']['net_changes'] ['3']) if '3' in item['calculations'][ 'net_changes'].keys() else None next_row['net_change_6'] = float( item['calculations']['net_changes'] ['6']) if '6' in item['calculations'][ 'net_changes'].keys() else None next_row['net_change_12'] = float( item['calculations']['net_changes'] ['12']) if '12' in item['calculations'][ 'net_changes'].keys() else None else: next_row['net_change_1'] = next_row[ 'net_change_3'] = next_row[ 'net_change_6'] = next_row[ 'net_change_12'] = None if ("net_changes" in item["calculations"].keys()): next_row['pct_change_1'] = float( item['calculations']['pct_changes'] ['1']) if '1' in item['calculations'][ 'pct_changes'].keys() else None next_row['pct_change_3'] = float( item['calculations']['pct_changes'] ['3']) if '3' in item['calculations'][ 'pct_changes'].keys() else None next_row['pct_change_6'] = float( item['calculations']['pct_changes'] ['6']) if '6' in item['calculations'][ 'pct_changes'].keys() else None next_row['pct_change_12'] = float( item['calculations']['pct_changes'] ['12']) if '12' in item['calculations'][ 'pct_changes'].keys() else None else: next_row['pct_change_1'] = next_row[ 'pct_change_3'] = next_row[ 'pct_change_6'] = next_row[ 'pct_change_12'] = None else: next_row['net_change_1'] = next_row[ 'net_change_3'] = next_row[ 'net_change_6'] = next_row[ 'net_change_12'] = next_row[ 'pct_change_1'] = next_row[ 'pct_change_3'] = next_row[ 'pct_change_6'] = next_row[ 'pct_change_12'] = None if ("aspects" in config.keys()) and (config['aspects'].lower() == "true"): next_row['aspects'] = str(item['aspects']) if ("annualaverage" in config.keys()) and (config['annualaverage'].lower() == "true"): if period == 'M13' or period == 'Q5': next_row['annualaverage'] = float(item['value']) else: next_row['annualaverage'] = None # write one or more rows to the stream: singer.write_records(stream.tap_stream_id, [next_row]) # capture stream state if bookmark_column: if is_sorted: # update bookmark to latest value - this is redundant for tap-bls singer.write_state({ stream.tap_stream_id: next_row["record"][bookmark_column[0]] }) else: # if data unsorted, save max value until end of writes. tap-bls goes by the year and will use this approach max_bookmark = max( max_bookmark, int(next_row["record"][bookmark_column[0]])) if bookmark_column and not is_sorted: singer.write_state({stream.tap_stream_id: max_bookmark}) if (config['update_state'].lower() == 'true') and ( stream_start_year == config['startyear'] ): # if you set 'uptadate_state' in config.json the *tap* will update the STATE file - note this is NOT standard behaviour in Singer data flows as the *target* should handle STATE updates. LOGGER.info(update_state({stream.tap_stream_id: max_bookmark})) return
def load_and_write_schema(tap_stream_id): schema = load_schema(tap_stream_id) singer.write_schema(tap_stream_id, schema, pk_fields[tap_stream_id])
def do_sync(sf, catalog, state): starting_stream = state.get("current_stream") if starting_stream: LOGGER.info("Resuming sync from %s", starting_stream) else: LOGGER.info("Starting sync") for catalog_entry in catalog["streams"]: stream_version = get_stream_version(catalog_entry, state) stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') stream_name = catalog_entry["tap_stream_id"] activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') mdata = metadata.to_map(catalog_entry['metadata']) if not stream_is_selected(mdata): LOGGER.info("%s: Skipping - not selected", stream_name) continue if starting_stream: if starting_stream == stream_name: LOGGER.info("%s: Resuming", stream_name) starting_stream = None else: LOGGER.info("%s: Skipping - already synced", stream_name) continue else: LOGGER.info("%s: Starting", stream_name) state["current_stream"] = stream_name singer.write_state(state) key_properties = metadata.to_map(catalog_entry['metadata']).get( (), {}).get('table-key-properties') singer.write_schema(stream, catalog_entry['schema'], key_properties, replication_key, stream_alias) job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID') if job_id: with metrics.record_counter(stream) as counter: LOGGER.info( "Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id) # Resuming a sync should clear out the remaining state once finished counter = resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred: # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or # existing bookmark if no bookmark exists for the Job. # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None) bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \ .pop('JobHighestBookmarkSeen', None) existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \ .pop(replication_key, None) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, bookmark or existing_bookmark ) # If job is removed, reset to existing bookmark or None singer.write_state(state) else: # Tables with a replication_key or an empty bookmark will emit an # activate_version at the beginning of their sync bookmark_is_empty = state.get('bookmarks', {}).get( catalog_entry['tap_stream_id']) is None if replication_key or bookmark_is_empty: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', stream_version) counter = sync_stream(sf, catalog_entry, state) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) state["current_stream"] = None singer.write_state(state) LOGGER.info("Finished sync")
def test_write_schema(self): schema={'type': 'object', 'properties': { 'name': {'type': 'string'}}} singer.write_schema("users", schema, ["name"])
async def sync_report_interval(client, account_id, report_stream, start_date, end_date): state_key = '{}_{}'.format(account_id, report_stream.stream) report_name = stringcase.pascalcase(report_stream.stream) report_schema = get_report_schema(client, report_name) singer.write_schema(report_stream.stream, report_schema, []) report_time = arrow.get().isoformat() request_id = get_report_request_id(client, account_id, report_stream, report_name, start_date, end_date, state_key) singer.write_bookmark(STATE, state_key, 'request_id', request_id) singer.write_state(STATE) try: success, download_url = await poll_report(client, account_id, report_name, start_date, end_date, request_id) except Exception as some_error: LOGGER.info( 'The request_id %s for %s is invalid, generating a new one', request_id, state_key) request_id = get_report_request_id(client, account_id, report_stream, report_name, start_date, end_date, state_key, force_refresh=True) singer.write_bookmark(STATE, state_key, 'request_id', request_id) singer.write_state(STATE) success, download_url = await poll_report(client, account_id, report_name, start_date, end_date, request_id) if success and download_url: LOGGER.info( 'Streaming report: {} for account {} - from {} to {}'.format( report_name, account_id, start_date, end_date)) stream_report(report_stream.stream, report_name, download_url, report_time) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat()) singer.write_state(STATE) return True elif success and not download_url: LOGGER.info( 'No data for report: {} for account {} - from {} to {}'.format( report_name, account_id, start_date, end_date)) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat()) singer.write_state(STATE) return True else: LOGGER.info( 'Unsuccessful request for report: {} for account {} - from {} to {}' .format(report_name, account_id, start_date, end_date)) singer.write_bookmark(STATE, state_key, 'request_id', None) singer.write_state(STATE) return False
def write_schema(self): singer.write_schema( self.catalog.get('stream'), self.catalog.get('schema'), key_properties=self.catalog.get('key_properties'))