def sync_stream(stream_name): """ Sync each stream, looking for newly created records. Updates are captured by events stream. """ LOGGER.info("Started syncing stream %s", stream_name) stream_metadata = metadata.to_map(Context.get_catalog_entry(stream_name)['metadata']) stream_field_whitelist = json.loads(Context.config.get('whitelist_map', '{}')).get(stream_name) extraction_time = singer.utils.now() replication_key = metadata.get(stream_metadata, (), 'valid-replication-keys')[0] # Invoice Items bookmarks on `date`, but queries on `created` filter_key = 'created' if stream_name == 'invoice_items' else replication_key stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) bookmark = stream_bookmark # if this stream has a sub_stream, compare the bookmark sub_stream_name = SUB_STREAMS.get(stream_name) # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark) should_sync_sub_stream = sub_stream_name and Context.is_selected(sub_stream_name) if should_sync_sub_stream: sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \ or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) # if there is a sub stream, set bookmark to sub stream's bookmark # since we know it must be earlier than the stream's bookmark if sub_stream_bookmark != stream_bookmark: bookmark = sub_stream_bookmark else: sub_stream_bookmark = None with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: end_time = dt_to_epoch(utils.now()) window_size = float(Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE)) if DEFAULT_DATE_WINDOW_SIZE != window_size: LOGGER.info('Using non-default date window size of %.2f',window_size) start_window = bookmark # NB: Immutable streams are never synced for updates. We've # observed a short lag period between when records are created and # when they are available via the API, so these streams will need # a short lookback window. if stream_name in IMMUTABLE_STREAMS: # pylint:disable=fixme # TODO: This may be an issue for other streams' created_at # entries, but to keep the surface small, doing this only for # immutable streams at first to confirm the suspicion. start_window -= IMMUTABLE_STREAM_LOOKBACK # NB: We observed records coming through newest->oldest and so # date-windowing was added and the tap only bookmarks after it has # gotten through a date window while start_window < end_time: stop_window = dt_to_epoch(epoch_to_dt(start_window) + timedelta(days=window_size)) # cut off the last window at the end time if stop_window > end_time: stop_window = end_time for stream_obj in paginate( STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key, start_window, stop_window, STREAM_SDK_OBJECTS[stream_name].get('request_args') ): # get the replication key value from the object rec = unwrap_data_objects(stream_obj.to_dict_recursive()) rec = reduce_foreign_keys(rec, stream_name) stream_obj_created = rec[replication_key] rec['updated'] = stream_obj_created # sync stream if object is greater than or equal to the bookmark if stream_obj_created >= stream_bookmark: rec = transformer.transform(rec, Context.get_catalog_entry(stream_name)['schema'], stream_metadata) # At this point, the record has been transformed and so # any de-selected fields have been pruned. Now, prune off # any fields that aren't present in the whitelist. if stream_field_whitelist: rec = apply_whitelist(rec, stream_field_whitelist) singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.new_counts[stream_name] += 1 # sync sub streams if its selected and the parent object # is greater than its bookmark if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark: sync_sub_stream(sub_stream_name, stream_obj) # Update stream/sub-streams bookmarks as stop window if stop_window > stream_bookmark: stream_bookmark = stop_window singer.write_bookmark(Context.state, stream_name, replication_key, stream_bookmark) # the sub stream bookmarks on its parent if should_sync_sub_stream and stop_window > sub_stream_bookmark: sub_stream_bookmark = stop_window singer.write_bookmark(Context.state, sub_stream_name, replication_key, sub_stream_bookmark) singer.write_state(Context.state) # update window for next iteration start_window = stop_window singer.write_state(Context.state)
def get_start(state, tap_stream_id, bookmark_key): current_bookmark = singer.get_bookmark(state, tap_stream_id, bookmark_key) if current_bookmark is None: return CONFIG['start_date'] return current_bookmark
def do_sync(config, state, stream): singer.set_currently_syncing(state, stream.tap_stream_id) singer.write_state(state) client = bigquery.Client() metadata = stream.metadata[0]["metadata"] tap_stream_id = stream.tap_stream_id inclusive_start = True start_datetime = singer.get_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME) if start_datetime: if not config.get("start_always_inclusive"): inclusive_start = False else: start_datetime = config.get("start_datetime") start_datetime = dateutil.parser.parse(start_datetime).strftime( "%Y-%m-%d %H:%M:%S.%f") if config.get("end_datetime"): end_datetime = dateutil.parser.parse( config.get("end_datetime")).strftime("%Y-%m-%d %H:%M:%S.%f") singer.write_schema(tap_stream_id, stream.schema.to_dict(), stream.key_properties) keys = { "table": metadata["table"], "columns": metadata["columns"], "datetime_key": metadata.get("datetime_key"), "start_datetime": start_datetime, "end_datetime": end_datetime } limit = config.get("limit", None) query = _build_query(keys, metadata.get("filters", []), inclusive_start, limit=limit) query_job = client.query(query) properties = stream.schema.properties last_update = start_datetime LOGGER.info("Running query:\n %s" % query) extract_tstamp = datetime.datetime.utcnow() extract_tstamp = extract_tstamp.replace(tzinfo=datetime.timezone.utc) with metrics.record_counter(tap_stream_id) as counter: for row in query_job: record = {} for key in properties.keys(): prop = properties[key] if key in [ LEGACY_TIMESTAMP, EXTRACT_TIMESTAMP, BATCH_TIMESTAMP ]: continue if row[key] is None: if prop.type[0] != "null": raise ValueError( "NULL value not allowed by the schema") else: record[key] = None elif prop.format == "date-time": if type(row[key]) == str: r = dateutil.parser.parse(row[key]) elif type(row[key]) == datetime.date: r = datetime.datetime(year=row[key].year, month=row[key].month, day=row[key].day) elif type(row[key]) == datetime.datetime: r = row[key] record[key] = r.isoformat() elif prop.type[1] == "string": record[key] = str(row[key]) elif prop.type[1] == "number": record[key] = Decimal(row[key]) elif prop.type[1] == "integer": record[key] = int(row[key]) else: record[key] = row[key] if LEGACY_TIMESTAMP in properties.keys(): record[LEGACY_TIMESTAMP] = int(round(time.time() * 1000)) if EXTRACT_TIMESTAMP in properties.keys(): record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat() singer.write_record(stream.stream, record) last_update = record[keys["datetime_key"]] counter.increment() state = singer.write_bookmark(state, tap_stream_id, BOOKMARK_KEY_NAME, last_update) singer.write_state(state)
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params): replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') query_string = cursor.mogrify(select_sql, params) time_extracted = utils.now() LOGGER.info('Running %s', query_string) cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method in {'FULL_TABLE', 'LOG_BASED'}: key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = { k: v for k, v in record_message.record.items() if k in key_properties } state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark( state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_rows(config, state, tap_stream_id, key_properties=[], auth_method=None, max_page=None, assume_sorted=True, filter_by_schema=True, raw_output=False): """ - max_page: Force sync to end after max_page. Mostly used for debugging. - assume_sorted: Trust the data to be presorted by the index/timestamp/datetime keys so it is safe to finish the replication once the last update index/timestamp/datetime passes the end. """ schema = load_schema(config["schema_dir"], tap_stream_id) params = get_init_endpoint_params(config, state, tap_stream_id) bookmark_type = get_bookmark_type(config) start = get_start(config, state, tap_stream_id, "last_update") end = get_end(config) headers = get_http_headers(config) if start is None: LOGGER.warning("None of timestamp_key, datetime_key, and index_key" + " are set in conifg. Bookmarking is not available.") start_str = human_readable(bookmark_type, start) end_str = human_readable(bookmark_type, end) # Log the conditions LOGGER.info("Stream %s has %s set starting %s and ending %s." % (tap_stream_id, bookmark_type, start_str, end_str)) # I trust you set URL format contains those params. The behavior depends # on the data source API's spec. # I will not filter out the records outside the boundary. Every record # received is will be written out. LOGGER.info("assume_sorted is set to %s" % assume_sorted) # I trust the data to be sorted by the index/timestamp/datetime keys. # So it is safe to finish the replication once the last # update index/timestamp/datetime passes the end. # When in doubt, set this to False. Always perform post-replication dedup. LOGGER.info("filter_by_schema is set to %s." % filter_by_schema) # The fields undefined/not-conforming to the schema will be written out. LOGGER.info("auth_method is set to %s" % auth_method) # Initialize the counters last_update = start # Offset is the number of records (vs. page) offset_number = params.get("current_offset", 0) page_number = params.get("current_page", 0) # When we rely on index/datetime/timestamp to parse the next GET URL, # we will get the record we have already seen in the current process. # When we get last_record_extracted from state file, we can also # compare with the previous process to further avoiding duplicated # records in the target data store. prev_written_record = None last_record_extracted = singer.get_bookmark(state, tap_stream_id, "last_record_extracted") if last_record_extracted: prev_written_record = json.loads(last_record_extracted) # First writ out the schema if raw_output is False: singer.write_schema(tap_stream_id, schema, key_properties) # Fetch and iterate over to write the records with metrics.record_counter(tap_stream_id) as counter: while True: params.update({"current_page": page_number}) params.update({"current_page_one_base": page_number + 1}) params.update({"current_offset": offset_number}) params.update({"last_update": last_update}) endpoint = get_endpoint(config["url"], tap_stream_id, params) LOGGER.info("GET %s", endpoint) rows = generate_request(tap_stream_id, endpoint, auth_method, headers, config.get("username"), config.get("password")) rows = get_record_list(rows, config.get("record_list_level")) LOGGER.info("Current page %d" % page_number) LOGGER.info("Current offset %d" % offset_number) for row in rows: record = get_record(row, config.get("record_level")) if filter_by_schema: record = filter_record(record, schema) # It's important to compare the record before adding # EXTRACT_TIMESTAMP if record == prev_written_record: LOGGER.debug("Skipping the duplicated row %s" % record) continue if EXTRACT_TIMESTAMP in schema["properties"].keys(): extract_tstamp = datetime.datetime.utcnow() extract_tstamp = extract_tstamp.replace( tzinfo=datetime.timezone.utc) record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat() next_last_update = get_last_update(config, record, last_update) if not end or next_last_update < end: if raw_output: sys.stdout.write(json.dumps(record) + "\n") else: singer.write_record(tap_stream_id, record) counter.increment() # Increment only when we write last_update = next_last_update # prev_written_record may be persisted for the next run. # EXTRACT_TIMESTAMP will be different. So popping it out # before storing. record.pop(EXTRACT_TIMESTAMP) prev_written_record = record # Exit conditions if len(rows) < config["items_per_page"]: LOGGER.info( ("Response is less than set item per page (%d)." + "Finishing the extraction") % config["items_per_page"]) break if max_page and page_number + 1 >= max_page: LOGGER.info("Max page %d reached. Finishing the extraction.") break if assume_sorted and end and next_last_update >= end: LOGGER.info(("Record greater than %s and assume_sorted is" + " set. Finishing the extraction.") % end) break page_number += 1 offset_number += len(rows) state = singer.write_bookmark(state, tap_stream_id, "last_update", last_update) if prev_written_record: state = singer.write_bookmark(state, tap_stream_id, "last_record_extracted", json.dumps(prev_written_record)) if raw_output == False: singer.write_state(state) return state
def do_sync(sf, catalog, state): starting_stream = state.get("current_stream") if starting_stream: LOGGER.info("Resuming sync from %s", starting_stream) else: LOGGER.info("Starting sync") for catalog_entry in catalog["streams"]: stream_version = get_stream_version(catalog_entry, state) stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') stream_name = catalog_entry["tap_stream_id"] activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') mdata = metadata.to_map(catalog_entry['metadata']) if not stream_is_selected(mdata): LOGGER.info("%s: Skipping - not selected", stream_name) continue if starting_stream: if starting_stream == stream_name: LOGGER.info("%s: Resuming", stream_name) starting_stream = None else: LOGGER.info("%s: Skipping - already synced", stream_name) continue else: LOGGER.info("%s: Starting", stream_name) state["current_stream"] = stream_name singer.write_state(state) key_properties = metadata.to_map(catalog_entry['metadata']).get( (), {}).get('table-key-properties') singer.write_schema(stream, catalog_entry['schema'], key_properties, replication_key, stream_alias) job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID') if job_id: with metrics.record_counter(stream) as counter: LOGGER.info( "Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id) # Resuming a sync should clear out the remaining state once finished counter = resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None) bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobHighestBookmarkSeen', None) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, bookmark) singer.write_state(state) else: # Tables with a replication_key or an empty bookmark will emit an # activate_version at the beginning of their sync bookmark_is_empty = state.get('bookmarks', {}).get( catalog_entry['tap_stream_id']) is None if replication_key or bookmark_is_empty: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', stream_version) counter = sync_stream(sf, catalog_entry, state) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) state["current_stream"] = None singer.write_state(state) LOGGER.info("Finished sync")
def sync_tables(conn_info, logical_streams, state, end_lsn, state_file): lsn_comitted = min([get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams]) start_lsn = lsn_comitted lsn_to_flush = None time_extracted = utils.now() slot = locate_replication_slot(conn_info) lsn_last_processed = None lsn_currently_processing = None lsn_received_timestamp = None lsn_processed_count = 0 logical_poll_total_seconds = conn_info['logical_poll_total_seconds'] or 300 poll_interval = 10 poll_timestamp = None selected_tables = [] for s in logical_streams: selected_tables.append("{}.{}".format(s['metadata'][0]['metadata']['schema-name'], s['table_name'])) for s in logical_streams: sync_common.send_schema_message(s, ['lsn']) # Create replication connection and cursor conn = post_db.open_connection(conn_info, True) cur = conn.cursor() try: LOGGER.info("{} : Starting log streaming at {} to {} (slot {})".format(datetime.datetime.utcnow(), int_to_lsn(start_lsn), int_to_lsn(end_lsn), slot)) cur.start_replication(slot_name=slot, decode=True, start_lsn=start_lsn, options={'write-in-chunks': 1, 'add-tables': ','.join(selected_tables)}) except psycopg2.ProgrammingError: raise Exception("Unable to start replication with logical replication (slot {})".format(slot)) # Emulate some behaviour of pg_recvlogical LOGGER.info("{} : Confirming write up to 0/0, flush to 0/0".format(datetime.datetime.utcnow())) cur.send_feedback(write_lsn=0, flush_lsn=0, reply=True) time.sleep(1) lsn_received_timestamp = datetime.datetime.utcnow() poll_timestamp = datetime.datetime.utcnow() while True: # Disconnect when no data received for logical_poll_total_seconds # needs to be long enough to wait for the largest single wal payload to avoid unplanned timeouts poll_duration = (datetime.datetime.utcnow() - lsn_received_timestamp).total_seconds() if poll_duration > logical_poll_total_seconds: LOGGER.info("{} : Breaking - {} seconds of polling with no data".format(datetime.datetime.utcnow(), poll_duration)) break try: msg = cur.read_message() except Exception as e: LOGGER.error("{} : {}".format(datetime.datetime.utcnow(), e)) raise if msg: if msg.data_start > end_lsn: LOGGER.info("{} : Breaking - current {} is past end_lsn {}".format(datetime.datetime.utcnow(), int_to_lsn(msg.data_start), int_to_lsn(end_lsn))) break state = consume_message(logical_streams, state, msg, time_extracted, conn_info, end_lsn) # When using wal2json with write-in-chunks, multiple messages can have the same lsn # This is to ensure we only flush to lsn that has completed entirely if lsn_currently_processing is None: lsn_currently_processing = msg.data_start LOGGER.info("{} : First message received is {} at {}".format(datetime.datetime.utcnow(), int_to_lsn(lsn_currently_processing), datetime.datetime.utcnow())) # Flush Postgres wal up to lsn comitted in previous run, or first lsn received in this run lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info("{} : Confirming write up to {}, flush to {}".format(datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush))) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) elif (int(msg.data_start) > lsn_currently_processing): lsn_last_processed = lsn_currently_processing lsn_currently_processing = msg.data_start lsn_received_timestamp = datetime.datetime.utcnow() lsn_processed_count = lsn_processed_count + 1 if lsn_processed_count >= UPDATE_BOOKMARK_PERIOD: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) lsn_processed_count = 0 # When data is received, and when data is not received, a keep-alive poll needs to be returned to PostgreSQL if datetime.datetime.utcnow() >= (poll_timestamp + datetime.timedelta(seconds=poll_interval)): if lsn_currently_processing is None: LOGGER.info("{} : Sending keep-alive message to source server (last message received was {} at {})".format( datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() elif state_file is None: LOGGER.info("{} : Sending keep-alive message to source server (last message received was {} at {})".format( datetime.datetime.utcnow(), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback() else: # Read lsn_comitted currently captured in state file on disk lsn_comitted = min([get_bookmark(utils.load_json(state_file), s['tap_stream_id'], 'lsn') for s in logical_streams]) lsn_to_flush = lsn_comitted if lsn_currently_processing < lsn_to_flush: lsn_to_flush = lsn_currently_processing LOGGER.info("{} : Confirming write up to {}, flush to {} (last message received was {} at {})".format( datetime.datetime.utcnow(), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_to_flush), int_to_lsn(lsn_last_processed), lsn_received_timestamp)) cur.send_feedback(write_lsn=lsn_to_flush, flush_lsn=lsn_to_flush, reply=True) poll_timestamp = datetime.datetime.utcnow() # Close replication connection and cursor cur.close() conn.close() if lsn_last_processed: if lsn_comitted > lsn_last_processed: lsn_last_processed = lsn_comitted LOGGER.info("Current lsn_last_processed {} is older than lsn_comitted {}".format(int_to_lsn(lsn_last_processed), int_to_lsn(lsn_comitted))) for s in logical_streams: LOGGER.info("updating bookmark for stream {} to lsn = {} ({})".format(s['tap_stream_id'], lsn_last_processed, int_to_lsn(lsn_last_processed))) state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', lsn_last_processed) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) return state
def sync_report(client, account_id, report_stream): report_name = stringcase.pascalcase(report_stream.stream) report_schema = get_report_schema(client, report_name) singer.write_schema(report_stream.stream, report_schema, []) state_key = '{}_{}'.format(account_id, report_stream.stream) config_start_date = CONFIG.get('start_date') bookmark = singer.get_bookmark(STATE, state_key, 'date') conversion_window = int(CONFIG.get('conversion_window', '-30')) start_date = arrow.get(bookmark or config_start_date).shift(days=conversion_window) end_date = arrow.get(CONFIG.get('end_date')) # defaults to now LOGGER.info('Syncing report: {} - from {} to {}'.format( report_name, start_date, end_date)) report_request = client.factory.create('{}Request'.format(report_name)) report_request.Format = 'Csv' report_request.Aggregation = 'Daily' report_request.Language = 'English' report_request.ExcludeReportHeader = True report_request.ExcludeReportFooter = True scope = client.factory.create('AccountThroughAdGroupReportScope') scope.AccountIds = {'long': [account_id]} report_request.Scope = scope excluded_fields = ['GregorianDate', '_sdc_report_datetime'] if report_name in reports.EXTRA_FIELDS: excluded_fields += reports.EXTRA_FIELDS[report_name] selected_fields = get_selected_fields(report_stream, exclude=excluded_fields) selected_fields.append('TimePeriod') report_columns = client.factory.create( 'ArrayOf{}Column'.format(report_name)) getattr(report_columns, '{}Column'.format(report_name)).append(selected_fields) report_request.Columns = report_columns request_start_date = client.factory.create('Date') request_start_date.Day = start_date.day request_start_date.Month = start_date.month request_start_date.Year = start_date.year request_end_date = client.factory.create('Date') request_end_date.Day = end_date.day request_end_date.Month = end_date.month request_end_date.Year = end_date.year report_time = client.factory.create('ReportTime') report_time.CustomDateRangeStart = request_start_date report_time.CustomDateRangeEnd = request_end_date report_time.PredefinedTime = None report_request.Time = report_time report_time = arrow.get().isoformat() request_id = client.SubmitGenerateReport(report_request) download_url = poll_report(client, report_name, start_date, end_date, request_id) if download_url: stream_report(report_stream.stream, report_name, download_url, report_time) singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat()) singer.write_state(STATE)
def do_sync(sf, catalog, state): starting_stream = state.get("current_stream") if starting_stream: LOGGER.info("Resuming sync from %s", starting_stream) else: LOGGER.info("Starting sync") for catalog_entry in catalog["streams"]: stream_version = get_stream_version(catalog_entry, state) stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') stream_name = catalog_entry["tap_stream_id"] activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') mdata = metadata.to_map(catalog_entry['metadata']) if not stream_is_selected(mdata): LOGGER.info("%s: Skipping - not selected", stream_name) continue if starting_stream: if starting_stream == stream_name: LOGGER.info("%s: Resuming", stream_name) starting_stream = None else: LOGGER.info("%s: Skipping - already synced", stream_name) continue else: LOGGER.info("%s: Starting", stream_name) state["current_stream"] = stream_name singer.write_state(state) key_properties = metadata.to_map(catalog_entry['metadata']).get( (), {}).get('table-key-properties') singer.write_schema(stream, catalog_entry['schema'], key_properties, replication_key, stream_alias) job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID') batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'BatchIDs') # Checking whether job_id list is not empty and batches list is not empty if job_id and batch_ids: with metrics.record_counter(stream) as counter: LOGGER.info( "Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id) # Resuming a sync should clear out the remaining state once finished counter = resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred: # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or # existing bookmark if no bookmark exists for the Job. # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None) bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \ .pop('JobHighestBookmarkSeen', None) existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \ .pop(replication_key, None) state = singer.write_bookmark( state, catalog_entry['tap_stream_id'], replication_key, bookmark or existing_bookmark ) # If job is removed, reset to existing bookmark or None singer.write_state(state) else: # Tables with a replication_key or an empty bookmark will emit an # activate_version at the beginning of their sync bookmark_is_empty = state.get('bookmarks', {}).get( catalog_entry['tap_stream_id']) is None if replication_key or bookmark_is_empty: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', stream_version) counter = sync_stream(sf, catalog_entry, state) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value) state["current_stream"] = None singer.write_state(state) LOGGER.info("Finished sync")
def sync_table(conn_config, stream, state, desired_columns): connection = orc_db.open_connection(conn_config) connection.outputtypehandler = common.OutputTypeHandler cur = connection.cursor() cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'") cur.execute("""ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'""") cur.execute("""ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'""") time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream.tap_stream_id, 'version') is None #pick a new table version IFF we do not have an ORA_ROWSCN in our state #the presence of an ORA_ROWSCN indicates that we were interrupted last time through if singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream.tap_stream_id, 'version') state = singer.write_bookmark(state, stream.tap_stream_id, 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) # cur = connection.cursor() md = metadata.to_map(stream.metadata) schema_name = md.get(()).get('schema-name') escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c), desired_columns) escaped_schema = schema_name escaped_table = stream.table activate_version_message = singer.ActivateVersionMessage( stream=stream.stream, version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: ora_rowscn = singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') if ora_rowscn: LOGGER.info("Resuming Full Table replication %s from ORA_ROWSCN %s", nascent_stream_version, ora_rowscn) select_sql = """SELECT {}, ORA_ROWSCN FROM {}.{} WHERE ORA_ROWSCN >= {} ORDER BY ORA_ROWSCN ASC """.format(','.join(escaped_columns), escaped_schema, escaped_table, ora_rowscn) else: select_sql = """SELECT {}, ORA_ROWSCN FROM {}.{} ORDER BY ORA_ROWSCN ASC""".format(','.join(escaped_columns), escaped_schema, escaped_table) rows_saved = 0 LOGGER.info("select %s", select_sql) for row in cur.execute(select_sql): ora_rowscn = row[-1] row = row[:-1] record_message = common.row_to_singer_message(stream, row, nascent_stream_version, desired_columns, time_extracted) singer.write_message(record_message) state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', ora_rowscn) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) counter.increment() state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN', None) #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) cur.close() connection.close() return state
def sync_table(client, stream, state, stream_version, blacklist): common.whitelist_bookmark_keys(generate_bookmark_keys(stream), stream['tap_stream_id'], state) mdata = metadata.to_map(stream['metadata']) stream_metadata = mdata.get(()) database_name = stream_metadata['database-name'] db = client[database_name] collection = db[stream['stream']] activate_version_message = singer.ActivateVersionMessage( stream=stream['stream'], version=stream_version) initial_full_table_complete = singer.get_bookmark( state, stream['tap_stream_id'], 'initial_full_table_complete') # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete: singer.write_message(activate_version_message) max_id_value = singer.get_bookmark( state, stream['tap_stream_id'], 'max_id_value') or get_max_id_value(collection) last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') state = singer.write_bookmark(state, stream['tap_stream_id'], 'max_id_value', max_id_value) find_filter = {'$lte': objectid.ObjectId(max_id_value)} if last_id_fetched: find_filter['$gt'] = objectid.ObjectId(last_id_fetched) LOGGER.info("Starting full table replication for table {}.{}".format( database_name, stream['stream'])) with metrics.record_counter(None) as counter: with collection.find({'_id': find_filter}, sort=[("_id", pymongo.DESCENDING)]) as cursor: rows_saved = 0 time_extracted = utils.now() while cursor.alive: try: row = next(cursor) rows_saved += 1 whitelisted_row = { k: v for k, v in row.items() if k not in blacklist } record_message = common.row_to_singer_record( stream, whitelisted_row, stream_version, time_extracted) singer.write_message(record_message) state = singer.write_bookmark(state, stream['tap_stream_id'], 'last_id_fetched', str(row['_id'])) if rows_saved % 1000 == 0: singer.write_state(state) except InvalidBSON as e: LOGGER.info(e) continue # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value') singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched') singer.write_message(activate_version_message) singer.write_state(state)
def get_bookmark(name): bookmark = singer.get_bookmark(Context.state, name, 'start_date') if bookmark is None: bookmark = Context.config['start_date'] return bookmark
def sync_binlog_stream(mysql_conn, config, binlog_streams, state): binlog_streams_map = generate_streams_map(binlog_streams) for tap_stream_id in binlog_streams_map.keys(): common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state) log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map, state) verify_log_file_exists(mysql_conn, log_file, log_pos) server_id = fetch_server_id(mysql_conn) connection_wrapper = make_connection_wrapper(config) reader = BinLogStreamReader(connection_settings={}, server_id=server_id, log_file=log_file, log_pos=log_pos, resume_stream=True, only_events=[ RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent ], pymysql_wrapper=connection_wrapper) time_extracted = utils.now() LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s", log_file, log_pos) rows_saved = 0 events_skipped = 0 current_log_file, current_log_pos = fetch_current_log_file_and_pos( mysql_conn) for binlog_event in reader: if isinstance(binlog_event, RotateEvent): state = update_bookmarks(state, binlog_streams_map, binlog_event.next_binlog, binlog_event.position) else: tap_stream_id = common.generate_tap_stream_id( binlog_event.schema, binlog_event.table) streams_map_entry = binlog_streams_map.get(tap_stream_id, {}) catalog_entry = streams_map_entry.get('catalog_entry') desired_columns = streams_map_entry.get('desired_columns') if not catalog_entry: events_skipped = events_skipped + 1 if events_skipped % UPDATE_BOOKMARK_PERIOD == 0: LOGGER.info( "Skipped %s events so far as they were not for selected tables; %s rows extracted", events_skipped, rows_saved) elif catalog_entry: initial_binlog_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_binlog_complete') if (initial_binlog_complete and reader.log_file == log_file and reader.log_pos == log_pos): LOGGER.info( "Skipping event for stream(%s) log_file=%s and log_pos=%s as it was processed last sync", catalog_entry.tap_stream_id, reader.log_file, reader.log_pos) continue if isinstance(binlog_event, WriteRowsEvent): rows_saved = handle_write_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, UpdateRowsEvent): rows_saved = handle_update_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) elif isinstance(binlog_event, DeleteRowsEvent): rows_saved = handle_delete_rows_event( binlog_event, catalog_entry, state, desired_columns, rows_saved, time_extracted) else: LOGGER.info( "Skipping event for table %s.%s as it is not an INSERT, UPDATE, or DELETE", binlog_event.schema, binlog_event.table) state = update_bookmarks(state, binlog_streams_map, reader.log_file, reader.log_pos) # The iterator across python-mysql-replication's fetchone method should ultimately terminate # upon receiving an EOF packet. There seem to be some cases when a MySQL server will not send # one causing binlog replication to hang. if current_log_file == reader.log_file and reader.log_pos >= current_log_pos: break if ((rows_saved and rows_saved % UPDATE_BOOKMARK_PERIOD == 0) or (events_skipped and events_skipped % UPDATE_BOOKMARK_PERIOD == 0)): singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) state = update_initial_binlog_complete(binlog_streams_map, state) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_event_updates(stream_name): ''' Get updates via events endpoint look at 'events update' bookmark and pull events after that ''' LOGGER.info("Started syncing event based updates") date_window_size = 60 * 60 * 24 # Seconds in a day bookmark_value = singer.get_bookmark(Context.state, stream_name + '_events', 'updates_created') or \ int(utils.strptime_to_utc(Context.config["start_date"]).timestamp()) max_created = bookmark_value date_window_start = max_created date_window_end = max_created + date_window_size stop_paging = False # Create a map to hold relate event object ids to timestamps updated_object_timestamps = {} while not stop_paging: extraction_time = singer.utils.now() response = STREAM_SDK_OBJECTS['events']['sdk_object'].list(**{ "limit": 100, "type": STREAM_TO_TYPE_FILTER[stream_name]['type'], "stripe_account" : Context.config.get('account_id'), # None passed to starting_after appears to retrieve # all of them so this should always be safe. "created[gte]": date_window_start, "created[lt]": date_window_end, }) # If no results, and we are not up to current time if not len(response) and date_window_end > extraction_time.timestamp(): # pylint: disable=len-as-condition stop_paging = True for events_obj in response.auto_paging_iter(): event_resource_obj = events_obj.data.object sub_stream_name = SUB_STREAMS.get(stream_name) # Check whether we should sync the event based on its created time if not should_sync_event(events_obj, STREAM_TO_TYPE_FILTER[stream_name]['object'], updated_object_timestamps): continue # Syncing an event as its the first time we've seen it or its the most recent version with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer: event_resource_metadata = metadata.to_map( Context.get_catalog_entry(stream_name)['metadata'] ) # Filter out line items with null ids if isinstance(events_obj.get('data').get('object'), stripe.Invoice): invoice_obj = events_obj.get('data', {}).get('object', {}) line_items = invoice_obj.get('lines', {}).get('data') if line_items: filtered_line_items = [line_item for line_item in line_items if line_item.get('id')] invoice_obj['lines']['data'] = filtered_line_items rec = recursive_to_dict(event_resource_obj) rec = unwrap_data_objects(rec) rec = reduce_foreign_keys(rec, stream_name) rec["updated"] = events_obj.created rec = transformer.transform( rec, Context.get_catalog_entry(stream_name)['schema'], event_resource_metadata ) if events_obj.created >= bookmark_value: if rec.get('id') is not None: singer.write_record(stream_name, rec, time_extracted=extraction_time) Context.updated_counts[stream_name] += 1 # Delete events should be synced but not their subobjects if events_obj.get('type', '').endswith('.deleted'): continue if sub_stream_name and Context.is_selected(sub_stream_name): if event_resource_obj: sync_sub_stream(sub_stream_name, event_resource_obj, updates=True) if events_obj.created > max_created: max_created = events_obj.created # The events stream returns results in descending order, so we # cannot bookmark until the entire page is processed date_window_start = date_window_end date_window_end = date_window_end + date_window_size singer.write_bookmark(Context.state, stream_name + '_events', 'updates_created', max_created) singer.write_state(Context.state) singer.write_state(Context.state)
def sync_table(config, mysql_conn, catalog_entry, state, columns, stream_version): common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry), catalog_entry.tap_stream_id, state) bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {}) version_exists = 'version' in bookmark initial_full_table_complete = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'initial_full_table_complete') state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'version') activate_version_message = singer.ActivateVersionMessage( stream=catalog_entry.stream, version=stream_version) # For the initial replication, emit an ACTIVATE_VERSION message # at the beginning so the records show up right away. if not initial_full_table_complete and not (version_exists and state_version is None): singer.write_message(activate_version_message) key_props_are_auto_incrementing = pks_are_auto_incrementing( mysql_conn, catalog_entry) with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: select_sql = common.generate_select_sql(catalog_entry, columns) if key_props_are_auto_incrementing: LOGGER.info( "Detected auto-incrementing primary key(s) - will replicate incrementally" ) max_pk_values = singer.get_bookmark( state, catalog_entry.tap_stream_id, 'max_pk_values') or get_max_pk_values(cur, catalog_entry) if not max_pk_values: LOGGER.info( "No max value for auto-incrementing PK found for table %s", catalog_entry.table) else: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values', max_pk_values) pk_clause = generate_pk_clause(catalog_entry, state) select_sql += pk_clause params = {} common.sync_query(config, cur, catalog_entry, state, select_sql, columns, stream_version, params) # clear max pk value and last pk fetched upon successful sync singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') singer.write_message(activate_version_message)
def get_bookmark(self, state): return singer.get_bookmark(state, self.name, self.replication_key)
def get_start(STATE, tap_stream_id, bookmark_key): current_bookmark = singer.get_bookmark(STATE, tap_stream_id, bookmark_key) if current_bookmark is None: return CONFIG["start_date"] return current_bookmark
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version") if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream["tap_stream_id"], "version", stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get("schema-name") escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=stream_version, ) singer.write_message(activate_version_message) replication_key = md_map.get((), {}).get("replication-key") replication_key_value = singer.get_bookmark(state, stream["tap_stream_id"], "replication_key_value") replication_key_sql_datatype = md_map.get( ("properties", replication_key)).get("sql-datatype") hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to client_encoding in postgresql.conf. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor") as cur: cur.itersize = post_db.cursor_iter_size LOGGER.info("Beginning new incremental replication sync %s", stream_version) if replication_key_value: select_sql = """SELECT {} FROM {} WHERE {} > '{}'::{} ORDER BY {} ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, post_db.prepare_columns_sql(replication_key), ) else: # if not replication_key_value select_sql = """SELECT {} FROM {} ORDER BY {} ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), post_db.prepare_columns_sql(replication_key), ) LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) LOGGER.info("Query returned - processing results") rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, stream_version, desired_columns, time_extracted, md_map, ) singer.write_message(record_message) rows_saved = rows_saved + 1 # Picking a replication_key with NULL values will result in it ALWAYS being # synced which is not great. Even worse would be allowing the NULL value to # enter into the state. if record_message.record[replication_key] is not None: state = singer.write_bookmark( state, stream["tap_stream_id"], "replication_key_value", record_message.record[replication_key], ) if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() LOGGER.info("Incremental table tap complete") return state
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream["tap_stream_id"], "version") is None # pick a new table version IFF we do not have an xmin in our state # the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream["tap_stream_id"], "xmin") is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream["tap_stream_id"], "version") state = singer.write_bookmark(state, stream["tap_stream_id"], "version", nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get("schema-name") escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version, ) if first_run: singer.write_message(activate_version_message) hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to client_encoding in postgresql.conf. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name="stitch_cursor") as cur: cur.itersize = post_db.cursor_iter_size xmin = singer.get_bookmark(state, stream["tap_stream_id"], "xmin") if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin, ) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) <= age('{}'::xid) ORDER BY xmin::text::bigint ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), xmin, ) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text::bigint ASC""".format( ",".join(escaped_columns), post_db.fully_qualified_table_name( schema_name, stream["table_name"]), ) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) LOGGER.info("Query returned - processing results") rows_saved = 0 for rec in cur: xmin = rec["xmin"] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", xmin) rows_saved = rows_saved + 1 if rows_saved % conn_info["emit_state_every_n_rows"] == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() LOGGER.info( "Processing complete - saved {} rows".format(rows_saved)) # once we have completed the full table replication, discard the xmin bookmark. # the xmin bookmark only comes into play when a full table replication is interrupted LOGGER.info("Writing bookmark") state = singer.write_bookmark(state, stream["tap_stream_id"], "xmin", None) # always send the activate version whether first run or subsequent LOGGER.info("Sending activate version message") singer.write_message(activate_version_message) LOGGER.info("Full table tap complete") return state
def get_start(STATE, tap_stream_id, bookmark_key): current_bookmark = singer.get_bookmark(STATE, tap_stream_id, bookmark_key) if current_bookmark is None: return 0 return current_bookmark
def get_bookmark(self, state): return utils.strptime_with_tz( singer.get_bookmark(state, self.name, self.replication_key))
def sync(config, state, catalog): client = SquareClient(config) with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) start_time = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) bookmarked_cursor = singer.get_bookmark(state, tap_stream_id, 'cursor') if tap_stream_id == 'shifts': replication_key = stream_obj.replication_key sync_start_bookmark = singer.get_bookmark( state, tap_stream_id, 'sync_start', singer.utils.strftime( singer.utils.now(), format_str=singer.utils.DATETIME_PARSE)) state = singer.write_bookmark( state, tap_stream_id, 'sync_start', sync_start_bookmark, ) for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: if record[replication_key] >= start_time: transformed_record = transformer.transform( record, stream_schema, stream_metadata, ) singer.write_record( tap_stream_id, transformed_record, ) state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) singer.write_state(state) state = singer.clear_bookmark(state, tap_stream_id, 'sync_start') state = singer.write_bookmark( state, tap_stream_id, replication_key, sync_start_bookmark, ) singer.write_state(state) elif stream_obj.replication_method == 'INCREMENTAL': replication_key = stream_obj.replication_key max_record_value = start_time for page, cursor in stream_obj.sync(start_time, bookmarked_cursor): for record in page: transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) if record[replication_key] > max_record_value: max_record_value = transformed_record[ replication_key] state = singer.write_bookmark(state, tap_stream_id, 'cursor', cursor) state = singer.write_bookmark(state, tap_stream_id, replication_key, max_record_value) singer.write_state(state) else: for record in stream_obj.sync(start_time, bookmarked_cursor): transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'cursor') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def do_sync_historical_binlog(mysql_conn, config, catalog_entry, state, columns): binlog.verify_binlog_config(mysql_conn) is_view = common.get_is_view(catalog_entry) key_properties = common.get_key_properties(catalog_entry) if is_view: raise Exception( "Unable to replicate stream({}) with binlog because it is a view.". format(catalog_entry.stream)) log_file = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_file') log_pos = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'log_pos') max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') last_pk_fetched = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched') write_schema_message(catalog_entry) stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state) if log_file and log_pos and max_pk_values: LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: LOGGER.info( "Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'initial_binlog_complete', False) current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos( mysql_conn) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', stream_version) if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry): # We must save log_file and log_pos across FULL_TABLE syncs when using # an incrementing PK state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos) full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) else: full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_file', current_log_file) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'log_pos', current_log_pos)
def get_since_id(self): return singer.get_bookmark(Context.state, # name is overridden by some substreams self.name, 'since_id')
def get_current_sync_start(state, tap_stream_id): current_sync_start_value = singer.get_bookmark(state, tap_stream_id, "current_sync_start") if current_sync_start_value is None: return current_sync_start_value return utils.strptime_to_utc(current_sync_start_value)
def sync_table(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() # before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None # pick a new table version IFF we do not have an xmin in our state # the presence of an xmin indicates that we were interrupted last time through if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None: nascent_stream_version = int(time.time() * 1000) else: nascent_stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map( partial(post_db.prepare_columns_for_select_sql, md_map=md_map), desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream), version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) hstore_available = post_db.hstore_available(conn_info) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: # Client side character encoding defaults to the value in postgresql.conf under client_encoding. # The server / db can also have its own configred encoding. with conn.cursor() as cur: cur.execute("show server_encoding") LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) cur.execute("show client_encoding") LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) if hstore_available: LOGGER.info("hstore is available") psycopg2.extras.register_hstore(conn) else: LOGGER.info("hstore is UNavailable") with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.CURSOR_ITER_SIZE fq_table_name = post_db.fully_qualified_table_name( schema_name, stream['table_name']) xmin = singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') if xmin: LOGGER.info( "Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin) select_sql = """SELECT {}, xmin::text::bigint FROM {} where age(xmin::xid) <= age('{}'::xid) ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), fq_table_name, xmin) else: LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) select_sql = """SELECT {}, xmin::text::bigint FROM {} ORDER BY xmin::text ASC""".format( ','.join(escaped_columns), fq_table_name) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: xmin = rec['xmin'] rec = rec[:-1] record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', xmin) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() # once we have completed the full table replication, discard the xmin bookmark. # the xmin bookmark only comes into play when a full table replication is interrupted state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) # always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_stream(client, stream, state): tap_stream_id = stream['tap_stream_id'] common.COUNTS[tap_stream_id] = 0 common.TIMES[tap_stream_id] = 0 common.SCHEMA_COUNT[tap_stream_id] = 0 common.SCHEMA_TIMES[tap_stream_id] = 0 md_map = metadata.to_map(stream['metadata']) replication_method = metadata.get(md_map, (), 'replication-method') database_name = metadata.get(md_map, (), 'database-name') stream_projection = load_stream_projection(stream) # Emit a state message to indicate that we've started this stream state = clear_state_on_replication_change(stream, state) state = singer.set_currently_syncing(state, stream['tap_stream_id']) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) write_schema_message(stream) common.SCHEMA_COUNT[tap_stream_id] += 1 with metrics.job_timer('sync_table') as timer: timer.tags['database'] = database_name timer.tags['table'] = stream['table_name'] if replication_method == 'LOG_BASED': if oplog.oplog_has_aged_out(client, state, tap_stream_id): # remove all state for stream # then it will do a full sync and start oplog again. LOGGER.info("Clearing state because Oplog has aged out") state.get('bookmarks', {}).pop(tap_stream_id) # make sure initial full table sync has been completed if not singer.get_bookmark(state, tap_stream_id, 'initial_full_table_complete'): msg = 'Must complete full table sync before starting oplog replication for %s' LOGGER.info(msg, tap_stream_id) # only mark current ts in oplog on first sync so tap has a # starting point after the full table sync if singer.get_bookmark(state, tap_stream_id, 'version') is None: collection_oplog_ts = oplog.get_latest_ts(client) oplog.update_bookmarks(state, tap_stream_id, collection_oplog_ts) full_table.sync_collection(client, stream, state, stream_projection) oplog.sync_collection(client, stream, state, stream_projection) elif replication_method == 'FULL_TABLE': full_table.sync_collection(client, stream, state, stream_projection) elif replication_method == 'INCREMENTAL': incremental.sync_collection(client, stream, state, stream_projection) else: raise Exception( "only FULL_TABLE, LOG_BASED, and INCREMENTAL replication \ methods are supported (you passed {})".format(replication_method)) state = singer.set_currently_syncing(state, None) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def get_bookmark(self, state, name=None): name = self.name if not name else name return (singer.get_bookmark( state, name, self.replication_key)) or Context.config["start_date"]
async def sync_catalog_entry(sf, catalog_entry, state): stream_version = get_stream_version(catalog_entry, state) stream = catalog_entry['stream'] stream_alias = catalog_entry.get('stream_alias') stream_name = catalog_entry["tap_stream_id"] activate_version_message = singer.ActivateVersionMessage( stream=(stream_alias or stream), version=stream_version) catalog_metadata = metadata.to_map(catalog_entry['metadata']) replication_key = catalog_metadata.get((), {}).get('replication-key') mdata = metadata.to_map(catalog_entry['metadata']) if not stream_is_selected(mdata): LOGGER.info("%s: Skipping - not selected", stream_name) return LOGGER.info("%s: Starting", stream_name) singer.write_state(state) key_properties = metadata.to_map(catalog_entry['metadata']).get( (), {}).get('table-key-properties') singer.write_schema(stream, catalog_entry['schema'], key_properties, replication_key, stream_alias) loop = asyncio.get_event_loop() job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID') if job_id: with metrics.record_counter(stream) as counter: LOGGER.info( "Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id) # Resuming a sync should clear out the remaining state once finished await loop.run_in_executor(None, resume_syncing_bulk_query, sf, catalog_entry, job_id, state, counter) LOGGER.info("Completed sync for %s", stream_name) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None) state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None) bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobHighestBookmarkSeen', None) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], replication_key, bookmark) singer.write_state(state) else: state_msg_threshold = CONFIG.get('state_message_threshold', 1000) # Tables with a replication_key or an empty bookmark will emit an # activate_version at the beginning of their sync bookmark_is_empty = state.get('bookmarks', {}).get( catalog_entry['tap_stream_id']) is None if replication_key or bookmark_is_empty: singer.write_message(activate_version_message) state = singer.write_bookmark(state, catalog_entry['tap_stream_id'], 'version', stream_version) await loop.run_in_executor(None, sync_stream, sf, catalog_entry, state, state_msg_threshold) LOGGER.info("Completed sync for %s", stream_name)
def sync_query(cursor, catalog_entry, state, select_sql, columns, stream_version, params, original_state_file=''): replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'replication_key') query_string = cursor.mogrify(select_sql, params) time_extracted = utils.now() # Adding logic to reattempt query in case of 2013 MySQL timeout. try: cursor.execute(select_sql, params) except Exception: LOGGER.info('Running %s',query_string) LOGGER.exception("An Exception has occurred.") cursor.execute(select_sql, params) row = cursor.fetchone() rows_saved = 0 database_name = get_database_name(catalog_entry) with metrics.record_counter(None) as counter: #Rename table to include database name so it is included when sent to Stitch. catalog_entry.table = str(database_name) + '_' + str(catalog_entry.table) catalog_entry.stream = str(database_name) + '_' + str(catalog_entry.stream) counter.tags['database'] = database_name counter.tags['table'] = catalog_entry.table while row: counter.increment() rows_saved += 1 record_message = row_to_singer_record(catalog_entry, stream_version, row, columns, time_extracted) singer.write_message(record_message) md_map = metadata.to_map(catalog_entry.metadata) stream_metadata = md_map.get((), {}) replication_method = stream_metadata.get('replication-method') if replication_method in {'FULL_TABLE', 'LOG_BASED'}: key_properties = get_key_properties(catalog_entry) max_pk_values = singer.get_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values') if max_pk_values: last_pk_fetched = {k:v for k,v in record_message.record.items() if k in key_properties} state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'last_pk_fetched', last_pk_fetched) elif replication_method == 'INCREMENTAL': if replication_key is not None: state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key', replication_key) state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'replication_key_value', record_message.record[replication_key]) if rows_saved % 1000 == 0: singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) row = cursor.fetchone() #Update and backup state file to keep current increment in case of error. if original_state_file != '': os.rename(original_state_file,original_state_file + '_backup') with open(original_state_file,'w') as state_file: json.dump(copy.deepcopy(state),state_file) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))