def test_incremental(self, log): with metrics.record_counter(endpoint='users') as counter: counter.increment(1) counter._ready_to_log = lambda: True counter.increment(2) counter._ready_to_log = lambda: False counter.increment(5) self.assertEqual( [metrics.Point('counter', 'record_count', 3, {'endpoint': 'users'}), metrics.Point('counter', 'record_count', 5, {'endpoint': 'users'})], logged_points(log))
def sync_generic_basic_endpoint(sdk_client, stream, stream_metadata): discovered_schema = load_schema(stream) field_list = get_field_list(discovered_schema, stream, stream_metadata) discovered_schema['properties']['_sdc_customer_id'] = { 'description': 'Profile ID', 'type': 'string', 'field': "customer_id" } primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys'] write_schema(stream, discovered_schema, primary_keys) LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id) start_index = 0 selector = { 'fields': field_list, 'paging': { 'startIndex': str(start_index), 'numberResults': str(PAGE_SIZE) } } while True: page = get_page(sdk_client, selector, stream, start_index) if page['totalNumEntries'] > GOOGLE_MAX_START_INDEX: raise Exception("Too many %s (%s > %s) for customer %s", stream, GOOGLE_MAX_START_INDEX, page['totalNumEntries'], sdk_client.client_customer_id) if 'entries' in page: with metrics.record_counter(stream) as counter: time_extracted = utils.now() for entry in page['entries']: obj = suds_to_dict(entry) obj['_sdc_customer_id'] = sdk_client.client_customer_id with Transformer( singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING ) as bumble_bee: bumble_bee.pre_hook = transform_pre_hook record = bumble_bee.transform(obj, discovered_schema) singer.write_record(stream, record, time_extracted=time_extracted) counter.increment() start_index += PAGE_SIZE if start_index > int(page['totalNumEntries']): break LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer() as transformer: try: transformed_record = transformer.transform( record, schema, stream_metadata) except Exception as err: LOGGER.error('Transformer Error: {}'.format(err)) LOGGER.error('Stream: {}, record: {}'.format( stream_name, record)) raise err # Reset max_bookmark_value to new value if higher if transformed_record.get(bookmark_field): if max_bookmark_value is None or \ transformed_record[bookmark_field] > transform_datetime(max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm: if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_offer_stages(offer): LOGGER.info("-----/ Syncing offer_stages") offer_id = int(offer["id"]) if "stages" in offer and len(offer["stages"]) > 0: with metrics.record_counter("offer_stages") as counter: offer_stages = [ ofr.filter_stage(stage, offer_id) for stage in offer["stages"] ] for stage in offer_stages: counter.increment() singer.write_record("offer_stages", stage)
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path): schema = load_schema(entity_name) bookmark_key = 'startTimestamp' singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias')) start = get_start(STATE, entity_name, bookmark_key) LOGGER.info("sync_%s from %s", entity_name, start) now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) now_ts = int(now.timestamp() * 1000) start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000) url = get_url(entity_name) mdata = metadata.to_map(catalog.get('metadata')) with metrics.record_counter(entity_name) as counter: while start_ts < now_ts: end_ts = start_ts + CHUNK_SIZES[entity_name] params = { 'startTimestamp': start_ts, 'endTimestamp': end_ts, 'limit': 1000, } with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: while True: our_offset = singer.get_offset(STATE, entity_name) if bool(our_offset) and our_offset.get('offset') != None: params[StateFields.offset] = our_offset.get('offset') data = request(url, params).json() time_extracted = utils.now() for row in data[path]: counter.increment() record = bumble_bee.transform(row, schema, mdata) singer.write_record(entity_name, record, catalog.get('stream_alias'), time_extracted=time_extracted) if data.get('hasMore'): STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset']) singer.write_state(STATE) else: STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) break STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc ))) # pylint: disable=line-too-long singer.write_state(STATE) start_ts = end_ts STATE = singer.clear_offset(STATE, entity_name) singer.write_state(STATE) return STATE
def sync_candidate_tags(candidate): LOGGER.info("-----/ Syncing candidate_tags") candidate_id = int(candidate["id"]) if "tags" in candidate and len(candidate["tags"]) > 0: with metrics.record_counter("candidate_tags") as counter: candidate_tags = [ cand.filter_tag(tag, candidate_id) for tag in candidate["tags"] ] for tag in candidate_tags: counter.increment() singer.write_record("candidate_tags", tag)
async def process_lines(self, lines, loop=None): loop = loop or asyncio.get_event_loop() api = self._api_client schemas = {} active_versions = {} queues = {} consumers = {} logger.info('Checking network connectivity') api.connection_check() # Fail fast logger.info('Ensuring dataset exists and is in good state') await self._fix_dataset() with metrics.record_counter() as counter: for line in lines: try: msg = singer.parse_message(line) except (json.JSONDecodeError, simplejson.JSONDecodeError) as e: raise UnparseableMessageError(line, str(e)) if isinstance(msg, singer.RecordMessage): await self._handle_record_msg(msg, schemas, active_versions, loop, queues, consumers) counter.increment() logger.debug('Line #{} in {} queued for upload'.format( counter.value, msg.stream)) elif isinstance(msg, singer.SchemaMessage): logger.info('Schema found for {}'.format(msg.stream)) schemas[msg.stream] = await self._handle_schema_msg(msg) elif isinstance(msg, singer.StateMessage): logger.info('State message found: {}'.format(msg.value)) state = await self._handle_state_msg( msg, queues, consumers) queues = {} yield state elif isinstance(msg, singer.ActivateVersionMessage): logger.info('Version message found: {}/{}'.format( msg.stream, msg.version)) current_version = active_versions.get(msg.stream) active_version = await self._handle_active_version_msg( msg, current_version, api) active_versions[msg.stream] = active_version else: logger.warn('Unrecognized message ({})'.format(msg)) await TargetDataDotWorld._drain_queues(queues, consumers) self._api_client.sync(self.config['dataset_owner'], self.config['dataset_id'])
def output_responses(stream_id, config: dict, state: dict) -> dict: """ Query and output the api for individual responses """ while True: previous_state_end_datetime = state.get('bookmarks', {}).get( stream_id, {}).get('last_record', None) # Start where the previous run left off, or it's a first run use the one from the config. start_datetime = arrow.get(previous_state_end_datetime or config.get('start_date')) # request data from the api in blocks of a month end_datetime = start_datetime.shift(months=1) # Fetch data from api params = { "format": "json", "project": config["project_id"], "startDate": start_datetime.isoformat(), "endDate": end_datetime.isoformat(), } res_json = request('/responses/', params=params, auth=HTTPBasicAuth(config["api_key"], None), user_agent=config.get('user_agent', None)).json() # Output items bookmark = start_datetime with record_counter(endpoint=stream_id) as counter: for record in res_json['responses']: write_record(stream_id, record) counter.increment() bookmark = max([arrow.get(record['created']), bookmark]) # If we're not past the current timestamp, set the bookmark # to the end_datetime requested as there won't be any new ones # coming in for past times. if end_datetime < arrow.utcnow(): bookmark = end_datetime # Update and export state if 'bookmarks' not in state: state['bookmarks'] = {} state['bookmarks'][stream_id] = {'last_record': bookmark.isoformat()} write_state(state) # Stop when we had requested past the current timestamp, # there won't be anything more. if end_datetime > arrow.utcnow(): break return state
def sync_view(conn_info, stream, state, desired_columns, md_map): time_extracted = utils.now() #before writing the table version to state, check if we had one to begin with first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None nascent_stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', nascent_stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) schema_name = md_map.get(()).get('schema-name') escaped_columns = map(post_db.prepare_columns_sql, desired_columns) activate_version_message = singer.ActivateVersionMessage( stream=post_db.calculate_destination_stream_name(stream, md_map), version=nascent_stream_version) if first_run: singer.write_message(activate_version_message) with metrics.record_counter(None) as counter: with post_db.open_connection(conn_info) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.cursor_iter_size select_sql = 'SELECT {} FROM {}'.format( ','.join(escaped_columns), post_db.fully_qualified_table_name(schema_name, stream['table_name'])) LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) cur.execute(select_sql) rows_saved = 0 for rec in cur: record_message = post_db.selected_row_to_singer_message( stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) singer.write_message(record_message) rows_saved = rows_saved + 1 if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) counter.increment() #always send the activate version whether first run or subsequent singer.write_message(activate_version_message) return state
def sync_candidate_sources(candidate): LOGGER.info("-----/ Syncing candidate_sources") candidate_id = int(candidate["id"]) if "sources" in candidate and len(candidate["sources"]) > 0: with metrics.record_counter("candidate_sources") as counter: candidate_sources = [ cand.filter_source(source, candidate_id) for source in candidate["sources"] ] for source in candidate_sources: counter.increment() singer.write_record("candidate_sources", source)
def sync_candidate_placements(candidate): LOGGER.info("-----/ Syncing candidate_placements") candidate_id = int(candidate["id"]) if "placements" in candidate and len(candidate["placements"]) > 0: with metrics.record_counter("candidate_placements") as counter: candidate_placements = [ cand.filter_placement(placement, candidate_id) for placement in candidate["placements"] ] for placement in candidate_placements: counter.increment() singer.write_record("candidate_placements", placement)
def persist_records(catalog, stream_id, records): stream = catalog.get_stream(stream_id) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_id) as counter: for record in records: with Transformer( integer_datetime_fmt=UNIX_SECONDS_INTEGER_DATETIME_PARSING ) as transformer: record = transformer.transform(record, schema, stream_metadata) singer.write_record(stream_id, record) counter.increment()
def process_records(catalog, stream_name, records): if records: stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metedata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: with Transformer() as transformer: record = transformer.transform(record, schema, stream_metedata) singer.write_record(stream_name, record) counter.increment()
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, bookmark_type=None, max_bookmark_value=None, last_datetime=None, last_integer=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if (max_bookmark_value is None) or \ (transformed_record[bookmark_field] > transform_datetime(max_bookmark_value)): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): if bookmark_type == 'integer': # Keep only records whose bookmark is after the last_integer if transformed_record[bookmark_field] >= last_integer: write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() elif bookmark_type == 'datetime': last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: # LOGGER.info('record1: {}'.format(record)) # TESTING, comment out write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() else: # LOGGER.info('record2: {}'.format(record)) # TESTING, comment out write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() LOGGER.info('Stream: {}, Processed {} records'.format( stream_name, counter.value)) return max_bookmark_value
def persist_records(catalog, stream_id, records): if records: # check for empty array stream = catalog.get_stream(stream_id) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_id) as counter: for record in records: with Transformer() as transformer: record = transformer.transform(record, schema, stream_metadata) singer.write_record(stream_id, record) counter.increment()
def process_records(catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, bookmark_type=None, max_bookmark_value=None, last_datetime=None, last_integer=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform(record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if (max_bookmark_value is None) or \ (transformed_record[bookmark_field] > max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field: if bookmark_field in transformed_record: if bookmark_type == 'integer': # Keep only records whose bookmark is after the last_integer if transformed_record[bookmark_field] >= last_integer: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() elif bookmark_type == 'datetime': last_dttm = transformer._transform_datetime(last_datetime) bookmark_dttm = transformer._transform_datetime(transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_campaign_ids_endpoint(sdk_client, campaign_ids, stream_schema, stream, stream_metadata): discovered_schema = load_schema(stream) field_list = get_field_list(discovered_schema, stream, stream_metadata) discovered_schema['properties']['_sdc_customer_id'] = { 'description': 'Profile ID', 'type': 'string', 'field': "customer_id" } primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys'] write_schema(stream, discovered_schema, primary_keys) LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id) for safe_selector in get_campaign_ids_safe_selectors( sdk_client, campaign_ids, stream): start_index = 0 while True: page = get_campaign_ids_filtered_page(sdk_client, field_list, safe_selector, stream, start_index) if page['totalNumEntries'] > GOOGLE_MAX_RESULTSET_SIZE: raise Exception("Too many {} ({} > {}) for customer {}, campaigns {}".format( stream, GOOGLE_MAX_RESULTSET_SIZE, page['totalNumEntries'], sdk_client.client_customer_id, campaign_ids)) if 'entries' in page: with metrics.record_counter(stream) as counter: for entry in page['entries']: obj = suds_to_dict(entry) obj['_sdc_customer_id'] = sdk_client.client_customer_id with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: #pylint: disable=line-too-long bumble_bee.pre_hook = transform_pre_hook record = bumble_bee.transform(obj, discovered_schema) singer.write_record(stream, record) counter.increment() start_index += PAGE_SIZE if start_index > int(page['totalNumEntries']): break LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
def sync_rows(filename, STATE, tap_stream_id, key_properties=[], auth_method=None, max_page=None): schema = load_schema(tap_stream_id) singer.write_schema(tap_stream_id, schema, key_properties) bookmark_type = get_bookmark_type() start = get_start(STATE, tap_stream_id, "last_update") end = get_end() pretty_start = start pretty_end = end if bookmark_type == "timestamp": pretty_start = str(start) + " (" + str( datetime.datetime.fromtimestamp(start)) + ")" if end is not None: pretty_end = str(end) + " (" + str( datetime.datetime.fromtimestamp(end)) + ")" LOGGER.info( "Stream %s has %s set starting %s and ending %s. I trust you set URL format contains those params. The behavior depends on the data source API's spec. I will not filter out the records outside the boundary. Every record received is will be written out." % (tap_stream_id, bookmark_type, pretty_start, pretty_end)) last_update = start etl_tstamp = int(time.time()) with metrics.record_counter(tap_stream_id) as counter: data = read_csv_as_dict(filename, skip=CONFIG.get("skip"), lower=True, replace_special="_", snake_case=True) LOGGER.info("Read %d records from CSV. ETL timestamp %d" % (len(data), etl_tstamp)) for row in data: counter.increment() row = get_record(row, CONFIG.get("record_level")) row = filter_result(row, schema) if "_etl_tstamp" in schema["properties"].keys(): row["_etl_tstamp"] = etl_tstamp last_update = get_last_update(row, last_update) singer.write_record(tap_stream_id, row) STATE = singer.write_bookmark(STATE, tap_stream_id, 'last_update', last_update) singer.write_state(STATE) return STATE
def print_counts(cls): # Separate loops for formatting. for stream_name, stream_count in Context.new_counts.items(): with metrics.record_counter(stream_name) as counter: updates_count = Context.updated_counts[stream_name] total_replicated = stream_count + updates_count counter.increment(total_replicated) LOGGER.info('------------------') for stream_name, stream_count in Context.new_counts.items(): LOGGER.info('%s: %d new, %d updates', stream_name, stream_count, Context.updated_counts[stream_name]) LOGGER.info('------------------')
def write_page(self, page): stream = Context.get_catalog_entry(self.tap_stream_id) stream_metadata = metadata.to_map(stream.metadata) extraction_time = singer.utils.now() for rec in page: with Transformer() as transformer: rec = transformer.transform(rec, stream.schema.to_dict(), stream_metadata) singer.write_record(self.tap_stream_id, rec, time_extracted=extraction_time) with metrics.record_counter(self.tap_stream_id) as counter: counter.increment(len(page))
def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys, offset_targets, v3_fields=None): if len(offset_keys) != len(offset_targets): raise ValueError( "Number of offset_keys must match number of offset_targets") if singer.get_offset(STATE, tap_stream_id): params.update(singer.get_offset(STATE, tap_stream_id)) with metrics.record_counter(tap_stream_id) as counter: while True: data = request(url, params).json() if data.get(path) is None: raise RuntimeError( "Unexpected API response: {} not in {}".format( path, data.keys())) if v3_fields: v3_data = get_v3_deals(v3_fields, data[path]) # The shape of v3_data is different than the V1 response, # so we transform v3 to look like v1 transformed_v3_data = process_v3_deals_records(v3_data) merge_responses(data[path], transformed_v3_data) for row in data[path]: counter.increment() yield row if not data.get(more_key, False): break STATE = singer.clear_offset(STATE, tap_stream_id) for key, target in zip(offset_keys, offset_targets): if key in data: params[target] = data[key] STATE = singer.set_offset(STATE, tap_stream_id, target, data[key]) singer.write_state(STATE) STATE = singer.clear_offset(STATE, tap_stream_id) singer.write_state(STATE)
def sync_stream(sf, catalog_entry, state): stream = catalog_entry['stream'] with metrics.record_counter(stream) as counter: try: sync_records(sf, catalog_entry, state, counter) singer.write_state(state) except RequestException as ex: raise Exception("Error syncing {}: {} Response: {}".format( stream, ex, ex.response.text)) except Exception as ex: raise Exception("Error syncing {}: {}".format(stream, ex)) from ex return counter
def sync_stream(sf, catalog_entry, state, state_msg_threshold): stream = catalog_entry['stream'] with metrics.record_counter(stream) as counter: try: sync_records(sf, catalog_entry, state, counter, state_msg_threshold) # Write the state generated for the last record generated by sf.query singer.write_state(state) except RequestException as ex: raise Exception("Error syncing {}: {} Response: {}".format( stream, ex, ex.response.text)) except Exception as ex: raise Exception("Error syncing {}: {}".format(stream, ex)) from ex
def sync_endpoint(url, state): '''Syncs the url and paginates through until there are no more "next" urls. Yields schema, record, and state messages. Modifies state by setting the NEXT field every time we get a next url from Shippo. This allows us to resume paginating if we're terminated. ''' stream = parse_stream_from_url(url) yield singer.SchemaMessage(stream=stream, schema=load_schema(stream), key_properties=["object_id"]) if LAST_START_DATE in state: start = pendulum.parse(state[LAST_START_DATE]).subtract(days=2) else: start = pendulum.parse(CONFIG[START_DATE]) # The Shippo API does not return data from long ago, so we only try to # replicate the last 60 days sixty_days_ago = pendulum.now().subtract(days=60) bounded_start = max(start, sixty_days_ago) LOGGER.info("Replicating all %s from %s", stream, bounded_start) rows_read = 0 rows_written = 0 finished = False with metrics.record_counter(parse_stream_from_url(url)) as counter: while url and not finished: state[NEXT] = url yield singer.StateMessage(value=state) data = request(url) for row in data['results']: counter.increment() rows_read += 1 updated = pendulum.parse(row[OBJECT_UPDATED]) if updated >= bounded_start: row = fix_extra_map(row) yield singer.RecordMessage(stream=stream, record=row) rows_written += 1 else: finished = True break url = data.get(NEXT) if rows_read: LOGGER.info("Done syncing %s. Read %d records, wrote %d (%.2f%%)", stream, rows_read, rows_written, 100.0 * rows_written / float(rows_read))
def get_all_projects(schemas, repo_path, state, mdata): bookmark_value = get_bookmark(state, repo_path, "projects", "since") if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter('projects') as counter: #pylint: disable=too-many-nested-blocks for response in authed_get_all_pages( 'projects', 'https://api.github.com/repos/{}/projects?sort=created_at&direction=desc'.format(repo_path), { 'Accept': 'application/vnd.github.inertia-preview+json' } ): projects = response.json() extraction_time = singer.utils.now() for r in projects: r['_sdc_repository'] = repo_path # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: return state # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('projects', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'projects', {'since': singer.utils.strftime(extraction_time)}) counter.increment() project_id = r.get('id') # sync project_columns if that schema is present (only there if selected) if schemas.get('project_columns'): for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata): singer.write_record('project_columns', project_column_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_columns', {'since': singer.utils.strftime(extraction_time)}) # sync project_cards if that schema is present (only there if selected) if schemas.get('project_cards'): column_id = project_column_rec['id'] for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata): singer.write_record('project_cards', project_card_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'project_cards', {'since': singer.utils.strftime(extraction_time)}) return state
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # Transform record for Singer.io # LOGGER.info('Process record = {}'.format(record)) # COMMENT OUT with Transformer() as transformer: try: transformed_record = transformer.transform( record, schema, stream_metadata) except Exception as err: LOGGER.error('Error: {}'.format(err)) LOGGER.error('Error record: {}'.format( json.dumps(record, sort_keys=True, indent=2))) LOGGER.error(' for schema: {}'.format( json.dumps(schema, sort_keys=True, indent=2))) raise err # Reset max_bookmark_value to new value if higher if transformed_record.get(bookmark_field): if max_bookmark_value is None or \ transformed_record[bookmark_field] > transform_datetime(max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, \ time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_stream(sf, catalog_entry, state): stream = catalog_entry['stream'] with metrics.record_counter(stream) as counter: try: sync_records(sf, catalog_entry, state, counter) singer.write_state(state) except TapSalesforceException as ex: raise type(ex)("Error syncing {}: {}".format(stream, ex)) except Exception as ex: raise Exception("Unexpected error syncing {}: {}".format( stream, ex)) from ex return counter
def process_records(stream, mdata, max_modified, records): schema = stream.schema.to_dict() with metrics.record_counter(stream.tap_stream_id) as counter: for record in records: if record['Modified'] > max_modified: max_modified = record['Modified'] with Transformer() as transformer: record = transformer.transform(record, schema, mdata) singer.write_record(stream.tap_stream_id, record) counter.increment() return max_modified
def process_records( catalog, #pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer(integer_datetime_fmt=UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) \ as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if max_bookmark_value is None or \ strptime_to_utc(transformed_record[bookmark_field]) > strptime_to_utc(max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = strptime_to_utc(last_datetime) bookmark_dttm = strptime_to_utc( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value