def _sync_contacts_by_company(STATE, ctx, company_id): schema = load_schema(CONTACTS_BY_COMPANY) catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) url = get_url("contacts_by_company", company_id=company_id) path = 'vids' with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: with metrics.record_counter(CONTACTS_BY_COMPANY) as counter: data = request(url, default_contacts_by_company_params).json() for row in data[path]: counter.increment() record = {'company-id': company_id, 'contact-id': row} record = bumble_bee.transform(record, schema, mdata) singer.write_record("contacts_by_company", record, time_extracted=utils.now()) return STATE
def find_weekly_emails_received(selected_date=previous_week, page_token=None): week = create_week(selected_date) start_date = week[0] end_date = week[-1] for date in week: process_received(date, page_token) total = total_emails_count(total_weekly_emails_received) json_response = create_json_response(start_date, end_date, total, 'weekly emails received') with Transformer() as transformer: transformed_record = transformer.transform(json_response, 'gmail') singer.write_schema('gmail', gmail_schema, 'timestamp') singer.write_records('gmail', transformed_record) return json_response
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get('metadata')) schema = load_schema('deal_pipelines') singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias')) LOGGER.info('sync_deal_pipelines') data = request(get_url('deal_pipelines')).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata) singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now()) singer.write_state(STATE) return STATE
def sync_campaigns(STATE, catalog): schema = load_schema("campaigns") singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias')) LOGGER.info("sync_campaigns(NO bookmarks)") url = get_url("campaigns_all") params = {'limit': 500} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]): record = request(get_url("campaigns_detail", campaign_id=row['id'])).json() record = bumble_bee.transform(record, schema) singer.write_record("campaigns", record, catalog.get('stream_alias')) return STATE
def sync(self, client, **kwargs): startdate = kwargs['startdate'] start, end = self.get_absolute_start_end_time( startdate, lookback=int(self.config.get('lookback'))) max_bookmark_dttm = start with singer.metrics.record_counter(endpoint=self.name) as counter: while start != end: start_str = start.strftime(INVOICE_DATETIME_FMT) next_window_str = start_str results = client.get_paginated_data(self.api_method, self.version, self.endpoint, data_key=self.data_key, params=self.build_params(), body=self.build_body( start_str, next_window_str)) max_bookmark_value = strftime(max_bookmark_dttm) with Transformer( integer_datetime_fmt="no-integer-datetime-parsing" ) as transformer: for page in results: for record in page.get(self.data_key): transformed_record = self.transform(record) record_timestamp = strptime_to_utc( transformed_record[self.replication_key]) if record_timestamp > max_bookmark_dttm: max_bookmark_value = strftime(record_timestamp) singer.write_record( stream_name=self.name, record=transformer.transform( data=transformed_record, schema=self.stream_schema, metadata=self.stream_metadata), time_extracted=singer.utils.now()) counter.increment() start = start + timedelta(days=DATE_WINDOW_SIZE) self.update_bookmark(self.name, max_bookmark_value) return counter.value
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) records_synced += 1 return records_synced
def process_records(catalog, stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None, parent=None, parent_id=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # If child object, add parent_id to record if parent_id and parent: record[parent + '_id'] = parent_id # Transform record for Singer.io with Transformer(integer_datetime_fmt=UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) \ as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if bookmark_field and (bookmark_field in transformed_record): if max_bookmark_value is None or strptime_to_utc(transformed_record[bookmark_field]) > strptime_to_utc(max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = strptime_to_utc(last_datetime) bookmark_dttm = strptime_to_utc(transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def sync_report_for_day(stream_name, stream_schema, sdk_client, start, field_list): # pylint: disable=too-many-locals report_downloader = sdk_client.GetReportDownloader(version=VERSION) customer_id = sdk_client.client_customer_id report = { 'reportName': 'Seems this is required', 'dateRangeType': 'CUSTOM_DATE', 'reportType': stream_name, 'downloadFormat': 'CSV', 'selector': { 'fields': field_list, 'dateRange': {'min': start.strftime('%Y%m%d'), 'max': start.strftime('%Y%m%d')}}} # Fetch the report as a csv string with metrics.http_request_timer(stream_name): result = attempt_download_report(report_downloader, report) headers, values = parse_csv_string(result) with metrics.record_counter(stream_name) as counter: time_extracted = utils.now() for _, val in enumerate(values): obj = dict(zip(get_xml_attribute_headers(stream_schema, headers), val)) obj['_sdc_customer_id'] = customer_id obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: bumble_bee.pre_hook = transform_pre_hook obj = bumble_bee.transform(obj, stream_schema) singer.write_record(stream_name, obj, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(sdk_client.client_customer_id, stream_name): LOGGER.info('updating bookmark: %s > %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) bookmarks.write_bookmark(STATE, state_key_name(sdk_client.client_customer_id, stream_name), 'date', start.strftime(utils.DATETIME_FMT)) singer.write_state(STATE) else: LOGGER.info('not updating bookmark: %s <= %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) LOGGER.info("Done syncing %s records for the %s report for customer_id %s on %s", counter.value, stream_name, customer_id, start)
def process_records(stream, mdata, max_modified, records, filter_field): schema = stream.schema.to_dict() with metrics.record_counter(stream.tap_stream_id) as counter: for record in records: record_flat = {} for prop, value in record.items(): record_flat[prop] = value if (filter_field in record_flat and record_flat[filter_field] > max_modified): max_modified = record_flat[filter_field] with Transformer() as transformer: record_typed = transformer.transform(record_flat, schema, mdata) singer.write_record(stream.tap_stream_id, record_typed) counter.increment() return max_modified
def sync(self, state: dict, stream_schema: dict, stream_metadata: dict, config: dict, transformer: Transformer) -> dict: """ The sync logic for an full table stream. :param state: A dictionary representing singer state :param stream_schema: A dictionary containing the stream schema :param stream_metadata: A dictionnary containing stream metadata :param config: A dictionary containing tap config data :param transformer: A singer Transformer object :return: State data in the form of a dictionary """ with metrics.record_counter(self.tap_stream_id) as counter: for record in self.get_records(config): transformed_record = transformer.transform(record, stream_schema, stream_metadata) singer.write_record(self.tap_stream_id, transformed_record) counter.increment() singer.write_state(state) return state
def sync_file(conn, f, stream, table_spec): LOGGER.info('Syncing file "%s".', f["filepath"]) try: file_handle = conn.get_file_handle(f) except OSError: return 0 # Add file_name to opts and flag infer_compression to support gzipped files opts = { 'key_properties': table_spec['key_properties'], 'delimiter': table_spec['delimiter'], 'file_name': f['filepath'] } readers = csv.get_row_iterators(file_handle, options=opts, infer_compression=True) records_synced = 0 for reader in readers: with Transformer() as transformer: for row in reader: custom_columns = { '_sdc_source_file': f["filepath"], # index zero, +1 for header row '_sdc_source_lineno': records_synced + 2 } rec = {**row, **custom_columns} to_write = transformer.transform( rec, stream.schema.to_dict(), metadata.to_map(stream.metadata)) singer.write_record(stream.tap_stream_id, to_write) records_synced += 1 stats.add_file_data(table_spec, f['filepath'], f['last_modified'], records_synced) return records_synced
def process_record(self, record, time_extracted, bookmark_field): with Transformer() as transformer: transformed_record = transformer.transform(record, self.schema, self.stream_metadata) self._update_bookmark(transformed_record, bookmark_field) if self._is_record_past_bookmark(transformed_record, bookmark_field): try: write_record(self.stream_name, transformed_record, time_extracted=time_extracted) except OSError as err: LOGGER.info(f'OS Error writing record for: {self.stream_name}') LOGGER.info(f'record: {transformed_record}') raise err return True return False
def sync_stream(state, instance): stream = instance.stream with metrics.record_counter(stream.tap_stream_id) as counter: for (stream, record) in instance.sync(state): counter.increment() with Transformer(integer_datetime_fmt= "unix-milliseconds-integer-datetime-parsing" ) as transformer: record = transformer.transform( record, stream.schema.to_dict(), metadata.to_map(stream.metadata)) singer.write_record(stream.tap_stream_id, record) if instance.replication_method == "INCREMENTAL": singer.write_state(state) return counter.value
def sync_channels(client, catalog, channel_ids, endpoint_config): stream_name = 'channels' stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) id_fields = endpoint_config.get('key_properties', 'id') params = endpoint_config.get('params', {}) params['id'] = channel_ids records = get_paginated_data(client=client, url=DATA_URL, path=stream_name, endpoint=stream_name, params=params, data_key='items') time_extracted = utils.now() with metrics.record_counter(stream_name) as counter: for record in records: for key in id_fields: if not record.get(key): raise ValueError('Stream: {}, Missing key: {}'.format( stream_name, key)) with Transformer() as transformer: try: transformed_record = transformer.transform( transform_data_record(record), schema, stream_metadata) except Exception as err: LOGGER.error('Transformer Error: %s', err) LOGGER.error('Stream: %s, record: %s', stream_name, record) raise err write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() LOGGER.info('Stream: {}, Processed {} records'.format( stream_name, counter.value)) return counter.value
def process_records(stream, mdata, max_modified, records, filter_field, fks): schema = stream.schema.to_dict() with metrics.record_counter(stream.tap_stream_id) as counter: for record in records: record_flat = { 'id': record['id'] } for prop, value in record['attributes'].items(): if prop == 'id': raise Exception('Error flattening Outeach record - conflict with `id` key') record_flat[prop] = value if 'relationships' in record: for prop, value in record['relationships'].items(): if 'data' not in value and 'links' not in value: raise Exception('Only `data` or `links` expected in relationships') fk_field_name = '{}Id'.format(prop) if 'data' in value and fk_field_name in fks: data_value = value['data'] if data_value is not None and 'id' not in data_value: raise Exception('null or `id` field expected for `data` relationship') if fk_field_name in record_flat: raise Exception( '`{}` exists as both an attribute and generated relationship name'.format(fk_field_name)) if data_value == None: record_flat[fk_field_name] = None else: record_flat[fk_field_name] = data_value['id'] if filter_field in record_flat and record_flat[filter_field] > max_modified: max_modified = record_flat[filter_field] with Transformer() as transformer: record_typed = transformer.transform(record_flat, schema, mdata) singer.write_record(stream.tap_stream_id, record_typed) counter.increment() return max_modified
def sync_contact_lists(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("contact_lists") bookmark_key = "updatedAt" singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get("stream_alias")) start = get_start(STATE, "contact_lists", bookmark_key) max_bk_value = start LOGGER.info("sync_contact_lists from %s", start) url = get_url("contact_lists") params = {"count": 250} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request( STATE, "contact_lists", url, params, "lists", "has-more", ["offset"], ["offset"], ): record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: singer.write_record( "contact_lists", record, catalog.get("stream_alias"), time_extracted=utils.now(), ) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] STATE = singer.write_bookmark(STATE, "contact_lists", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def process_records( catalog, # pylint: disable=too-many-branches stream_name, records, time_extracted, bookmark_field=None, max_bookmark_value=None, last_datetime=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) # Reset max_bookmark_value to new value if higher if transformed_record.get(bookmark_field): if max_bookmark_value is None or transformed_record[ bookmark_field] > transform_datetime( max_bookmark_value): max_bookmark_value = transformed_record[bookmark_field] if bookmark_field and (bookmark_field in transformed_record): last_dttm = transform_datetime(last_datetime) bookmark_dttm = transform_datetime( transformed_record[bookmark_field]) # Keep only records whose bookmark is after the last_datetime if bookmark_dttm >= last_dttm: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() else: write_record(stream_name, transformed_record, time_extracted=time_extracted) counter.increment() return max_bookmark_value, counter.value
def ORIGINAL_TAP(config): shop_url = "https://{k}:{p}@{s}.myshopify.com/admin".format(k=config['api_key'],p=config['api_password'],s=config['shop_name']) shopify.ShopifyResource.set_site(shop_url) start_time = time.time() if WRITE_TO_TARGET: singer.write_schema(config['stream_id'], config['stream_schema'], config['key_properties'], bookmark_properties=config['replication_key']) rec_count = 0 with Transformer() as transformer: for rec in sync(config): extraction_time = singer.utils.now() record_metadata = metadata.to_map(config['stream_metadata']) rec = transformer.transform(rec, config['stream_schema'], record_metadata) if WRITE_TO_TARGET: singer.write_record(config['stream_id'], rec, time_extracted=extraction_time) rec_count += 1 duration = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) return (rec_count, duration)
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("deal_pipelines") singer.write_schema("deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias")) LOGGER.info("sync_deal_pipelines") data = request(get_url("deal_pipelines")).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema, mdata) singer.write_record( "deal_pipelines", record, catalog.get("stream_alias"), time_extracted=utils.now(), ) singer.write_state(STATE) return STATE
def sync_contacts(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) bookmark_key = 'versionTimestamp' start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key)) LOGGER.info("sync_contacts from %s", start) max_bk_value = start schema = load_schema("contacts") singer.write_schema("contacts", schema, ["vid"], [bookmark_key], catalog.get('stream_alias')) url = get_url("contacts_all") vids = [] with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'contacts', url, default_contact_params, 'contacts', 'has-more', ['vid-offset'], ['vidOffset']): modified_time = None if bookmark_key in row: modified_time = utils.strptime_with_tz( _transform_datetime( # pylint: disable=protected-access row[bookmark_key], UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) if not modified_time or modified_time >= start: vids.append(row['vid']) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if len(vids) == 100: _sync_contact_vids(catalog, vids, schema, bumble_bee) vids = [] _sync_contact_vids(catalog, vids, schema, bumble_bee) STATE = singer.write_bookmark(STATE, 'contacts', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE
def sync_stream(state, start_date, instance): stream = instance.stream bookmark_date = instance.get_bookmark(state, instance.name, start_date, instance.replication_key) bookmark_dttm = strptime_to_utc(bookmark_date) new_bookmark = bookmark_dttm with metrics.record_counter(stream.tap_stream_id) as counter, Transformer( integer_datetime_fmt="unix-milliseconds-integer-datetime-parsing" ) as transformer: (stream, records) = instance.sync(state) for record in records: schema_dict = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) transformed_record = instance.transform(record) try: transformed_record = transformer.transform( transformed_record, schema_dict, stream_metadata) except Exception as err: LOGGER.error('Error: %s', err) LOGGER.error(' for schema: %s', json.dumps(schema_dict, sort_keys=True, indent=2)) LOGGER.error('Transform failed for %s', record) raise err record_timestamp = strptime_to_utc( transformed_record.get( humps.decamelize(instance.replication_key))) new_bookmark = max(new_bookmark, record_timestamp) if record_timestamp > bookmark_dttm: singer.write_record(stream.tap_stream_id, transformed_record) counter.increment() instance.update_bookmark(state, instance.name, strftime(new_bookmark), instance.replication_key) singer.write_state(state) return counter.value
def sync(client, config, catalog, state): LOGGER.info('Starting Sync..') selected_streams = catalog.get_selected_streams(state) streams = [] stream_keys = [] with Transformer() as transformer: for catalog_entry in selected_streams: streams.append(catalog_entry) stream_keys.append(catalog_entry.stream) for catalog_entry in streams: stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client, config=config, catalog=catalog, state=state) LOGGER.info('Syncing stream: %s', catalog_entry.stream) stream.write_state() stream_schema = catalog_entry.schema.to_dict() stream.write_schema() stream_metadata = metadata.to_map(catalog_entry.metadata) max_bookmark_value = None with singer.metrics.record_counter( endpoint=stream.name) as counter: for page in stream.sync(catalog_entry.metadata): for records in page: transformed_records = transform(records) for transformed in transformed_records: singer.write_record( catalog_entry.stream, transformer.transform( transformed, stream_schema, stream_metadata, )) counter.increment() stream.update_bookmark(stream.name, max_bookmark_value) stream.write_state() stream.write_state() LOGGER.info('Finished Sync..')
def sync(config, state, catalog): client = PeekClient(config['token']) partner_id = config['partner_id'] with Transformer() as transformer: for stream in catalog.get_selected_streams(state): tap_stream_id = stream.tap_stream_id stream_obj = STREAMS[tap_stream_id](client, state) replication_key = stream_obj.replication_key stream_schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) LOGGER.info('Staring sync for stream: %s', tap_stream_id) state = singer.set_currently_syncing(state, tap_stream_id) singer.write_state(state) singer.write_schema(tap_stream_id, stream_schema, stream_obj.key_properties, stream.replication_key) start_date = singer.get_bookmark(state, tap_stream_id, replication_key, config['start_date']) end_date = singer.utils.strftime( singer.utils.now(), format_str=singer.utils.DATETIME_PARSE) for record in stream_obj.sync(partner_id=partner_id, start_date=start_date, end_date=end_date): LOGGER.info(f"Writing record: {record}") transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( tap_stream_id, transformed_record, ) state = singer.clear_bookmark(state, tap_stream_id, 'start_date') singer.write_state(state) state = singer.set_currently_syncing(state, None) singer.write_state(state)
def sync(): # Emit all schemas first so we have them for child streams for stream in Context.catalog["streams"]: if Context.is_selected(stream["tap_stream_id"]): singer.write_schema(stream["tap_stream_id"], stream["schema"], stream["key_properties"]) Context.counts[stream["tap_stream_id"]] = 0 # Loop over streams in catalog for catalog_entry in Context.catalog['streams']: stream_id = catalog_entry['tap_stream_id'] stream = Context.stream_objects[stream_id]() if not Context.is_selected(stream_id): LOGGER.info('Skipping stream: %s', stream_id) continue LOGGER.info('Syncing stream: %s', stream_id) if not Context.state.get('bookmarks'): Context.state['bookmarks'] = {} Context.state['bookmarks']['currently_sync_stream'] = stream_id with Transformer() as transformer: for rec in stream.sync(): extraction_time = singer.utils.now() record_schema = catalog_entry['schema'] record_metadata = metadata.to_map(catalog_entry['metadata']) rec = transformer.transform(rec, record_schema, record_metadata) singer.write_record(stream_id, rec, time_extracted=extraction_time) Context.counts[stream_id] += 1 Context.state['bookmarks'].pop('currently_sync_stream') singer.write_state(Context.state) LOGGER.info('----------------------') for stream_id, stream_count in Context.counts.items(): LOGGER.info('%s: %d', stream_id, stream_count) LOGGER.info('----------------------')
def do_sync(account, catalog, state): streams_to_sync = get_streams_to_sync(account, catalog, state) refs = load_shared_schema_refs() for stream in streams_to_sync: LOGGER.info('Syncing %s, fields %s', stream.name, stream.fields()) schema = singer.resolve_schema_references(load_schema(stream), refs) bookmark_key = BOOKMARK_KEYS.get(stream.name) singer.write_schema(stream.name, schema, stream.key_properties, bookmark_key, stream.stream_alias) with Transformer(pre_hook=transform_date_hook) as transformer: with metrics.record_counter(stream.name) as counter: for message in stream: if 'record' in message: counter.increment() time_extracted = utils.now() record = transformer.transform(message['record'], schema) singer.write_record(stream.name, record, stream.stream_alias, time_extracted) elif 'state' in message: singer.write_state(message['state']) else: raise TapFacebookException('Unrecognized message {}'.format(message))
def process_records(catalog, stream_name, records, time_extracted, version=None): stream = catalog.get_stream(stream_name) schema = stream.schema.to_dict() stream_metadata = metadata.to_map(stream.metadata) with metrics.record_counter(stream_name) as counter: for record in records: # Transform record for Singer.io with Transformer() as transformer: transformed_record = transformer.transform( record, schema, stream_metadata) write_record(stream_name=stream_name, record=transformed_record, time_extracted=time_extracted, version=version) counter.increment() return counter.value
def for_each_estimate(estimate, time_extracted): # Sync estimate messages sync_endpoint("estimate_messages", endpoint=("estimates/{}/messages".format( estimate['id'])), path="estimate_messages", with_updated_since=False, date_fields=["send_reminder_on"], map_handler=map_estimate_message) # Extract all estimate_line_items line_items_schema = load_and_write_schema("estimate_line_items") with Transformer() as transformer: for line_item in estimate['line_items']: line_item['estimate_id'] = estimate['id'] line_item = transformer.transform(line_item, line_items_schema) singer.write_record("estimate_line_items", line_item, time_extracted=time_extracted)
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) records_synced += 1 return records_synced
def for_each_invoice(invoice, time_extracted): def map_invoice_message(message): message['invoice_id'] = invoice['id'] return message def map_invoice_payment(payment): payment['invoice_id'] = invoice['id'] payment['payment_gateway_id'] = payment['payment_gateway']['id'] payment['payment_gateway_name'] = payment['payment_gateway'][ 'name'] return payment # Sync invoice messages sync_endpoint("invoice_messages", endpoint=("invoices/{}/messages".format(invoice['id'])), path="invoice_messages", with_updated_since=False, map_handler=map_invoice_message) # Sync invoice payments sync_endpoint("invoice_payments", endpoint=("invoices/{}/payments".format(invoice['id'])), path="invoice_payments", with_updated_since=False, map_handler=map_invoice_payment, date_fields=["send_reminder_on"]) # Extract all invoice_line_items line_items_schema = load_and_write_schema("invoice_line_items") with Transformer() as transformer: for line_item in invoice['line_items']: line_item['invoice_id'] = invoice['id'] if line_item['project'] is not None: line_item['project_id'] = line_item['project']['id'] else: line_item['project_id'] = None line_item = transformer.transform(line_item, line_items_schema) singer.write_record("invoice_line_items", line_item, time_extracted=time_extracted)
def sync_deals(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) bookmark_key = 'hs_lastmodifieddate' start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key)) max_bk_value = start LOGGER.info("sync_deals from %s", start) most_recent_modified_time = start params = {'count': 250, 'properties' : []} schema = load_schema("deals") singer.write_schema("deals", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias')) # Append all the properties fields for deals to the request additional_properties = schema.get("properties").get("properties").get("properties") for key in additional_properties.keys(): params['properties'].append(key) url = get_url('deals_all') with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore", ["offset"], ["offset"]): row_properties = row['properties'] modified_time = None if bookmark_key in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) elif 'createdate' in row_properties: # Hubspot returns timestamps in millis timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0 modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc) if modified_time and modified_time >= max_bk_value: max_bk_value = modified_time if not modified_time or modified_time >= start: record = bumble_bee.transform(row, schema) singer.write_record("deals", record, catalog.get('stream_alias'), time_extracted=utils.now()) STATE = singer.write_bookmark(STATE, 'deals', bookmark_key, utils.strftime(max_bk_value)) singer.write_state(STATE) return STATE