def sync_events(): schema = load_schema("events") singer.write_schema("events", schema, []) for export_bundle in request_export_bundles(): with metrics.record_counter("events") as counter: for event in download_events(export_bundle['Id']): transform_event(event) counter.increment() singer.write_record("events", event) stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop']) utils.update_state(STATE, "events", stop_timestamp) singer.write_state(STATE)
def sync_sales_activities(): """Sync all sales activities, call out to individual filters """ bookmark_property = 'updated_at' endpoint = 'sales_activities' state_entity = endpoint start = get_start(state_entity) singer.write_schema(endpoint, tap_utils.load_schema(endpoint), ["id"], bookmark_properties=[bookmark_property]) sales = gen_request(get_url(endpoint)) for sale in sales: if sale[bookmark_property] >= start: LOGGER.info("Sale {}: Syncing details".format(sale['id'])) singer.write_record("sale_activities", sale, time_extracted=singer.utils.now())
def sync_deals_by_filter(bookmark_prop, fil): """ Iterate over all deal filter to sync all deal data """ endpoint = 'deals' fil_id = fil['id'] state_entity = endpoint + "_" + str(fil_id) start = get_start(state_entity) deals = gen_request(get_url(endpoint, query='view/'+str(fil_id))) for deal in deals: if deal[bookmark_prop] >= start: # get all sub-entities and save them deal['amount'] = float(deal['amount']) # cast amount to float deal['custom_field'] = json.dumps( deal['custom_field']) # Make JSON String to store LOGGER.info("Deal {}: Syncing details".format(deal['id'])) singer.write_record( "deals", deal, time_extracted=singer.utils.now())
def get_all_pull_requests(schemas, config, state, mdata): ''' https://developer.github.com/v3/pulls/#list-pull-requests ''' repo_path = config['repository'] with metrics.record_counter('pull_requests') as counter: with metrics.record_counter('reviews') as reviews_counter: for response in authed_get_all_pages( 'pull_requests', 'https://api.github.com/repos/{}/pulls?state=all'.format( repo_path)): pull_requests = response.json() extraction_time = singer.utils.now() for pr in pull_requests: pr_num = pr.get('number') # transform and write pull_request record with singer.Transformer() as transformer: rec = transformer.transform( pr, schemas['pull_requests'], metadata=metadata.to_map(mdata)) singer.write_record('pull_requests', rec, time_extracted=extraction_time) singer.write_bookmark( state, 'pull_requests', 'since', singer.utils.strftime(extraction_time)) counter.increment() # sync reviews if that schema is present (only there if selected) if schemas.get('reviews'): for review_rec in get_reviews_for_pr( pr_num, schemas['reviews'], config, state, mdata): singer.write_record('reviews', review_rec, time_extracted=extraction_time) singer.write_bookmark( state, 'reviews', 'since', singer.utils.strftime(extraction_time)) reviews_counter.increment() return state
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) records_synced += 1 return records_synced
def sync(self): current_bookmark_str = singer.bookmarks.get_bookmark( state=self.state, tap_stream_id=self.tap_stream_id, key=self.bookmark_properties) if current_bookmark_str is not None: self.params.update({self.api_bookmark_param: current_bookmark_str}) singer.bookmarks.write_bookmark(state=self.state, tap_stream_id=self.tap_stream_id, key=self.bookmark_properties, val=singer.utils.strftime( singer.utils.now())) with singer.metrics.job_timer(job_type=f"sync_{self.tap_stream_id}"): with singer.metrics.record_counter( endpoint=self.tap_stream_id) as counter: project_params = self.config.get("streams", {}).get("projects", {}) project_params.update({"fields": "id"}) for project in self._yield_records(entity='projects', params=project_params): # Reset the offset after each Project iteration. self.params.update({"offset": 0}) for story in self._yield_records( entity=f"projects/{project.get('id')}/stories", params=self.params): for endpoint in self.expand_endpoints: records = [ record for record in self._yield_records( entity= f"projects/{project.get('id')}/stories/{story.get('id')}/{endpoint}", params={}) ] story[endpoint] = records with singer.Transformer() as transformer: transformed_record = transformer.transform( data=story, schema=self.schema) singer.write_record( stream_name=self.tap_stream_id, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def sync_contacts(STATE, stream): '''Sync contacts from the Autopilot API The API returns data in the following format { "contacts": [{...},{...}], "total_contacts": 400, "bookmark": "person_9EAF39E4-9AEC-4134-964A-D9D8D54162E7" } Params: STATE - State dictionary stream - Stream dictionary from the catalog ''' tap_stream_id = stream['tap_stream_id'] singer.write_schema(tap_stream_id, stream['schema'], ["contact_id"]) start = utils.strptime_with_tz(get_start(STATE, tap_stream_id, "updated_at")) LOGGER.info("Only syncing contacts updated since " + utils.strftime(start)) max_updated_at = start for row in gen_request(STATE, get_url(tap_stream_id)): updated_at = None if "updated_at" in row: updated_at = utils.strptime_with_tz( _transform_datetime( # pylint: disable=protected-access row["updated_at"], UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)) if not updated_at or updated_at >= start: singer.write_record(tap_stream_id, transform_contact(row)) if updated_at and updated_at > max_updated_at: max_updated_at = updated_at STATE = singer.write_bookmark(STATE, tap_stream_id, "updated_at", utils.strftime(max_updated_at)) singer.write_state(STATE) LOGGER.info("Completed Contacts Sync") return STATE
def get_rate_cards_rates(name, schema, state, url, start_date, replication_key, replication_method, sync=False, stream_rate_cards=None, mdata=None): with metrics.record_counter(name) as counter: for rate_card in get_data('rate_cards', stream_rate_cards.schema.to_dict(), state, url, start_date, stream_rate_cards.replication_key, stream_rate_cards.replication_method, mdata=stream_rate_cards.metadata, sync=False, by_pass_date=True): rate_card_id = rate_card.get('id') response = request_get(url + f'rate_cards/{rate_card_id}/{name}') if response: bookmark = singer.get_bookmark(state, name, replication_key) if bookmark is None: bookmark = start_date new_bookmark = bookmark records = response.json() extraction_time = singer.utils.now() for record in records: with singer.Transformer() as transformer: record['rate_card_id'] = rate_card_id rec = transformer.transform( record, schema, metadata=metadata.to_map(mdata)) new_bookmark = max(new_bookmark, rec[replication_key]) singer.write_record(name, rec, time_extracted=extraction_time) counter.increment() singer.write_bookmark(state, name, replication_key, new_bookmark) return state
def sync(self, client, **kwargs): startdate = kwargs['startdate'] start, end = self.get_absolute_start_end_time( startdate, lookback=int(self.config.get('lookback'))) max_bookmark_dttm = start with singer.metrics.record_counter(endpoint=self.name) as counter: while start != end: start_str = start.strftime(INVOICE_DATETIME_FMT) next_window_str = start_str results = client.get_paginated_data(self.api_method, self.version, self.endpoint, data_key=self.data_key, params=self.build_params(), body=self.build_body( start_str, next_window_str)) max_bookmark_value = strftime(max_bookmark_dttm) with Transformer( integer_datetime_fmt="no-integer-datetime-parsing" ) as transformer: for page in results: for record in page.get(self.data_key): transformed_record = self.transform(record) record_timestamp = strptime_to_utc( transformed_record[self.replication_key]) if record_timestamp > max_bookmark_dttm: max_bookmark_value = strftime(record_timestamp) singer.write_record( stream_name=self.name, record=transformer.transform( data=transformed_record, schema=self.stream_schema, metadata=self.stream_metadata), time_extracted=singer.utils.now()) counter.increment() start = start + timedelta(days=DATE_WINDOW_SIZE) self.update_bookmark(self.name, max_bookmark_value) return counter.value
def process_worksheet(gsheets_loader, sheet_name, worksheet, config): if worksheet is None: name_with_worksheet = sheet_name else: name_with_worksheet = sheet_name + "_" + worksheet if 'singular_table_name' in config and config['singular_table_name']: stream_name = underscore(parameterize(name_with_worksheet)) else: stream_name = tableize(parameterize(name_with_worksheet)) schema = gsheets_loader.get_schema(sheet_name, worksheet) records = gsheets_loader.get_data(sheet_name, worksheet) # additional data transformations column_mapping = None if 'underscore_columns' in config and config['underscore_columns']: column_mapping = {'id': 'id'} props = {} for k, v in schema['properties'].items(): kt = underscore(parameterize(k)) props[kt] = v column_mapping[k] = kt schema['properties'] = props schema['properties']['id'] = {'type': 'integer'} for i, record in enumerate(records, start=1): record['id'] = i # write stuff singer.write_schema( stream_name=stream_name, schema=schema, key_properties=['id'] ) for record in records: if column_mapping is not None: record_transformed = {column_mapping[k]: v for k, v in record.items()} else: record_transformed = record singer.write_record(stream_name, record_transformed)
def sync(self): with singer.metrics.job_timer(job_type=f"sync_{self.tap_stream_id}"): with singer.metrics.record_counter( endpoint=self.tap_stream_id) as counter: client = self._get_auth_client() params = { "start_date": singer.utils.strftime(singer.utils.now() - timedelta(days=365), format_str='%Y-%m-01'), "end_date": singer.utils.strftime(singer.utils.now(), format_str="%Y-%m-%d"), "accounting_method": "Accrual", "summarize_column_by": "Month" } resp = self._get(auth_client=client, report_entity='ProfitAndLoss', params=params) rows = self._transform_columns_into_rows(resp) for i, row in enumerate(rows): if row.get("StartDate") is None: continue input = [] data = self._get_row_data(resp=resp, column_enum=i + 1, input=input) new_data = {} for line in data: new_data.update(line) row["ReportData"] = new_data row["SyncTimestampUtc"] = singer.utils.strftime( singer.utils.now(), "%Y-%m-%dT%H:%M:%SZ") with singer.Transformer() as transformer: transformed_record = transformer.transform( data=row, schema=self.schema) singer.write_record(stream_name=self.stream, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def sync(self, mdata): schema = self.load_schema() bookmark = singer.get_bookmark(state=self.state, tap_stream_id=self.name, key=self.replication_key) if bookmark is None: bookmark = self.config.get('start_date') new_bookmark = bookmark # pylint: disable=unused-variable with singer.metrics.job_timer(job_type='list_users') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: users_list = self.client.get_users(limit=100) for page in users_list: users = page.get('members') transformed_users = transform_json( stream=self.name, data=users, date_fields=self.date_fields) for user in transformed_users: with singer.Transformer( integer_datetime_fmt="unix-seconds-integer-datetime-parsing") \ as transformer: transformed_record = transformer.transform( data=user, schema=schema, metadata=metadata.to_map(mdata)) new_bookmark = max( new_bookmark, transformed_record.get('updated')) if transformed_record.get('updated') > bookmark: if self.write_to_singer: singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment() self.state = singer.write_bookmark(state=self.state, tap_stream_id=self.name, key=self.replication_key, val=new_bookmark)
def get_all_events(schemas, repo_path, state, mdata): # Incremental sync off `created_at` # https://developer.github.com/v3/issues/events/#list-events-for-a-repository # 'https://api.github.com/repos/{}/issues/events?sort=created_at&direction=desc'.format(repo_path) bookmark_value = get_bookmark(state, repo_path, "events", "since") if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter('events') as counter: for response in authed_get_all_pages( 'events', 'https://api.github.com/repos/{}/events?sort=created_at&direction=desc' .format(repo_path)): events = response.json() extraction_time = singer.utils.now() for r in events: r['_sdc_repository'] = repo_path # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting updated_at = r.get('created_at') if r.get( 'updated_at') is None else r.get('updated_at') if bookmark_time and singer.utils.strptime_to_utc( updated_at) < bookmark_time: return state # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('events', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'events', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def sync_report_for_day(stream_name, stream_schema, sdk_client, start, field_list): # pylint: disable=too-many-locals report_downloader = sdk_client.GetReportDownloader(version=VERSION) customer_id = sdk_client.client_customer_id report = { 'reportName': 'Seems this is required', 'dateRangeType': 'CUSTOM_DATE', 'reportType': stream_name, 'downloadFormat': 'CSV', 'selector': { 'fields': field_list, 'dateRange': {'min': start.strftime('%Y%m%d'), 'max': start.strftime('%Y%m%d')}}} # Fetch the report as a csv string with metrics.http_request_timer(stream_name): result = attempt_download_report(report_downloader, report) headers, values = parse_csv_string(result) with metrics.record_counter(stream_name) as counter: time_extracted = utils.now() for _, val in enumerate(values): obj = dict(zip(get_xml_attribute_headers(stream_schema, headers), val)) obj['_sdc_customer_id'] = customer_id obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: bumble_bee.pre_hook = transform_pre_hook obj = bumble_bee.transform(obj, stream_schema) singer.write_record(stream_name, obj, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(sdk_client.client_customer_id, stream_name): LOGGER.info('updating bookmark: %s > %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) bookmarks.write_bookmark(STATE, state_key_name(sdk_client.client_customer_id, stream_name), 'date', start.strftime(utils.DATETIME_FMT)) singer.write_state(STATE) else: LOGGER.info('not updating bookmark: %s <= %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) LOGGER.info("Done syncing %s records for the %s report for customer_id %s on %s", counter.value, stream_name, customer_id, start)
def paginate(self, offset, count, ext_time, path, stream_id): if len(self.state) < 14: start_date = singer.utils.strptime_with_tz( self.config['start_date']) else: first_time = False start_date = singer.utils.strptime_with_tz(self.state[stream_id]) start_date = start_date.strftime('%m/%d/%YT%H:%M:%S') ext_time = start_date while (int(count) > int(offset) and (int(count) - int(offset)) >= -100): url = "https://api.merchantos.com/API/Account/" + str( self.config['customer_ids']) + "/" + str( stream_id) + ".json?offset=" relation = self.create_relation() page = self.client.request(stream_id, "GET", (url + str(offset) + relation)) info = page['@attributes'] count = info['count'] if int(count) <= 100: offset = 300 data = page[str(stream_id)] else: offset = int(info['offset']) + 100 data = page[str(stream_id)] for key in data: if type(key) == str: ext_time = data['timeStamp'] elif type(key) == str: if key['timeStamp'] >= ext_time: ext_time = key['timeStamp'] else: pass singer.write_record(stream_id, data) with metrics.record_counter(stream_id) as counter: counter.increment(len(page)) continue else: pass singer.write_record(stream_id, key) with metrics.record_counter(stream_id) as counter: counter.increment(len(page)) path.append(ext_time) self.update_start_date_bookmark(path, str(stream_id))
def sync(self, mdata): schema = self.load_schema() with singer.metrics.job_timer(job_type='list_conversations') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: for channel in self.channels(): with singer.Transformer( integer_datetime_fmt= "unix-seconds-integer-datetime-parsing" ) as transformer: transformed_record = transformer.transform( data=channel, schema=schema, metadata=metadata.to_map(mdata)) singer.write_record(stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
async def sync_stats(self, schema, period: pendulum.period = None): """Output the stats in the period.""" stream = "stats" loop = asyncio.get_event_loop() singer.write_schema(stream, schema, ["service_id", "start_time"]) bookmark = get_bookmark(self.state, stream, "from") if bookmark is not None: if "UTC" in bookmark: bookmark = datetime.datetime.strptime( bookmark, '%Y-%m-%d %H:%M:%S UTC').isoformat() start_date = pendulum.parse(bookmark).int_timestamp else: start_date = pendulum.parse( self._config['start_date']).int_timestamp end_date = pendulum.now().int_timestamp result = await loop.run_in_executor(None, self.client.stats, start_date, end_date) LOGGER.info("stats results: %s", result) if result: for n in result['data']: service_result = await loop.run_in_executor( None, self.client.service, n) for i in result['data'][n]: i['service_name'] = service_result['name'] i['service_versions'] = json.dumps( service_result['versions']) i['service_customer_id'] = service_result['customer_id'] i['service_publish_key'] = service_result['publish_key'] i['service_comment'] = service_result['comment'] i['service_deleted_at'] = service_result['deleted_at'] i['service_updated_at'] = service_result['updated_at'] i['service_created_at'] = service_result['created_at'] singer.write_record(stream, i) try: end_temp = datetime.datetime.strptime(result['meta']["to"], '%Y-%m-%d %H:%M:%S UTC') end = end_temp.isoformat() self.state = write_bookmark(self.state, stream, "from", end) except: # print("what fails is:" + result['meta']["to"]) sys.stderr.write("what fails is:" + result['meta']["to"] + "\n")
def sync_deal_pipelines(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("deal_pipelines") singer.write_schema("deal_pipelines", schema, ["pipelineId"], catalog.get("stream_alias")) LOGGER.info("sync_deal_pipelines") data = request(get_url("deal_pipelines")).json() with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in data: record = bumble_bee.transform(row, schema, mdata) singer.write_record( "deal_pipelines", record, catalog.get("stream_alias"), time_extracted=utils.now(), ) singer.write_state(STATE) return STATE
def sync_contact_lists(STATE, ctx): catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE)) mdata = metadata.to_map(catalog.get("metadata")) schema = load_schema("contact_lists") bookmark_key = "updatedAt" singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key], catalog.get("stream_alias")) start = get_start(STATE, "contact_lists", bookmark_key) max_bk_value = start LOGGER.info("sync_contact_lists from %s", start) url = get_url("contact_lists") params = {"count": 250} with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: for row in gen_request( STATE, "contact_lists", url, params, "lists", "has-more", ["offset"], ["offset"], ): record = bumble_bee.transform(row, schema, mdata) if record[bookmark_key] >= start: singer.write_record( "contact_lists", record, catalog.get("stream_alias"), time_extracted=utils.now(), ) if record[bookmark_key] >= max_bk_value: max_bk_value = record[bookmark_key] STATE = singer.write_bookmark(STATE, "contact_lists", bookmark_key, max_bk_value) singer.write_state(STATE) return STATE
def sync_time_filtered(entity): bookmark_property = 'updated_at' singer.write_schema(entity, utils.load_schema(entity), ["id"], bookmark_properties=[bookmark_property]) start = get_start(entity) logger.info("Syncing {} from {}".format(entity, start)) for row in gen_request(get_url(entity)): if row[bookmark_property] >= start: if 'custom_fields' in row: row['custom_fields'] = transform_dict(row['custom_fields'], force_str=True) utils.update_state(STATE, entity, row[bookmark_property]) singer.write_record(entity, row, time_extracted=singer.utils.now()) singer.write_state(STATE)
async def sync_custom_usage(self, schema): """Get hourly usage for custom metric.""" stream = "custom_usage" loop = asyncio.get_event_loop() singer.write_schema(stream, schema, ["hour", "account"]) custom_usage = await loop.run_in_executor(None, self.client.hourly_request, self.state, self.config, f"timeseries", stream) if custom_usage: for c in custom_usage['usage']: c['account'] = self.config['account'] singer.write_record(stream, c) if custom_usage['usage'] is not None and len( custom_usage['usage']) > 0: self.state = write_bookmark( self.state, stream, "since", custom_usage['usage'][len(custom_usage['usage']) - 1]['hour'])
def sync(self): most_recent_date = self.params["last_seen_at"] record_metadata = singer.metadata.to_map(self.metadata) with singer.metrics.job_timer(job_type=f"list_{self.tap_stream_id}"), \ singer.metrics.record_counter(endpoint=self.tap_stream_id) as counter, \ singer.Transformer() as transformer: for page in self._list_resource(url_suffix="/customers/all", params=self.params): for record in page.get(self.tap_stream_id): transformed_record = transformer.transform(data=record, schema=self.schema, metadata=record_metadata) singer.write_record(stream_name=self.stream, time_extracted=singer.utils.now(), record=transformed_record) counter.increment() if transformed_record["last_seen_at"] > most_recent_date: most_recent_date = transformed_record["last_seen_at"] singer.bookmarks.write_bookmark(state=self.state, tap_stream_id=self.tap_stream_id, key="last_seen_at", val=most_recent_date) singer.write_state(self.state) singer.bookmarks.write_bookmark(state=self.state, tap_stream_id=self.tap_stream_id, key="last_seen_at", val=most_recent_date)
def do_sync(account, annotated_schemas, state): for stream in get_streams_to_sync(account, annotated_schemas, state): LOGGER.info('Syncing %s, fields %s', stream.name, stream.fields()) schema = load_schema(stream) singer.write_schema(stream.name, schema, stream.key_properties) with singer.stats.Counter(source=stream.name) as stats: for message in stream: if 'record' in message: stats.add(record_count=1) record = singer.transform.transform( message['record'], schema) singer.write_record(stream.name, record) elif 'state' in message: singer.write_state(message['state']) else: raise Exception('Unrecognized message {}'.format(message))
def getaccount(): accounts = requests.get('https://api.awin.com/accounts?accessToken=' + AUTH['accessToken'], headers={"user_agent": AUTH['user_agent']}) if accounts.status_code == 200: singer.write_schema("Accounts", ACCOUNT_SCHEMA, ["accountId"]) for account in accounts.json()['accounts']: if account['accountType'] == 'advertiser': ADVERTISERS.append(account['accountId']) if account['accountType'] == 'publisher': PUBLISHERS.append(account['publisher']) account["startDate"] = str( parse(STATE['last_fetched']) + timedelta(days=1)) account["endDate"] = str(parse(STATE['last_fetched']) + \ timedelta(days=AUTH['increment'])) singer.write_record("Accounts", account) else: LOGGER.error(accounts.json()['error']) sys.exit(1)
def getcampaign(): campaigns = CLIENT.service.getCampaigns() singer.write_schema('campaigns', CAMPAIGN_SCHEMA, ['campaignID']) campaigns_list = [] for acmp in campaigns: for cmp in acmp[1:]: camp = dict(cmp[0]) if camp['campaignBid'] != None: camp['biddingStrategy'] = camp['campaignBid'][ 'biddingStrategy'] if camp['campaignBid']['cpcBid'] != None: camp['cpc'] = camp['campaignBid']['cpcBid']['cpc'] if camp['campaignBid']['cpaBid'] != None: camp['cpa'] = camp['campaignBid']['cpaBid']['cpa'] camp['categoryBids'] = str(camp['categoryBids']) camp.pop('campaignBid') campaigns_list.append(camp['campaignID']) singer.write_record('campaigns', camp) return campaigns_list
def sync(self, state, stream_schema, stream_metadata, config, transformer): start_time = singer.get_bookmark(state, self.tap_stream_id, self.replication_key, config['start_date']) for window_start, window_end in get_date_windows(start_time): LOGGER.info("Searching for customers from %s to %s", window_start, window_end) for page, _ in self.client.get_customers(window_start, window_end): for record in page: transformed_record = transformer.transform( record, stream_schema, stream_metadata) singer.write_record( self.tap_stream_id, transformed_record, ) state = singer.write_bookmark(state, self.tap_stream_id, self.replication_key, window_end) singer.write_state(state) return state
async def sync_trace_search(self, schema): stream = "trace_search" loop = asyncio.get_event_loop() singer.write_schema(stream, schema, ["hour", "account"]) trace_search = await loop.run_in_executor(None, self.client.hourly_request, self.state, self.config, f"traces", stream) if trace_search: for trace in trace_search['usage']: trace['account'] = self.config['account'] singer.write_record(stream, trace) if trace_search['usage'] is not None and len( trace_search['usage']) > 0: self.state = write_bookmark( self.state, stream, "since", trace_search['usage'][len(trace_search['usage']) - 1]['hour'])
def _sync_contact_vids(catalog, vids, schema, bumble_bee): if len(vids) == 0: return data = request(get_url("contacts_detail"), params={ 'vid': vids, 'showListMemberships': True, "formSubmissionMode": "all" }).json() time_extracted = utils.now() mdata = metadata.to_map(catalog.get('metadata')) for record in data.values(): record = bumble_bee.transform(record, schema, mdata) singer.write_record("contacts", record, catalog.get('stream_alias'), time_extracted=time_extracted)
def getaggreportcreative(): singer.write_schema("AggReport", AGGREGATED_CREATIVE_SCHEMA, ["advertiserId", "publisherId", "region"]) for advertiser in ADVERTISERS: reportdataset = requests.get( 'https://api.awin.com/advertisers/' + str(advertiser) + '/reports/creative', params=STATE['aggregatedByCreative'], headers={"User-Agent": AUTH['user_agent']}) if reportdataset.status_code == 200: for data in reportdataset.json(): data["startDate"] = str( parse(STATE['last_fetched']) + timedelta(days=1)) data["endDate"] = str(parse(STATE['last_fetched']) + \ timedelta(days=AUTH['increment'])) singer.write_record("AggReport", data) else: LOGGER.error( "Error" + str(reportdataset.content).replace('\n', ' ') + " while extracting data in report creative for advertiser: " + str(advertiser)) sys.exit(1) time.sleep(5) for publisher in PUBLISHERS: reportdataset = requests.get( 'https://api.awin.com/publishers/' + str(publisher) + '/reports/creative', params=STATE['aggregatedByCreative'], headers={"User-Agent": AUTH['user_agent']}) if reportdataset.status_code == 200: for data in reportdataset.json(): data["startDate"] = str( parse(STATE['last_fetched']) + timedelta(days=1)) data["endDate"] = str(parse(STATE['last_fetched']) + \ timedelta(days=AUTH['increment'])) singer.write_record("AggReport", data) else: LOGGER.error( "Error " + str(reportdataset.content).replace('\n', ' ') + " while extracting data in report creative for publisher: " + str(publisher)) sys.exit(1) time.sleep(5)
def downloadcategoryreport(): filters = STATE cate = getcategory() if not cate: LOGGER.info('0 rows for category report') else: filters['reportSelector'] = {'CategoryIDs': cate} filters['reportType'] = 'Category' jobid = CLIENT.service.scheduleReportJob(filters) while True: if CLIENT.service.getJobStatus(jobid) == 'Completed': tab = ET.parse(urlopen(CLIENT.service.getReportDownloadUrl(jobid))) \ .getroot().getchildren()[0] break rows = [i for i in tab if i.tag == 'rows'][0] singer.write_schema('categoriemetrics', CATEGORY_METRICS_SCHEMA, ['categoryID', 'dateTime']) for row in rows: singer.write_record('campaignsmetrics', row.attrib)
def process_records(stream, mdata, max_modified, records, filter_field, fks): schema = stream.schema.to_dict() with metrics.record_counter(stream.tap_stream_id) as counter: for record in records: record_flat = { 'id': record['id'] } for prop, value in record['attributes'].items(): if prop == 'id': raise Exception('Error flattening Outeach record - conflict with `id` key') record_flat[prop] = value if 'relationships' in record: for prop, value in record['relationships'].items(): if 'data' not in value and 'links' not in value: raise Exception('Only `data` or `links` expected in relationships') fk_field_name = '{}Id'.format(prop) if 'data' in value and fk_field_name in fks: data_value = value['data'] if data_value is not None and 'id' not in data_value: raise Exception('null or `id` field expected for `data` relationship') if fk_field_name in record_flat: raise Exception( '`{}` exists as both an attribute and generated relationship name'.format(fk_field_name)) if data_value == None: record_flat[fk_field_name] = None else: record_flat[fk_field_name] = data_value['id'] if filter_field in record_flat and record_flat[filter_field] > max_modified: max_modified = record_flat[filter_field] with Transformer() as transformer: record_typed = transformer.transform(record_flat, schema, mdata) singer.write_record(stream.tap_stream_id, record_typed) counter.increment() return max_modified