def refresh_token(self): # http://developers.marketo.com/rest-api/authentication/#creating_an_access_token params = { "grant_type": "client_credentials", "client_id": self.client_id, "client_secret": self.client_secret, } singer.log_info("Refreshing token") try: url = self.get_url("identity/oauth/token") resp = requests.get(url, params=params) resp_time = pendulum.utcnow() except requests.exceptions.ConnectionError as e: raise ApiException( "Connection error while refreshing token at {}.".format( url)) from e if resp.status_code != 200: raise ApiException("Error refreshing token [{}]: {}".format( resp.status_code, resp.content)) data = resp.json() if "error" in data: if data["error"] == "unauthorized": msg = "Authorization failed: " else: msg = "Marketo API returned an error: " msg += data.get("error_description", "No message from api") raise ApiException(msg) self.access_token = data["access_token"] self.token_expires = resp_time.add(seconds=data["expires_in"] - 15) singer.log_info("Token valid until %s", self.token_expires)
def get_or_create_export_for_leads(client, state, stream, export_start, config): export_id = bookmarks.get_bookmark(state, "leads", "export_id") # check if export is still valid if export_id is not None and not client.export_available("leads", export_id): singer.log_info("Export %s no longer available.", export_id) export_id = None if export_id is None: # Corona mode is required to query by "updatedAt", otherwise a full # sync is required using "createdAt". query_field = "updatedAt" if client.use_corona else "createdAt" max_export_days = int(config.get('max_export_days', MAX_EXPORT_DAYS)) export_end = get_export_end(export_start, end_days=max_export_days) query = {query_field: {"startAt": export_start.isoformat(), "endAt": export_end.isoformat()}} # Create the new export and store the id and end date in state. # Does not start the export (must POST to the "enqueue" endpoint). fields = [] for entry in stream['metadata']: if len(entry['breadcrumb']) > 0 and (entry['metadata'].get('selected') or entry['metadata'].get('inclusion') == 'automatic'): fields.append(entry['breadcrumb'][-1]) export_id = client.create_export("leads", fields, query) state = update_state_with_export_info( state, stream, export_id=export_id, export_end=export_end.isoformat()) else: export_end = pendulum.parse(bookmarks.get_bookmark(state, "leads", "export_end")) return export_id, export_end
def wait_for_export(self, stream_type, export_id): # Poll the export status until it enters a finalized state or # exceeds the job timeout time. with metrics.job_timer('Export {} for {}'.format( export_id, stream_type)): timeout_time = pendulum.utcnow().add(seconds=self.job_timeout) while pendulum.utcnow() < timeout_time: status = self.poll_export(stream_type, export_id) singer.log_info("export %s status is %s", export_id, status) if status == "Created": # If the status is created, the export has been made but # not started, so enqueue the export. self.enqueue_export(stream_type, export_id) elif status in ["Cancelled", "Failed"]: # Cancelled and failed exports fail the current sync. raise ExportFailed(status) elif status == "Completed": return True time.sleep(self.poll_interval) raise ExportFailed("Export timed out after {} minutes".format( self.job_timeout / 60))
def update_calls_today(self): # http://developers.marketo.com/rest-api/endpoint-reference/lead-database-endpoint-reference/#!/Usage/getDailyUsageUsingGET data = self._request("GET", "rest/v1/stats/usage.json").json() if "result" not in data: raise ApiException(data) self.calls_today = int(data["result"][0]["total"]) singer.log_info("Used %s of %s requests", self.calls_today, self.max_daily_calls)
def get_or_create_export_for_activities(client, state, stream, export_start, config): export_id = bookmarks.get_bookmark(state, stream["tap_stream_id"], "export_id") if export_id is not None and not client.export_available( "activities", export_id): singer.log_info("Export %s no longer available.", export_id) export_id = None if export_id is None: # The activity id is in the top-most breadcrumb of the metatdata # Activity ids correspond to activity type id in Marketo. # We need the activity type id to build the query. activity_metadata = metadata.to_map(stream["metadata"]) activity_type_id = metadata.get(activity_metadata, (), 'marketo.activity-id') # Activities must be queried by `createdAt` even though # that is not a real field. `createdAt` proxies `activityDate`. # The activity type id must also be included in the query. The # largest date range that can be used for activities is 30 days. max_export_days = int(config.get('max_export_days', MAX_EXPORT_DAYS)) export_end = get_export_end(export_start, end_days=max_export_days) query = { "createdAt": { "startAt": export_start.isoformat(), "endAt": export_end.isoformat() }, "activityTypeIds": [activity_type_id] } # Create the new export and store the id and end date in state. # Does not start the export (must POST to the "enqueue" endpoint). try: export_id = client.create_export("activities", ACTIVITY_FIELDS, query) except ApiQuotaExceeded as e: # The main reason we wrap the ApiQuotaExceeded exception in a # new one is to be able to tell the customer what their # configured max_export_days is. raise ApiQuotaExceeded( ("You may wish to consider changing the " "`max_export_days` config value to a lower number if " "you're unable to sync a single {} day window within " "your current API quota.").format(max_export_days)) from e state = update_state_with_export_info( state, stream, export_id=export_id, export_end=export_end.isoformat()) else: export_end = pendulum.parse( bookmarks.get_bookmark(state, stream["tap_stream_id"], "export_end")) return export_id, export_end
def discover(client): singer.log_info("Starting discover") streams = [] streams.append(discover_leads(client)) streams.append(discover_catalog("activity_types", ACTIVITY_TYPES_AUTOMATIC_INCLUSION, unsupported=ACTIVITY_TYPES_UNSUPPORTED, stream_automatic_inclusion=True)) streams.extend(discover_activities(client)) streams.append(discover_catalog("campaigns", CAMPAIGNS_AUTOMATIC_INCLUSION)) streams.append(discover_catalog("lists", LISTS_AUTOMATIC_INCLUSION)) streams.append(discover_catalog("programs", PROGRAMS_AUTOMATIC_INCLUSION)) json.dump({"streams": streams}, sys.stdout, indent=2) singer.log_info("Finished discover")
def create_export(self, stream_type, fields, query): # http://developers.marketo.com/rest-api/bulk-extract/#creating_a_job payload = {"format": "CSV", "fields": fields, "filter": query} endpoint = self.get_bulk_endpoint(stream_type, "create") endpoint_name = "{}_create".format(stream_type) singer.log_info('Scheduling export job with query %s', query) data = self.request("POST", endpoint, endpoint_name=endpoint_name, json=payload) return data["result"][0]["exportId"]
def get_selected_streams(remaining_streams): selected_streams = [] for stream in remaining_streams: mdata = metadata.to_map(stream.get('metadata')) if metadata.get(mdata, (), 'selected') == True: selected_streams.append(stream) else: singer.log_info("%s: not selected", stream["tap_stream_id"]) return selected_streams
def stream_rows(client, stream_type, export_id): with tempfile.NamedTemporaryFile(mode="w+", encoding="utf8") as csv_file: singer.log_info("Download starting.") resp = client.stream_export(stream_type, export_id) for chunk in resp.iter_content(chunk_size=1024, decode_unicode=True): if chunk: csv_file.write(chunk) singer.log_info("Download completed. Begin streaming rows.") csv_file.seek(0) reader = csv.reader(csv_file, delimiter=',', quotechar='"') headers = next(reader) for line in reader: yield dict(zip(headers, line))
def _request(self, method, url, endpoint_name=None, stream=False, **kwargs): endpoint_name = endpoint_name or url url = self.get_url(url) headers = kwargs.pop("headers", {}) headers.update(self.headers) req = requests.Request(method, url, headers=headers, **kwargs).prepare() singer.log_info("%s: %s", method, req.url) with singer.metrics.http_request_timer(endpoint_name): resp = self._session.send(req, stream=stream) resp.raise_for_status() return resp
def excute_graph_ql(self, query: str) -> dict: try: # the execute function sometimes prints and this causes errors for the target, so I block printing for it with HiddenPrints(): response = json.loads(shopify.GraphQL().execute(query)) except Exception: raise GraphQLGeneralError("Execution failed", code=500) if 'data' in response and response['data'] is not None: return response['data'] if "errors" in response: errors = response["errors"] singer.log_info(errors) if errors[0]["extensions"]["code"] == "THROTTLED": raise GraphQLThrottledError("THROTTLED", code=429) raise GraphQLGeneralError("Failed", code=500)
def get_graph_ql_data(self, replication_obj: Stream): LOGGER.info("Getting data with GraphQL") updated_at_min = replication_obj.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) date_window_size = float( Context.config.get("date_window_size", DATE_WINDOW_SIZE)) # Page through till the end of the result set while updated_at_min < stop_time: after = None updated_at_max = updated_at_min + datetime.timedelta( days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time singer.log_info("getting from %s - %s", updated_at_min, updated_at_max) while True: query = self.get_graph_query(updated_at_min, updated_at_max, replication_obj.name, after=after) with metrics.http_request_timer(replication_obj.name): data = self.excute_graph_ql(query) data = data[replication_obj.name] page_info = data['pageInfo'] edges = data["edges"] for edge in edges: after = edge["cursor"] node = edge["node"] yield node if not page_info["hasNextPage"]: Context.state.get('bookmarks', {}).get(replication_obj.name, {}).pop('since_id', None) replication_obj.update_bookmark( utils.strftime(updated_at_max + datetime.timedelta(seconds=1))) break updated_at_min = updated_at_max + datetime.timedelta(seconds=1)
def get_or_create_export_for_leads(client, state, stream, export_start): export_id = bookmarks.get_bookmark(state, "leads", "export_id") # check if export is still valid if export_id is not None and not client.export_available( "leads", export_id): singer.log_info("Export %s no longer available.", export_id) export_id = None if export_id is None: # Corona mode is required to query by "updatedAt", otherwise a full # sync is required using "createdAt". query_field = "updatedAt" if client.use_corona else "createdAt" export_end = get_export_end(export_start) query = { query_field: { "startAt": export_start.isoformat(), "endAt": export_end.isoformat() } } # Create the new export and store the id and end date in state. # Does not start the export (must POST to the "enqueue" endpoint). fields = [ f for f, s in stream["schema"]["properties"].items() if s.get("selected") or (s.get("inclusion") == "automatic") ] export_id = client.create_export("leads", fields, query) state = update_state_with_export_info( state, stream, export_id=export_id, export_end=export_end.isoformat()) else: export_end = pendulum.parse( bookmarks.get_bookmark(state, "leads", "export_end")) return export_id, export_end
def test_corona(self): # http://developers.marketo.com/rest-api/bulk-extract/#limits # Corona allows us to do bulk queries for Leads using updatedAt # as a filter. Clients without Corona (should only be clients # with < 50,000 Leads) must do a full bulk export every sync. # We test for Corona by requesting a one-second export of leads # using the updatedAt filter. singer.log_info("Testing for Corona support") start_pen = pendulum.utcnow().subtract(days=1).replace(microsecond=0) end_pen = start_pen.add(seconds=1) payload = { "format": "CSV", "fields": ["id"], "filter": { "updatedAt": { "startAt": start_pen.isoformat(), "endAt": end_pen.isoformat(), }, }, } endpoint = self.get_bulk_endpoint("leads", "create") data = self._request("POST", endpoint, endpoint_name="leads_create", json=payload).json() # If the error code indicating no Corona support is present, # Corona is not supported. If we don't get that error code, # Corona is supported and we need to clean up by cancelling the # test export we requested. err_codes = set(err["code"] for err in data.get("errors", [])) if NO_CORONA_CODE in err_codes: singer.log_info("Corona not supported.") return False elif API_QUOTA_EXCEEDED in err_codes: raise ApiQuotaExceeded( API_QUOTA_EXCEEDED_MESSAGE.format(data['errors'])) else: singer.log_info("Corona is supported.") singer.log_info(data) self.cancel_export("leads", data["result"][0]["exportId"]) return True
def get_objects(self): updated_at_min = self.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) # Retrieve data for max 1 year. Otherwise log incremental needed. diff_days = (stop_time - updated_at_min).days yearly = False if diff_days > 365: yearly = True stop_time = updated_at_min + datetime.timedelta(days=365) LOGGER.info("This import will only import the first year of historical data. " "You need to trigger further incremental imports to get the missing rows.") date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE)) results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE) # Page through till the end of the resultset while updated_at_min < stop_time: # Bookmarking can also occur on the since_id since_id = self.get_since_id() or 1 if since_id != 1: LOGGER.info("Resuming sync from since_id %d", since_id) # It's important that `updated_at_min` has microseconds # truncated. Why has been lost to the mists of time but we # think it has something to do with how the API treats # microseconds on its date windows. Maybe it's possible to # drop data due to rounding errors or something like that? updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time singer.log_info("getting from %s - %s", updated_at_min, updated_at_max) min_filer_key = self.get_min_replication_key() max_filer_key = self.get_max_replication_key() while True: status_key = self.status_key or "status" query_params = { "since_id": since_id, min_filer_key: updated_at_min, max_filer_key: updated_at_max, "limit": results_per_page, } if self.add_status: query_params[status_key] = "any" with metrics.http_request_timer(self.name): objects = self.call_api(query_params) for obj in objects: if obj.id < since_id: # This verifies the api behavior expectation we # have that all results actually honor the # since_id parameter. raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format( obj.id, since_id)) yield obj # You know you're at the end when the current page has # less than the request size limits you set. singer.log_info(f"Got {len(objects)} records") if len(objects) < results_per_page: # Save the updated_at_max as our bookmark as we've synced all rows up in our # window and can move forward. Also remove the since_id because we want to # restart at 1. Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None) state_val = updated_at_max if self.skip_day: state_val = state_val + datetime.timedelta(days=1) self.update_bookmark(utils.strftime(state_val)) break if objects[-1].id != max([o.id for o in objects]): # This verifies the api behavior expectation we have # that all pages are internally ordered by the # `since_id`. raise OutOfOrderIdsError("{} is not the max id in objects ({})".format( objects[-1].id, max([o.id for o in objects]))) since_id = objects[-1].id # Put since_id into the state. self.update_bookmark(since_id, bookmark_key='since_id') updated_at_min = updated_at_max + datetime.timedelta(seconds=1) if self.skip_day: updated_at_min = updated_at_min + datetime.timedelta(days=1) if yearly: LOGGER.info("This import only imported one year of historical data. " "Please trigger further incremental data to get the missing rows.")
def sync(client, catalog, config, state): starting_stream = bookmarks.get_currently_syncing(state) if starting_stream: singer.log_info("Resuming sync from %s", starting_stream) else: singer.log_info("Starting sync") for stream in catalog["streams"]: # Skip unselected streams. if not stream["schema"].get("selected"): singer.log_info("%s: not selected", stream["tap_stream_id"]) continue # Skip streams that have already be synced when resuming. if starting_stream and stream["tap_stream_id"] != starting_stream: singer.log_info("%s: already synced", stream["tap_stream_id"]) continue singer.log_info("%s: starting sync", stream["tap_stream_id"]) # Now that we've started, there's no more "starting stream". Set # the current stream to resume on next run. starting_stream = None state = bookmarks.set_currently_syncing(state, stream["tap_stream_id"]) singer.write_state(state) # Sync stream based on type. if stream["tap_stream_id"] == "activity_types": state, record_count = sync_activity_types(client, state, stream) elif stream["tap_stream_id"] == "leads": state, record_count = sync_leads(client, state, stream) elif stream["tap_stream_id"].startswith("activities_"): state, record_count = sync_activities(client, state, stream, config) elif stream["tap_stream_id"] in ["campaigns", "lists"]: state, record_count = sync_paginated(client, state, stream) elif stream["tap_stream_id"] == "programs": state, record_count = sync_programs(client, state, stream) else: raise Exception("Stream %s not implemented" % stream["tap_stream_id"]) # Emit metric for record count. counter = singer.metrics.record_counter(stream["tap_stream_id"]) counter.value = record_count counter._pop() # pylint: disable=protected-access # Unset current stream. state = bookmarks.set_currently_syncing(state, None) singer.write_state(state) singer.log_info("%s: finished sync", stream["tap_stream_id"]) # If Corona is not supported, log a warning near the end of the tap # log with instructions on how to get Corona supported. singer.log_info("Finished sync.") if not client.use_corona: singer.log_warning(NO_CORONA_WARNING)
def sync_report(stream_name, stream_metadata, sdk_client): report_window_days = CONFIG.get("MAX_REPORT_TIME_WINDOW", 365) is_incremental = False if metadata.get(stream_metadata, (), "replication-method") == "INCREMENTAL": is_incremental = True customer_id = sdk_client.client_customer_id stream_schema, _ = create_schema_for_report(stream_name, sdk_client) stream_schema = add_synthetic_keys_to_stream_schema(stream_schema) xml_attribute_list = get_fields_to_sync(stream_schema, stream_metadata) primary_keys = metadata.get(stream_metadata, (), 'tap-adwords.report-key-properties') or [] LOGGER.info("{} primary keys are {}".format(stream_name, primary_keys)) write_schema(stream_name, stream_schema, primary_keys, bookmark_properties=['day']) field_list = [] for field in xml_attribute_list: field_list.append(stream_metadata[('properties', field)]['adwords.fieldName']) check_selected_fields(stream_name, field_list, sdk_client) # If an attribution window sync is interrupted, start where it left off start_date = get_attribution_window_bookmark(customer_id, stream_name) if start_date is not None: start_date = start_date + relativedelta(days=1) if start_date is None: start_date = apply_conversion_window( get_start_for_stream(customer_id, stream_name)) if stream_name in REPORTS_WITH_90_DAY_MAX: cutoff = utils.now() + relativedelta(days=-90) if start_date < cutoff: LOGGER.warning( "report only supports up to 90 days, will start at {}".format( start_date)) start_date = cutoff start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) LOGGER.info('Selected fields: %s', field_list) max_end_date = utils.now() - relativedelta(days=1) required_end_date = get_end_date() report_end_date = min(max_end_date, required_end_date) report_end_date = report_end_date.replace(hour=23, minute=59, second=59, microsecond=0) next_start_date = start_date is_single_day_report = stream_name in REPORTS_REQUIRING_DAILY_REPORTS start_plus_window = next_start_date if not is_single_day_report: start_plus_window += relativedelta(days=report_window_days) end_date = min(start_plus_window, report_end_date) while next_start_date <= report_end_date: singer.log_info("syncing %s for %s - %s", stream_name, next_start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")) actual_end_date = min(end_date, report_end_date) sync_report_for_day(stream_name, stream_schema, sdk_client, next_start_date, field_list, actual_end_date) next_start_date = end_date + relativedelta(days=1) start_plus_window = next_start_date if not is_single_day_report: start_plus_window += relativedelta(days=report_window_days) end_date = start_plus_window bookmarks.write_bookmark(STATE, state_key_name(customer_id, stream_name), 'last_attribution_window_date', actual_end_date.strftime(utils.DATETIME_FMT)) singer.write_state(STATE) if not is_incremental: bookmarks.clear_bookmark(STATE, state_key_name(customer_id, stream_name), 'last_attribution_window_date') singer.write_state(STATE) LOGGER.info("Done syncing the %s report for customer_id %s", stream_name, customer_id)