def request(endpoint, params=None): url = BASE_URL + endpoint params = params or {} headers = {} if 'user_agent' in CONFIG: headers['User-Agent'] = CONFIG['user_agent'] headers['Authorization'] = "Basic " + CONFIG['api_key'] req = requests.Request("GET", url, params=params, headers=headers).prepare() LOGGER.info("GET {}".format(req.url)) with metrics.http_request_timer(url) as timer: resp = SESSION.send(req) timer.tags[metrics.Tag.http_status_code] = resp.status_code if resp.headers.get('Content-Type') == "application/gzip": json_body = unzip_to_json(resp.content) else: json_body = resp.json() resp.raise_for_status() return json_body
def request(self, method, path=None, url=None, **kwargs): self.get_access_token() if not url and self.base_url is None: self.base_url = '{}/{}'.format(API_URL, API_VERSION) if not url and path: url = '{}/{}'.format(self.base_url, path) # endpoint = stream_name (from sync.py API call) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if 'headers' not in kwargs: kwargs['headers'] = {} kwargs['headers']['Authorization'] = 'Bearer {}'.format( self.__access_token) if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request(method, url, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() # Rate limits: https://developers.snapchat.com/api/docs/#rate-limits # Use retry functionality in backoff to wait and retry if # response code equals 429 because rate limit has been exceeded # LOGGER.info('headers = {}'.format(response.headers)) rate_limit = int(response.headers.get('X-Rate-Limit-Limit', 0)) rate_limit_remaining = int( response.headers.get('X-Rate-Limit-Remaining', 0)) rate_limit_reset = int(response.headers.get('X-Rate-Limit-Reset', 0)) if rate_limit == 0: rate_limit_percent_remaining = 100 else: rate_limit_percent_remaining = 100 * ( Decimal(rate_limit_remaining) / Decimal(rate_limit)) # Wait for reset if remaining calls are less than 5% if rate_limit_percent_remaining < 5: LOGGER.warning( 'Rate Limit Warning: {}; remaining calls: {}; remaining %: {}% ' .format(rate_limit, rate_limit_remaining, int(rate_limit_percent_remaining))) wait_time = rate_limit_reset - int(time.time()) LOGGER.warning('Waiting for {} seconds.'.format(wait_time)) time.sleep(int(wait_time)) if response.status_code == 429: raise Server429Error() elif response.status_code >= 500: raise Server5xxError() if response.status_code != 200: LOGGER.error('{}: {}'.format(response.status_code, response.text)) raise_for_error(response) # Catch invalid json response try: response_json = response.json() except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('response.headers = {}'.format(response.headers)) LOGGER.error('response.reason = {}'.format(response.reason)) raise Exception(err) return response_json
def request(self, method, url=None, path=None, headers=None, json=None, version=None, **kwargs): if not self.__verified: self.__verified = self.check_access() if not url and path: url = '{}/{}'.format(self.base_url, path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if not headers: headers = {} # API Version: https://developer.github.com/v3/#current-version if not version: version = 'v3' headers['Accept'] = 'application/vnd.github.{}+json'.format(version) # Authentication: https://developer.github.com/v3/#authentication headers['Authorization'] = 'Token {}'.format(self.__api_token) if self.__user_agent: headers['User-Agent'] = self.__user_agent if method == 'POST': headers['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request(method=method, url=url, headers=headers, json=json, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() # 304: File Not Modified status_code if response.status_code == 304: return None, None if response.status_code != 200: raise_for_error(response) last_modified = response.headers.get('Last-Modified') response_json = response.json() # last-modified: https://developer.github.com/v3/#conditional-requests if last_modified: last_modified_dttm = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') response_json['last_modified'] = last_modified_dttm.strftime( "%Y-%m-%dT%H:%M:%SZ") # Pagination: https://developer.github.com/v3/guides/traversing-with-pagination/ links_header = response.headers.get('Link') links = [] next_url = None if links_header: links = links_header.split(',') for link in links: try: url, rel = re.search(r'^\<(https.*)\>; rel\=\"(.*)\"$', link.strip()).groups() if rel == 'next': next_url = url except AttributeError: next_url = None return response_json, next_url
def request(self, method, url=None, path=None, data=None, params=None, **kwargs): if not url and path: url = '{}/{}'.format(self.base_url, path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if 'headers' not in kwargs: kwargs['headers'] = {} kwargs['headers']['Accept'] = 'application/json' if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request( method, url, auth=self.__auth_header, data=data, params=params, timeout=(DEFAULT_CONNECTION_TIMEOUT, DEFAULT_REST_TIMEOUT), **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code # Rate Limit reference: https://developer.twitter.com/en/docs/basics/rate-limiting # LOGGER.info('headers = {}'.format(response.headers)) rate_limit = int(response.headers.get('x-rate-limit-limit')) rate_limit_remaining = int( response.headers.get('x-rate-limit-remaining')) rate_limit_reset = int(response.headers.get('x-rate-limit-reset')) rate_limit_percent_remaining = 100 * (Decimal(rate_limit_remaining) / Decimal(rate_limit)) # Wait for reset if remaining calls are less than 5% if rate_limit_percent_remaining < 5: LOGGER.warning( 'Rate Limit Warning: {}; remaining calls: {}; remaining %: {}% ' .format(rate_limit, rate_limit_remaining, int(rate_limit_percent_remaining))) wait_time = rate_limit_reset - int(time.time()) LOGGER.warning('Waiting for {} seconds.'.format(wait_time)) time.sleep(int(wait_time)) if response.status_code in (420, 429): raise Server42xRateLimitError() elif response.status_code >= 500: raise Server5xxError() elif response.status_code == 400: error_combined = raise_for_error(response) if 'INVALID_ACCOUNT_SERVICE_LEVEL' in error_combined: return None elif response.status_code != 200: error_combined = raise_for_error(response) return response.json()
def authed_get(source, url, headers={}): with metrics.http_request_timer(source) as timer: session.headers.update(headers) resp = session.request(method='get', url=url) timer.tags[metrics.Tag.http_status_code] = resp.status_code return resp
def gen_request(stream_id, url): with metrics.http_request_timer(stream_id) as timer: resp = requests.get(url, auth=HTTPBasicAuth(CONFIG["consumer_key"], CONFIG["consumer_secret"])) timer.tags[metrics.Tag.http_status_code] = resp.status_code resp.raise_for_status() return resp.json()
def rest_request(self, method, url, **kwargs): with metrics.http_request_timer(url): url = self.get_url(url, rest=True) return self._request(method, url, headers=self.rest_headers, **kwargs)
def gen_request(stream_id, url): with metrics.http_request_timer(stream_id) as timer: resp = requests.get(url, headers={"token": CONFIG["api_key"]}) timer.tags[metrics.Tag.http_status_code] = resp.status_code resp.raise_for_status() return resp.json()
def authed_get(source, url, params): with metrics.http_request_timer(source) as timer: resp = session.request(method='get', url=url, params=params) timer.tags[metrics.Tag.http_status_code] = resp.status_code return resp
def __process_periodic_data_calcs(req_state, scenario_name='Actual', currency_code='USD'): # pylint: disable=too-many-statements entity_types = ['assets'] # Currently: assets only (not funds) period_types = req_state.period_types.strip().replace(' ', '').split(',') batch_size = 10000 end_dttm = req_state.end_date max_bookmark_value = req_state.last_date # Init params_list and results i_get_params_list = req_state.client.factory.create( 'ArrayOfBaseRequestParameters') results = [] req_id = 1 batch = 1 update_count = 0 # Base objects data_value_types = req_state.client.factory.create('DataValueTypes') # scenario_id for scenario_name scenarios = req_state.client.service.GetScenarios() scenario = [i for i in scenarios.NamedEntity if i.Name == scenario_name][0] scenario_id = scenario.Id # current_date date_types = req_state.client.factory.create('DateTypes') current_date = req_state.client.factory.create('Date') current_date.Type = date_types.Current # latest_date latest_date = req_state.client.factory.create('Date') latest_date.Type = date_types.Latest # Get all calc data items data_item_search_criteria = req_state.client.factory.create( 'DataItemsSearchCriteria') data_item_search_criteria.GetGlobalDataItemsOnly = True # Global Data Items ONLY data_items = req_state.client.service.GetDataItems( data_item_search_criteria) calc_data_items = [ i for i in data_items.DataItemObjectEx if i.FormulaTypeIDsString ] # TESTING (add): and 'Gross Margin' in i.Name calc_data_items_len = len(calc_data_items) # entity_type loop for entity_type in entity_types: # funds, assets pylint: disable=too-many-nested-blocks LOGGER.info('entity_type = %s', entity_type) # COMMENT OUT # entity_ids for funds_or_assets if entity_type == 'funds': entities = req_state.client.service.GetFunds() entity_objs = entities.Fund # entity_objs = [i for i in entity_objs if 'IV, L.P.' in i.ExcelName] # COMMENT OUT else: # assets entities = req_state.client.service.GetAssets() entity_objs = entities.Asset # entity_objs = [i for i in entity_objs if 'Guild Education' in i.Name] # TESTING: COMMENT OUT entity_objs_len = len(entity_objs) # calc_data_items loop cdi = 1 for data_item in calc_data_items: data_item_id = data_item.Id data_item_name = data_item.Name LOGGER.info('data_item_name = %s (%s)', data_item_name, data_item_id) # COMMENT OUT # data_value_type for data_item data_value_type_id = data_item.DataValueType data_value_type = data_value_types[data_value_type_id] # entity loop ent = 1 for entity in entity_objs: entity_dict = ilevel.sobject_to_dict(entity) entity_id = entity_dict.get('Id') # LOGGER.info('entity = {} ({})'.format(entity_name, entity_id)) # COMMENT OUT entity_initial_dttm = datetime.strptime( entity_dict.get('InitialPeriod')[:10], '%Y-%m-%d') start_dttm = datetime.strptime(req_state.last_date, '%Y-%m-%d') max_dttm = [start_dttm, entity_initial_dttm] # Choose the earliest date for which there is data for an entity start_dttm = max(i for i in max_dttm if i is not None) # LOGGER.info('periodic_data_calculated: {}, {}: {} ({})'.format( # data_item_name, entity_type, entity_name, entity_id)) # COMMENT OUT entity_path = ilevel.create_entity_path(req_state, [entity_id]) # period_type loop last_period_type = period_types[-1] for period_type in period_types: period, period_diff = ilevel.get_periods( req_state, start_dttm, end_dttm, period_type) # offset_period loop (0, -1, -2, ...) look-back pd = 0 while pd <= period_diff + 1: # LOGGER.info('{}: periodic_data_calculated: {}, Period Type: {}, Offset: {}'.format( # req_id, data_item_name, period_type, -pd)) # COMMENT OUT offset_period = copy.copy(period) offset_period.IsOffset = True offset_period.Quantity = int(-1 * pd) i_get_params = req_state.client.factory.create( 'AssetAndFundGetRequestParameters') i_get_params.RequestIdentifier = req_id i_get_params.DataValueType = data_value_type i_get_params.EntitiesPath = entity_path i_get_params.DataItemId = data_item_id i_get_params.ScenarioId = scenario_id i_get_params.Period = period i_get_params.Offset = offset_period i_get_params.EndOfPeriod = latest_date i_get_params.ReportedDate = current_date i_get_params.CurrencyCode = currency_code i_get_params_list.BaseRequestParameters.append( i_get_params) # LOGGER.info('i_get_params = {}'.format(i_get_params)) # COMMENT OUT # run iGetBatch end_of_batches = False if (pd == (period_diff + 1) and period_type == last_period_type \ and ent == entity_objs_len and cdi == calc_data_items_len and entity_type == 'assets'): end_of_batches = True LOGGER.info('xxx END OF BATCHES xxx') if (req_id % batch_size == 0) or end_of_batches: LOGGER.info('xxx BATCH: %s xxx', batch) i_get_count = len(i_get_params_list) i_get_request = req_state.client.factory.create( 'DataServiceRequest') i_get_request.IncludeStandardizedDataInfo = True i_get_request.IncludeExcelFormula = True i_get_request.ParametersList = i_get_params_list # LOGGER.info('i_get_request = {}'.format(i_get_request)) # COMMENT OUT # pylint: disable=unused-variable metrics_string = ( 'periodic_data_calculated, iGetBatch #{}: {} requests' .format(batch, i_get_count)) with metrics.http_request_timer( metrics_string) as timer: data_values = req_state.client.service.iGetBatch( i_get_request) # LOGGER.info('data_values = {}'.format(data_values)) # COMMENT OUT if isinstance(data_values, str): continue try: periodic_data_records = data_values.DataValue except Exception as err: LOGGER.error('%s', err) LOGGER.error( 'data_values dict = %s', ilevel.sobject_to_dict(data_values)) raise err for periodic_data_record in periodic_data_records: if "Error" in periodic_data_record: continue if "NoDataAvailable" in periodic_data_record: continue periodic_data_record_dict = ilevel.sobject_to_dict( periodic_data_record) # LOGGER.info('period_data_record_dict = {}'.format(periodic_data_record_dict)) # COMMENT OUT transformed_record = transform_json( periodic_data_record_dict) # LOGGER.info('transformed_record = {}'.format(transformed_record)) # COMMENT OUT if 'value' in transformed_record: value = transformed_record.get('value') value_string = str(value) if type(value) in (int, float): value_numeric = float(value) else: value_numeric = None if value == 'No Data Available': LOGGER.info( 'No Data Available, skipping record' ) continue sd_parameters = transformed_record.get( 'sd_parameters', {}) excel_formula = transformed_record.get( 'excel_formula') currency_code = sd_parameters.get( 'currency_code') data_item_id = sd_parameters.get( 'data_item_id') data_value_type = sd_parameters.get( 'data_value_type') detail_id = sd_parameters.get('detail_id') entity_id = next( iter( sd_parameters.get( 'entities_path', {}).get('path', {}).get('int', [])), None) scenario_id = sd_parameters.get( 'scenario_id') period_type = sd_parameters.get( 'period', {}).get('type') end_of_period_value = sd_parameters.get( 'end_of_period', {}).get('value') reported_date_value = sd_parameters.get( 'reported_date', {}).get('value') exchange_rate_type = sd_parameters.get( 'exchange_rate', {}).get('type') request_id = sd_parameters.get( 'request_identifier') standardized_data_id = sd_parameters.get( 'standardized_data_id') dimensions = { 'data_item_id': data_item_id, 'entity_id': entity_id, 'scenario_id': scenario_id, 'period_type': period_type, 'end_of_period_value': end_of_period_value, 'currency_code': currency_code, 'exchange_rate_type': exchange_rate_type, 'data_value_type': data_value_type } hash_key = str( hash_data( json.dumps(dimensions, sort_keys=True))) # Primary key dimensions, create md5 hash key new_record = { 'hash_key': hash_key, 'excel_formula': excel_formula, 'currency_code': currency_code, 'data_item_id': data_item_id, 'data_value_type': data_value_type, 'detail_id': detail_id, 'entity_id': entity_id, 'scenario_id': scenario_id, 'period_type': period_type, 'end_of_period_value': end_of_period_value, 'reported_date_value': reported_date_value, 'exchange_rate_type': exchange_rate_type, 'request_id': request_id, 'standardized_data_id': standardized_data_id, 'value': value, 'value_string': value_string, 'value_numeric': value_numeric } results.append(new_record) # end for rec in period_data_records # Process batch records max_bookmark_value, process_record_count = process_records( result_records=results, req_state=req_state, deletion_flag=False, max_bookmark_value=max_bookmark_value) update_count = update_count + process_record_count # Init new params_list and results i_get_params_list = req_state.client.factory.create( 'ArrayOfBaseRequestParameters') results = [] batch = batch + 1 # end iGetBatch req_id = req_id + 1 pd = pd + 1 # end offset_period loop # end period_type loop ent = ent + 1 # end entity_id loop cdi = cdi + 1 # end calc_data_items loop # end entity_type loop # Update the state with the max_bookmark_value for the stream after ALL records # Always process past year of calculated data (Subtract 365 days from max_bookmark_value) max_bookmark_dttm = datetime.strptime(max_bookmark_value[:10], "%Y-%m-%d") - timedelta(days=365) max_bookmark_value = max_bookmark_dttm.strftime("%Y-%m-%d") singer_ops.write_bookmark(req_state.state, req_state.stream_name, max_bookmark_value) return update_count
def request_metrics_patch(self, method, url, **kwargs): with singer_metrics.http_request_timer(None): return request(self, method, url, **kwargs)
def get_objects(self): updated_at_min = self.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) # Retrieve data for max 1 year. Otherwise log incremental needed. diff_days = (stop_time - updated_at_min).days yearly = False if diff_days > 365: yearly = True stop_time = updated_at_min + datetime.timedelta(days=365) LOGGER.info("This import will only import the first year of historical data. " "You need to trigger further incremental imports to get the missing rows.") date_window_size = float(Context.config.get("date_window_size", DATE_WINDOW_SIZE)) results_per_page = Context.get_results_per_page(RESULTS_PER_PAGE) # Page through till the end of the resultset while updated_at_min < stop_time: # Bookmarking can also occur on the since_id since_id = self.get_since_id() or 1 if since_id != 1: LOGGER.info("Resuming sync from since_id %d", since_id) # It's important that `updated_at_min` has microseconds # truncated. Why has been lost to the mists of time but we # think it has something to do with how the API treats # microseconds on its date windows. Maybe it's possible to # drop data due to rounding errors or something like that? updated_at_max = updated_at_min + datetime.timedelta(days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time singer.log_info("getting from %s - %s", updated_at_min, updated_at_max) min_filer_key = self.get_min_replication_key() max_filer_key = self.get_max_replication_key() while True: status_key = self.status_key or "status" query_params = { "since_id": since_id, min_filer_key: updated_at_min, max_filer_key: updated_at_max, "limit": results_per_page, } if self.add_status: query_params[status_key] = "any" with metrics.http_request_timer(self.name): objects = self.call_api(query_params) for obj in objects: if obj.id < since_id: # This verifies the api behavior expectation we # have that all results actually honor the # since_id parameter. raise OutOfOrderIdsError("obj.id < since_id: {} < {}".format( obj.id, since_id)) yield obj # You know you're at the end when the current page has # less than the request size limits you set. singer.log_info(f"Got {len(objects)} records") if len(objects) < results_per_page: # Save the updated_at_max as our bookmark as we've synced all rows up in our # window and can move forward. Also remove the since_id because we want to # restart at 1. Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None) state_val = updated_at_max if self.skip_day: state_val = state_val + datetime.timedelta(days=1) self.update_bookmark(utils.strftime(state_val)) break if objects[-1].id != max([o.id for o in objects]): # This verifies the api behavior expectation we have # that all pages are internally ordered by the # `since_id`. raise OutOfOrderIdsError("{} is not the max id in objects ({})".format( objects[-1].id, max([o.id for o in objects]))) since_id = objects[-1].id # Put since_id into the state. self.update_bookmark(since_id, bookmark_key='since_id') updated_at_min = updated_at_max + datetime.timedelta(seconds=1) if self.skip_day: updated_at_min = updated_at_min + datetime.timedelta(days=1) if yearly: LOGGER.info("This import only imported one year of historical data. " "Please trigger further incremental data to get the missing rows.")
def request(self, method, path=None, url=None, **kwargs): # pylint: disable=too-many-branches,too-many-statements if not self.__verified: self.__verified = self.check_access_token() if not url and self.__base_url is None: self.__base_url = 'https://api.rechargeapps.com/' if not url and path: url = self.__base_url + path if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if 'headers' not in kwargs: kwargs['headers'] = {} kwargs['headers']['X-Recharge-Access-Token'] = self.__access_token kwargs['headers']['Accept'] = 'application/json' # If we did not specify any API Version during API Call, the Recharge will use the default API Version of our store # the 'collections' was added as part of API Version: '2021-11', for older API Version, # we will get empty records so adding 'X-Recharge-Version' for 'collections' API call if path == 'collections': kwargs['headers']['X-Recharge-Version'] = '2021-11' if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request(method, url, stream=True, timeout=self.request_timeout, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() if response.status_code == 429: # Delay for 5 seconds for leaky bucket rate limit algorithm time.sleep(5) raise Server429Error() if response.status_code != 200: raise_for_error(response) # Intermittent JSONDecodeErrors when parsing JSON; Adding 2 attempts # FIRST ATTEMPT with metrics.http_request_timer(endpoint) as timer: response = self.__session.request(method, url, stream=True, timeout=self.request_timeout, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() if response.status_code == 429: # Delay for 5 seconds for leaky bucket rate limit algorithm time.sleep(5) raise Server429Error() if response.status_code != 200: raise_for_error(response) # Catch invalid JSON (e.g. unterminated string errors) try: response_json = response.json() return response_json, response.links except ValueError as err: # includes simplejson.decoder.JSONDecodeError LOGGER.warning(err) # SECOND ATTEMPT, if there is a ValueError (unterminated string error) with metrics.http_request_timer(endpoint) as timer: response = self.__session.request( method, url, stream=True, timeout=self.request_timeout, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() if response.status_code == 429: raise Server429Error() if response.status_code != 200: raise_for_error(response) # Log invalid JSON (e.g. unterminated string errors) try: response_json = response.json() return response_json, response.links except ValueError as err: # includes simplejson.decoder.JSONDecodeError LOGGER.error(err) raise Exception(err)
def get_gzip_json(self, url, endpoint): resp = None with metrics.http_request_timer(endpoint) as timer: resp = self.__session.request(method='GET', url=url, timeout=60) timer.tags[metrics.Tag.http_status_code] = resp.status_code return self.unzip(resp.content)
def download_request(self, start_date, end_date): #returns a datastream for the csv File with metrics.http_request_timer("create_report") as timer: url = BASE_URL + "api/v1/shops/" + self.shop_id + "/" + "click-reports" headers = { "Content-Type": "application/json", "Authorization": "Bearer " + self.access_token, } data = {"from": start_date, "to": end_date, "site": self.site} response = requests.post(url, headers=headers, data=json.dumps(data)) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code in [429, 502]: raise RateLimitException() if ("The date should be in the past" in response.text ): #checks whether the end date was set correctly return -1 response.raise_for_status() LOGGER.debug(response.json()) status = response.json()["status"] report_id = response.json()["id"] # file was requested now poll for availability while status == "PROCESSING": #maybe a timeout for 1 minute or sth. similar LOGGER.info("Check whether the report is online") with metrics.http_request_timer("poll_report status") as timer: url = BASE_URL + "api/v1/shops/" + self.shop_id + "/" + "click-reports" + "/" + report_id headers = { "Authorization": "Bearer " + self.access_token, } if self.user_agent: headers["User-Agent"] = self.user_agent request = requests.Request("GET", url, headers=headers) response = self.session.send(request.prepare()) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code in [429, 502]: raise RateLimitException() response.raise_for_status() LOGGER.debug(response.json()) status = response.json()["status"] if status == "PROCESSING": LOGGER.info("waiting 1 sec") time.sleep(1) #it is no longer processing - so either FAILDED or SUCCESSFUL if status == "FAILED": raise Exception() else: ## if successful ##download the code with metrics.http_request_timer("download_status") as timer: url = BASE_URL + "api/v1/shops/" + self.shop_id + "/" + "click-reports" + "/" + report_id + "/download" headers = {"Authorization": "Bearer " + self.access_token} request = requests.Request("GET", url, headers=headers) response = self.session.send(request.prepare()) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code in [429, 502]: raise RateLimitException() response.raise_for_status() #extract the files inside the zipfile zf = zipfile.ZipFile(io.BytesIO(response.content), 'r') for filename in zf.namelist(): try: LOGGER.info("read File %s" % filename) return io.StringIO(zf.read(filename).decode()) except KeyError: LOGGER.critical('ERROR: Did not find %s in zip file' % filename)
def perform_igetbatch_operation_for_standardized_id_set(id_set, req_state): data_value_types = req_state.client.factory.create('DataValueTypes') # current_date date_types = req_state.client.factory.create('DateTypes') current_date = req_state.client.factory.create('Date') current_date.Type = date_types.Current # latest_date latest_date = req_state.client.factory.create('Date') latest_date.Type = date_types.Latest req_id = 1 id_set_len = len(id_set) i_get_params_list = req_state.client.factory.create( 'ArrayOfBaseRequestParameters') for cur_id in id_set: req_id = req_id + 1 i_get_params = req_state.client.factory.create( 'AssetAndFundGetRequestParameters') i_get_params.StandardizedDataId = cur_id i_get_params.RequestIdentifier = req_id i_get_params.DataValueType = getattr(data_value_types, 'ObjectId') i_get_params.EndOfPeriod = latest_date i_get_params.ReportedDate = current_date i_get_params_list.BaseRequestParameters.append(i_get_params) i_get_request = req_state.client.factory.create('DataServiceRequest') i_get_request.IncludeStandardizedDataInfo = True i_get_request.IncludeExcelFormula = True i_get_request.ParametersList = i_get_params_list # pylint: disable=unused-variable metrics_string = ( 'Standardized Data Item iGetBatch: {} requests'.format(id_set_len)) with metrics.http_request_timer(metrics_string) as timer: data_values = req_state.client.service.iGetBatch(i_get_request) # LOGGER.info('data_values dict = {}'.format(sobject_to_dict(data_values))) # COMMENT OUT if isinstance(data_values, str): return [] try: periodic_data_records = data_values.DataValue except Exception as err: LOGGER.error('{}'.format(err)) LOGGER.error('data_values dict = {}'.format( sobject_to_dict(data_values))) raise err results = [] for periodic_data_record in periodic_data_records: if "Error" in periodic_data_record: continue if "NoDataAvailable" in periodic_data_record: continue periodic_data_record_dict = sobject_to_dict(periodic_data_record) # LOGGER.info('period_data_record_dict = {}'.format(periodic_data_record_dict)) # COMMENT OUT transformed_record = transform_json(periodic_data_record_dict) # LOGGER.info('transformed_record = {}'.format(transformed_record)) # COMMENT OUT if 'value' in transformed_record: value = transformed_record.get('value') value_string = str(value) if type(value) in (int, float): value_numeric = float(value) else: value_numeric = None if value == 'No Data Available': continue sd_parameters = transformed_record.get('sd_parameters', {}) excel_formula = transformed_record.get('excel_formula') currency_code = sd_parameters.get('currency_code') data_item_id = sd_parameters.get('data_item_id') data_value_type = sd_parameters.get('data_value_type') detail_id = sd_parameters.get('detail_id') scenario_id = sd_parameters.get('scenario_id') period_type = sd_parameters.get('period', {}).get('type') end_of_period_value = sd_parameters.get('end_of_period', {}).get('value') reported_date_value = sd_parameters.get('reported_date', {}).get('value') exchange_rate_type = sd_parameters.get('exchange_rate', {}).get('type') request_id = sd_parameters.get('request_identifier') standardized_data_id = sd_parameters.get('standardized_data_id') entity_ids = sd_parameters.get('entities_path', {}).get('path', {}).get('int', []) for entity_id in entity_ids: # Primary key dimensions, create md5 hash key dimensions = { 'data_item_id': data_item_id, 'entity_id': entity_id, 'scenario_id': scenario_id, 'period_type': period_type, 'end_of_period_value': end_of_period_value, 'currency_code': currency_code, 'exchange_rate_type': exchange_rate_type, 'data_value_type': data_value_type } hash_key = str( hash_data(json.dumps(dimensions, sort_keys=True))) new_record = { 'hash_key': hash_key, 'excel_formula': excel_formula, 'currency_code': currency_code, 'data_item_id': data_item_id, 'data_value_type': data_value_type, 'detail_id': detail_id, 'entity_id': entity_id, 'scenario_id': scenario_id, 'period_type': period_type, 'end_of_period_value': end_of_period_value, 'reported_date_value': reported_date_value, 'exchange_rate_type': exchange_rate_type, 'request_id': request_id, 'standardized_data_id': standardized_data_id, 'value': value, 'value_string': value_string, 'value_numeric': value_numeric } results.append(new_record) # end for rec in periodic_data_records # LOGGER.info('results = {}'.format(results)) # COMMENT OUT return results
def gen_request(stream_id, url): with metrics.http_request_timer(stream_id) as timer: resp = requests.get(url) timer.tags[metrics.Tag.http_status_code] = resp.status_code resp.raise_for_status() return resp.json()
def request(self, method, url=None, path=None, headers=None, json=None, version=None, **kwargs): if not self.__verified: self.__verified = self.check_access() if not url and path: url = '{}/{}'.format(self.base_url, path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if not headers: headers = {} # API Version: https://developer.github.com/v3/#current-version if not version: version = 'v3' headers['Accept'] = 'application/vnd.github.{}+json'.format(version) # Authentication: https://developer.github.com/v3/#authentication headers['Authorization'] = 'Token {}'.format(self.__api_token) if self.__user_agent: headers['User-Agent'] = self.__user_agent if method == 'POST': headers['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request( method=method, url=url, headers=headers, json=json, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() # Pagination: https://developer.github.com/v3/guides/traversing-with-pagination/ links_header = response.headers.get('Link') links = [] next_url = None if links_header: links = links_header.split(',') for link in links: try: url, rel = re.search(r'^\<(https.*)\>; rel\=\"(.*)\"$', link.strip()).groups() if rel == 'next': next_url = url except AttributeError: next_url = None # last-modified: https://developer.github.com/v3/#conditional-requests last_modified = response.headers.get('Last-Modified') last_modified_str = None if last_modified: last_modified_dttm = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') last_modified_str = last_modified_dttm.strftime("%Y-%m-%dT%H:%M:%SZ") # 304: File Not Modified status_code if response.status_code == 304: LOGGER.warning('304: FILE NOT UPDATED, Stream: {}, URL: {}'.format(endpoint, url)) return None, next_url, last_modified_str # Catch 403 error with message: # "You have triggered an abuse detection mechanism. Please wait a few minutes before you try again." # Reference: https://developer.github.com/v3/#abuse-rate-limits if response.status_code == 403: response_json = response.json() response_message = response_json.get('message', '') if 'abuse detection mechanism.' in response_message: # Wait 3 minutes LOGGER.warning('Abuse Detection 403 Error: API triggered an abuse detection mechanism. Waiting 3 mins and trying again.') time.sleep(180) # Wait for 3 minutes raise AbuseDetection403Error(response) if response.status_code != 200: raise_for_error(response) response_json = response.json() return response_json, next_url, last_modified_str
def aqua_request(self, method, url, **kwargs): with metrics.http_request_timer(url): url = self.get_url(url, rest=False) return self._request(method, url, auth=self.aqua_auth, **kwargs)
def sync_statistics_for_day( config, state, stream, sdk_client, token, start, report_metrics, report_dimensions, ): # pylint: disable=too-many-locals """Sync and output Criteo Statistics endpoint for one day.""" mdata = metadata.to_map(stream.metadata) stats_query = { "report_type": stream.tap_stream_id, "dimensions": report_dimensions, "metrics": report_metrics, "start_date": start.strftime("%Y-%m-%d"), "end_date": start.strftime("%Y-%m-%d"), "currency": metadata.get(mdata, (), "tap-criteo.currency"), } # Filter advertiser_ids if defined in config advertiser_ids = config.get("advertiser_ids") if advertiser_ids: stats_query["advertiserId"] = advertiser_ids # Add ignore_x_device if defined in metadata ignore_x_device = metadata.get(mdata, (), "tap-criteo.ignoreXDevice") if ignore_x_device: stats_query["tap-criteo.ignoreXDevice"] = ignore_x_device # Fetch the report as a csv string with metrics.http_request_timer(stream.tap_stream_id): result = get_statistics_report(sdk_client, stats_query, token=token) csv_reader = parse_csv_string(mdata, result) with metrics.record_counter(stream.tap_stream_id) as counter: time_extracted = utils.now() with Transformer() as bumble_bee: for row in csv_reader: row["_sdc_report_datetime"] = REPORT_RUN_DATETIME row["_sdc_report_currency"] = metadata.get( mdata, (), "tap-criteo.currency") row = bumble_bee.transform(row, stream.schema.to_dict()) singer.write_record(stream.stream, row, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(config, state, advertiser_ids, stream.stream): LOGGER.info( "updating bookmark: %s > %s", start, get_start_for_stream(config, state, advertiser_ids, stream.stream), ) bookmarks.write_bookmark( state, state_key_name(advertiser_ids, stream.stream), "date", utils.strftime(start), ) singer.write_state(state) else: LOGGER.info( "not updating bookmark: %s <= %s", start, get_start_for_stream(config, state, advertiser_ids, stream.stream), ) LOGGER.info( "Done syncing %s records for the %s report for " + "advertiser_ids %s on %s", counter.value, stream.stream, advertiser_ids, start, )
def get_objects(self): updated_at_min = self.get_bookmark() stop_time = singer.utils.now().replace(microsecond=0) date_window_size = float( Context.config.get("date_window_size", DATE_WINDOW_SIZE)) # Page through till the end of the resultset while updated_at_min < stop_time: # Bookmarking can also occur on the since_id since_id = self.get_since_id() or 1 if since_id != 1: LOGGER.info("Resuming sync from since_id %d", since_id) # It's important that `updated_at_min` has microseconds # truncated. Why has been lost to the mists of time but we # think it has something to do with how the API treats # microseconds on its date windows. Maybe it's possible to # drop data due to rounding errors or something like that? updated_at_max = updated_at_min + datetime.timedelta( days=date_window_size) if updated_at_max > stop_time: updated_at_max = stop_time while True: status_key = self.status_key or "status" query_params = self.get_query_params(since_id, status_key, updated_at_min, updated_at_max) with metrics.http_request_timer(self.name): objects = self.call_api(query_params) for obj in objects: if obj.id < since_id: # This verifies the api behavior expectation we # have that all results actually honor the # since_id parameter. raise OutOfOrderIdsError( "obj.id < since_id: {} < {}".format( obj.id, since_id)) yield obj # You know you're at the end when the current page has # less than the request size limits you set. if len(objects) < self.results_per_page: # Save the updated_at_max as our bookmark as we've synced all rows up in our # window and can move forward. Also remove the since_id because we want to # restart at 1. Context.state.get('bookmarks', {}).get(self.name, {}).pop('since_id', None) self.update_bookmark(utils.strftime(updated_at_max)) break if objects[-1].id != max([o.id for o in objects]): # This verifies the api behavior expectation we have # that all pages are internally ordered by the # `since_id`. raise OutOfOrderIdsError( "{} is not the max id in objects ({})".format( objects[-1].id, max([o.id for o in objects]))) since_id = objects[-1].id # Put since_id into the state. self.update_bookmark(since_id, bookmark_key='since_id') updated_at_min = updated_at_max
def sync_generic_endpoint(config, state, stream, sdk_client, token): """Sync a stream which is backed by a generic Criteo endpoint.""" stream = add_synthetic_keys_to_stream_schema(stream) stream = add_synthetic_keys_to_stream_metadata(stream) mdata = metadata.to_map(stream.metadata) primary_keys = metadata.get(mdata, (), "table-key-properties") or [] LOGGER.info("{} primary keys are {}".format(stream.stream, primary_keys)) singer.write_schema(stream.stream, stream.schema.to_dict(), primary_keys) advertiser_ids = config.get("advertiser_ids", None) if stream.tap_stream_id == "Audiences": if not advertiser_ids: LOGGER.warn( "%s stream needs at least one advertiser_id defined in config" % stream.stream) for advertiser_id in advertiser_ids.split(","): token = refresh_auth_token(sdk_client, token) with metrics.http_request_timer(stream.tap_stream_id): result = get_audiences_endpoint(sdk_client, advertiser_id, token=token) else: module = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["module"] method = GENERIC_ENDPOINT_MAPPINGS[stream.tap_stream_id]["method"] if stream.tap_stream_id in ( "Portfolio", "AdvertiserInfo", "Sellers", "SellerBudgets", "SellerCampaigns", ): result = call_generic_endpoint(stream, sdk_client, module, method, token=token) else: result = call_generic_endpoint( stream, sdk_client, module, method, advertiser_ids=advertiser_ids, token=token, ) result = convert_keys_snake_to_camel([_.to_dict() for _ in result]) with metrics.record_counter(stream.tap_stream_id) as counter: time_extracted = utils.now() with Transformer() as bumble_bee: for row in result: row["_sdc_report_datetime"] = REPORT_RUN_DATETIME row = bumble_bee.transform(row, stream.schema.to_dict()) singer.write_record(stream.stream, row, time_extracted=time_extracted) counter.increment() LOGGER.info( "Done syncing %s records for the %s report for advertiser_ids %s", counter.value, stream.stream, advertiser_ids, )
def sync_report_for_day(stream_name, stream_schema, sdk_client, start, field_list): # pylint: disable=too-many-locals report_downloader = sdk_client.GetReportDownloader(version=VERSION) customer_id = sdk_client.client_customer_id report = { 'reportName': 'Seems this is required', 'dateRangeType': 'CUSTOM_DATE', 'reportType': stream_name, 'downloadFormat': 'CSV', 'selector': { 'fields': field_list, 'dateRange': { 'min': start.strftime('%Y%m%d'), 'max': start.strftime('%Y%m%d') } } } # Fetch the report as a csv string with metrics.http_request_timer(stream_name): result = attempt_download_report(report_downloader, report) headers, csv_reader = parse_csv_stream(result) with metrics.record_counter(stream_name) as counter: time_extracted = utils.now() with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING ) as bumble_bee: for row in csv_reader: obj = dict( zip(get_xml_attribute_headers(stream_schema, headers), row)) obj['_sdc_customer_id'] = customer_id obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME bumble_bee.pre_hook = transform_pre_hook obj = bumble_bee.transform(obj, stream_schema) singer.write_record(stream_name, obj, time_extracted=time_extracted) counter.increment() if start > get_start_for_stream(sdk_client.client_customer_id, stream_name): LOGGER.info( 'updating bookmark: %s > %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) bookmarks.write_bookmark( STATE, state_key_name(sdk_client.client_customer_id, stream_name), 'date', start.strftime(utils.DATETIME_FMT)) singer.write_state(STATE) else: LOGGER.info( 'not updating bookmark: %s <= %s', start, get_start_for_stream(sdk_client.client_customer_id, stream_name)) LOGGER.info( "Done syncing %s records for the %s report for customer_id %s on %s", counter.value, stream_name, customer_id, start)
def request(self, method, path=None, url=None, json=None, version=None, **kwargs): if not self.__verified: self.__verified = self.check_access() if not version: version = 'v2' if not url and path: url = '{}/{}'.format(self.base_url, path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = None if 'headers' not in kwargs: kwargs['headers'] = {} # Version represents API version (e.g. v2): https://api.Ujet.com/?http#versioning kwargs['headers']['Accept'] = 'application/vnd.json'.format(version) if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: response = self.__session.request(method=method, url=url, auth=(self.__company_key, self.__company_secret), json=json, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: raise Server5xxError() if response.status_code != 200: raise_for_error(response) # pagination details are returned in the header: total, per-page, next url total_records = int(response.headers.get('total', 0)) # Not returning currently due to client API bug per_page = total_records = int(response.headers.get('per-page', 0)) next_url = None if ((response.headers.get('link') is not None) and ('link' in response.headers)): links = response.headers.get('link').split(',') next_url = None for link in links: try: url, rel = re.search(r'^\<(https.*)\>; rel\=\"(.*)\"$', link.strip()).groups() if rel == 'next': next_url = url except AttributeError: next_url = None return response.json(), total_records, next_url
def wrapped_request(*args, **kwargs): url = args[1] match = re.match(r'http[s]?://api\.stripe\.com/v1/(\w+)\??', url) stream_name = match.groups()[0] with metrics.http_request_timer(stream_name): return _original_request(*args, **kwargs)
def request_export(self, method, url=None, path=None, params=None, json=None, **kwargs): if not self.__verified: self.__verified = self.check_access() if url and path: url = '{}/{}'.format(url, path) elif path and not url: url = 'https://data.mixpanel.com/api/2.0/{}'.format(path) if 'endpoint' in kwargs: endpoint = kwargs['endpoint'] del kwargs['endpoint'] else: endpoint = 'export' if 'headers' not in kwargs: kwargs['headers'] = {} kwargs['headers']['Accept'] = 'application/json' if self.__user_agent: kwargs['headers']['User-Agent'] = self.__user_agent if method == 'POST': kwargs['headers']['Content-Type'] = 'application/json' kwargs['headers']['Authorization'] = 'Basic {}'.format( str(base64.urlsafe_b64encode(self.__api_secret.encode("utf-8")), "utf-8")) with metrics.http_request_timer(endpoint) as timer: with self.__session.request(method=method, url=url, params=params, json=json, stream=True, timeout=180, **kwargs) as response: if response.status_code >= 500: raise Server5xxError() if response.status_code != 200: raise_for_error(response) # export endpoint returns jsonl results; # other endpoints return json with array of results # jsonlines reference: https://jsonlines.readthedocs.io/en/latest/ if response.text == '': LOGGER.warning('/export API response empty') yield None else: file_like_object = io.StringIO(response.text) reader = jsonlines.Reader(file_like_object) for record in reader.iter(allow_none=True, skip_empty=True): yield record timer.tags[metrics.Tag.http_status_code] = response.status_code