def save_data(api_request, part): '''Loads data chunk from Logs API and saves to ClickHouse''' url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \ .format( host=HOST, counter_id=api_request.user_request.counter_id, request_id=api_request.request_id, part=part, token=api_request.user_request.token ) r = requests.get(url) if r.status_code != 200: logger.debug(r.text) raise ValueError(r.text) splitted_text = r.text.split('\n') logger.info('\n'.join(splitted_text[:5])) headers_num = len(splitted_text[0].split('\t')) splitted_text_filtered = filter( lambda x: len(x.split('\t')) == headers_num, r.text.split('\n')) num_filtered = len(splitted_text) - len(splitted_text_filtered) if num_filtered != 0: logger.warning('%d rows were filtered out') output_data = '\n'.join(splitted_text_filtered) clickhouse.save_data(api_request.user_request.source, api_request.user_request.fields, output_data) api_request.status = 'saved'
def save_data(api_request, part): import clickhouse '''Loads data chunk from Logs API and saves to ClickHouse''' url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \ .format( host=HOST, counter_id=api_request.user_request.counter_id, request_id=api_request.request_id, part=part ) headers = {'Authorization': 'OAuth ' + api_request.user_request.token} r = requests.get(url, headers=headers) if r.status_code != 200: logger.debug(r.text) raise ValueError(r.text) splitted_text = r.text.split('\n') logger.info('### DATA SAMPLE') logger.info('\n'.join(splitted_text[:5])) headers_num = len(splitted_text[0].split('\t')) splitted_text_filtered = list( filter(lambda x: len(x.split('\t')) == headers_num, r.text.split('\n'))) num_filtered = len(splitted_text) - len(splitted_text_filtered) if num_filtered != 0: logger.warning('%d rows were filtered out' % num_filtered) if len(splitted_text_filtered) > 1: output_data = '\n'.join(splitted_text_filtered[1:]) #.encode('utf-8') output_data = '\t'.join( map(clickhouse.get_ch_field_name, splitted_text_filtered[0].split('\t')) ) + '\n' + output_data # convert headers to CH column names output_data = output_data.replace(r"\'", "'") # to correct escapes in params clickhouse.save_data(api_request.user_request.source, api_request.user_request.fields, output_data) else: logger.warning('### No data to upload') api_request.status = 'saved'
def save_data(api_request, part): '''Loads data chunk from Logs API and saves to ClickHouse''' url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \ .format( host=HOST, counter_id=api_request.user_request.counter_id, request_id=api_request.request_id, part=part, token=api_request.user_request.token ) r = requests.get(url) if r.status_code != 200: logger.debug(r.text) raise ValueError(r.text) splitted_text = r.text.split('\n') logger.info('### DATA SAMPLE') logger.info('\n'.join(splitted_text[:5])) headers_num = len(splitted_text[0].split('\t')) splitted_text_filtered = filter( lambda x: len(x.split('\t')) == headers_num, r.text.split('\n')) num_filtered = len(splitted_text) - len(splitted_text_filtered) if num_filtered != 0: logger.warning('%d rows were filtered out' % num_filtered) #Get additional fields for clickHouse ch_fields_config = utils.get_ch_fields_config() ch_fields = ch_fields_config['{source}_fields'.format( source=api_request.user_request.source)] prefix = 'ym:pv:' if api_request.user_request.source == 'visits': prefix = 'ym:s:' #adds additional fields to the end if len(ch_fields) > 0: splitted_text_filtered[0] += '\t' + '\t'.join(ch_fields) headers = splitted_text[0].split('\t') if prefix + 'params' in headers and prefix + 'URL' in headers: params_index = headers.index(prefix + 'params') url_index = headers.index(prefix + 'URL') #parse the params i = 1 while i < len(splitted_text_filtered): value = splitted_text_filtered[i].split('\t') for field in ch_fields: splitted_text_filtered[i] += "\t" params_json = clear_json(value[params_index]) url = clear_json(value[url_index]) if not is_json(params_json): continue params = json.loads(params_json) if len(params) > 0: if type(params) is list: params = params[0] data = parsing_params.get_data_from_params( prefix, params, field, url) splitted_text_filtered[i] += unicode(data) i += 1 output_data = '\n'.join(splitted_text_filtered).encode('utf-8') output_data = output_data.replace(r"\'", "'") # to correct escapes in params clickhouse.save_data(api_request.user_request.source, api_request.user_request.fields, output_data) api_request.status = 'saved'