def create_table(source, fields): '''Creates table in ClickHouse for hits/visits with particular fields''' tmpl = ''' CREATE TABLE {table_name} ( {fields} ) ENGINE = {engine} ''' field_tmpl = '{name} {type}' field_statements = [] table_name = get_source_table_name(source) if source == 'hits': if ('ym:pv:date' in fields) and ('ym:pv:clientID' in fields): engine = 'MergeTree(Date, intHash32(ClientID), (CounterID, Date, intHash32(ClientID)), 8192)' else: engine = 'Log' if source == 'visits': if ('ym:s:date' in fields) and ('ym:s:clientID' in fields): engine = 'MergeTree(Date, intHash32(ClientID), (CounterID, Date, intHash32(ClientID)), 8192)' else: engine = 'Log' ch_field_types = utils.get_ch_fields_config() ch_fields = sorted(map(get_ch_field_name, fields)) for i in range(len(fields)): field_statements.append( field_tmpl.format(name=ch_fields[i], type=ch_field_types[fields[i]])) query = tmpl.format(table_name=table_name, engine=engine, fields=',\n'.join(sorted(field_statements))) get_clickhouse_data(query)
def create_table(source, fields): '''Creates table in ClickHouse for hits/visits with particular fields''' tmpl = ''' CREATE TABLE {table_name} ( {fields} ) ENGINE = {engine} ''' field_tmpl = '{name} {type}' field_statements = [] table_name = get_source_table_name(source) if source == 'hits': if ('ym:pv:date' in fields) and ('ym:pv:clientID' in fields): engine = 'MergeTree() PARTITION BY toYYYYMM(Date) ORDER BY (ClientID, Date, intHash64(ClientID)) SAMPLE BY intHash64(ClientID) SETTINGS index_granularity=8192' else: engine = 'Log' if source == 'visits': if ('ym:s:date' in fields) and ('ym:s:clientID' in fields): engine = 'MergeTree() PARTITION BY toYYYYMM(Date) ORDER BY (ClientID, Date, intHash64(ClientID)) SAMPLE BY intHash64(ClientID) SETTINGS index_granularity=8192' else: engine = 'Log' ch_field_types = utils.get_ch_fields_config() ch_fields = list(map(get_ch_field_name, fields)) for i in range(len(fields)): field_statements.append(field_tmpl.format(name=ch_fields[i], type=ch_field_types[fields[i]])) field_statements = sorted(field_statements) query = tmpl.format(table_name=table_name, engine=engine, fields=',\n'.join(sorted(field_statements))) get_clickhouse_data(query)
def save_data(api_request, part): '''Loads data chunk from Logs API and saves to ClickHouse''' url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \ .format( host=HOST, counter_id=api_request.user_request.counter_id, request_id=api_request.request_id, part=part, token=api_request.user_request.token ) r = requests.get(url) if r.status_code != 200: logger.debug(r.text) raise ValueError(r.text) splitted_text = r.text.split('\n') logger.info('### DATA SAMPLE') logger.info('\n'.join(splitted_text[:5])) headers_num = len(splitted_text[0].split('\t')) splitted_text_filtered = filter( lambda x: len(x.split('\t')) == headers_num, r.text.split('\n')) num_filtered = len(splitted_text) - len(splitted_text_filtered) if num_filtered != 0: logger.warning('%d rows were filtered out' % num_filtered) #Get additional fields for clickHouse ch_fields_config = utils.get_ch_fields_config() ch_fields = ch_fields_config['{source}_fields'.format( source=api_request.user_request.source)] prefix = 'ym:pv:' if api_request.user_request.source == 'visits': prefix = 'ym:s:' #adds additional fields to the end if len(ch_fields) > 0: splitted_text_filtered[0] += '\t' + '\t'.join(ch_fields) headers = splitted_text[0].split('\t') if prefix + 'params' in headers and prefix + 'URL' in headers: params_index = headers.index(prefix + 'params') url_index = headers.index(prefix + 'URL') #parse the params i = 1 while i < len(splitted_text_filtered): value = splitted_text_filtered[i].split('\t') for field in ch_fields: splitted_text_filtered[i] += "\t" params_json = clear_json(value[params_index]) url = clear_json(value[url_index]) if not is_json(params_json): continue params = json.loads(params_json) if len(params) > 0: if type(params) is list: params = params[0] data = parsing_params.get_data_from_params( prefix, params, field, url) splitted_text_filtered[i] += unicode(data) i += 1 output_data = '\n'.join(splitted_text_filtered).encode('utf-8') output_data = output_data.replace(r"\'", "'") # to correct escapes in params clickhouse.save_data(api_request.user_request.source, api_request.user_request.fields, output_data) api_request.status = 'saved'
def get_field_type_list(dimension_list): field_type_list = [] dimension_to_field_type = utils.get_ch_fields_config() for dimension in dimension_list: field_type_list.append(dimension_to_field_type[dimension]) return field_type_list