Beispiel #1
0
def create_table(source, fields):
    '''Creates table in ClickHouse for hits/visits with particular fields'''
    tmpl = '''
        CREATE TABLE {table_name} (
            {fields}
        ) ENGINE = {engine}
    '''
    field_tmpl = '{name} {type}'
    field_statements = []

    table_name = get_source_table_name(source)
    if source == 'hits':
        if ('ym:pv:date' in fields) and ('ym:pv:clientID' in fields):
            engine = 'MergeTree(Date, intHash32(ClientID), (CounterID, Date, intHash32(ClientID)), 8192)'
        else:
            engine = 'Log'

    if source == 'visits':
        if ('ym:s:date' in fields) and ('ym:s:clientID' in fields):
            engine = 'MergeTree(Date, intHash32(ClientID), (CounterID, Date, intHash32(ClientID)), 8192)'
        else:
            engine = 'Log'

    ch_field_types = utils.get_ch_fields_config()
    ch_fields = sorted(map(get_ch_field_name, fields))
    for i in range(len(fields)):
        field_statements.append(
            field_tmpl.format(name=ch_fields[i],
                              type=ch_field_types[fields[i]]))

    query = tmpl.format(table_name=table_name,
                        engine=engine,
                        fields=',\n'.join(sorted(field_statements)))

    get_clickhouse_data(query)
def create_table(source, fields):
    '''Creates table in ClickHouse for hits/visits with particular fields'''
    tmpl = '''
        CREATE TABLE {table_name} (
            {fields}
        ) ENGINE = {engine}
    '''
    field_tmpl = '{name} {type}'
    field_statements = []

    table_name = get_source_table_name(source)
    if source == 'hits':
        if ('ym:pv:date' in fields) and ('ym:pv:clientID' in fields):
            engine = 'MergeTree() PARTITION BY toYYYYMM(Date) ORDER BY (ClientID, Date, intHash64(ClientID)) SAMPLE BY intHash64(ClientID) SETTINGS index_granularity=8192'
        else:
            engine = 'Log'

    if source == 'visits':
        if ('ym:s:date' in fields) and ('ym:s:clientID' in fields):
            engine = 'MergeTree() PARTITION BY toYYYYMM(Date) ORDER BY (ClientID, Date, intHash64(ClientID)) SAMPLE BY intHash64(ClientID) SETTINGS index_granularity=8192'
        else:
            engine = 'Log'

    ch_field_types = utils.get_ch_fields_config()
    ch_fields = list(map(get_ch_field_name, fields))

    for i in range(len(fields)):
        field_statements.append(field_tmpl.format(name=ch_fields[i],
                                                  type=ch_field_types[fields[i]]))

    field_statements = sorted(field_statements)
    query = tmpl.format(table_name=table_name,
                        engine=engine,
                        fields=',\n'.join(sorted(field_statements)))

    get_clickhouse_data(query)
Beispiel #3
0
def save_data(api_request, part):
    '''Loads data chunk from Logs API and saves to ClickHouse'''
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \
        .format(
            host=HOST,
            counter_id=api_request.user_request.counter_id,
            request_id=api_request.request_id,
            part=part,
            token=api_request.user_request.token
        )

    r = requests.get(url)
    if r.status_code != 200:
        logger.debug(r.text)
        raise ValueError(r.text)

    splitted_text = r.text.split('\n')
    logger.info('### DATA SAMPLE')
    logger.info('\n'.join(splitted_text[:5]))

    headers_num = len(splitted_text[0].split('\t'))
    splitted_text_filtered = filter(
        lambda x: len(x.split('\t')) == headers_num, r.text.split('\n'))
    num_filtered = len(splitted_text) - len(splitted_text_filtered)
    if num_filtered != 0:
        logger.warning('%d rows were filtered out' % num_filtered)

    #Get additional fields for clickHouse
    ch_fields_config = utils.get_ch_fields_config()
    ch_fields = ch_fields_config['{source}_fields'.format(
        source=api_request.user_request.source)]
    prefix = 'ym:pv:'
    if api_request.user_request.source == 'visits':
        prefix = 'ym:s:'

    #adds additional fields to the end
    if len(ch_fields) > 0:
        splitted_text_filtered[0] += '\t' + '\t'.join(ch_fields)

    headers = splitted_text[0].split('\t')

    if prefix + 'params' in headers and prefix + 'URL' in headers:
        params_index = headers.index(prefix + 'params')

        url_index = headers.index(prefix + 'URL')

        #parse the params
        i = 1
        while i < len(splitted_text_filtered):
            value = splitted_text_filtered[i].split('\t')

            for field in ch_fields:
                splitted_text_filtered[i] += "\t"
                params_json = clear_json(value[params_index])
                url = clear_json(value[url_index])
                if not is_json(params_json):
                    continue
                params = json.loads(params_json)
                if len(params) > 0:
                    if type(params) is list:
                        params = params[0]
                    data = parsing_params.get_data_from_params(
                        prefix, params, field, url)
                    splitted_text_filtered[i] += unicode(data)
            i += 1

    output_data = '\n'.join(splitted_text_filtered).encode('utf-8')
    output_data = output_data.replace(r"\'",
                                      "'")  # to correct escapes in params

    clickhouse.save_data(api_request.user_request.source,
                         api_request.user_request.fields, output_data)

    api_request.status = 'saved'
def get_field_type_list(dimension_list):
    field_type_list = []
    dimension_to_field_type = utils.get_ch_fields_config()
    for dimension in dimension_list:
        field_type_list.append(dimension_to_field_type[dimension])
    return field_type_list