Example #1
0
def save_data(api_request, part):
    '''Loads data chunk from Logs API and saves to ClickHouse'''
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \
        .format(
            host=HOST,
            counter_id=api_request.user_request.counter_id,
            request_id=api_request.request_id,
            part=part,
            token=api_request.user_request.token
        )

    r = requests.get(url)
    if r.status_code != 200:
        logger.debug(r.text)
        raise ValueError(r.text)

    splitted_text = r.text.split('\n')
    logger.info('\n'.join(splitted_text[:5]))

    headers_num = len(splitted_text[0].split('\t'))
    splitted_text_filtered = filter(
        lambda x: len(x.split('\t')) == headers_num, r.text.split('\n'))
    num_filtered = len(splitted_text) - len(splitted_text_filtered)
    if num_filtered != 0:
        logger.warning('%d rows were filtered out')

    output_data = '\n'.join(splitted_text_filtered)

    clickhouse.save_data(api_request.user_request.source,
                         api_request.user_request.fields, output_data)

    api_request.status = 'saved'
Example #2
0
def save_data(api_request, part):
    import clickhouse
    '''Loads data chunk from Logs API and saves to ClickHouse'''
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
        .format(
            host=HOST,
            counter_id=api_request.user_request.counter_id,
            request_id=api_request.request_id,
            part=part
        )

    headers = {'Authorization': 'OAuth ' + api_request.user_request.token}

    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        logger.debug(r.text)
        raise ValueError(r.text)

    splitted_text = r.text.split('\n')
    logger.info('### DATA SAMPLE')
    logger.info('\n'.join(splitted_text[:5]))

    headers_num = len(splitted_text[0].split('\t'))
    splitted_text_filtered = list(
        filter(lambda x: len(x.split('\t')) == headers_num,
               r.text.split('\n')))
    num_filtered = len(splitted_text) - len(splitted_text_filtered)
    if num_filtered != 0:
        logger.warning('%d rows were filtered out' % num_filtered)

    if len(splitted_text_filtered) > 1:
        output_data = '\n'.join(splitted_text_filtered[1:])  #.encode('utf-8')
        output_data = '\t'.join(
            map(clickhouse.get_ch_field_name,
                splitted_text_filtered[0].split('\t'))
        ) + '\n' + output_data  # convert headers to CH column names
        output_data = output_data.replace(r"\'",
                                          "'")  # to correct escapes in params

        clickhouse.save_data(api_request.user_request.source,
                             api_request.user_request.fields, output_data)
    else:
        logger.warning('### No data to upload')

    api_request.status = 'saved'
Example #3
0
def save_data(api_request, part):
    '''Loads data chunk from Logs API and saves to ClickHouse'''
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \
        .format(
            host=HOST,
            counter_id=api_request.user_request.counter_id,
            request_id=api_request.request_id,
            part=part,
            token=api_request.user_request.token
        )

    r = requests.get(url)
    if r.status_code != 200:
        logger.debug(r.text)
        raise ValueError(r.text)

    splitted_text = r.text.split('\n')
    logger.info('### DATA SAMPLE')
    logger.info('\n'.join(splitted_text[:5]))

    headers_num = len(splitted_text[0].split('\t'))
    splitted_text_filtered = filter(
        lambda x: len(x.split('\t')) == headers_num, r.text.split('\n'))
    num_filtered = len(splitted_text) - len(splitted_text_filtered)
    if num_filtered != 0:
        logger.warning('%d rows were filtered out' % num_filtered)

    #Get additional fields for clickHouse
    ch_fields_config = utils.get_ch_fields_config()
    ch_fields = ch_fields_config['{source}_fields'.format(
        source=api_request.user_request.source)]
    prefix = 'ym:pv:'
    if api_request.user_request.source == 'visits':
        prefix = 'ym:s:'

    #adds additional fields to the end
    if len(ch_fields) > 0:
        splitted_text_filtered[0] += '\t' + '\t'.join(ch_fields)

    headers = splitted_text[0].split('\t')

    if prefix + 'params' in headers and prefix + 'URL' in headers:
        params_index = headers.index(prefix + 'params')

        url_index = headers.index(prefix + 'URL')

        #parse the params
        i = 1
        while i < len(splitted_text_filtered):
            value = splitted_text_filtered[i].split('\t')

            for field in ch_fields:
                splitted_text_filtered[i] += "\t"
                params_json = clear_json(value[params_index])
                url = clear_json(value[url_index])
                if not is_json(params_json):
                    continue
                params = json.loads(params_json)
                if len(params) > 0:
                    if type(params) is list:
                        params = params[0]
                    data = parsing_params.get_data_from_params(
                        prefix, params, field, url)
                    splitted_text_filtered[i] += unicode(data)
            i += 1

    output_data = '\n'.join(splitted_text_filtered).encode('utf-8')
    output_data = output_data.replace(r"\'",
                                      "'")  # to correct escapes in params

    clickhouse.save_data(api_request.user_request.source,
                         api_request.user_request.fields, output_data)

    api_request.status = 'saved'