Python logs_map_and_reduce Examples, data_flow_graph.logs_map_and_reduce Python Examples

Example #1

0

Show file

File: http_pandora_mediawiki.py Project: Wikia/data-flow-graphs

def get_pandora_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # https://kibana.wikia-inc.com/goto/3aef04fa1f9e55df5cc4c3031671ecab
    # k8s-ingress access logs, internal traffic
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-k8s-ingress-controller'
    ).query_by_string(
        query='NOT request_Fastly-Client-Ip: * AND request_User-Agent: * '
              'AND RequestHost: "prod.sjc.k8s.wikia.net"',
        fields=[
            'request_User-Agent',
            'RequestPath',
        ],
        limit=limit
    )

    # extract required fields only
    # ('mediawiki', 'pandora:helios::info')
    # ('swagger-codegen', 'pandora:user-attribute::user')
    # ('node-fetch', 'pandora:discussion::threads')
    rows = [
        (
            str(row.get('request_User-Agent')).split('/')[0].lower(),
            normalize_pandora_url(row.get('RequestPath')),
        )
        for row in rows
    ]

    # process the logs
    def _map(item):
        return '{}-{}'.format(item[0], item[1])

    def _reduce(items):
        first = items[0]
        source = first[0]
        target = first[1]

        # normalize the source
        if source == 'swagger-codegen':
            source = 'mediawiki'
        elif source == 'node-fetch':
            source = 'mobile-wiki'

        return {
            'source': source,
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)

Example #2

0

Show file

File: http_pandora_mediawiki.py Project: Wikia/data-flow-graphs

def get_mediawiki_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # https://kibana5.wikia-inc.com/goto/e6ab16f694b625d5b87833ae794f5989
    # goreplay is running in RES (check SJC logs only)
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-mediawiki'
    ).query_by_string(
        query='"Wikia internal request" AND @fields.environment: "prod" '
              'AND @fields.datacenter: "sjc" '
              'AND @fields.http_url_path: *',
        fields=[
            '@context.source',
            '@fields.http_url_path',
        ],
        limit=limit
    )

    # extract required fields only
    # (u'user-permissions', 'api:query::users')
    # (u'1', 'nirvana:EmailControllerDiscussionReply::handle')
    rows = [
        (
            row.get('@context', {})['source'],
            normalize_mediawiki_url(row.get('@fields', {})['http_url_path'])
        )
        for row in rows
        if row.get('@context', {}).get('source') is not None
    ]

    # process the logs
    def _map(item):
        return '{}-{}'.format(item[0], item[1])

    def _reduce(items):
        first = items[0]
        source = first[0]
        target = first[1]

        return {
            'source': source if source != '1' else 'internal',
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)

Example #3

0

Show file

def test_logs_grouping():
    logs = _get_logs()

    # group logs using source name and URL, ignore user agent
    def _map(entry):
        return '{}-{}'.format(entry[0], entry[1])

    # this will be called for each group of logs
    def _reduce(items):
        first = items[0]
        host = str(first[1]).split('/')[2]

        return {
            'source': first[0],
            'edge': 'http',
            'target': host,
            # the following is optional
            'metadata': '{} requests'.format(len(items))
        }

    grouped = logs_map_and_reduce(logs, _map, _reduce)
    # print(grouped)

    assert len(grouped) == 3

    assert grouped[0]['source'] == 'web'
    assert grouped[0]['edge'] == 'http'
    assert grouped[0]['target'] == 'serviceA'
    assert grouped[0]['metadata'] == '15 requests'
    assert grouped[0]['value'] == 0.75

    assert grouped[1]['source'] == 'web'
    assert grouped[1]['edge'] == 'http'
    assert grouped[1]['target'] == 'serviceB'
    assert grouped[1]['metadata'] == '20 requests'
    assert grouped[1]['value'] == 1

    assert grouped[2]['source'] == 'cron'
    assert grouped[2]['edge'] == 'http'
    assert grouped[2]['target'] == 'serviceA'
    assert grouped[2]['metadata'] == '5 requests'
    assert grouped[2]['value'] == 0.25

    assert format_tsv_line(
        **grouped[0]) == 'web\thttp\tserviceA\t0.7500\t15 requests'
    assert format_tsv_line(
        **grouped[1]) == 'web\thttp\tserviceB\t1.0000\t20 requests'
    assert format_tsv_line(
        **grouped[2]) == 'cron\thttp\tserviceA\t0.2500\t5 requests'

Example #4

0

Show file

File: portability_metrics.py Project: Wikia/data-flow-graphs

def get_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-other'
    ).query_by_string(
        query='kubernetes.labels.job-name:* AND '
              'kubernetes.container_name: "portability-metric" AND ("SELECT" OR "UPDATE")',
        fields=[
            'log',
            'kubernetes.labels.job-name'
        ],
        limit=limit
    )

    entries = []

    for row in rows:
        for entry in get_portability_metrics_query(
                row['log'], row['kubernetes']['labels']['job-name']):
            entries.append(entry)

    # print(entries)

    # process the logs
    def _map(item):
        return '{}'.join(item)

    def _reduce(items):
        #  ('MetricArticleProvider.py', 'UPDATE', 'articledata')
        first = items[0]

        script = 'cron:{}'.format(first[0])
        query_type = first[1]
        table_name = 'db:{}'.format(first[2])

        return {
            'source': table_name if query_type == 'SELECT' else script,
            'edge': query_type,
            'target': table_name if query_type != 'SELECT' else script,
        }

    return logs_map_and_reduce(entries, _map, _reduce)

Example #5

0

Show file

def get_solr_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-solr').query_by_string(
            query='@source_host.keyword: /search-s.*/ AND @message: "webapp"',
            fields=[
                '@message',
            ],
            limit=limit)

    # extract required fields only
    # core name and method name
    rows = [(
        get_solr_core_name(row.get('@message')),
        str(get_solr_parameters(row.get('@message')).get('path',
                                                         '')).strip('/'),
    ) for row in rows]

    # process the logs
    def _map(item):
        return '{}'.join(item)

    def _reduce(items):
        first = items[0]
        index = first[0]
        method = first[1]
        client = 'client'  # add a user agent to the logs and identify the client based on it

        return {
            'source':
            'solr:{}'.format(index) if method == 'select' else 'indexer',
            'edge':
            'http',
            'target':
            'solr:{}'.format(index) if method != 'select' else client,
            # the following is optional
            'metadata':
            '{:.3f} /{} reqs per sec'.format(1. * len(items) / period, method)
        }

    return logs_map_and_reduce(rows, _map, _reduce)

Example #6

0

Show file

File: http_pandora_mediawiki.py Project: Wikia/data-flow-graphs

def get_mobile_apps_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-apache-access-log'
    ).query_by_string(
        query='(agent: "Android" OR agent: "iOS") AND NOT agent: "Chrome" '
              'AND @source_host.keyword: /ap-s.*/',
        fields=[
            'agent',
            'request',
        ],
        limit=limit
    )

    # extract the request URL only
    # and filter out non-mobile app requests
    rows = [
        normalize_mediawiki_url(row.get('request'))
        for row in rows if is_mobile_app_user_agent(row.get('agent'))
    ]

    # process the logs
    def _map(item):
        return item

    def _reduce(items):
        target = items[0]

        return {
            'source': 'mobile-app',
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)

Example #7

0

Show file

File: http_pandora_mediawiki.py Project: Wikia/data-flow-graphs

def get_celery_tasks_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # @see https://kibana5.wikia-inc.com/goto/d877bf3caf4204b9b5fdc5f8864f4ce2
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-mediawiki'
    ).query_by_string(
        query='@message: "BaseTask::execute" AND @fields.datacenter: "sjc" '
              'AND @fields.environment: "prod"',
        fields=[
            '@context.task_call',
        ],
        limit=limit
    )

    # extract the task type
    rows = [
        row.get('@context').get('task_call')
        for row in rows
    ]

    # process the logs
    def _map(item):
        return item

    def _reduce(items):
        target = items[0]

        return {
            'source': 'celery',
            'edge': 'http',
            'target': 'task:{}'.format(target),
            # the following is optional
            'metadata': '{:.3f} calls per minute'.format(60. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)

Example #8

0

Show file

def get_flow(period, limit):
    logger = logging.getLogger('get_flow')
    kibana = get_kibana(period)

    # fetch DB queries
    def _map_query(row):
        query = generalize_sql(re.sub(r'^SQL ', '', row['@message']))
        database = row['@fields']['database']['name']

        if database in ['uportal.mysql', 'default']:
            database = 'mysql'

        # print(query, kind, tables)

        return (
            database,
            query,
            'php:{}'.format(row['@context']['method']),
        )

    logs = map(
        _map_query,
        kibana.query_by_string(
            '@context.rows: *',
            fields=['@message', '@fields.database.name', '@context.method'],
            limit=limit))
    logs = [log for log in logs if log is not None]

    # print(list(logs))

    # group logs using source name and URL, ignore user agent
    def _map(entry):
        return '{}-{}'.format(entry[0], entry[1])

    # this will be called for each group of logs
    def _reduce(items):
        first = items[0]
        logger.info(first)

        sql = str(first[1])
        tables = get_query_tables(sql) or ['unknown']
        kind = sql.split(' ')[0]
        table = '{}:{}'.format(first[0], tables[0])
        method = first[2]

        ret = {
            'source': table,
            'edge': 'SQL {}'.format(kind),
            'target': method,
            'metadata': '{:.3f} QPS'.format(1. * len(items) / period)
        }

        # reverse the direction of the graph
        # from method (code) to table (database)
        if kind not in ['SELECT']:
            ret['target'] = table
            ret['source'] = method

        return ret

    logger.info('Mapping %d log entries...' % len(logs))
    return logs_map_and_reduce(logs, _map, _reduce)