def test_get_timestamp_filer():
    es_query = ElasticsearchQuery(es_host='foo', since=123456, period=60)
    res = es_query._get_timestamp_filer()

    print(res)

    assert res['range']['@timestamp'] is not None
    assert res['range']['@timestamp']['gte'] == '1970-01-02T10:17:37.000Z'
    assert res['range']['@timestamp']['lte'] is not None
def test_format_index():
    assert ElasticsearchQuery.format_index(
        prefix='logstash', timestamp=1) == 'logstash-1970.01.01'
    assert ElasticsearchQuery.format_index(
        prefix='logstash', timestamp=1408450795) == 'logstash-2014.08.19'
    assert ElasticsearchQuery.format_index(
        prefix='logstash-foo',
        timestamp=1408450795) == 'logstash-foo-2014.08.19'
    assert ElasticsearchQuery.format_index(prefix='syslog-ng',
                                           timestamp=1408450795,
                                           sep="_") == 'syslog-ng_2014.08.19'
def get_pandora_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # https://kibana.wikia-inc.com/goto/3aef04fa1f9e55df5cc4c3031671ecab
    # k8s-ingress access logs, internal traffic
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-k8s-ingress-controller'
    ).query_by_string(
        query='NOT request_Fastly-Client-Ip: * AND request_User-Agent: * '
              'AND RequestHost: "prod.sjc.k8s.wikia.net"',
        fields=[
            'request_User-Agent',
            'RequestPath',
        ],
        limit=limit
    )

    # extract required fields only
    # ('mediawiki', 'pandora:helios::info')
    # ('swagger-codegen', 'pandora:user-attribute::user')
    # ('node-fetch', 'pandora:discussion::threads')
    rows = [
        (
            str(row.get('request_User-Agent')).split('/')[0].lower(),
            normalize_pandora_url(row.get('RequestPath')),
        )
        for row in rows
    ]

    # process the logs
    def _map(item):
        return '{}-{}'.format(item[0], item[1])

    def _reduce(items):
        first = items[0]
        source = first[0]
        target = first[1]

        # normalize the source
        if source == 'swagger-codegen':
            source = 'mediawiki'
        elif source == 'node-fetch':
            source = 'mobile-wiki'

        return {
            'source': source,
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)
def get_mediawiki_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # https://kibana5.wikia-inc.com/goto/e6ab16f694b625d5b87833ae794f5989
    # goreplay is running in RES (check SJC logs only)
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-mediawiki'
    ).query_by_string(
        query='"Wikia internal request" AND @fields.environment: "prod" '
              'AND @fields.datacenter: "sjc" '
              'AND @fields.http_url_path: *',
        fields=[
            '@context.source',
            '@fields.http_url_path',
        ],
        limit=limit
    )

    # extract required fields only
    # (u'user-permissions', 'api:query::users')
    # (u'1', 'nirvana:EmailControllerDiscussionReply::handle')
    rows = [
        (
            row.get('@context', {})['source'],
            normalize_mediawiki_url(row.get('@fields', {})['http_url_path'])
        )
        for row in rows
        if row.get('@context', {}).get('source') is not None
    ]

    # process the logs
    def _map(item):
        return '{}-{}'.format(item[0], item[1])

    def _reduce(items):
        first = items[0]
        source = first[0]
        target = first[1]

        return {
            'source': source if source != '1' else 'internal',
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)
Ejemplo n.º 5
0
    def fetch(self):
        logger = logging.getLogger(__name__)

        while True:
            es = ElasticsearchQuery(es_host=self.ELASTICSEARCH_HOST,
                                    period=self.INTERVAL,
                                    index_prefix=self.ES_INDEX)

            res = es.query_by_string(self.QUERY,
                                     fields=self.FIELDS,
                                     limit=self.BATCH)
            urls = map(self.format_log_entry, res)

            for url in urls:
                if self.filter_out(url):
                    logger.info('Filtered out <%s>', url)
                    continue

                yield url

            time.sleep(self.INTERVAL)
Ejemplo n.º 6
0
def get_log_entries(query,
                    period,
                    fields,
                    limit,
                    index_prefix='logstash-other'):
    """
    Get log entries from elasticsearch that match given query

    :type query str
    :type period int
    :type fields list[str] or None
    :type limit int
    :type index_prefix str
    :rtype tuple
    """
    logger = logging.getLogger('get_log_entries')
    source = ElasticsearchQuery(es_host=LOGS_ES_HOST,
                                period=period,
                                index_prefix=index_prefix)

    logger.info('Query: \'%s\' for the last %d hour(s)', query, period / 3600)

    return source.query_by_string(query, fields, limit)
Ejemplo n.º 7
0
    def client(self):
        """
        Connect to elasticsearch lazily

        :rtype: ElasticsearchQuery
        """
        if not self._client:
            self.logger.info(
                'Setting up elasticsearch client for %s host ("%s" index)',
                self._server, self._index)
            self._client = ElasticsearchQuery(es_host=self._server,
                                              period=self._period,
                                              index_prefix=self._index)

        return self._client
Ejemplo n.º 8
0
def get_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-other'
    ).query_by_string(
        query='kubernetes.labels.job-name:* AND '
              'kubernetes.container_name: "portability-metric" AND ("SELECT" OR "UPDATE")',
        fields=[
            'log',
            'kubernetes.labels.job-name'
        ],
        limit=limit
    )

    entries = []

    for row in rows:
        for entry in get_portability_metrics_query(
                row['log'], row['kubernetes']['labels']['job-name']):
            entries.append(entry)

    # print(entries)

    # process the logs
    def _map(item):
        return '{}'.join(item)

    def _reduce(items):
        #  ('MetricArticleProvider.py', 'UPDATE', 'articledata')
        first = items[0]

        script = 'cron:{}'.format(first[0])
        query_type = first[1]
        table_name = 'db:{}'.format(first[2])

        return {
            'source': table_name if query_type == 'SELECT' else script,
            'edge': query_type,
            'target': table_name if query_type != 'SELECT' else script,
        }

    return logs_map_and_reduce(entries, _map, _reduce)
Ejemplo n.º 9
0
def get_solr_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-solr').query_by_string(
            query='@source_host.keyword: /search-s.*/ AND @message: "webapp"',
            fields=[
                '@message',
            ],
            limit=limit)

    # extract required fields only
    # core name and method name
    rows = [(
        get_solr_core_name(row.get('@message')),
        str(get_solr_parameters(row.get('@message')).get('path',
                                                         '')).strip('/'),
    ) for row in rows]

    # process the logs
    def _map(item):
        return '{}'.join(item)

    def _reduce(items):
        first = items[0]
        index = first[0]
        method = first[1]
        client = 'client'  # add a user agent to the logs and identify the client based on it

        return {
            'source':
            'solr:{}'.format(index) if method == 'select' else 'indexer',
            'edge':
            'http',
            'target':
            'solr:{}'.format(index) if method != 'select' else client,
            # the following is optional
            'metadata':
            '{:.3f} /{} reqs per sec'.format(1. * len(items) / period, method)
        }

    return logs_map_and_reduce(rows, _map, _reduce)
def get_mobile_apps_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-apache-access-log'
    ).query_by_string(
        query='(agent: "Android" OR agent: "iOS") AND NOT agent: "Chrome" '
              'AND @source_host.keyword: /ap-s.*/',
        fields=[
            'agent',
            'request',
        ],
        limit=limit
    )

    # extract the request URL only
    # and filter out non-mobile app requests
    rows = [
        normalize_mediawiki_url(row.get('request'))
        for row in rows if is_mobile_app_user_agent(row.get('agent'))
    ]

    # process the logs
    def _map(item):
        return item

    def _reduce(items):
        target = items[0]

        return {
            'source': 'mobile-app',
            'edge': 'http',
            'target': target,
            # the following is optional
            'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)
def get_celery_tasks_flow_graph(limit, period):
    """
    :type limit int
    :type period int
    :rtype: list[dict]
    """
    # @see https://kibana5.wikia-inc.com/goto/d877bf3caf4204b9b5fdc5f8864f4ce2
    rows = ElasticsearchQuery(
        es_host=ELASTICSEARCH_HOST,
        period=period,
        index_prefix='logstash-mediawiki'
    ).query_by_string(
        query='@message: "BaseTask::execute" AND @fields.datacenter: "sjc" '
              'AND @fields.environment: "prod"',
        fields=[
            '@context.task_call',
        ],
        limit=limit
    )

    # extract the task type
    rows = [
        row.get('@context').get('task_call')
        for row in rows
    ]

    # process the logs
    def _map(item):
        return item

    def _reduce(items):
        target = items[0]

        return {
            'source': 'celery',
            'edge': 'http',
            'target': 'task:{}'.format(target),
            # the following is optional
            'metadata': '{:.3f} calls per minute'.format(60. * len(items) / period)
        }

    return logs_map_and_reduce(rows, _map, _reduce)
def test_indexes():
    es_query = ElasticsearchQuery(es_host='foo')
    assert es_query._index.startswith('logstash-')
def check_time(since, expected_since, expected_to, period):
    es_query = ElasticsearchQuery('foo.host.net', since, period)

    assert es_query._since == expected_since
    assert es_query.get_to_timestamp() == expected_to
def test_indexes_prefix_with_separator():
    es_query = ElasticsearchQuery(es_host='foo',
                                  index_prefix='syslog-ng',
                                  index_sep="_")
    assert es_query._index.startswith('syslog-ng_')
    assert ',syslog-ng_' in es_query._index
def test_indexes_prefix():
    es_query = ElasticsearchQuery(es_host='foo', index_prefix='syslog-ng')
    assert es_query._index.startswith('syslog-ng-')