def test_get_timestamp_filer(): es_query = ElasticsearchQuery(es_host='foo', since=123456, period=60) res = es_query._get_timestamp_filer() print(res) assert res['range']['@timestamp'] is not None assert res['range']['@timestamp']['gte'] == '1970-01-02T10:17:37.000Z' assert res['range']['@timestamp']['lte'] is not None
def test_format_index(): assert ElasticsearchQuery.format_index( prefix='logstash', timestamp=1) == 'logstash-1970.01.01' assert ElasticsearchQuery.format_index( prefix='logstash', timestamp=1408450795) == 'logstash-2014.08.19' assert ElasticsearchQuery.format_index( prefix='logstash-foo', timestamp=1408450795) == 'logstash-foo-2014.08.19' assert ElasticsearchQuery.format_index(prefix='syslog-ng', timestamp=1408450795, sep="_") == 'syslog-ng_2014.08.19'
def get_pandora_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ # https://kibana.wikia-inc.com/goto/3aef04fa1f9e55df5cc4c3031671ecab # k8s-ingress access logs, internal traffic rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-k8s-ingress-controller' ).query_by_string( query='NOT request_Fastly-Client-Ip: * AND request_User-Agent: * ' 'AND RequestHost: "prod.sjc.k8s.wikia.net"', fields=[ 'request_User-Agent', 'RequestPath', ], limit=limit ) # extract required fields only # ('mediawiki', 'pandora:helios::info') # ('swagger-codegen', 'pandora:user-attribute::user') # ('node-fetch', 'pandora:discussion::threads') rows = [ ( str(row.get('request_User-Agent')).split('/')[0].lower(), normalize_pandora_url(row.get('RequestPath')), ) for row in rows ] # process the logs def _map(item): return '{}-{}'.format(item[0], item[1]) def _reduce(items): first = items[0] source = first[0] target = first[1] # normalize the source if source == 'swagger-codegen': source = 'mediawiki' elif source == 'node-fetch': source = 'mobile-wiki' return { 'source': source, 'edge': 'http', 'target': target, # the following is optional 'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period) } return logs_map_and_reduce(rows, _map, _reduce)
def get_mediawiki_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ # https://kibana5.wikia-inc.com/goto/e6ab16f694b625d5b87833ae794f5989 # goreplay is running in RES (check SJC logs only) rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-mediawiki' ).query_by_string( query='"Wikia internal request" AND @fields.environment: "prod" ' 'AND @fields.datacenter: "sjc" ' 'AND @fields.http_url_path: *', fields=[ '@context.source', '@fields.http_url_path', ], limit=limit ) # extract required fields only # (u'user-permissions', 'api:query::users') # (u'1', 'nirvana:EmailControllerDiscussionReply::handle') rows = [ ( row.get('@context', {})['source'], normalize_mediawiki_url(row.get('@fields', {})['http_url_path']) ) for row in rows if row.get('@context', {}).get('source') is not None ] # process the logs def _map(item): return '{}-{}'.format(item[0], item[1]) def _reduce(items): first = items[0] source = first[0] target = first[1] return { 'source': source if source != '1' else 'internal', 'edge': 'http', 'target': target, # the following is optional 'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period) } return logs_map_and_reduce(rows, _map, _reduce)
def fetch(self): logger = logging.getLogger(__name__) while True: es = ElasticsearchQuery(es_host=self.ELASTICSEARCH_HOST, period=self.INTERVAL, index_prefix=self.ES_INDEX) res = es.query_by_string(self.QUERY, fields=self.FIELDS, limit=self.BATCH) urls = map(self.format_log_entry, res) for url in urls: if self.filter_out(url): logger.info('Filtered out <%s>', url) continue yield url time.sleep(self.INTERVAL)
def get_log_entries(query, period, fields, limit, index_prefix='logstash-other'): """ Get log entries from elasticsearch that match given query :type query str :type period int :type fields list[str] or None :type limit int :type index_prefix str :rtype tuple """ logger = logging.getLogger('get_log_entries') source = ElasticsearchQuery(es_host=LOGS_ES_HOST, period=period, index_prefix=index_prefix) logger.info('Query: \'%s\' for the last %d hour(s)', query, period / 3600) return source.query_by_string(query, fields, limit)
def client(self): """ Connect to elasticsearch lazily :rtype: ElasticsearchQuery """ if not self._client: self.logger.info( 'Setting up elasticsearch client for %s host ("%s" index)', self._server, self._index) self._client = ElasticsearchQuery(es_host=self._server, period=self._period, index_prefix=self._index) return self._client
def get_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-other' ).query_by_string( query='kubernetes.labels.job-name:* AND ' 'kubernetes.container_name: "portability-metric" AND ("SELECT" OR "UPDATE")', fields=[ 'log', 'kubernetes.labels.job-name' ], limit=limit ) entries = [] for row in rows: for entry in get_portability_metrics_query( row['log'], row['kubernetes']['labels']['job-name']): entries.append(entry) # print(entries) # process the logs def _map(item): return '{}'.join(item) def _reduce(items): # ('MetricArticleProvider.py', 'UPDATE', 'articledata') first = items[0] script = 'cron:{}'.format(first[0]) query_type = first[1] table_name = 'db:{}'.format(first[2]) return { 'source': table_name if query_type == 'SELECT' else script, 'edge': query_type, 'target': table_name if query_type != 'SELECT' else script, } return logs_map_and_reduce(entries, _map, _reduce)
def get_solr_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-solr').query_by_string( query='@source_host.keyword: /search-s.*/ AND @message: "webapp"', fields=[ '@message', ], limit=limit) # extract required fields only # core name and method name rows = [( get_solr_core_name(row.get('@message')), str(get_solr_parameters(row.get('@message')).get('path', '')).strip('/'), ) for row in rows] # process the logs def _map(item): return '{}'.join(item) def _reduce(items): first = items[0] index = first[0] method = first[1] client = 'client' # add a user agent to the logs and identify the client based on it return { 'source': 'solr:{}'.format(index) if method == 'select' else 'indexer', 'edge': 'http', 'target': 'solr:{}'.format(index) if method != 'select' else client, # the following is optional 'metadata': '{:.3f} /{} reqs per sec'.format(1. * len(items) / period, method) } return logs_map_and_reduce(rows, _map, _reduce)
def get_mobile_apps_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-apache-access-log' ).query_by_string( query='(agent: "Android" OR agent: "iOS") AND NOT agent: "Chrome" ' 'AND @source_host.keyword: /ap-s.*/', fields=[ 'agent', 'request', ], limit=limit ) # extract the request URL only # and filter out non-mobile app requests rows = [ normalize_mediawiki_url(row.get('request')) for row in rows if is_mobile_app_user_agent(row.get('agent')) ] # process the logs def _map(item): return item def _reduce(items): target = items[0] return { 'source': 'mobile-app', 'edge': 'http', 'target': target, # the following is optional 'metadata': '{:.3f} reqs per sec'.format(1. * len(items) / period) } return logs_map_and_reduce(rows, _map, _reduce)
def get_celery_tasks_flow_graph(limit, period): """ :type limit int :type period int :rtype: list[dict] """ # @see https://kibana5.wikia-inc.com/goto/d877bf3caf4204b9b5fdc5f8864f4ce2 rows = ElasticsearchQuery( es_host=ELASTICSEARCH_HOST, period=period, index_prefix='logstash-mediawiki' ).query_by_string( query='@message: "BaseTask::execute" AND @fields.datacenter: "sjc" ' 'AND @fields.environment: "prod"', fields=[ '@context.task_call', ], limit=limit ) # extract the task type rows = [ row.get('@context').get('task_call') for row in rows ] # process the logs def _map(item): return item def _reduce(items): target = items[0] return { 'source': 'celery', 'edge': 'http', 'target': 'task:{}'.format(target), # the following is optional 'metadata': '{:.3f} calls per minute'.format(60. * len(items) / period) } return logs_map_and_reduce(rows, _map, _reduce)
def test_indexes(): es_query = ElasticsearchQuery(es_host='foo') assert es_query._index.startswith('logstash-')
def check_time(since, expected_since, expected_to, period): es_query = ElasticsearchQuery('foo.host.net', since, period) assert es_query._since == expected_since assert es_query.get_to_timestamp() == expected_to
def test_indexes_prefix_with_separator(): es_query = ElasticsearchQuery(es_host='foo', index_prefix='syslog-ng', index_sep="_") assert es_query._index.startswith('syslog-ng_') assert ',syslog-ng_' in es_query._index
def test_indexes_prefix(): es_query = ElasticsearchQuery(es_host='foo', index_prefix='syslog-ng') assert es_query._index.startswith('syslog-ng-')