def test_duration_analysis(self): """Test that the geolocation study works correctly""" study, ocean_backend, enrich_backend = self._test_study('enrich_duration_analysis') with self.assertLogs(logger, level='INFO') as cm: if study.__name__ == "enrich_duration_analysis": study(ocean_backend, enrich_backend, start_event_type="UnlabeledEvent", target_attr="label", fltr_attr="label", fltr_event_types=["LabeledEvent"]) self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.githubql:[githubql] Duration analysis ' 'starting study %s/test_githubql_enrich' % anonymize_url(self.es_con)) self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.githubql:[githubql] Duration analysis ' 'ending study %s/test_githubql_enrich' % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until github enrich index has been written items = [item for item in enrich_backend.fetch() if item['event_type'] == 'UnlabeledEvent'] self.assertEqual(len(items), 1) for item in items: self.assertEqual(item['previous_event_uuid'], 'f371d54454d297f86f08ab52a440ae5f9e4afeb1') self.assertEqual(item['duration_from_previous_event'], 2.0)
def __init__(self, url, index, mappings=None, clean=False, insecure=True, analyzers=None, aliases=None): """Class to handle the operations with the ElasticSearch database, such as creating indexes, mappings, setting up aliases and uploading documents. :param url: ES url :param index: index name :param mappings: an instance of the Mapping class :param clean: if True, deletes an existing index and create it again :param insecure: support https with invalid certificates :param analyzers: analyzers for ElasticSearch :param aliases: list of aliases, defined as strings, to be added to the index """ # Get major version of Elasticsearch instance self.major = self.check_instance(url, insecure) logger.debug("Found version of ES instance at {}: {}.".format( anonymize_url(url), self.major)) self.url = url # Valid index for elastic self.index = self.safe_index(index) self.aliases = aliases self.index_url = self.url + "/" + self.index self.wait_bulk_seconds = 2 # time to wait to complete a bulk operation self.requests = grimoire_con(insecure) analyzer_settings = None if analyzers: analyzers_dict = analyzers.get_elastic_analyzers( es_major=self.major) analyzer_settings = analyzers_dict['items'] self.create_index(analyzer_settings, clean) if analyzers: self.update_analyzers(analyzer_settings) if mappings: map_dict = mappings.get_elastic_mappings(es_major=self.major) self.create_mappings(map_dict) if aliases: for alias in aliases: if self.alias_in_use(alias): logger.debug( "Alias {} won't be set on {}, it already exists on {}". format(alias, anonymize_url(self.index_url), anonymize_url(self.url))) continue self.add_alias(alias)
def create_index(self, analyzers=None, clean=False): """Create an index. If clean is `True`, the target index will be deleted and recreated. :param analyzers: set index analyzers :param clean: if True, the index is deleted and recreated """ res = self.requests.get(self.index_url) headers = {"Content-Type": "application/json"} if res.status_code != 200: # Index does no exists res = self.requests.put(self.index_url, data=analyzers, headers=headers) if res.status_code != 200: msg = "Can't create index {} ({})".format( anonymize_url(self.index_url), res.status_code) logger.error(msg) raise ElasticError(cause=msg) else: logger.info("Created index {}".format( anonymize_url(self.index_url))) else: if clean: res = self.requests.delete(self.index_url) res.raise_for_status() res = self.requests.put(self.index_url, data=analyzers, headers=headers) res.raise_for_status() logger.info("Deleted and created index {}".format( anonymize_url(self.index_url)))
def add_alias(self, alias): """Add an alias to the index set in the elastic obj :param alias: alias to add :returns: None """ aliases = self.list_aliases() alias_dict = alias if isinstance(alias, str): alias_dict = {"alias": alias} if aliases and alias_dict['alias'] in aliases: logger.debug("Alias {} already exists on {}.".format( alias_dict['alias'], anonymize_url(self.index_url))) return # add alias alias_dict['index'] = self.index alias_action = {"actions": [{"add": alias_dict}]} r = self.requests.post(self.url + "/_aliases", headers=HEADER_JSON, verify=False, data=json.dumps(alias_action)) try: r.raise_for_status() except requests.exceptions.HTTPError as ex: logger.warning( "Something went wrong when adding an alias on {}. Alias not set. {}" .format(anonymize_url(self.index_url), ex)) return logger.info("Alias {} created on {}.".format( alias, anonymize_url(self.index_url)))
def test_geolocation_study(self): """ Test that the geolocation study works correctly """ study, ocean_backend, enrich_backend = self._test_study( 'enrich_geolocation') with self.assertLogs(logger, level='INFO') as cm: if study.__name__ == "enrich_geolocation": study(ocean_backend, enrich_backend, location_field="user_location", geolocation_field="user_geolocation") self.assertEqual( cm.output[0], 'INFO:grimoire_elk.enriched.enrich:[github] Geolocation ' 'starting study %s/test_github_enrich' % anonymize_url(self.es_con)) self.assertEqual( cm.output[-1], 'INFO:grimoire_elk.enriched.enrich:[github] Geolocation ' 'end %s/test_github_enrich' % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until github enrich index has been written items = [ item for item in enrich_backend.fetch() if 'user_location' in item ] self.assertEqual(len(items), 4) for item in items: self.assertIn('user_geolocation', item)
def test_reference_analysis(self): """Test that the cross reference study works correctly""" study, ocean_backend, enrich_backend = self._test_study('enrich_reference_analysis') with self.assertLogs(logger, level='INFO') as cm: if study.__name__ == "enrich_reference_analysis": study(ocean_backend, enrich_backend) self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.githubql:[githubql] Cross reference analysis ' 'starting study %s/test_githubql_enrich' % anonymize_url(self.es_con)) self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.githubql:[githubql] Cross reference analysis ' 'ending study %s/test_githubql_enrich' % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until github enrich index has been written referenced_items = [] for item in enrich_backend.fetch(): if ('referenced_by_issues' or 'referenced_by_prs') in item.keys(): referenced_items.append(item) self.assertEqual(len(referenced_items), 7) for item in referenced_items: self.assertIn('referenced_by_issues', item) self.assertIn('referenced_by_prs', item) self.assertIn('referenced_by_merged_prs', item) self.assertIn('referenced_by_external_issues', item) self.assertIn('referenced_by_external_prs', item) self.assertIn('referenced_by_external_merged_prs', item) ref_issues = item['referenced_by_issues'] self.assertEqual(len(ref_issues), 1) ref = ref_issues[0] self.assertEqual(ref, 'https://github.com/valeriocos/test-issues-update/issues/2') ref_prs = item['referenced_by_prs'] self.assertEqual(len(ref_prs), 1) ref = ref_prs[0] self.assertEqual(ref, 'https://github.com/valeriocos/test-issues-update/pull/3') ref_merged_prs = item['referenced_by_merged_prs'] self.assertEqual(len(ref_merged_prs), 1) ref = ref_merged_prs[0] self.assertEqual(ref, 'https://github.com/valeriocos/test-issues-update/pull/3') ref_ext_issues = item['referenced_by_external_issues'] self.assertEqual(len(ref_ext_issues), 0) ref_ext_prs = item['referenced_by_external_prs'] self.assertEqual(len(ref_ext_prs), 0) ref_ext_merged_prs = item['referenced_by_external_merged_prs'] self.assertEqual(len(ref_ext_merged_prs), 0)
def delete_items(self, retention_time, time_field="metadata__updated_on"): """Delete documents updated before a given date :param retention_time: maximum number of minutes wrt the current date to retain the data :param time_field: time field to delete the data """ if retention_time is None: logger.debug( "[items retention] Retention policy disabled, no items will be deleted." ) return if retention_time <= 0: logger.debug( "[items retention] Minutes to retain must be greater than 0.") return before_date = get_diff_current_date(minutes=retention_time) before_date_str = before_date.isoformat() es_query = ''' { "query": { "range": { "%s": { "lte": "%s" } } } } ''' % (time_field, before_date_str) r = self.requests.post(self.index_url + "/_delete_by_query?refresh", data=es_query, headers=HEADER_JSON, verify=False) try: r.raise_for_status() r_json = r.json() logger.debug( "[items retention] {} items deleted from {} before {}.".format( r_json['deleted'], anonymize_url(self.index_url), before_date)) except requests.exceptions.HTTPError as ex: logger.error( "[items retention] Error deleted items from {}. {}".format( anonymize_url(self.index_url), ex)) return
def check_instance(url, insecure): """Checks if there is an instance of Elasticsearch in url. Actually, it checks if GET on the url returns a JSON document with a field tagline "You know, for search", and a field version.number. :value url: url of the instance to check :value insecure: don't verify ssl connection (boolean) :returns: major version of Elasticsearch, as string. """ res = grimoire_con(insecure).get(url) if res.status_code != 200: msg = "Got {} from url {}".format(res.status_code, url) logger.error(msg) raise ElasticError(cause=msg) else: try: version_str = res.json()['version']['number'] version_major = version_str.split('.')[0] return version_major except Exception: msg = "Could not read proper welcome message from url {}, {}".format( anonymize_url(url), res.text) logger.error(msg) raise ElasticError(cause=msg)
def safe_put_bulk(self, url, bulk_json): """Bulk items to a target index `url`. In case of UnicodeEncodeError, the bulk is encoded with iso-8859-1. :param url: target index where to bulk the items :param bulk_json: str representation of the items to upload """ headers = {"Content-Type": "application/x-ndjson"} try: res = self.requests.put(url + '?refresh=true', data=bulk_json, headers=headers) res.raise_for_status() except UnicodeEncodeError: # Related to body.encode('iso-8859-1'). mbox data logger.warning("Encondig error ... converting bulk to iso-8859-1") bulk_json = bulk_json.encode('iso-8859-1', 'ignore') res = self.requests.put(url, data=bulk_json, headers=headers) res.raise_for_status() result = res.json() failed_items = [] error = "" if result['errors']: # Due to multiple errors that may be thrown when inserting bulk data, only the first error is returned failed_items = [ item['index'] for item in result['items'] if 'error' in item['index'] ] error = str(failed_items[0]['error']) logger.error("Failed to insert data to ES: {}, {}".format( error, anonymize_url(url))) inserted_items = len(result['items']) - len(failed_items) # The exception is currently not thrown to avoid stopping ocean uploading processes try: if failed_items: raise ELKError(cause=error) except ELKError: pass logger.debug("{} items uploaded to ES ({})".format( inserted_items, anonymize_url(url))) return inserted_items
def create_mappings(self, mappings): """Create the mappings for a given index. It includes the index pattern plus dynamic templates. :param mappings: elastic_mapping.Mapping object """ headers = {"Content-Type": "application/json"} for _type in mappings: url_map = self.get_mapping_url(_type) # First create the manual mappings if mappings[_type] != '{}': res = self.requests.put(url_map, data=mappings[_type], headers=headers) try: res.raise_for_status() except requests.exceptions.HTTPError: logger.error( "Error creating ES mappings {}. Mapping: {}".format( res.text, str(mappings[_type]))) # After version 6, strings are keywords (not analyzed) not_analyze_strings = """ { "dynamic_templates": [ { "notanalyzed": { "match": "*", "match_mapping_type": "string", "mapping": { "type": "keyword" } } }, { "formatdate": { "match": "*", "match_mapping_type": "date", "mapping": { "type": "date", "format" : "strict_date_optional_time||epoch_millis" } } } ] } """ res = self.requests.put(url_map, data=not_analyze_strings, headers=headers) try: res.raise_for_status() except requests.exceptions.HTTPError: logger.error("Can't add mapping {}: {}".format( anonymize_url(url_map), not_analyze_strings))
def test_demography_study(self): """ Test that the demography study works correctly """ study, ocean_backend, enrich_backend = self._test_study( 'enrich_demography') with self.assertLogs(logger, level='INFO') as cm: if study.__name__ == "enrich_demography": study(ocean_backend, enrich_backend) self.assertEqual( cm.output[0], 'INFO:grimoire_elk.enriched.enrich:[github] Demography ' 'starting study %s/test_github_enrich' % anonymize_url(self.es_con)) self.assertEqual( cm.output[-1], 'INFO:grimoire_elk.enriched.enrich:[github] Demography ' 'end %s/test_github_enrich' % anonymize_url(self.es_con)) time.sleep(5) # HACK: Wait until github enrich index has been written items = [item for item in enrich_backend.fetch()] self.assertEqual(len(items), 7) for item in items: self.assertNotIn('username:password', item['origin']) self.assertNotIn('username:password', item['tag']) if 'author_uuid' in item: self.assertTrue('demography_min_date' in item.keys()) self.assertTrue('demography_max_date' in item.keys()) r = enrich_backend.elastic.requests.get( enrich_backend.elastic.index_url + "/_alias", headers=HEADER_JSON, verify=False) self.assertIn(DEMOGRAPHICS_ALIAS, r.json()[enrich_backend.elastic.index]['aliases'])
def list_aliases(self): """List aliases linked to the index""" # check alias doesn't exist r = self.requests.get(self.index_url + "/_alias", headers=HEADER_JSON, verify=False) try: r.raise_for_status() except requests.exceptions.HTTPError as ex: logger.warning( "Something went wrong when retrieving aliases on {}, {}". format(anonymize_url(self.index_url), ex)) return aliases = r.json()[self.index]['aliases'] return aliases
def bulk_upload(self, items, field_id): """Upload in controlled packs items to ES using bulk API :param items: list of items to be uploaded :param field_id: unique ID attribute used to differentiate the items """ current = 0 new_items = 0 # total items added with bulk bulk_json = "" if not items: return new_items url = self.get_bulk_url() logger.debug("Adding items to {} (in {} packs)".format( anonymize_url(url), self.max_items_bulk)) task_init = time() for item in items: if current >= self.max_items_bulk: task_init = time() new_items += self.safe_put_bulk(url, bulk_json) current = 0 json_size = sys.getsizeof(bulk_json) / (1024 * 1024) logger.debug( "bulk packet sent ({:.2f} sec, {} total, {:.2f} MB)". format(time() - task_init, new_items, json_size)) bulk_json = "" data_json = json.dumps(item) bulk_json += '{{"index" : {{"_id" : "{}" }} }}\n'.format( item[field_id]) bulk_json += data_json + "\n" # Bulk document current += 1 if current > 0: new_items += self.safe_put_bulk(url, bulk_json) json_size = sys.getsizeof(bulk_json) / (1024 * 1024) logger.debug( "bulk packet sent ({:.2f} sec prev, {} total, {:.2f} MB)". format(time() - task_init, new_items, json_size)) return new_items
def all_es_aliases(self): """List all aliases used in ES""" r = self.requests.get(self.url + "/_aliases", headers=HEADER_JSON, verify=False) try: r.raise_for_status() except requests.exceptions.HTTPError as ex: logger.warning( "Something went wrong when retrieving aliases on {}, {}". format(anonymize_url(self.index_url), ex)) return aliases = [] for index in r.json().keys(): aliases.extend(list(r.json()[index]['aliases'].keys())) aliases = list(set(aliases)) return aliases
def all_properties(self): """Get all properties of a given index""" url = self.get_mapping_url(_type='items') r = self.requests.get(url, headers=HEADER_JSON, verify=False) try: r.raise_for_status() r_json = r.json() # ES 7.x properties = r_json[self.index]['mappings'].get('properties', {}) # ES 6.x if not properties: items_mapping = r_json[self.index]['mappings'].get('items', {}) properties = items_mapping.get('properties', {}) if items_mapping else {} except requests.exceptions.HTTPError as ex: logger.error("Error all attributes for {}. {}".format( anonymize_url(self.index_url), ex)) return return properties
def get_last_item_field(self, field, filters_=[], offset=False): """Find the offset/date of the last item stored in the index. :param field: field with the data :param filters_: additional filters to find the date :param offset: if True, returns the offset field instead of date field """ last_value = None url = self.index_url url += "/_search" if filters_ is None: filters_ = [] terms = [] for filter_ in filters_: if not filter_: continue term = '''{"term" : { "%s" : "%s"}}''' % (filter_['name'], filter_['value']) terms.append(term) data_query = '''"query": {"bool": {"filter": [%s]}},''' % ( ','.join(terms)) data_agg = ''' "aggs": { "1": { "max": { "field": "%s" } } } ''' % field data_json = ''' { "size": 0, %s %s } ''' % (data_query, data_agg) logger.debug("{} {}".format(anonymize_url(url), data_json)) headers = {"Content-Type": "application/json"} res = self.requests.post(url, data=data_json, headers=headers) res.raise_for_status() res_json = res.json() if 'aggregations' in res_json: last_value = res_json["aggregations"]["1"]["value"] if offset: if last_value is not None: last_value = int(last_value) else: if "value_as_string" in res_json["aggregations"]["1"]: last_value = res_json["aggregations"]["1"][ "value_as_string"] last_value = str_to_datetime(last_value) else: last_value = res_json["aggregations"]["1"]["value"] if last_value: try: last_value = unixtime_to_datetime(last_value) except InvalidDateError: # last_value is in microsecs last_value = unixtime_to_datetime(last_value / 1000) return last_value