def __init_index(elastic_url, index, wait_time): mapping = Mapping while True: try: elastic = ElasticSearch(elastic_url, index, mappings=mapping) break except Exception as e: logging.info("Index %s not ready: %s", ARGS.index, str(e)) time.sleep(wait_time) return elastic
def test_get_last_date(self): """Test whether the last date is correctly returned""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) # no filter last_date = elastic.get_last_date('updated_on') self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # filter including all items fltr = { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' } last_date = elastic.get_last_date('updated_on', filters_=[fltr]) self.assertEqual(last_date.isoformat(), '2014-02-12T06:11:12+00:00') # filter including a sub-set og items fltr = { 'name': 'perceval_version', 'value': '0.9.11' } last_date = elastic.get_last_date('updated_on', filters_=[fltr]) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
def test_get_last_offset(self): """Test whether the last offset is correctly returned""" items = json.loads(read_file('data/kitsune.json')) elastic = ElasticSearch(self.es_con, self.target_index, KitsuneOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 4) # no filter last_offset = elastic.get_last_offset('offset') self.assertEqual(last_offset, 3) # filter including all items fltr = { 'name': 'origin', 'value': 'http://example.com' } last_offset = elastic.get_last_offset('offset', filters_=[fltr]) self.assertEqual(last_offset, 3) # filter including a sub-set og items fltr = { 'name': 'perceval_version', 'value': '0.9.11' } last_offset = elastic.get_last_offset('offset', filters_=[fltr]) self.assertEqual(last_offset, 1)
def create_index_pattern(elastic_url, dashboard, enrich_index, es_index=None): """ Create a index pattern using as template the index pattern in dashboard template vis :param elastic_url: URL for ElasticSearch (ES) server :param dashboard: kibana dashboard to be used as template :param enrich_index: ES enriched index used in the new dashboard """ index_pattern = None if not es_index: es_index = ".kibana" elastic = ElasticSearch(elastic_url, es_index) dash_data = get_dashboard_json(elastic, dashboard) # First vis if "panelsJSON" not in dash_data: logger.error("Can not find vis in dashboard: %s", dashboard) raise # Get the index pattern from the first vis in the panel # that as index pattern data for panel in json.loads(dash_data["panelsJSON"]): panel_id = panel["id"] logger.debug("Checking index pattern in %s vis", panel_id) index_pattern = get_index_pattern_from_vis(elastic, panel_id) if index_pattern: break # And now time to create the index pattern found if not index_pattern: logger.error("Can't find index pattern for %s", dashboard) raise logger.debug("Found %s template index pattern", index_pattern) new_index_pattern_json = get_index_pattern_json(elastic, index_pattern) new_index_pattern_json['title'] = enrich_index url = elastic.index_url + "/index-pattern/" + enrich_index headers = {"Content-Type": "application/json"} res = requests_ses.post(url, data=json.dumps(new_index_pattern_json), verify=False, headers=headers) res.raise_for_status() logger.debug("New index pattern created: %s", url) return enrich_index
def test_safe_put_bulk_errors(self): """Test whether an error message is logged when an item isn't inserted""" items = json.loads(read_file('data/git.json')) data_json = items[0] data_json['origin'] = ''.join( random.choice(string.ascii_letters) for x in range(66000)) bulk_json = '{{"index" : {{"_id" : "{}" }} }}\n'.format( data_json['uuid']) bulk_json += json.dumps(data_json) + "\n" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) bulk_url = elastic.get_bulk_url() with self.assertLogs(logger, level='ERROR') as cm: inserted_items = elastic.safe_put_bulk(bulk_url, bulk_json) self.assertRegex( cm.output[0], "ERROR:grimoire_elk.elastic:Failed to insert data to ES*") self.assertEqual(inserted_items, 0)
def export_items(elastic_url, in_index, out_index, elastic_url_out=None, search_after=False, search_after_value=None, limit=None, copy=False): """ Export items from in_index to out_index using the correct mapping """ if not limit: limit = DEFAULT_LIMIT if search_after_value: search_after_value_timestamp = int(search_after_value[0]) search_after_value_uuid = search_after_value[1] search_after_value = [search_after_value_timestamp, search_after_value_uuid] logging.info("Exporting items from %s/%s to %s", elastic_url, in_index, out_index) count_res = requests.get('%s/%s/_count' % (elastic_url, in_index)) try: count_res.raise_for_status() except requests.exceptions.HTTPError: if count_res.status_code == 404: logging.error("The index does not exists: %s", in_index) else: logging.error(count_res.text) sys.exit(1) logging.info("Total items to copy: %i", count_res.json()['count']) # Time to upload the items with the correct mapping elastic_in = ElasticSearch(elastic_url, in_index) if not copy: # Create the correct mapping for the data sources detected from in_index ds_mapping = find_mapping(elastic_url, in_index) else: logging.debug('Using the input index mapping') ds_mapping = extract_mapping(elastic_url, in_index) if not elastic_url_out: elastic_out = ElasticSearch(elastic_url, out_index, mappings=ds_mapping) else: elastic_out = ElasticSearch(elastic_url_out, out_index, mappings=ds_mapping) # Time to just copy from in_index to our_index uid_field = find_uuid(elastic_url, in_index) backend = find_perceval_backend(elastic_url, in_index) if search_after: total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit, search_after_value, scroll=False), uid_field) else: total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit), uid_field) logging.info("Total items copied: %i", total)
def test_init_duplicated_aliases(self): """Test whether duplicated aliases are ignored""" expected_aliases = { 'A': {}, 'B': {} } elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=["A", "B", "A"]) self.assertEqual(elastic.url, self.es_con) self.assertEqual(elastic.index, self.target_index) self.assertEqual(elastic.index_url, self.es_con + "/" + self.target_index) r = elastic.requests.get(elastic.index_url + '/_alias') aliases = r.json()[self.target_index]['aliases'] self.assertDictEqual(aliases, expected_aliases)
def test_fetch_filter_raw(self): """Test whether the fetch with filter raw properly works""" perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) # Load items items = json.loads(read_file('data/git.json')) ocean = GitOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) eitems = ElasticItems(perceval_backend) eitems.set_filter_raw("data.commit:87783129c3f00d2c81a3a8e585eb86a47e39891a") eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 1)
def feed_dashboard(dashboard, elastic_url, es_index=None, data_sources=None, add_vis_studies=False): """ Import a dashboard. If data_sources are defined, just include items for this data source. """ if not es_index: es_index = ".kibana" elastic = ElasticSearch(elastic_url, es_index) if 'dashboard' in dashboard: import_item_json(elastic, "dashboard", dashboard['dashboard']['id'], dashboard['dashboard']['value'], data_sources, add_vis_studies) if 'searches' in dashboard: for search in dashboard['searches']: import_item_json(elastic, "search", search['id'], search['value'], data_sources) if 'index_patterns' in dashboard: for index in dashboard['index_patterns']: if not data_sources or \ is_index_pattern_from_data_sources(index, data_sources): import_item_json(elastic, "index-pattern", index['id'], index['value']) else: logger.debug("Index pattern %s not for %s. Not included.", index['id'], data_sources) if 'visualizations' in dashboard: for vis in dashboard['visualizations']: if not add_vis_studies and is_vis_study(vis): logger.debug("Vis %s is for an study. Not included.", vis['id']) elif not data_sources or is_vis_from_data_sources( vis, data_sources): import_item_json(elastic, "visualization", vis['id'], vis['value']) else: logger.debug("Vis %s not for %s. Not included.", vis['id'], data_sources)
def test_fetch(self): """Test whether the fetch method properly works""" perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) # Load items items = json.loads(read_file('data/git.json')) ocean = GitOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) eitems = ElasticItems(perceval_backend) eitems.scroll_size = 2 eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 9)
def get_rich_items(self, item): # The real data entry = item['data'] enriched_items = [] for file_analysis in entry["analysis"]: eitem = self.get_rich_item(file_analysis) for f in self.RAW_FIELDS_COPY: if f in item: eitem[f] = item[f] else: eitem[f] = None # common attributes eitem['commit_sha'] = entry['commit'] eitem['author'] = entry['Author'] eitem['committer'] = entry['Commit'] eitem['message'] = entry.get('message', None) eitem['author_date'] = fix_field_date(entry['AuthorDate']) eitem['commit_date'] = fix_field_date(entry['CommitDate']) # Other enrichment eitem["repo_url"] = item["origin"] if eitem["repo_url"].startswith('http'): eitem["repo_url"] = ElasticSearch.anonymize_url( eitem["repo_url"]) if self.prjs_map: eitem.update(self.get_item_project(eitem)) # uuid eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) enriched_items.append(eitem) return enriched_items
def test_get_elastic_items(self): """Test whether the elastic method works properly""" perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) # Load items items = json.loads(read_file('data/git.json')) ocean = GitOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) eitems = ElasticItems(perceval_backend) eitems.elastic = elastic r_json = eitems.get_elastic_items() total = r_json['hits']['total'] total = total['value'] if isinstance(total, dict) else total self.assertEqual(total, 9)
def search_dashboards(elastic_url, es_index=None): dashboards = [] if not es_index: es_index = ".kibana" elastic = ElasticSearch(elastic_url, es_index) elastic_ver = find_elasticsearch_version(elastic) if elastic_ver < 6: dash_json_url = elastic.index_url + "/dashboard/_search?size=10000" res = requests_ses.get(dash_json_url, verify=False) else: items_json_url = elastic.index_url + "/_search?size=10000" query = ''' { "query" : { "term" : { "type" : "dashboard" } } }''' res = requests_ses.post(items_json_url, data=query, verify=False, headers=HEADERS_JSON) res.raise_for_status() res_json = res.json() if "hits" not in res_json: logger.error("Can't find dashboards") raise RuntimeError("Can't find dashboards") for dash in res_json["hits"]["hits"]: if elastic_ver < 6: dash_json = dash["_source"] else: dash_json = dash["_source"]["dashboard"] dashboards.append({"_id": dash["_id"], "title": dash_json["title"]}) return dashboards
def test_get_last_item_field(self): """Test whether the date/offset of the last item is correctly returned""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) # no filter last_date = elastic.get_last_item_field('updated_on') self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # None filter last_date = elastic.get_last_item_field('updated_on', filters_=None) self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # Multiple filters fltrs = [ { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' }, { 'name': 'perceval_version', 'value': '0.9.11' } ] last_date = elastic.get_last_item_field('updated_on', filters_=fltrs) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00') # Handle None filter fltrs = [ { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' }, { 'name': 'perceval_version', 'value': '0.9.11' }, None ] last_date = elastic.get_last_item_field('updated_on', filters_=fltrs) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
def get_elastic(url, es_index, clean=None, backend=None, es_aliases=None, mapping=None): analyzers = None if backend: backend.set_elastic_url(url) # mapping = backend.get_elastic_mappings() mapping = backend.mapping analyzers = backend.get_elastic_analyzers() try: insecure = True elastic = ElasticSearch(url=url, index=es_index, mappings=mapping, clean=clean, insecure=insecure, analyzers=analyzers, aliases=es_aliases) except ElasticConnectException: logger.error("Can't connect to Elastic Search. Is it running?") sys.exit(1) return elastic
def test_check_instance_es_major_error(self): """Test whether an exception is thrown when the ElasticSearch version number is not retrieved""" body = """{ "name" : "Amber Hunt", "cluster_name" : "jgbarah", "version" : { "build_hash" : "e3126df", "build_date" : "2016-04-26T12:08:58.960Z", "build_snapshot" : false, "lucene_version" : "6.0.0" }, "tagline" : "You Know, for Search" }""" es_con = "http://es_err.com" httpretty.register_uri(httpretty.GET, es_con, body=body, status=200) with self.assertRaises(ElasticError): _, _ = ElasticSearch.check_instance(es_con, insecure=False)
def test_fetch_no_results(self): """Test whether a message is logged when no results are found""" perceval_backend = Git('/tmp/perceval_mc84igfc/gittest-not_found', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) # Load items items = json.loads(read_file('data/git.json')) ocean = GitOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) eitems = ElasticItems(perceval_backend) eitems.elastic = elastic with self.assertLogs(logger, level='DEBUG') as cm: items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 0) self.assertRegex(cm.output[-2], 'DEBUG:grimoire_elk.elastic_items:No results found.*') self.assertRegex(cm.output[-1], 'DEBUG:grimoire_elk.elastic_items:Releasing scroll_id=*')
def test_add_aliases_duplicated(self): """Test whether an alias isn't added when already present in a given index""" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=['A', 'B', 'C']) expected_aliases = {'A': {}, 'B': {}, 'C': {}} aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases) elastic.add_alias('C') aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases)
def test_check_instance_es_major_5(self): """Test whether the major version is correctly calculated for ElasticSearch 5.x""" body = """{ "name" : "Amber Hunt", "cluster_name" : "jgbarah", "version" : { "number" : "5.0.0-alpha2", "build_hash" : "e3126df", "build_date" : "2016-04-26T12:08:58.960Z", "build_snapshot" : false, "lucene_version" : "6.0.0" }, "tagline" : "You Know, for Search" }""" es_con = "http://es5.com" httpretty.register_uri(httpretty.GET, es_con, body=body, status=200) major = ElasticSearch.check_instance(es_con, insecure=False) self.assertEqual(major, '5')
def fetch_index_pattern(elastic_url, ip_id, es_index=None): """ Fetch an index pattern JSON definition from Kibana and return it. :param elastic_url: Elasticsearch URL :param ip_id: index pattern identifier :param es_index: kibana index :return: a dict with index pattern data """ logger.debug("Fetching index pattern %s", ip_id) if not es_index: es_index = ".kibana" elastic = ElasticSearch(elastic_url, es_index) ip_json = get_index_pattern_json(elastic, ip_id) index_pattern = {"id": ip_id, "value": ip_json} return index_pattern
def test_add_aliases(self): """Test whether an alias is added to a given index""" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=['A', 'B', 'C']) expected_aliases = {'A': {}, 'B': {}, 'C': {}} aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases) expected_aliases = {'A': {}, 'B': {}, 'C': {}, 'D': {}} elastic.add_alias('D') aliases = elastic.list_aliases() self.assertDictEqual(aliases, expected_aliases)
def test_delete_items_wrong_retention(self): """Test whether no items are deleted if retention isn't defined or negative""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=None, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11) elastic.delete_items(retention_time=-1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11)
def test_fetch_from_offset(self): """Test whether the fetch method with offset properly works""" perceval_backend = Kitsune('http://example.com') elastic = ElasticSearch(self.es_con, self.target_index, KitsuneOcean.mapping) # Load items items = json.loads(read_file('data/kitsune.json')) ocean = KitsuneOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) # Fetch total items eitems = ElasticItems(perceval_backend) eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 4) # Fetch with offset eitems = ElasticItems(perceval_backend, offset=2) eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 2)
def test_delete_items(self): """Test whether items are correctly deleted""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 9) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=90000000, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 9) elastic.delete_items(retention_time=1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 0)
def test_check_instance_es_major_6(self): """Test whether the major version is correctly calculated for ElasticSearch 6.x""" body = """{ "name" : "44BPNNH", "cluster_name" : "elasticsearch", "cluster_uuid" : "fIa1j8AQRfSrmuhTwb9a0Q", "version" : { "number" : "6.1.0", "build_hash" : "c0c1ba0", "build_date" : "2017-12-12T12:32:54.550Z", "build_snapshot" : false, "lucene_version" : "7.1.0", "minimum_wire_compatibility_version" : "5.6.0", "minimum_index_compatibility_version" : "5.0.0" }, "tagline" : "You Know, for Search" }""" es_con = "http://es6.com" httpretty.register_uri(httpretty.GET, es_con, body=body, status=200) major = ElasticSearch.check_instance(es_con, insecure=False) self.assertEqual(major, '6')
def test_fetch_from_date(self): """Test whether the fetch method with from_date properly works""" perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) # Load items items = json.loads(read_file('data/git.json')) ocean = GitOcean(perceval_backend) ocean.elastic = elastic ocean.feed_items(items) # Fetch total items eitems = ElasticItems(perceval_backend) eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 9) # Fetch with from date from_date = str_to_datetime("2018-02-09T08:33:22.699+00:00") eitems = ElasticItems(perceval_backend, from_date=from_date) eitems.elastic = elastic items = [ei for ei in eitems.fetch()] self.assertEqual(len(items), 2)
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[cocom] study enrich-cocom-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) logger.info( "[cocom] study enrich-cocom-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("cocom_study") num_items = 0 ins_items = 0 for repository_url in repositories: logger.info( "[cocom] study enrich-cocom-analysis start analysis for {}". format(repository_url)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: files_at_time = es_in.search( index=in_index, body=get_files_at_time(repository_url, to_month.isoformat()) )['aggregations']['file_stats'].get("buckets", []) if not len(files_at_time): to_month = to_month + relativedelta(months=+interval) continue repository_name = repository_url.split("/")[-1] evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "total_files": len(files_at_time) } for file_ in files_at_time: file_details = file_["1"]["hits"]["hits"][0]["_source"] for metric in self.metrics: total_metric = "total_" + metric evolution_item[total_metric] = evolution_item.get( total_metric, 0) evolution_item[total_metric] += file_details[ metric] if file_details[ metric] is not None else 0 # TODO: Fix Logic: None rather than 1 evolution_item["total_comments_per_loc"] = round( evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_blanks_per_loc"] = round( evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_loc_per_function"] = round( evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis " "Study".format(missing, num_items)) else: logger.info( "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis " "Study".format(num_items)) logger.info( "[cocom] study enrich-cocom-analysis End analysis for {} with month interval" .format(repository_url)) logger.info("[cocom] study enrich-cocom-analysis End")
def __create_arthur_json(self, repo, backend_args): """ Create the JSON for configuring arthur to collect data https://github.com/grimoirelab/arthur#adding-tasks Sample for git: { "tasks": [ { "task_id": "arthur.git", "backend": "git", "backend_args": { "gitpath": "/tmp/arthur_git/", "uri": "https://github.com/grimoirelab/arthur.git" }, "category": "commit", "archive_args": { "archive_path": '/tmp/test_archives', "fetch_from_archive": false, "archive_after": None }, "scheduler_args": { "delay": 10 } } ] } """ backend_args = self._compose_arthur_params(self.backend_section, repo) if self.backend_section == 'git': backend_args['gitpath'] = os.path.join(self.REPOSITORY_DIR, repo) backend_args['tag'] = self.backend_tag(repo) ajson = {"tasks": [{}]} # This is the perceval tag ajson["tasks"][0]['task_id'] = self.backend_tag(repo) ajson["tasks"][0]['backend'] = self.backend_section.split(":")[0] ajson["tasks"][0]['backend_args'] = backend_args ajson["tasks"][0]['category'] = backend_args['category'] ajson["tasks"][0]['archive'] = {} ajson["tasks"][0]['scheduler'] = {"delay": self.ARTHUR_TASK_DELAY} # from-date or offset param must be added es_col_url = self._get_collection_url() es_index = self.conf[self.backend_section]['raw_index'] # Get the last activity for the data source es = ElasticSearch(es_col_url, es_index) connector = get_connector_from_name(self.backend_section) klass = connector[0] # Backend for the connector signature = inspect.signature(klass.fetch) last_activity = None filter_ = {"name": "tag", "value": backend_args['tag']} if 'from_date' in signature.parameters: last_activity = es.get_last_item_field('metadata__updated_on', [filter_]) if last_activity: ajson["tasks"][0]['backend_args'][ 'from_date'] = last_activity.isoformat() elif 'offset' in signature.parameters: last_activity = es.get_last_item_field('offset', [filter_]) if last_activity: ajson["tasks"][0]['backend_args']['offset'] = last_activity if last_activity: logging.info("Getting raw item with arthur since %s", last_activity) return (ajson)
def enrich_backlog_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="github_enrich_backlog", date_field="grimoire_creation_date", interval_days=1, reduced_labels=["bug"], map_label=["others", "bugs"]): """ The purpose of this study is to add additional index to compute the chronological evolution of opened issues and average opened time issues. For each repository and label, we start the study on repository creation date until today with a day interval (default). For each date we retrieve the number of open issues at this date by difference between number of opened issues and number of closed issues. In addition, we compute the average opened time for all issues open at this date. To differentiate by label, we compute evolution for bugs and all others labels (like "enhancement","good first issue" ... ), we call this "reduced labels". We need to use theses reduced labels because the complexity to compute evolution for each combination of labels would be too big. In addition, we can rename "bug" label to "bugs" with map_label. Entry example in setup.cfg : [github] raw_index = github_issues_raw enriched_index = github_issues_enriched ... studies = [enrich_backlog_analysis] [enrich_backlog_analysis] out_index = github_enrich_backlog interval_days = 7 reduced_labels = [bug,enhancement] map_label = [others, bugs, enhancements] """ logger.info("[github] Start enrich_backlog_analysis study") # combine two lists to create the dict to map labels map_label = dict(zip([""] + reduced_labels, map_label)) # connect to ES es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index # get all repositories unique_repos = es_in.search( index=in_index, body=get_unique_repository_with_project_name()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.debug( "[enrich-backlog-analysis] {} repositories to process".format( len(repositories))) # create the index es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("backlog_study") # analysis for each repositories num_items = 0 ins_items = 0 for repository in repositories: repository_url = repository["origin"] project = repository["project"] org_name = repository["organization"] repository_name = repository_url.split("/")[-1] logger.debug( "[enrich-backlog-analysis] Start analysis for {}".format( repository_url)) # get each day since repository creation dates = es_in.search(index=in_index, body=get_issues_dates(interval_days, repository_url) )['aggregations']['created_per_interval'].get( "buckets", []) # for each selected label + others labels for label, other in [("", True)] + [(l, False) for l in reduced_labels]: # compute metrics for each day (ES request for each day) evolution_items = [] for date in map(lambda i: i['key_as_string'], dates): evolution_item = self.__create_backlog_item( repository_url, repository_name, project, date, org_name, interval_days, label, map_label, self.__get_opened_issues(es_in, in_index, repository_url, date, interval_days, other, label, reduced_labels)) evolution_items.append(evolution_item) # complete until today (no ES request needed, just extrapol) today = datetime.now().replace(hour=0, minute=0, second=0, tzinfo=None) last_item = evolution_item last_date = str_to_datetime( evolution_item['study_creation_date']).replace(tzinfo=None) \ + relativedelta(days=interval_days) average_opened_time = evolution_item['average_opened_time'] \ + float(interval_days) while last_date < today: date = last_date.strftime('%Y-%m-%dT%H:%M:%S.000Z') evolution_item = {} evolution_item.update(last_item) evolution_item.update({ "average_opened_time": average_opened_time, "study_creation_date": date, "uuid": "{}_{}_{}".format(date, repository_name, label), }) evolution_item.update( self.get_grimoire_fields(date, "stats")) evolution_items.append(evolution_item) last_date = last_date + relativedelta(days=interval_days) average_opened_time = average_opened_time + float( interval_days) # upload items to ES if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( ("[enrich-backlog-analysis] %s/%s missing items", "for Graal Backlog Analysis Study"), str(missing), str(num_items)) else: logger.debug( ("[enrich-backlog-analysis] %s items inserted", "for Graal Backlog Analysis Study"), str(num_items)) logger.info("[github] End enrich_backlog_analysis study")
connector = get_connector_from_name(backend_name, connectors) backend = connector[0](**vars(args)) ocean_backend = connector[1](backend, **vars(args)) enrich_backend = connector[2](backend, **vars(args)) es_index = backend.get_name() + "_" + backend.get_id() clean = args.no_incremental if args.cache: clean = True try: # Ocean elastic_state = ElasticSearch(args.elastic_url, es_index, ocean_backend.get_elastic_mappings(), clean) # Enriched ocean enrich_index = es_index + "_enrich" elastic = ElasticSearch(args.elastic_url, enrich_index, enrich_backend.get_elastic_mappings(), clean) except ElasticConnectException: logging.error("Can't connect to Elastic Search. Is it running?") sys.exit(1) ocean_backend.set_elastic(elastic_state) enrich_backend.set_elastic(elastic) try: