def export_items(elastic_url, in_index, out_index, elastic_url_out=None, search_after=False, search_after_value=None, limit=None, copy=False): """ Export items from in_index to out_index using the correct mapping """ if not limit: limit = DEFAULT_LIMIT if search_after_value: search_after_value_timestamp = int(search_after_value[0]) search_after_value_uuid = search_after_value[1] search_after_value = [search_after_value_timestamp, search_after_value_uuid] logging.info("Exporting items from %s/%s to %s", elastic_url, in_index, out_index) count_res = requests.get('%s/%s/_count' % (elastic_url, in_index)) try: count_res.raise_for_status() except requests.exceptions.HTTPError: if count_res.status_code == 404: logging.error("The index does not exists: %s", in_index) else: logging.error(count_res.text) sys.exit(1) logging.info("Total items to copy: %i", count_res.json()['count']) # Time to upload the items with the correct mapping elastic_in = ElasticSearch(elastic_url, in_index) if not copy: # Create the correct mapping for the data sources detected from in_index ds_mapping = find_mapping(elastic_url, in_index) else: logging.debug('Using the input index mapping') ds_mapping = extract_mapping(elastic_url, in_index) if not elastic_url_out: elastic_out = ElasticSearch(elastic_url, out_index, mappings=ds_mapping) else: elastic_out = ElasticSearch(elastic_url_out, out_index, mappings=ds_mapping) # Time to just copy from in_index to our_index uid_field = find_uuid(elastic_url, in_index) backend = find_perceval_backend(elastic_url, in_index) if search_after: total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit, search_after_value, scroll=False), uid_field) else: total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit), uid_field) logging.info("Total items copied: %i", total)
def test_get_last_date(self): """Test whether the last date is correctly returned""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) # no filter last_date = elastic.get_last_date('updated_on') self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # filter including all items fltr = { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' } last_date = elastic.get_last_date('updated_on', filters_=[fltr]) self.assertEqual(last_date.isoformat(), '2014-02-12T06:11:12+00:00') # filter including a sub-set og items fltr = { 'name': 'perceval_version', 'value': '0.9.11' } last_date = elastic.get_last_date('updated_on', filters_=[fltr]) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
def test_get_elastic_items_error(self): """Test whether a message is logged if an error occurs when getting items from an index""" items = json.loads(read_file('data/git.json')) perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo') elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) elastic.bulk_upload(items, field_id="uuid") # Load items eitems = ElasticItems(perceval_backend) eitems.elastic = elastic with self.assertLogs(logger, level='DEBUG') as cm: r_json = eitems.get_elastic_items() self.assertIsNone(r_json) self.assertRegex(cm.output[-1], 'DEBUG:grimoire_elk.elastic_items:No results found from*')
def test_bulk_upload_no_items(self): """Test whether items are correctly uploaded to an index""" elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload([], field_id="uuid") self.assertEqual(new_items, 0)
def test_get_last_offset(self): """Test whether the last offset is correctly returned""" items = json.loads(read_file('data/kitsune.json')) elastic = ElasticSearch(self.es_con, self.target_index, KitsuneOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 4) # no filter last_offset = elastic.get_last_offset('offset') self.assertEqual(last_offset, 3) # filter including all items fltr = { 'name': 'origin', 'value': 'http://example.com' } last_offset = elastic.get_last_offset('offset', filters_=[fltr]) self.assertEqual(last_offset, 3) # filter including a sub-set og items fltr = { 'name': 'perceval_version', 'value': '0.9.11' } last_offset = elastic.get_last_offset('offset', filters_=[fltr]) self.assertEqual(last_offset, 1)
def test_bulk_upload(self): """Test whether items are correctly uploaded to an index""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11)
def test_get_last_item_field_handle_invalid_date_error(self): """Test whether long timestamps are properly handled""" items = json.loads(read_file('data/git.json')) items[-1]['updated_on'] = items[-1]['updated_on'] * 1000 elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) last_date = elastic.get_last_item_field('updated_on') self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:53.024000+00:00')
def test_delete_items_error(self): """Test whether an error message is logged if the items aren't deleted""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) with self.assertLogs(logger, level='ERROR') as cm: elastic.delete_items(retention_time=1, time_field='timestamp') self.assertRegex(cm.output[0], 'ERROR:grimoire_elk.elastic:\\[items retention\\] Error deleted items*')
def test_get_last_item_field(self): """Test whether the date/offset of the last item is correctly returned""" items = json.loads(read_file('data/git.json')) elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) # no filter last_date = elastic.get_last_item_field('updated_on') self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # None filter last_date = elastic.get_last_item_field('updated_on', filters_=None) self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00') # Multiple filters fltrs = [ { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' }, { 'name': 'perceval_version', 'value': '0.9.11' } ] last_date = elastic.get_last_item_field('updated_on', filters_=fltrs) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00') # Handle None filter fltrs = [ { 'name': 'origin', 'value': '/tmp/perceval_mc84igfc/gittest' }, { 'name': 'perceval_version', 'value': '0.9.11' }, None ] last_date = elastic.get_last_item_field('updated_on', filters_=fltrs) self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
def test_delete_items_wrong_retention(self): """Test whether no items are deleted if retention isn't defined or negative""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 11) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=None, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11) elastic.delete_items(retention_time=-1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 11)
def test_delete_items(self): """Test whether items are correctly deleted""" items = json.loads(read_file('data/git.json')) for item in items: timestamp = unixtime_to_datetime(item['timestamp']) item['timestamp'] = timestamp.isoformat() elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping) new_items = elastic.bulk_upload(items, field_id="uuid") self.assertEqual(new_items, 9) url = self.es_con + '/' + self.target_index + '/_count' elastic.delete_items(retention_time=90000000, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 9) elastic.delete_items(retention_time=1, time_field='timestamp') left_items = elastic.requests.get(url).json()['count'] self.assertEqual(left_items, 0)
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[cocom] study enrich-cocom-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) logger.info( "[cocom] study enrich-cocom-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("cocom_study") num_items = 0 ins_items = 0 for repository_url in repositories: logger.info( "[cocom] study enrich-cocom-analysis start analysis for {}". format(repository_url)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: files_at_time = es_in.search( index=in_index, body=get_files_at_time(repository_url, to_month.isoformat()) )['aggregations']['file_stats'].get("buckets", []) if not len(files_at_time): to_month = to_month + relativedelta(months=+interval) continue repository_name = repository_url.split("/")[-1] evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "total_files": len(files_at_time) } for file_ in files_at_time: file_details = file_["1"]["hits"]["hits"][0]["_source"] for metric in self.metrics: total_metric = "total_" + metric evolution_item[total_metric] = evolution_item.get( total_metric, 0) evolution_item[total_metric] += file_details[ metric] if file_details[ metric] is not None else 0 # TODO: Fix Logic: None rather than 1 evolution_item["total_comments_per_loc"] = round( evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_blanks_per_loc"] = round( evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) evolution_item["total_loc_per_function"] = round( evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis " "Study".format(missing, num_items)) else: logger.info( "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis " "Study".format(num_items)) logger.info( "[cocom] study enrich-cocom-analysis End analysis for {} with month interval" .format(repository_url)) logger.info("[cocom] study enrich-cocom-analysis End")
def enrich_backlog_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="github_enrich_backlog", date_field="grimoire_creation_date", interval_days=1, reduced_labels=["bug"], map_label=["others", "bugs"]): """ The purpose of this study is to add additional index to compute the chronological evolution of opened issues and average opened time issues. For each repository and label, we start the study on repository creation date until today with a day interval (default). For each date we retrieve the number of open issues at this date by difference between number of opened issues and number of closed issues. In addition, we compute the average opened time for all issues open at this date. To differentiate by label, we compute evolution for bugs and all others labels (like "enhancement","good first issue" ... ), we call this "reduced labels". We need to use theses reduced labels because the complexity to compute evolution for each combination of labels would be too big. In addition, we can rename "bug" label to "bugs" with map_label. Entry example in setup.cfg : [github] raw_index = github_issues_raw enriched_index = github_issues_enriched ... studies = [enrich_backlog_analysis] [enrich_backlog_analysis] out_index = github_enrich_backlog interval_days = 7 reduced_labels = [bug,enhancement] map_label = [others, bugs, enhancements] """ logger.info("[github] Start enrich_backlog_analysis study") # combine two lists to create the dict to map labels map_label = dict(zip([""] + reduced_labels, map_label)) # connect to ES es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index # get all repositories unique_repos = es_in.search( index=in_index, body=get_unique_repository_with_project_name()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.debug( "[enrich-backlog-analysis] {} repositories to process".format( len(repositories))) # create the index es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("backlog_study") # analysis for each repositories num_items = 0 ins_items = 0 for repository in repositories: repository_url = repository["origin"] project = repository["project"] org_name = repository["organization"] repository_name = repository_url.split("/")[-1] logger.debug( "[enrich-backlog-analysis] Start analysis for {}".format( repository_url)) # get each day since repository creation dates = es_in.search(index=in_index, body=get_issues_dates(interval_days, repository_url) )['aggregations']['created_per_interval'].get( "buckets", []) # for each selected label + others labels for label, other in [("", True)] + [(l, False) for l in reduced_labels]: # compute metrics for each day (ES request for each day) evolution_items = [] for date in map(lambda i: i['key_as_string'], dates): evolution_item = self.__create_backlog_item( repository_url, repository_name, project, date, org_name, interval_days, label, map_label, self.__get_opened_issues(es_in, in_index, repository_url, date, interval_days, other, label, reduced_labels)) evolution_items.append(evolution_item) # complete until today (no ES request needed, just extrapol) today = datetime.now().replace(hour=0, minute=0, second=0, tzinfo=None) last_item = evolution_item last_date = str_to_datetime( evolution_item['study_creation_date']).replace(tzinfo=None) \ + relativedelta(days=interval_days) average_opened_time = evolution_item['average_opened_time'] \ + float(interval_days) while last_date < today: date = last_date.strftime('%Y-%m-%dT%H:%M:%S.000Z') evolution_item = {} evolution_item.update(last_item) evolution_item.update({ "average_opened_time": average_opened_time, "study_creation_date": date, "uuid": "{}_{}_{}".format(date, repository_name, label), }) evolution_item.update( self.get_grimoire_fields(date, "stats")) evolution_items.append(evolution_item) last_date = last_date + relativedelta(days=interval_days) average_opened_time = average_opened_time + float( interval_days) # upload items to ES if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( ("[enrich-backlog-analysis] %s/%s missing items", "for Graal Backlog Analysis Study"), str(missing), str(num_items)) else: logger.debug( ("[enrich-backlog-analysis] %s items inserted", "for Graal Backlog Analysis Study"), str(num_items)) logger.info("[github] End enrich_backlog_analysis study")
def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("[colic] study enrich-colic-analysis start") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index interval_months = list(map(int, interval_months)) unique_repos = es_in.search(index=in_index, body=get_unique_repository()) repositories = [ repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get( 'buckets', []) ] logger.info( "[colic] study enrich-colic-analysis {} repositories to process". format(len(repositories))) es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) es_out.add_alias("colic_study") current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 for repository_url in repositories: repository_url_anonymized = repository_url if repository_url_anonymized.startswith('http'): repository_url_anonymized = anonymize_url( repository_url_anonymized) logger.info( "[colic] study enrich-colic-analysis start analysis for {}". format(repository_url_anonymized)) evolution_items = [] for interval in interval_months: to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) while to_month < current_month: copyrighted_files_at_time = es_in.search( index=in_index, body=self.__get_copyrighted_files( repository_url, to_month.isoformat())) licensed_files_at_time = es_in.search( index=in_index, body=self.__get_licensed_files(repository_url, to_month.isoformat())) files_at_time = es_in.search(index=in_index, body=self.__get_total_files( repository_url, to_month.isoformat())) licensed_files = int( licensed_files_at_time["aggregations"]["1"]["value"]) copyrighted_files = int( copyrighted_files_at_time["aggregations"]["1"] ["value"]) total_files = int( files_at_time["aggregations"]["1"]["value"]) if not total_files: to_month = to_month + relativedelta(months=+interval) continue evolution_item = { "id": "{}_{}_{}".format(to_month.isoformat(), hash(repository_url_anonymized), interval), "repo_url": repository_url_anonymized, "origin": repository_url, "interval_months": interval, "study_creation_date": to_month.isoformat(), "licensed_files": licensed_files, "copyrighted_files": copyrighted_files, "total_files": total_files } evolution_item.update( self.get_grimoire_fields( evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: num_items += len(evolution_items) ins_items += es_out.bulk_upload( evolution_items, self.get_field_unique_id()) evolution_items = [] to_month = to_month + relativedelta(months=+interval) if len(evolution_items) > 0: num_items += len(evolution_items) ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) if num_items != ins_items: missing = num_items - ins_items logger.error( "[colic] study enrich-colic-analysis {}/{} missing items for Graal CoLic Analysis " "Study".format(missing, num_items)) else: logger.info( "[colic] study enrich-colic-analysis {} items inserted for Graal CoLic Analysis " "Study".format(num_items)) logger.info( "[colic] study enrich-colic-analysis end analysis for {} with month interval" .format(repository_url_anonymized)) logger.info("[colic] study enrich-colic-analysis end")
args.index, mappings=TwitterOcean.mapping) total = 0 first_date = None last_date = None ids = [] tweets = [] for tweet in fetch_tweets(args.json_dir): # Check first and last dates tweet_date = parser.parse(tweet['created_at']) if not first_date or tweet_date <= first_date: first_date = tweet_date if not last_date or tweet_date >= last_date: last_date = tweet_date total += 1 tweets.append(tweet) ids.append(tweet["id_str"]) logging.info("%s -> %s", first_date, last_date) logging.info("Total tweets to be imported: %i", len(ids)) logging.info("Total unique tweets to be imported: %i", len(set(ids))) # Upload data to ES. The id is: "id_str" and the type "items" total = elastic.bulk_upload(tweets, "id_str") logging.info("Total tweets imported: %i", total)
eitem.update(item_meta) yield eitem if __name__ == '__main__': ARGS = get_params() if ARGS.debug: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s') logging.debug("Debug mode activated") else: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') logging.info("Importing items from %s to %s/%s", ARGS.collection, ARGS.elastic_url, ARGS.index) elastic = ElasticSearch(ARGS.elastic_url, ARGS.index) if ARGS.collection: mongo_items = fetch_mongodb_collection(ARGS.collection, ARGS.mongo_host, ARGS.mongo_port) elif ARGS.project: mongo_items = fetch_mongodb_project(ARGS.project, ARGS.mongo_host, ARGS.mongo_port) elif ARGS.all_collections: mongo_items = fetch_mongodb_all(ARGS.mongo_host, ARGS.mongo_port) else: raise RuntimeError('Collection to be processed not provided') if mongo_items: logging.info("Loading collections in Elasticsearch") elastic.bulk_upload(mongo_items, "id")
def execute(self): cfg = self.config.get_conf() if 'gerrit' not in cfg or 'git' not in cfg: logger.error("gerrit and git are needed for track items.") return # We need to track the items in all git repositories from OPNFV git_repos = [] repos_raw = TaskProjects.get_repos_by_backend_section("git") # git://git.opnfv.org/apex -> https://git.opnfv.org/apex/plain/UPSTREAM for repo in repos_raw: repo = repo.replace("git://", "https://") repo += "/plain/UPSTREAM" git_repos.append(repo) project = cfg['track_items']['project'] elastic_url_enrich = cfg['es_enrichment']['url'] # The raw data comes from upstream project elastic_url_raw = cfg['track_items']['upstream_raw_es_url'] index_gerrit_raw = cfg['track_items']['raw_index_gerrit'] index_git_raw = cfg['track_items']['raw_index_git'] index_gerrit_enrich = cfg['gerrit']['enriched_index'] index_git_enrich = cfg['git']['enriched_index'] db_config = { "database": cfg['sortinghat']['database'], "user": cfg['sortinghat']['user'], "password": cfg['sortinghat']['password'], "host": cfg['sortinghat']['host'] } logger.debug("Importing track items from %s ", git_repos) # # Gerrit Reviews # gerrit_uris = [] for git_repo in git_repos: gerrit_uris += fetch_track_items(git_repo, self.ITEMS_DATA_SOURCE) gerrit_numbers = get_gerrit_numbers(gerrit_uris) logger.info("Total gerrit track items to be imported: %i", len(gerrit_numbers)) enriched_items = enrich_gerrit_items(elastic_url_raw, index_gerrit_raw, gerrit_numbers, project, db_config) logger.info("Total gerrit track items enriched: %i", len(enriched_items)) elastic = ElasticSearch(elastic_url_enrich, index_gerrit_enrich) total = elastic.bulk_upload(enriched_items, "uuid") # # Git Commits # commits_sha = get_commits_from_gerrit(elastic_url_raw, index_gerrit_raw, gerrit_numbers) logger.info("Total git track items to be checked: %i", len(commits_sha)) enriched_items = enrich_git_items(elastic_url_raw, index_git_raw, commits_sha, project, db_config) logger.info("Total git track items enriched: %i", len(enriched_items)) elastic = ElasticSearch(elastic_url_enrich, index_git_enrich) total = elastic.bulk_upload(enriched_items, "uuid")