def __enrich_items(self): time_start = time.time() # logger.info('%s starts for %s ', 'enrichment', self.backend_section) logger.info('[%s] enrichment starts', self.backend_section) cfg = self.config.get_conf() if 'scroll_size' in cfg['general']: ElasticItems.scroll_size = cfg['general']['scroll_size'] if 'bulk_size' in cfg['general']: ElasticSearch.max_items_bulk = cfg['general']['bulk_size'] no_incremental = False github_token = None pair_programming = False if 'github' in cfg and 'backend_token' in cfg['github']: github_token = cfg['github']['backend_token'] if 'git' in cfg and 'pair-programming' in cfg['git']: pair_programming = cfg['git']['pair-programming'] only_studies = False only_identities = False # repos could change between executions because changes in projects repos = TaskProjects.get_repos_by_backend_section(self.backend_section) if not repos: logger.warning("No enrich repositories for %s", self.backend_section) for repo in repos: # First process p2o params from repo p2o_args = self._compose_p2o_params(self.backend_section, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None filters_raw_prefix = p2o_args[ 'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None jenkins_rename_file = p2o_args[ 'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self._compose_perceval_params( self.backend_section, url) try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_section, repo) backend = self.get_backend(self.backend_section) enrich_backend( es_col_url, self.clean, backend, backend_args, cfg[self.backend_section]['raw_index'], cfg[self.backend_section]['enriched_index'], None, # projects_db is deprecated cfg['projects']['projects_file'], cfg['sortinghat']['database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment']['url'], None, # args.events_enrich cfg['sortinghat']['user'], cfg['sortinghat']['password'], cfg['sortinghat']['host'], None, # args.refresh_projects, None, # args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw, filters_raw_prefix=filters_raw_prefix, jenkins_rename_file=jenkins_rename_file, unaffiliated_group=cfg['sortinghat']['unaffiliated_group'], pair_programming=pair_programming) except Exception as ex: logger.error( "Something went wrong producing enriched data for %s . " "Using the backend_args: %s ", self.backend_section, str(backend_args)) logger.error("Exception: %s", ex) raise DataEnrichmentError( 'Failed to produce enriched data for ' + self.backend_section) # Let's try to create the aliases for the enriched index if not self.enrich_aliases: logger.debug("Creating aliases after enrich") task_aliases = TaskPanelsAliases(self.config) task_aliases.set_backend_section(self.backend_section) task_aliases.execute() logger.debug("Done creating aliases after enrich") self.enrich_aliases = True spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_section, spent_time)
def __enrich_items(self): time_start = time.time() #logger.info('%s starts for %s ', 'enrichment', self.backend_name) logger.info('[%s] enrichment starts', self.backend_name) cfg = self.conf no_incremental = False github_token = None if 'github' in self.conf and 'backend_token' in self.conf['github']: github_token = self.conf['github']['backend_token'] only_studies = False only_identities = False for repo in self.repos: # First process p2o params from repo p2o_args = self.compose_p2o_params(self.backend_name, repo) filter_raw = p2o_args[ 'filter-raw'] if 'filter-raw' in p2o_args else None url = p2o_args['url'] # Second process perceval params from repo backend_args = self.compose_perceval_params(self.backend_name, url) try: es_col_url = self._get_collection_url() logger.debug('[%s] enrichment starts for %s', self.backend_name, repo) enrich_backend( es_col_url, self.clean, self.backend_name, backend_args, cfg[self.backend_name]['raw_index'], cfg[self.backend_name]['enriched_index'], None, #projects_db is deprecated cfg['projects_file'], cfg['sh_database'], no_incremental, only_identities, github_token, False, # studies are executed in its own Task only_studies, cfg['es_enrichment'], None, #args.events_enrich cfg['sh_user'], cfg['sh_password'], cfg['sh_host'], None, #args.refresh_projects, None, #args.refresh_identities, author_id=None, author_uuid=None, filter_raw=filter_raw) except KeyError as e: logger.exception(e) time.sleep(5) # Safety sleep tp avoid too quick execution spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start)) logger.info('[%s] enrichment finished in %s', self.backend_name, spent_time)
args.project, args.arthur) # Wait for one second, to ensure bulk write reflects in searches # https://www.elastic.co/guide/en/elasticsearch/reference/6.1/docs-refresh.html # (there are better ways of doing this, but for now...) time.sleep(1) logging.info("Backend feed completed") if args.enrich or args.enrich_only: unaffiliated_group = None enrich_backend( url, clean, args.backend, args.backend_args, args.index, args.index_enrich, args.db_projects_map, args.json_projects_map, args.db_sortinghat, args.no_incremental, args.only_identities, args.github_token, args.studies, args.only_studies, args.elastic_url_enrich, args.events_enrich, args.db_user, args.db_password, args.db_host, args.refresh_projects, args.refresh_identities, args.author_id, args.author_uuid, args.filter_raw, args.filters_raw_prefix, args.jenkins_rename_file, unaffiliated_group, args.pair_programming) logging.info("Enrich backend completed") elif args.events_enrich: logging.info("Enrich option is needed for events_enrich") else: logging.error("You must configure a backend") except KeyboardInterrupt: logging.info("\n\nReceived Ctrl-C or other break signal. Exiting.\n") sys.exit(0)
ElasticItems.scroll_size = args.scroll_size if not args.enrich_only: feed_backend(url, clean, args.fetch_cache, args.backend, args.backend_args, args.index, args.index_enrich, args.project, args.arthur) logging.info("Backed feed completed") if args.enrich or args.enrich_only: enrich_backend(url, clean, args.backend, args.backend_args, args.index, args.index_enrich, args.db_projects_map, args.json_projects_map, args.db_sortinghat, args.no_incremental, args.only_identities, args.github_token, args.studies, args.only_studies, args.elastic_url_enrich, args.events_enrich, args.db_user, args.db_password, args.db_host, args.refresh_projects, args.refresh_identities, args.author_id, args.author_uuid, args.filter_raw, args.filters_raw_prefix, args.jenkins_rename_file) logging.info("Enrich backend completed") elif args.events_enrich: logging.info("Enrich option is needed for events_enrich") else: logging.error("You must configure a backend") except KeyboardInterrupt: logging.info("\n\nReceived Ctrl-C or other break signal. Exiting.\n") sys.exit(0)