Exemple #1
0
    def test_get_repos_by_backend_sections_unknown(self):
        """Test whether the repos of each section are properly loaded when the unknown section is present"""

        config = Config(CONF_FILE_UNKNOWN)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        # repos not in unknown
        expected_list = ["https://github.com/chaoss/grimoirelab-perceval"]

        repos = task.get_repos_by_backend_section("git")
        self.assertListEqual(repos, expected_list)

        repos = task.get_repos_by_backend_section("git", raw=False)
        self.assertListEqual(repos, expected_list)

        # repos only in unknown
        expected_list = ["https://bugzilla.mozilla.org"]

        repos = task.get_repos_by_backend_section("bugzillarest")
        self.assertListEqual(repos, expected_list)

        repos = task.get_repos_by_backend_section("bugzillarest", raw=False)
        self.assertListEqual(repos, expected_list)

        # repos in unknown and other section
        expected_list = ["gerrit.onosproject.org"]

        repos = task.get_repos_by_backend_section("gerrit:onos")
        self.assertListEqual(repos, expected_list)

        expected_list = [
            "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTest",
            "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTestJenkins",
            "gerrit.onosproject.org --filter-raw=data.project:cord-openwrt",
            "gerrit.onosproject.org --filter-raw=data.project:fabric-control",
            "gerrit.onosproject.org --filter-raw=data.project:manifest"
        ]

        repos = task.get_repos_by_backend_section("gerrit:onos", raw=False)
        repos.sort()
        expected_list.sort()
        self.assertListEqual(repos, expected_list)
Exemple #2
0
    def test_get_repos_by_backend_section(self):
        """Test whether the repos of each section are properly loaded"""

        config = Config(CONF_FILE)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        config.conf.keys()
        backend_sections = list(set([sect for sect in config.conf.keys()
                                     for backend_section in Config.get_backend_sections()
                                     if sect and sect.startswith(backend_section)]))
        backend_sections.sort()
        backend = backend_sections[0]

        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'askbot')
        self.assertEqual(repos, ['https://ask.puppet.com'])

        backend = backend_sections[1]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'bugzilla')
        self.assertEqual(repos, ['https://bugs.eclipse.org/bugs/'])

        backend = backend_sections[2]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'bugzillarest')
        self.assertEqual(repos, ['https://bugzilla.mozilla.org'])

        backend = backend_sections[3]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'confluence')
        self.assertEqual(repos, ['https://wiki.open-o.org/'])

        backend = backend_sections[4]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'discourse')
        self.assertEqual(repos, ['https://foro.mozilla-hispano.org/'])

        backend = backend_sections[5]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'dockerhub')
        self.assertEqual(repos, ['bitergia kibiter'])

        backend = backend_sections[6]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'functest')
        self.assertEqual(repos, ['http://testresults.opnfv.org/test/'])

        backend = backend_sections[7]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'gerrit')
        self.assertEqual(repos, ['review.openstack.org'])

        backend = backend_sections[8]
        repos = task.get_repos_by_backend_section(backend)
        repos.sort()
        expected_list = [
            "https://github.com/VizGrimoire/GrimoireLib "
            "--filter-raw-prefix=data.files.file:grimoirelib_alch,data.files.file:README.md",
            "https://github.com/MetricsGrimoire/CMetrics"]
        expected_list.sort()
        self.assertEqual(backend, 'git')
        self.assertEqual(repos, expected_list)

        backend = backend_sections[9]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'github')
        self.assertEqual(repos, ['https://github.com/grimoirelab/perceval'])

        backend = backend_sections[10]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'github:pull')
        self.assertEqual(repos, ['https://github.com/grimoirelab/perceval'])

        backend = backend_sections[11]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'gitlab')
        self.assertEqual(repos, ['https://gitlab.com/inkscape/inkscape-web'])

        backend = backend_sections[12]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'google_hits')
        self.assertEqual(repos, ['bitergia grimoirelab'])

        backend = backend_sections[13]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'hyperkitty')
        self.assertEqual(repos,
                         ['https://lists.mailman3.org/archives/list/[email protected]'])

        backend = backend_sections[14]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'jenkins')
        self.assertEqual(repos, ['https://build.opnfv.org/ci'])

        backend = backend_sections[15]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'jira')
        self.assertEqual(repos, ['https://jira.opnfv.org'])

        backend = backend_sections[16]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987pcugaoa'])

        backend = backend_sections[17]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost:group1')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987cip'])

        backend = backend_sections[18]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost:group2')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987ciop'])

        backend = backend_sections[19]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mbox')
        self.assertEqual(repos, ['metrics-grimoire ~/.perceval/mbox'])

        backend = backend_sections[20]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mediawiki')
        self.assertEqual(repos, ['https://wiki.mozilla.org'])

        backend = backend_sections[21]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'meetup')
        self.assertEqual(repos, ['South-East-Puppet-User-Group'])

        backend = backend_sections[22]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mozillaclub')
        self.assertEqual(repos,
                         ['https://spreadsheets.google.com/feeds/cells/'
                          '1QHl2bjBhMslyFzR5XXPzMLdzzx7oeSKTbgR5PM8qp64/ohaibtm/public/values?alt=json'])

        backend = backend_sections[23]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'nntp')
        self.assertEqual(repos, ['news.mozilla.org mozilla.dev.project-link'])

        backend = backend_sections[24]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'phabricator')
        self.assertEqual(repos, ['https://phabricator.wikimedia.org'])

        backend = backend_sections[25]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'pipermail')
        self.assertEqual(repos, ['https://mail.gnome.org/archives/libart-hackers/'])

        backend = backend_sections[26]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'puppetforge')
        self.assertEqual(repos, [''])

        backend = backend_sections[27]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'redmine')
        self.assertEqual(repos, ['http://tracker.ceph.com/'])

        backend = backend_sections[28]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'remo')
        self.assertEqual(repos, ['https://reps.mozilla.org'])

        backend = backend_sections[29]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'remo:activities')
        self.assertEqual(repos, ['https://reps.mozilla.org'])

        backend = backend_sections[30]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'rss')
        self.assertEqual(repos, ['https://blog.bitergia.com/feed/'])

        backend = backend_sections[31]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'slack')
        self.assertEqual(repos, ['C7LSGB0AU'])

        backend = backend_sections[32]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'stackexchange')

        repos.sort()
        expected_list = [
            "https://stackoverflow.com/questions/tagged/ovirt",
            "https://stackoverflow.com/questions/tagged/rdo",
            "https://stackoverflow.com/questions/tagged/kibana"
        ]
        expected_list.sort()
        self.assertEqual(repos, expected_list)

        backend = backend_sections[33]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'supybot')
        self.assertEqual(repos,
                         ['openshift ~/.perceval/irc/percevalbot/logs/ChannelLogger/freenode/#openshift/'])

        backend = backend_sections[34]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'telegram')
        self.assertEqual(repos, ['Mozilla_analytics'])

        backend = backend_sections[35]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'twitter')
        self.assertEqual(repos, ['bitergia'])
    def execute(self):
        def check_arthur_task(repo, backend_args):
            """ Check if a task exists in arthur and if not, create it """
            arthur_repo_json = self.__create_arthur_json(repo, backend_args)
            logger.debug('JSON config for arthur %s',
                         json.dumps(arthur_repo_json, indent=True))

            # First check is the task already exists
            try:
                r = requests.post(self.arthur_url + "/tasks")
            except requests.exceptions.ConnectionError as ex:
                logging.error("Can not connect to %s", self.arthur_url)
                raise RuntimeError("Can not connect to " + self.arthur_url)

            task_ids = [task['task_id'] for task in r.json()['tasks']]
            new_task_ids = [
                task['task_id'] for task in arthur_repo_json['tasks']
            ]
            # TODO: if a tasks already exists maybe we should delete and readd it
            already_tasks = list(set(task_ids).intersection(set(new_task_ids)))
            if len(already_tasks) > 0:
                logger.warning(
                    "Tasks not added to arthur because there are already existing tasks %s",
                    already_tasks)
            else:
                r = requests.post(self.arthur_url + "/add",
                                  json=arthur_repo_json)
                r.raise_for_status()
                logger.info('[%s] collection configured in arthur for %s',
                            self.backend_section, repo)

        def collect_arthur_items(repo):
            aitems = self.__feed_backend_arthur(repo)
            if not aitems:
                return
            connector = get_connector_from_name(self.backend_section)
            klass = connector[1]  # Ocean backend for the connector
            ocean_backend = klass(None)
            es_col_url = self._get_collection_url()
            es_index = self.conf[self.backend_section]['raw_index']
            clean = False
            elastic_ocean = get_elastic(es_col_url, es_index, clean,
                                        ocean_backend)
            ocean_backend.set_elastic(elastic_ocean)
            ocean_backend.feed(arthur_items=aitems)

        cfg = self.config.get_conf()

        if 'collect' in cfg[self.backend_section] and not cfg[
                self.backend_section]['collect']:
            logging.info('%s collect disabled', self.backend_section)
            return

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        logger.info('Programming arthur for [%s] raw data collection',
                    self.backend_section)
        clean = False

        fetch_archive = False
        if 'fetch-archive' in self.conf[self.backend_section] and self.conf[
                self.backend_section]['fetch-archive']:
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            # If the repo already exists don't try to add it to arthur
            tag = self.backend_tag(repo)
            if tag not in self.arthur_items:
                self.arthur_items[tag] = []
                repo, repo_labels = self._extract_repo_labels(
                    self.backend_section, repo)
                p2o_args = self._compose_p2o_params(self.backend_section, repo)
                filter_raw = p2o_args[
                    'filter-raw'] if 'filter-raw' in p2o_args else None
                if filter_raw:
                    # If filter-raw exists it means that there is an equivalent URL
                    # in the `unknown` section of the projects.json. Thus the URL with
                    # filter-raw is ignored in the collection phase, while the URL
                    # in `unknown` is considered in this phase.
                    logging.warning("Not collecting filter raw repository: %s",
                                    repo)
                    continue
                backend_args = self._compose_perceval_params(
                    self.backend_section, repo)
                logger.debug(backend_args)

                check_arthur_task(repo, backend_args)

            collect_arthur_items(repo)
    def execute(self):

        errors = []
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if 'collect' in cfg[self.backend_section] and not cfg[
                self.backend_section]['collect']:
            logging.info('%s collect disabled', self.backend_section)
            return errors

        t2 = time.time()
        logger.info('[%s] collection phase starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if 'fetch-archive' in cfg[self.backend_section] and cfg[
                self.backend_section]['fetch-archive']:
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists it means that there is an equivalent URL
                # in the `unknown` section of the projects.json. Thus the URL with
                # filter-raw is ignored in the collection phase, while the URL
                # in `unknown` is considered in this phase.
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.info('[%s] collection starts for %s', self.backend_section,
                        repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron

            es_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                error_msg = feed_backend(es_col_url,
                                         clean,
                                         fetch_archive,
                                         backend,
                                         backend_args,
                                         cfg[ds]['raw_index'],
                                         cfg[ds]['enriched_index'],
                                         project,
                                         es_aliases=es_aliases,
                                         projects_json_repo=repo,
                                         repo_labels=repo_labels)
                error = {'backend': backend, 'repo': repo, 'error': error_msg}

                errors.append(error)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)
            logger.info('[%s] collection finished for %s',
                        self.backend_section, repo)

        t3 = time.time()
        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] collection phase finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))

        self.retain_data(cfg['general']['retention_time'],
                         self.conf['es_collection']['url'],
                         self.conf[self.backend_section]['raw_index'])

        return errors
Exemple #5
0
    def __enrich_items(self):

        time_start = datetime.now()

        logger.info('[%s] enrichment phase starts', self.backend_section)

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        # not used due to https://github.com/chaoss/grimoirelab-elk/pull/773
        github_token = None
        pair_programming = False
        node_regex = None
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']:
            node_regex = cfg['jenkins']['node_regex']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section,
                                                          raw=False)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        # Get the metadata__timestamp value of the last item inserted in the enriched index before
        # looping over the repos which data is stored in the same index. This is needed to make sure
        # that the incremental enrichment works for data sources that are collected globally but only
        # partially enriched.
        elastic_enrich = get_elastic(
            cfg['es_enrichment']['url'],
            cfg[self.backend_section]['enriched_index'])
        last_enrich_date = elastic_enrich.get_last_item_field(
            "metadata__timestamp")
        if last_enrich_date:
            last_enrich_date = last_enrich_date.replace(tzinfo=None)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            backend = self.get_backend(self.backend_section)
            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            logger.info('[%s] enrichment starts for %s', self.backend_section,
                        self.anonymize_url(repo))
            es_enrich_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                es_col_url = self._get_collection_url()
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    self.backend_section,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    self.db_sh,
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    self.db_user,
                    self.db_password,
                    self.db_host,
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=self.db_unaffiliate_group,
                    pair_programming=pair_programming,
                    node_regex=node_regex,
                    studies_args=studies_args,
                    es_enrich_aliases=es_enrich_aliases,
                    last_enrich_date=last_enrich_date,
                    projects_json_repo=repo,
                    repo_labels=repo_labels)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            logger.info('[%s] enrichment finished for %s',
                        self.backend_section, self.anonymize_url(repo))

        spent_time = str(datetime.now() - time_start).split('.')[0]
        logger.info('[%s] enrichment phase finished in %s',
                    self.backend_section, spent_time)
Exemple #6
0
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args[
                'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            try:
                es_col_url = self._get_collection_url()
                logger.debug('[%s] enrichment starts for %s',
                             self.backend_section, repo)
                backend = self.get_backend(self.backend_section)
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    cfg['sortinghat']['database'],
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    cfg['sortinghat']['user'],
                    cfg['sortinghat']['password'],
                    cfg['sortinghat']['host'],
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    filters_raw_prefix=filters_raw_prefix,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                    pair_programming=pair_programming,
                    studies_args=studies_args)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            # Let's try to create the aliases for the enriched index
            if not self.enrich_aliases:
                logger.debug("Creating aliases after enrich")
                task_aliases = TaskPanelsAliases(self.config)
                task_aliases.set_backend_section(self.backend_section)
                task_aliases.execute()
                logger.debug("Done creating aliases after enrich")
                self.enrich_aliases = True

        spent_time = time.strftime("%H:%M:%S",
                                   time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment finished in %s', self.backend_section,
                    spent_time)
        print("Enrichment for {}: finished after {} hours".format(
            self.backend_section, spent_time))
    def execute(self):
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if ('fetch-archive' in cfg[self.backend_section]
                and cfg[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_section,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron
            try:
                feed_backend(es_col_url, clean, fetch_archive, backend,
                             backend_args, cfg[ds]['raw_index'],
                             cfg[ds]['enriched_index'], project)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)

        t3 = time.time()

        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))
    def execute(self):
        cfg = self.config.get_conf()

        if 'gerrit' not in cfg or 'git' not in cfg:
            logger.error("gerrit and git are needed for track items.")
            return

        # We need to track the items in all git repositories from OPNFV
        git_repos = []

        repos_raw = TaskProjects.get_repos_by_backend_section("git")

        # git://git.opnfv.org/apex -> https://git.opnfv.org/apex/plain/UPSTREAM
        for repo in repos_raw:
            repo = repo.replace("git://", "https://")
            repo += "/plain/UPSTREAM"
            git_repos.append(repo)

        project = cfg['track_items']['project']
        elastic_url_enrich = cfg['es_enrichment']['url']

        # The raw data comes from upstream project
        elastic_url_raw = cfg['track_items']['upstream_raw_es_url']
        index_gerrit_raw = cfg['track_items']['raw_index_gerrit']
        index_git_raw = cfg['track_items']['raw_index_git']

        index_gerrit_enrich = cfg['gerrit']['enriched_index']
        index_git_enrich = cfg['git']['enriched_index']

        db_config = {
            "database": cfg['sortinghat']['database'],
            "user": cfg['sortinghat']['user'],
            "password": cfg['sortinghat']['password'],
            "host": cfg['sortinghat']['host']
        }

        logger.debug("Importing track items from %s ", git_repos)

        #
        # Gerrit Reviews
        #
        gerrit_uris = []
        for git_repo in git_repos:
            gerrit_uris += fetch_track_items(git_repo, self.ITEMS_DATA_SOURCE)

        gerrit_numbers = get_gerrit_numbers(gerrit_uris)
        logger.info("Total gerrit track items to be imported: %i",
                    len(gerrit_numbers))
        enriched_items = enrich_gerrit_items(elastic_url_raw, index_gerrit_raw,
                                             gerrit_numbers, project,
                                             db_config)
        logger.info("Total gerrit track items enriched: %i",
                    len(enriched_items))
        elastic = ElasticSearch(elastic_url_enrich, index_gerrit_enrich)
        total = elastic.bulk_upload(enriched_items, "uuid")

        #
        # Git Commits
        #
        commits_sha = get_commits_from_gerrit(elastic_url_raw,
                                              index_gerrit_raw, gerrit_numbers)
        logger.info("Total git track items to be checked: %i",
                    len(commits_sha))
        enriched_items = enrich_git_items(elastic_url_raw, index_git_raw,
                                          commits_sha, project, db_config)
        logger.info("Total git track items enriched: %i", len(enriched_items))
        elastic = ElasticSearch(elastic_url_enrich, index_git_enrich)
        total = elastic.bulk_upload(enriched_items, "uuid")
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment phase starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        node_regex = None
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']:
            node_regex = cfg['jenkins']['node_regex']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section, raw=False)

        if not repos:
            logger.warning("No enrich repositories for %s", self.backend_section)

        # Get the metadata__timestamp value of the last item inserted in the enriched index before
        # looping over the repos which data is stored in the same index. This is needed to make sure
        # that the incremental enrichment works for data sources that are collected globally but only
        # partially enriched.
        elastic_enrich = get_elastic(cfg['es_enrichment']['url'], cfg[self.backend_section]['enriched_index'])
        last_enrich_date = elastic_enrich.get_last_item_field("metadata__timestamp")
        if last_enrich_date:
            last_enrich_date = last_enrich_date.replace(second=0, microsecond=0, tzinfo=None)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args['filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args['filter-raw-prefix'] if 'filter-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args['jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(self.backend_section, url)
            studies_args = None

            backend = self.get_backend(self.backend_section)
            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            logger.info('[%s] enrichment starts for %s', self.backend_section, repo)
            es_enrich_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                es_col_url = self._get_collection_url()
                enrich_backend(es_col_url, self.clean, backend, backend_args,
                               self.backend_section,
                               cfg[self.backend_section]['raw_index'],
                               cfg[self.backend_section]['enriched_index'],
                               None,  # projects_db is deprecated
                               cfg['projects']['projects_file'],
                               cfg['sortinghat']['database'],
                               no_incremental, only_identities,
                               github_token,
                               False,  # studies are executed in its own Task
                               only_studies,
                               cfg['es_enrichment']['url'],
                               None,  # args.events_enrich
                               cfg['sortinghat']['user'],
                               cfg['sortinghat']['password'],
                               cfg['sortinghat']['host'],
                               None,  # args.refresh_projects,
                               None,  # args.refresh_identities,
                               author_id=None,
                               author_uuid=None,
                               filter_raw=filter_raw,
                               filters_raw_prefix=filters_raw_prefix,
                               jenkins_rename_file=jenkins_rename_file,
                               unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                               pair_programming=pair_programming,
                               node_regex=node_regex,
                               studies_args=studies_args,
                               es_enrich_aliases=es_enrich_aliases,
                               last_enrich_date=last_enrich_date)
            except Exception as ex:
                logger.error("Something went wrong producing enriched data for %s . "
                             "Using the backend_args: %s ", self.backend_section, str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError('Failed to produce enriched data for ' + self.backend_section)

            logger.info('[%s] enrichment finished for %s', self.backend_section, repo)

        spent_time = time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment phase finished in %s', self.backend_section, spent_time)
        print("Enrichment for {}: finished after {} hours".format(self.backend_section,
                                                                  spent_time))