Ejemplo n.º 1
0
    def setUp(self):
        self.config = Config(CONF_FILE)
        mordred = SirMordred(self.config)

        task = TaskProjects(self.config)
        self.assertEqual(task.execute(), None)

        self.backends = mordred._get_repos_by_backend()
        self.backend_tasks = [TaskRawDataCollection, TaskEnrich]
        self.stopper = threading.Event()
Ejemplo n.º 2
0
    def test__get_projects_from_url(self):
        """Test downloading projects from an URL """
        setup_http_server()

        projects_url = 'http://localhost/projects.json'
        config = Config(CONF_FILE)
        config.set_param('projects', 'projects_url', projects_url)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        projects = task.get_projects()
        self.assertTrue(URL_PROJECTS_MAIN in projects)
    def test_run_eclipse(self):
        """Test whether the Task could be run getting projects from Eclipse"""
        setup_http_server()

        # Create a empty projects file for testing
        projects_file = 'test-projects-eclipse.json'

        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)

        self.assertEqual(task.execute(), None)
        self.assertEqual(len(task.get_projects().keys()), 302)

        # Let's remove some projects to track changes
        with open(ECLIPSE_PROJECTS_FILE) as eproj:
            remove_project = 'birt'
            add_project = 'new_project'
            new_projects = task.convert_from_eclipse(
                json.load(eproj)['projects'])
            new_projects.pop(remove_project)
            new_projects.update({add_project: {}})
            task.set_projects(new_projects)
            self.assertEqual(task.get_projects_last_diff().sort(),
                             [add_project, remove_project].sort())

        remove(projects_file)
    def test_run(self):
        """Test whether the Task could be run"""
        config = Config(CONF_FILE)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)
        self.assertEqual(task.execute(), None)

        # Check that the enrichment went well
        es_collection = cfg['es_collection']['url']
        es_enrichment = cfg['es_enrichment']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']
        enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION]['enriched_index']

        r = requests.get(raw_index + "/_search?size=0")
        raw_items = r.json()['hits']['total']
        r = requests.get(enrich_index + "/_search?size=0")
        enriched_items = r.json()['hits']['total']

        # the number of raw items is bigger since the enriched items are generated based on:
        # https://github.com/VizGrimoire/GrimoireLib
        # --filters-raw-prefix data.files.file:grimoirelib_alch data.files.file:README.md
        # see [git] section in tests/test-projects.json
        self.assertGreater(raw_items, enriched_items)
Ejemplo n.º 5
0
    def _get_repos_by_backend(self):
        #
        # return dict with backend and list of repositories
        #
        output = {}
        projects = TaskProjects.get_projects()

        for pro in projects:
            # remove duplicates in backends_section with list(set(..))
            backend_sections = list(
                set([
                    sect for sect in projects[pro].keys()
                    for backend_section in Config.get_backend_sections()
                    if sect and sect.startswith(backend_section)
                ]))

            # sort backends section
            backend_sections.sort()
            for backend_section in backend_sections:
                if backend_section not in output:
                    output[backend_section] = projects[pro][backend_section]
                else:
                    output[backend_section] += projects[pro][backend_section]

        # backend could be in project/repo file but not enabled in
        # sirmordred conf file
        enabled = {}
        for k in output:
            if k in self.conf:
                enabled[k] = output[k]

        # logger.debug('repos to be retrieved: %s ', enabled)
        return enabled
    def test_execute_from_archive(self):
        """Test fetching data from archives"""

        # proj_file -> 'test-projects-archive.json' stored within the conf file
        conf_file = 'archives-test.cfg'
        config = Config(conf_file)

        backend_sections = [
            'askbot', 'bugzilla', 'bugzillarest', 'confluence', 'discourse',
            'dockerhub', 'gerrit', 'github:issue', 'github:pull',
            'gitlab:issue', 'gitlab:merge', 'google_hits', 'jenkins', 'jira',
            'mediawiki', 'meetup', 'mozillaclub', 'nntp', 'phabricator',
            'redmine', 'remo', 'rss', 'stackexchange', 'slack', 'telegram',
            'twitter'
        ]

        # We need to load the projects
        TaskProjects(config).execute()
        for backend_section in backend_sections:
            task = TaskRawDataCollection(config,
                                         backend_section=backend_section)
            task.execute()

        for backend_section in backend_sections:
            task = TaskEnrich(config, backend_section=backend_section)
            self.assertEqual(task.execute(), None)
Ejemplo n.º 7
0
    def _get_repos_by_backend(self):
        #
        # return dict with backend and list of repositories
        #
        output = {}
        projects = TaskProjects.get_projects()

        for backend_section in Config.get_backend_sections():
            for pro in projects:
                backend = Task.get_backend(backend_section)
                if backend in projects[pro]:
                    if backend_section not in output:
                        output[backend_section] = projects[pro][backend]
                    else:
                        output[backend_section] += projects[pro][backend]

        # backend could be in project/repo file but not enabled in
        # sirmordred conf file
        enabled = {}
        for k in output:
            if k in self.conf:
                enabled[k] = output[k]

        # logger.debug('repos to be retrieved: %s ', enabled)
        return enabled
Ejemplo n.º 8
0
    def test_initialization(self):
        """Test whether attributes are initializated"""

        config = Config(CONF_FILE)
        task = TaskProjects(config)

        self.assertEqual(task.config, config)
    def test_run_eclipse(self):
        """Test whether the Task could be run getting projects from Eclipse"""
        setup_http_server()

        # Create a empty projects file for testing
        projects_file = 'test-projects-eclipse.json'

        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)

        self.assertEqual(task.execute(), None)
        self.assertEqual(len(task.get_projects().keys()), 302)

        remove(projects_file)
    def test_execute(self):
        """Test whether the Task could be run"""

        config = Config(CONF_FILE)
        backend_section = GIT_BACKEND_SECTION
        task = TaskRawDataCollection(config, backend_section=backend_section)
        # We need to load the projects
        TaskProjects(config).execute()
        self.assertEqual(task.execute(), None)
Ejemplo n.º 11
0
def get_identities_load(config):
    """Execute the load identities phase

    :param config: a Mordred config object
    """
    TaskProjects(config).execute()
    task = TaskIdentitiesLoad(config)
    task.execute()
    logging.info("Loading identities finished!")
    def test_convert_from_eclipse(self):
        """Test the conversion from eclipse projects to grimoire projects"""
        setup_http_server()

        projects_file = 'test-projects-eclipse.json'
        config = Config(CONF_FILE)
        config.set_param('projects', 'load_eclipse', True)
        config.set_param('projects', 'projects_file', projects_file)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        projects = task.get_projects()
        self.assertTrue(TaskProjects.GLOBAL_PROJECT in projects)

        self.assertEqual(projects['birt']['github'][0],
                         'https://github.com/eclipse/birt')

        remove(projects_file)
Ejemplo n.º 13
0
def get_identities_merge(config):
    """Execute the merge identities phase

    :param config: a Mordred config object
    """
    TaskProjects(config).execute()
    task = TaskIdentitiesMerge(config)
    task.execute()
    logging.info("Merging identities finished!")
Ejemplo n.º 14
0
def get_enrich(config, backend_section):
    """Execute the enrich phase for a given backend section

    :param config: a Mordred config object
    :param backend_section: the backend section where the enrich phase is executed
    """

    TaskProjects(config).execute()
    task = TaskEnrich(config, backend_section=backend_section)
    task.execute()
    logging.info("Loading enriched data finished!")
Ejemplo n.º 15
0
    def test_get_repos_by_backend_sections_unknown(self):
        """Test whether the repos of each section are properly loaded when the unknown section is present"""

        config = Config(CONF_FILE_UNKNOWN)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        # repos not in unknown
        expected_list = ["https://github.com/chaoss/grimoirelab-perceval"]

        repos = task.get_repos_by_backend_section("git")
        self.assertListEqual(repos, expected_list)

        repos = task.get_repos_by_backend_section("git", raw=False)
        self.assertListEqual(repos, expected_list)

        # repos only in unknown
        expected_list = ["https://bugzilla.mozilla.org"]

        repos = task.get_repos_by_backend_section("bugzillarest")
        self.assertListEqual(repos, expected_list)

        repos = task.get_repos_by_backend_section("bugzillarest", raw=False)
        self.assertListEqual(repos, expected_list)

        # repos in unknown and other section
        expected_list = ["gerrit.onosproject.org"]

        repos = task.get_repos_by_backend_section("gerrit:onos")
        self.assertListEqual(repos, expected_list)

        expected_list = [
            "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTest",
            "gerrit.onosproject.org --filter-raw=data.project:OnosSystemTestJenkins",
            "gerrit.onosproject.org --filter-raw=data.project:cord-openwrt",
            "gerrit.onosproject.org --filter-raw=data.project:fabric-control",
            "gerrit.onosproject.org --filter-raw=data.project:manifest"
        ]

        repos = task.get_repos_by_backend_section("gerrit:onos", raw=False)
        repos.sort()
        expected_list.sort()
        self.assertListEqual(repos, expected_list)
Ejemplo n.º 16
0
def get_raw(config, backend_section):
    """Execute the raw phase for a given backend section

    :param config: a Mordred config object
    :param backend_section: the backend section where the raw phase is executed
    """

    task = TaskRawDataCollection(config, backend_section=backend_section)
    TaskProjects(config).execute()
    try:
        task.execute()
        logging.info("Loading raw data finished!")
    except Exception as e:
        logging.error(str(e))
        sys.exit(-1)
Ejemplo n.º 17
0
def get_raw(config, backend_section, arthur):
    """Execute the raw phase for a given backend section, optionally using Arthur

    :param config: a Mordred config object
    :param backend_section: the backend section where the raw phase is executed
    :param arthur: if true, it enables Arthur to collect the raw data
    """

    if arthur:
        task = TaskRawDataArthurCollection(config,
                                           backend_section=backend_section)
    else:
        task = TaskRawDataCollection(config, backend_section=backend_section)

    TaskProjects(config).execute()
    task.execute()
    logging.info("Loading raw data finished!")
    def test_execute_no_collection(self):
        """Test whether the raw data is not downloaded when --filter-no-collection is true"""

        config = Config(CONF_FILE_NO_COLL)
        cfg = config.get_conf()
        backend_section = GIT_BACKEND_SECTION
        task = TaskRawDataCollection(config, backend_section=backend_section)
        # We need to load the projects
        TaskProjects(config).execute()
        self.assertIsNotNone(task.execute())

        # Check that the fitler --filter-no-collection works
        es_collection = cfg['es_collection']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']

        r = requests.get(raw_index + "/_search?size=0", verify=False)
        raw_items = r.json()['hits']['total']
        self.assertEqual(raw_items, 40)
    def test_execute(self):
        """Test whether the Task could be run"""

        config = Config(CONF_FILE)
        cfg = config.get_conf()
        backend_section = GIT_BACKEND_SECTION
        task = TaskRawDataCollection(config, backend_section=backend_section)
        # We need to load the projects
        TaskProjects(config).execute()
        self.assertIsNotNone(task.execute())

        # Check that the collection went well
        es_collection = cfg['es_collection']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']

        r = requests.get(raw_index + "/_search?size=0", verify=False)
        raw_items = r.json()['hits']['total']
        self.assertEqual(raw_items, 3603)
Ejemplo n.º 20
0
def get_raw(config, backend_section, repos_to_check=None):
    """Execute the raw phase for a given backend section

    Repos are only checked if they are in BOTH `repos_to_check` and the `projects.json`

    :param config: a Mordred config object
    :param backend_section: the backend section where the raw phase is executed
    :param repos_to_check: A list of repo URLs to check, or None to check all repos
    """

    task = TaskRawDataCollection(config, backend_section=backend_section, allowed_repos=repos_to_check)
    TaskProjects(config).execute()
    try:
        task.execute()
        logging.info("Loading raw data finished!")
    except Exception as e:
        logging.error(str(e))
        sys.exit(-1)
    def test_execute(self):
        """Test whether the Task could be run"""
        def setUp(self):
            config = Config(CONF_FILE)
            sh = config.get_conf()['sortinghat']

            self.sh_kwargs = {
                'user': sh['user'],
                'password': sh['password'],
                'database': sh['database'],
                'host': sh['host'],
                'port': None
            }

            # Clean the database to start an empty state

            Database.drop(**self.sh_kwargs)

            # Create command
            Database.create(**self.sh_kwargs)
            self.sh_db = Database(**self.sh_kwargs)

        config = Config(CONF_FILE)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)
        self.assertEqual(task.execute(), None)

        # Check that the enrichment went well
        es_collection = cfg['es_collection']['url']
        es_enrichment = cfg['es_enrichment']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']
        enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][
            'enriched_index']

        r = requests.get(raw_index + "/_search?size=0", verify=False)
        raw_items = r.json()['hits']['total']
        r = requests.get(enrich_index + "/_search?size=0", verify=False)
        enriched_items = r.json()['hits']['total']

        self.assertEqual(raw_items, enriched_items)
    def test_studies(self):
        """Test whether the studies configuration works """
        config = Config(CONF_FILE)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)

        # Configure no studies
        cfg.set_param('git', 'studies', None)
        self.assertEqual(task.execute(), None)

        # Configure no studies
        cfg.set_param('git', 'studies', [])
        self.assertEqual(task.execute(), None)

        # Configure a wrong study
        cfg.set_param('git', 'studies', ['bad_study'])
        with self.assertRaises(DataEnrichmentError):
            self.assertEqual(task.execute(), None)

        # Configure several studies
        cfg.set_param('git', 'studies', ['enrich_onion'])
        self.assertEqual(task.execute(), None)

        # Configure several studies
        cfg.set_param('git', 'studies',
                      ['enrich_demography:1', 'enrich_areas_of_code'])
        self.assertEqual(task.execute(), None)

        # Configure kafka kip study
        cfg.set_param('mbox', 'studies', ['kafka_kip'])
        self.assertEqual(task.execute(), None)

        # Configure several studies, one wrong
        cfg.set_param('git', 'studies',
                      ['enrich_demography:1', "enrich_areas_of_code1"])
        with self.assertRaises(DataEnrichmentError):
            self.assertEqual(task.execute(), None)
    def test_execute_no_sh(self):
        """Test whether the Task could be run without SortingHat"""

        config = Config(CONF_FILE_NO_SH)
        cfg = config.get_conf()
        # We need to load the projects
        TaskProjects(config).execute()
        backend_section = GIT_BACKEND_SECTION
        task = TaskEnrich(config, backend_section=backend_section)
        self.assertEqual(task.execute(), None)

        # Check that the enrichment went well
        es_collection = cfg['es_collection']['url']
        es_enrichment = cfg['es_enrichment']['url']
        raw_index = es_collection + "/" + cfg[GIT_BACKEND_SECTION]['raw_index']
        enrich_index = es_enrichment + "/" + cfg[GIT_BACKEND_SECTION][
            'enriched_index']

        r = requests.get(raw_index + "/_search?size=0", verify=False)
        raw_items = r.json()['hits']['total']
        r = requests.get(enrich_index + "/_search?size=0", verify=False)
        enriched_items = r.json()['hits']['total']

        self.assertEqual(raw_items, enriched_items)
    def execute(self):
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if ('fetch-archive' in cfg[self.backend_section]
                and cfg[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_section,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron
            try:
                feed_backend(es_col_url, clean, fetch_archive, backend,
                             backend_args, cfg[ds]['raw_index'],
                             cfg[ds]['enriched_index'], project)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)

        t3 = time.time()

        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))
Ejemplo n.º 25
0
 def test_run(self):
     """Test whether the Task could be run"""
     config = Config(CONF_FILE)
     task = TaskProjects(config)
     self.assertEqual(task.execute(), None)
     self.assertEqual(len(task.get_projects().keys()), 1)
Ejemplo n.º 26
0
    def test_get_repos_by_backend_section(self):
        """Test whether the repos of each section are properly loaded"""

        config = Config(CONF_FILE)
        task = TaskProjects(config)
        self.assertEqual(task.execute(), None)

        config.conf.keys()
        backend_sections = list(set([sect for sect in config.conf.keys()
                                     for backend_section in Config.get_backend_sections()
                                     if sect and sect.startswith(backend_section)]))
        backend_sections.sort()
        backend = backend_sections[0]

        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'askbot')
        self.assertEqual(repos, ['https://ask.puppet.com'])

        backend = backend_sections[1]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'bugzilla')
        self.assertEqual(repos, ['https://bugs.eclipse.org/bugs/'])

        backend = backend_sections[2]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'bugzillarest')
        self.assertEqual(repos, ['https://bugzilla.mozilla.org'])

        backend = backend_sections[3]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'confluence')
        self.assertEqual(repos, ['https://wiki.open-o.org/'])

        backend = backend_sections[4]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'discourse')
        self.assertEqual(repos, ['https://foro.mozilla-hispano.org/'])

        backend = backend_sections[5]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'dockerhub')
        self.assertEqual(repos, ['bitergia kibiter'])

        backend = backend_sections[6]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'functest')
        self.assertEqual(repos, ['http://testresults.opnfv.org/test/'])

        backend = backend_sections[7]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'gerrit')
        self.assertEqual(repos, ['review.openstack.org'])

        backend = backend_sections[8]
        repos = task.get_repos_by_backend_section(backend)
        repos.sort()
        expected_list = [
            "https://github.com/VizGrimoire/GrimoireLib "
            "--filter-raw-prefix=data.files.file:grimoirelib_alch,data.files.file:README.md",
            "https://github.com/MetricsGrimoire/CMetrics"]
        expected_list.sort()
        self.assertEqual(backend, 'git')
        self.assertEqual(repos, expected_list)

        backend = backend_sections[9]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'github')
        self.assertEqual(repos, ['https://github.com/grimoirelab/perceval'])

        backend = backend_sections[10]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'github:pull')
        self.assertEqual(repos, ['https://github.com/grimoirelab/perceval'])

        backend = backend_sections[11]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'gitlab')
        self.assertEqual(repos, ['https://gitlab.com/inkscape/inkscape-web'])

        backend = backend_sections[12]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'google_hits')
        self.assertEqual(repos, ['bitergia grimoirelab'])

        backend = backend_sections[13]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'hyperkitty')
        self.assertEqual(repos,
                         ['https://lists.mailman3.org/archives/list/[email protected]'])

        backend = backend_sections[14]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'jenkins')
        self.assertEqual(repos, ['https://build.opnfv.org/ci'])

        backend = backend_sections[15]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'jira')
        self.assertEqual(repos, ['https://jira.opnfv.org'])

        backend = backend_sections[16]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987pcugaoa'])

        backend = backend_sections[17]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost:group1')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987cip'])

        backend = backend_sections[18]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mattermost:group2')
        self.assertEqual(repos, ['https://chat.openshift.io 8j366ft5affy3p36987ciop'])

        backend = backend_sections[19]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mbox')
        self.assertEqual(repos, ['metrics-grimoire ~/.perceval/mbox'])

        backend = backend_sections[20]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mediawiki')
        self.assertEqual(repos, ['https://wiki.mozilla.org'])

        backend = backend_sections[21]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'meetup')
        self.assertEqual(repos, ['South-East-Puppet-User-Group'])

        backend = backend_sections[22]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'mozillaclub')
        self.assertEqual(repos,
                         ['https://spreadsheets.google.com/feeds/cells/'
                          '1QHl2bjBhMslyFzR5XXPzMLdzzx7oeSKTbgR5PM8qp64/ohaibtm/public/values?alt=json'])

        backend = backend_sections[23]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'nntp')
        self.assertEqual(repos, ['news.mozilla.org mozilla.dev.project-link'])

        backend = backend_sections[24]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'phabricator')
        self.assertEqual(repos, ['https://phabricator.wikimedia.org'])

        backend = backend_sections[25]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'pipermail')
        self.assertEqual(repos, ['https://mail.gnome.org/archives/libart-hackers/'])

        backend = backend_sections[26]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'puppetforge')
        self.assertEqual(repos, [''])

        backend = backend_sections[27]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'redmine')
        self.assertEqual(repos, ['http://tracker.ceph.com/'])

        backend = backend_sections[28]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'remo')
        self.assertEqual(repos, ['https://reps.mozilla.org'])

        backend = backend_sections[29]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'remo:activities')
        self.assertEqual(repos, ['https://reps.mozilla.org'])

        backend = backend_sections[30]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'rss')
        self.assertEqual(repos, ['https://blog.bitergia.com/feed/'])

        backend = backend_sections[31]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'slack')
        self.assertEqual(repos, ['C7LSGB0AU'])

        backend = backend_sections[32]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'stackexchange')

        repos.sort()
        expected_list = [
            "https://stackoverflow.com/questions/tagged/ovirt",
            "https://stackoverflow.com/questions/tagged/rdo",
            "https://stackoverflow.com/questions/tagged/kibana"
        ]
        expected_list.sort()
        self.assertEqual(repos, expected_list)

        backend = backend_sections[33]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'supybot')
        self.assertEqual(repos,
                         ['openshift ~/.perceval/irc/percevalbot/logs/ChannelLogger/freenode/#openshift/'])

        backend = backend_sections[34]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'telegram')
        self.assertEqual(repos, ['Mozilla_analytics'])

        backend = backend_sections[35]
        repos = task.get_repos_by_backend_section(backend)
        self.assertEqual(backend, 'twitter')
        self.assertEqual(repos, ['bitergia'])
    def execute(self):

        errors = []
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if 'collect' in cfg[self.backend_section] and not cfg[
                self.backend_section]['collect']:
            logging.info('%s collect disabled', self.backend_section)
            return errors

        t2 = time.time()
        logger.info('[%s] collection phase starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if 'fetch-archive' in cfg[self.backend_section] and cfg[
                self.backend_section]['fetch-archive']:
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists it means that there is an equivalent URL
                # in the `unknown` section of the projects.json. Thus the URL with
                # filter-raw is ignored in the collection phase, while the URL
                # in `unknown` is considered in this phase.
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.info('[%s] collection starts for %s', self.backend_section,
                        repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron

            es_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                error_msg = feed_backend(es_col_url,
                                         clean,
                                         fetch_archive,
                                         backend,
                                         backend_args,
                                         cfg[ds]['raw_index'],
                                         cfg[ds]['enriched_index'],
                                         project,
                                         es_aliases=es_aliases,
                                         projects_json_repo=repo,
                                         repo_labels=repo_labels)
                error = {'backend': backend, 'repo': repo, 'error': error_msg}

                errors.append(error)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)
            logger.info('[%s] collection finished for %s',
                        self.backend_section, repo)

        t3 = time.time()
        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] collection phase finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))

        self.retain_data(cfg['general']['retention_time'],
                         self.conf['es_collection']['url'],
                         self.conf[self.backend_section]['raw_index'])

        return errors
    def execute(self):
        def check_arthur_task(repo, backend_args):
            """ Check if a task exists in arthur and if not, create it """
            arthur_repo_json = self.__create_arthur_json(repo, backend_args)
            logger.debug('JSON config for arthur %s',
                         json.dumps(arthur_repo_json, indent=True))

            # First check is the task already exists
            try:
                r = requests.post(self.arthur_url + "/tasks")
            except requests.exceptions.ConnectionError as ex:
                logging.error("Can not connect to %s", self.arthur_url)
                raise RuntimeError("Can not connect to " + self.arthur_url)

            task_ids = [task['task_id'] for task in r.json()['tasks']]
            new_task_ids = [
                task['task_id'] for task in arthur_repo_json['tasks']
            ]
            # TODO: if a tasks already exists maybe we should delete and readd it
            already_tasks = list(set(task_ids).intersection(set(new_task_ids)))
            if len(already_tasks) > 0:
                logger.warning(
                    "Tasks not added to arthur because there are already existing tasks %s",
                    already_tasks)
            else:
                r = requests.post(self.arthur_url + "/add",
                                  json=arthur_repo_json)
                r.raise_for_status()
                logger.info('[%s] collection configured in arthur for %s',
                            self.backend_section, repo)

        def collect_arthur_items(repo):
            aitems = self.__feed_backend_arthur(repo)
            if not aitems:
                return
            connector = get_connector_from_name(self.backend_section)
            klass = connector[1]  # Ocean backend for the connector
            ocean_backend = klass(None)
            es_col_url = self._get_collection_url()
            es_index = self.conf[self.backend_section]['raw_index']
            clean = False
            elastic_ocean = get_elastic(es_col_url, es_index, clean,
                                        ocean_backend)
            ocean_backend.set_elastic(elastic_ocean)
            ocean_backend.feed(arthur_items=aitems)

        cfg = self.config.get_conf()

        if 'collect' in cfg[self.backend_section] and not cfg[
                self.backend_section]['collect']:
            logging.info('%s collect disabled', self.backend_section)
            return

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        logger.info('Programming arthur for [%s] raw data collection',
                    self.backend_section)
        clean = False

        fetch_archive = False
        if 'fetch-archive' in self.conf[self.backend_section] and self.conf[
                self.backend_section]['fetch-archive']:
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            # If the repo already exists don't try to add it to arthur
            tag = self.backend_tag(repo)
            if tag not in self.arthur_items:
                self.arthur_items[tag] = []
                repo, repo_labels = self._extract_repo_labels(
                    self.backend_section, repo)
                p2o_args = self._compose_p2o_params(self.backend_section, repo)
                filter_raw = p2o_args[
                    'filter-raw'] if 'filter-raw' in p2o_args else None
                if filter_raw:
                    # If filter-raw exists it means that there is an equivalent URL
                    # in the `unknown` section of the projects.json. Thus the URL with
                    # filter-raw is ignored in the collection phase, while the URL
                    # in `unknown` is considered in this phase.
                    logging.warning("Not collecting filter raw repository: %s",
                                    repo)
                    continue
                backend_args = self._compose_perceval_params(
                    self.backend_section, repo)
                logger.debug(backend_args)

                check_arthur_task(repo, backend_args)

            collect_arthur_items(repo)
Ejemplo n.º 29
0
    def __enrich_items(self):

        time_start = time.time()

        # logger.info('%s starts for %s ', 'enrichment', self.backend_section)
        logger.info('[%s] enrichment starts', self.backend_section)
        print("Enrichment for {}: starting...".format(self.backend_section))

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        github_token = None
        pair_programming = False
        if 'github' in cfg and 'backend_token' in cfg['github']:
            github_token = cfg['github']['backend_token']
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        for repo in repos:
            # First process p2o params from repo
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            filters_raw_prefix = p2o_args[
                'filters-raw-prefix'] if 'filters-raw-prefix' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            try:
                es_col_url = self._get_collection_url()
                logger.debug('[%s] enrichment starts for %s',
                             self.backend_section, repo)
                backend = self.get_backend(self.backend_section)
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    cfg['sortinghat']['database'],
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    cfg['sortinghat']['user'],
                    cfg['sortinghat']['password'],
                    cfg['sortinghat']['host'],
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    filters_raw_prefix=filters_raw_prefix,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=cfg['sortinghat']['unaffiliated_group'],
                    pair_programming=pair_programming,
                    studies_args=studies_args)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            # Let's try to create the aliases for the enriched index
            if not self.enrich_aliases:
                logger.debug("Creating aliases after enrich")
                task_aliases = TaskPanelsAliases(self.config)
                task_aliases.set_backend_section(self.backend_section)
                task_aliases.execute()
                logger.debug("Done creating aliases after enrich")
                self.enrich_aliases = True

        spent_time = time.strftime("%H:%M:%S",
                                   time.gmtime(time.time() - time_start))
        logger.info('[%s] enrichment finished in %s', self.backend_section,
                    spent_time)
        print("Enrichment for {}: finished after {} hours".format(
            self.backend_section, spent_time))
Ejemplo n.º 30
0
    def __enrich_items(self):

        time_start = datetime.now()

        logger.info('[%s] enrichment phase starts', self.backend_section)

        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        no_incremental = False
        # not used due to https://github.com/chaoss/grimoirelab-elk/pull/773
        github_token = None
        pair_programming = False
        node_regex = None
        if 'git' in cfg and 'pair-programming' in cfg['git']:
            pair_programming = cfg['git']['pair-programming']
        if 'jenkins' in cfg and 'node_regex' in cfg['jenkins']:
            node_regex = cfg['jenkins']['node_regex']
        only_studies = False
        only_identities = False

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section,
                                                          raw=False)

        if not repos:
            logger.warning("No enrich repositories for %s",
                           self.backend_section)

        # Get the metadata__timestamp value of the last item inserted in the enriched index before
        # looping over the repos which data is stored in the same index. This is needed to make sure
        # that the incremental enrichment works for data sources that are collected globally but only
        # partially enriched.
        elastic_enrich = get_elastic(
            cfg['es_enrichment']['url'],
            cfg[self.backend_section]['enriched_index'])
        last_enrich_date = elastic_enrich.get_last_item_field(
            "metadata__timestamp")
        if last_enrich_date:
            last_enrich_date = last_enrich_date.replace(tzinfo=None)

        for repo in repos:
            repo, repo_labels = self._extract_repo_labels(
                self.backend_section, repo)
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            jenkins_rename_file = p2o_args[
                'jenkins-rename-file'] if 'jenkins-rename-file' in p2o_args else None
            url = p2o_args['url']
            # Second process perceval params from repo
            backend_args = self._compose_perceval_params(
                self.backend_section, url)
            studies_args = None

            backend = self.get_backend(self.backend_section)
            if 'studies' in self.conf[self.backend_section] and \
                    self.conf[self.backend_section]['studies']:
                studies_args = self.__load_studies()

            logger.info('[%s] enrichment starts for %s', self.backend_section,
                        self.anonymize_url(repo))
            es_enrich_aliases = self.select_aliases(cfg, self.backend_section)

            try:
                es_col_url = self._get_collection_url()
                enrich_backend(
                    es_col_url,
                    self.clean,
                    backend,
                    backend_args,
                    self.backend_section,
                    cfg[self.backend_section]['raw_index'],
                    cfg[self.backend_section]['enriched_index'],
                    None,  # projects_db is deprecated
                    cfg['projects']['projects_file'],
                    self.db_sh,
                    no_incremental,
                    only_identities,
                    github_token,
                    False,  # studies are executed in its own Task
                    only_studies,
                    cfg['es_enrichment']['url'],
                    None,  # args.events_enrich
                    self.db_user,
                    self.db_password,
                    self.db_host,
                    None,  # args.refresh_projects,
                    None,  # args.refresh_identities,
                    author_id=None,
                    author_uuid=None,
                    filter_raw=filter_raw,
                    jenkins_rename_file=jenkins_rename_file,
                    unaffiliated_group=self.db_unaffiliate_group,
                    pair_programming=pair_programming,
                    node_regex=node_regex,
                    studies_args=studies_args,
                    es_enrich_aliases=es_enrich_aliases,
                    last_enrich_date=last_enrich_date,
                    projects_json_repo=repo,
                    repo_labels=repo_labels)
            except Exception as ex:
                logger.error(
                    "Something went wrong producing enriched data for %s . "
                    "Using the backend_args: %s ", self.backend_section,
                    str(backend_args))
                logger.error("Exception: %s", ex)
                raise DataEnrichmentError(
                    'Failed to produce enriched data for ' +
                    self.backend_section)

            logger.info('[%s] enrichment finished for %s',
                        self.backend_section, self.anonymize_url(repo))

        spent_time = str(datetime.now() - time_start).split('.')[0]
        logger.info('[%s] enrichment phase finished in %s',
                    self.backend_section, spent_time)