Esempio n. 1
0
    def _compose_arthur_params(self, backend_section, repo):
        # Params for the backends must be in a dictionary for arthur

        params = {}

        backend = self.get_backend(backend_section)
        connector = get_connector_from_name(backend)
        ocean = connector[1]

        # First add the params from the URL, which is backend specific
        params.update(ocean.get_arthur_params_from_url(repo))

        # Now add the backend params included in the config file
        for p in self.conf[backend_section]:
            if p in self.NO_BACKEND_FIELDS:
                # These params are not for the perceval backend
                continue
            if self.conf[backend_section][p]:
                # Command line - in param is converted to _ in python variable
                p_ = p.replace("-", "_")
                if p in self.PARAMS_WITH_SPACES:
                    # '--blacklist-jobs', 'a', 'b', 'c'
                    # 'a', 'b', 'c' must be added as items in the list
                    list_params = self.conf[backend_section][p].split()
                    params[p_] = list_params
                else:
                    params[p_] = self.conf[backend_section][p]

        return params
Esempio n. 2
0
    def _compose_perceval_params(self, backend_section, repo):
        backend = self.get_backend(backend_section)
        connector = get_connector_from_name(backend)
        ocean = connector[1]

        # First add the params from the URL, which is backend specific
        params = ocean.get_perceval_params_from_url(repo)

        # Now add the backend params included in the config file
        for p in self.conf[backend_section]:
            if p in self.NO_BACKEND_FIELDS:
                # These params are not for the perceval backend
                continue

            section_param = self.conf[backend_section][p]
            if not section_param:
                logger.warning("Empty section %s", p)
                continue

            # If param is boolean, no values must be added
            if type(section_param) == bool:
                params.append("--" + p) if section_param else None
            elif type(section_param) == list:
                # '--blacklist-jobs', 'a', 'b', 'c'
                # 'a', 'b', 'c' must be added as items in the list
                params.append("--" + p)
                list_params = section_param
                params += list_params
            else:
                params.append("--" + p)
                params.append(str(section_param))

        return params
Esempio n. 3
0
    def _compose_perceval_params(self, backend_section, repo):
        backend = self.get_backend(backend_section)
        connector = get_connector_from_name(backend)
        ocean = connector[1]

        # First add the params from the URL, which is backend specific
        params = ocean.get_perceval_params_from_url(repo)

        # Now add the backend params included in the config file
        for p in self.conf[backend_section]:
            if p in self.ES_INDEX_FIELDS:
                # These params are not for the perceval backend
                continue
            params.append("--" + p)
            if self.conf[backend_section][p]:
                # If param is boolean, no values must be added
                if type(self.conf[backend_section][p]) != bool:
                    if type(self.conf[backend_section][p]) == list:
                        # '--blacklist-jobs', 'a', 'b', 'c'
                        # 'a', 'b', 'c' must be added as items in the list
                        list_params = self.conf[backend_section][p]
                        params += list_params
                    else:
                        params.append(self.conf[backend_section][p])
        return params
Esempio n. 4
0
    def _get_enrich_backend(self):
        db_projects_map = None
        json_projects_map = None
        clean = False
        connector = get_connector_from_name(
            self.get_backend(self.backend_section))

        if 'projects_file' in self.conf['projects']:
            json_projects_map = self.conf['projects']['projects_file']

        enrich_backend = connector[2](self.db_sh, db_projects_map,
                                      json_projects_map, self.db_user,
                                      self.db_password, self.db_host)
        elastic_enrich = get_elastic(
            self.conf['es_enrichment']['url'],
            self.conf[self.backend_section]['enriched_index'], clean,
            enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)

        if 'github' in self.conf.keys() and \
            'backend_token' in self.conf['github'].keys() and \
            self.get_backend(self.backend_section) == "git":

            gh_token = self.conf['github']['backend_token']
            enrich_backend.set_github_token(gh_token)

        if 'unaffiliated_group' in self.conf['sortinghat']:
            enrich_backend.unaffiliated_group = self.conf['sortinghat'][
                'unaffiliated_group']

        return enrich_backend
Esempio n. 5
0
    def compose_perceval_params(self, backend_name, repo):
        # Params that are lists separated by white space
        list_params_spaces = ['blacklist-jobs']

        connector = get_connector_from_name(self.backend_name)
        ocean = connector[1]

        # First add the params from the URL, which is backend specific
        params = ocean.get_perceval_params_from_url(repo)

        # Now add the backend params included in the config file
        for p in self.conf[backend_name]:
            if p in self.ES_INDEX_FIELDS:
                # These params are not for the perceval backend
                continue
            params.append("--"+p)
            if self.conf[backend_name][p]:
                if type(self.conf[backend_name][p]) != bool:
                    if p in list_params_spaces:
                        # '--blacklist-jobs', 'a', 'b', 'c'
                        # 'a', 'b', 'c' must be added as items in the list
                        list_params = self.conf[backend_name][p].split()
                        params += list_params
                    else:
                        params.append(self.conf[backend_name][p])
        return params
Esempio n. 6
0
    def compose_p2o_params(self, backend_name, repo):
        # get p2o params included in the projects list
        params = {}

        connector = get_connector_from_name(self.backend_name)
        ocean = connector[1]

        # First add the params from the URL, which is backend specific
        params = ocean.get_p2o_params_from_url(repo)

        return params
Esempio n. 7
0
    def _extract_repo_labels(self, backend_section, repo):
        """Extract the labels declared in the repositories within the projects.json, and remove them to
        avoid breaking already existing functionalities.

        :param backend_section: name of the backend section
        :param repo: repo url in projects.json
        """
        backend = self.get_backend(backend_section)
        connector = get_connector_from_name(backend)
        ocean = connector[1]

        processed_repo, labels_lst = ocean.extract_repo_labels(repo)
        return processed_repo, labels_lst
Esempio n. 8
0
 def collect_arthur_items(repo):
     aitems = self.__feed_backend_arthur(repo)
     if not aitems:
         return
     connector = get_connector_from_name(self.backend_section)
     klass = connector[1]  # Ocean backend for the connector
     ocean_backend = klass(None)
     es_col_url = self._get_collection_url()
     es_index = self.conf[self.backend_section]['raw_index']
     clean = False
     elastic_ocean = get_elastic(es_col_url, es_index, clean, ocean_backend)
     ocean_backend.set_elastic(elastic_ocean)
     ocean_backend.feed(arthur_items=aitems)
Esempio n. 9
0
    def _get_enrich_backend(self):
        db_projects_map = None
        json_projects_map = None
        clean = False
        connector = get_connector_from_name(self.get_backend(self.backend_section))

        if 'projects_file' in self.conf['projects']:
            json_projects_map = self.conf['projects']['projects_file']

        enrich_backend = connector[2](self.db_sh, db_projects_map, json_projects_map,
                                      self.db_user, self.db_password, self.db_host)
        elastic_enrich = get_elastic(self.conf['es_enrichment']['url'],
                                     self.conf[self.backend_section]['enriched_index'],
                                     clean, enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)

        if self.db_unaffiliate_group:
            enrich_backend.unaffiliated_group = self.db_unaffiliate_group

        return enrich_backend
Esempio n. 10
0
    def get_enrich_backend(self):
        db_projects_map = None
        json_projects_map = None
        clean = False
        connector = get_connector_from_name(self.backend_name)

        enrich_backend = connector[2](self.db_sh, db_projects_map, json_projects_map,
                                      self.db_user, self.db_password, self.db_host)
        elastic_enrich = get_elastic(self.conf['es_enrichment'],
                                     self.conf[self.backend_name]['enriched_index'],
                                     clean, enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)

        if 'github' in self.conf.keys() and \
            'backend_token' in self.conf['github'].keys() and \
            self.backend_name == "git":

            gh_token = self.conf['github']['backend_token']
            enrich_backend.set_github_token(gh_token)

        return enrich_backend
    def __create_arthur_json(self, repo, backend_args):
        """ Create the JSON for configuring arthur to collect data

        https://github.com/grimoirelab/arthur#adding-tasks
        Sample for git:

        {
        "tasks": [
            {
                "task_id": "arthur.git",
                "backend": "git",
                "backend_args": {
                    "gitpath": "/tmp/arthur_git/",
                    "uri": "https://github.com/grimoirelab/arthur.git"
                },
                "category": "commit",
                "archive_args": {
                    "archive_path": '/tmp/test_archives',
                    "fetch_from_archive": false,
                    "archive_after": None
                },
                "scheduler_args": {
                    "delay": 10
                }
            }
        ]
        }
        """

        backend_args = self._compose_arthur_params(self.backend_section, repo)
        if self.backend_section == 'git':
            backend_args['gitpath'] = os.path.join(self.REPOSITORY_DIR, repo)
        backend_args['tag'] = self.backend_tag(repo)

        ajson = {"tasks": [{}]}
        # This is the perceval tag
        ajson["tasks"][0]['task_id'] = self.backend_tag(repo)
        ajson["tasks"][0]['backend'] = self.backend_section.split(":")[0]
        ajson["tasks"][0]['backend_args'] = backend_args
        ajson["tasks"][0]['category'] = backend_args['category']
        ajson["tasks"][0]['archive'] = {}
        ajson["tasks"][0]['scheduler'] = {"delay": self.ARTHUR_TASK_DELAY}
        # from-date or offset param must be added
        es_col_url = self._get_collection_url()
        es_index = self.conf[self.backend_section]['raw_index']
        # Get the last activity for the data source
        es = ElasticSearch(es_col_url, es_index)
        connector = get_connector_from_name(self.backend_section)

        klass = connector[0]  # Backend for the connector
        signature = inspect.signature(klass.fetch)

        last_activity = None
        filter_ = {"name": "tag", "value": backend_args['tag']}
        if 'from_date' in signature.parameters:
            last_activity = es.get_last_item_field('metadata__updated_on',
                                                   [filter_])
            if last_activity:
                ajson["tasks"][0]['backend_args'][
                    'from_date'] = last_activity.isoformat()
        elif 'offset' in signature.parameters:
            last_activity = es.get_last_item_field('offset', [filter_])
            if last_activity:
                ajson["tasks"][0]['backend_args']['offset'] = last_activity

        if last_activity:
            logging.info("Getting raw item with arthur since %s",
                         last_activity)

        return (ajson)
Esempio n. 12
0
if __name__ == '__main__':

    app_init = datetime.now()

    args = get_params()

    config_logging(args.debug)

    if args.index is None:
        # Extract identities from all indexes
        pass
    else:
        logging.info("Extracting identities from: %s" % (args.index))
        perceval_params = get_perceval_params(args.elastic_url, args.index)
        backend_name = perceval_params['backend']
        connector = get_connector_from_name(backend_name)
        perceval_backend_class = connector[0]
        ocean_backend_class = connector[1]
        perceval_backend = None  # Don't use perceval

        perceval_backend = perceval_backend_class(**perceval_params)

        obackend =  ocean_backend_class(perceval_backend, incremental=False)
        obackend.set_elastic(get_elastic(args.elastic_url, args.index))

        identities = get_identities(obackend)
        SortingHat.add_identities(identities, backend_name)

        # Add the identities to Sorting Hat

        print ("Total identities processed: %i" % (len(identities)))
Esempio n. 13
0
if __name__ == '__main__':

    app_init = datetime.now()

    args = get_params()

    config_logging(args.debug)

    if args.index is None:
        # Extract identities from all indexes
        pass
    else:
        logging.info("Extracting identities from: %s" % (args.index))
        perceval_params = get_perceval_params(args.elastic_url, args.index)
        backend_name = perceval_params['backend']
        connector = get_connector_from_name(backend_name)
        perceval_backend_class = connector[0]
        ocean_backend_class = connector[1]
        perceval_backend = None  # Don't use perceval

        perceval_backend = perceval_backend_class(**perceval_params)

        obackend = ocean_backend_class(perceval_backend, incremental=False)
        obackend.set_elastic(get_elastic(args.elastic_url, args.index))

        identities = get_identities(obackend)
        SortingHat.add_identities(identities, backend_name)

        # Add the identities to Sorting Hat

        print("Total identities processed: %i" % (len(identities)))
Esempio n. 14
0
    def execute(self):
        cfg = self.config.get_conf()

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        logger.info('Programming arthur for [%s] raw data collection',
                    self.backend_section)
        clean = False

        fetch_cache = False
        if ('fetch-cache' in self.conf[self.backend_section]
                and self.conf[self.backend_section]['fetch-cache']):
            fetch_cache = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue
            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            arthur_repo_json = self.__create_arthur_json(repo, backend_args)
            logger.debug('JSON config for arthur %s',
                         json.dumps(arthur_repo_json, indent=True))

            # First check is the task already exists
            try:
                r = requests.post(self.arthur_url + "/tasks")
            except requests.exceptions.ConnectionError as ex:
                logging.error("Can not connect to %s", self.arthur_url)
                return
                # raise RuntimeError("Can not connect to " + self.arthur_url)

            task_ids = [task['task_id'] for task in r.json()['tasks']]
            new_task_ids = [
                task['task_id'] for task in arthur_repo_json['tasks']
            ]
            # TODO: if a tasks already exists maybe we should delete and readd it
            already_tasks = list(set(task_ids).intersection(set(new_task_ids)))
            if len(already_tasks) > 0:
                logger.warning(
                    "Tasks not added to arthur because there are already existing tasks %s",
                    already_tasks)
            else:
                r = requests.post(self.arthur_url + "/add",
                                  json=arthur_repo_json)
                r.raise_for_status()
                logger.info('[%s] collection configured in arthur for %s',
                            self.backend_section, repo)

            # Try to collect existing items from REDIS
            aitems = self.__feed_backend_arthur(repo)
            connector = get_connector_from_name(self.backend_section)
            klass = connector[1]  # Ocean backend for the connector
            ocean_backend = klass(None)
            es_col_url = self._get_collection_url()
            es_index = self.conf[self.backend_section]['raw_index']
            clean = False
            elastic_ocean = get_elastic(es_col_url, es_index, clean,
                                        ocean_backend)
            ocean_backend.set_elastic(elastic_ocean)
            ocean_backend.feed(arthur_items=aitems)