Esempio n. 1
0
    def run(self):
        cfg = self.conf

        if 'collect' in cfg[self.backend_name] and \
            cfg[self.backend_name]['collect'] == False:
            logging.info('%s collect disabled', self.backend_name)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_name)
        clean = False

        fetch_cache = False
        if 'fetch-cache' in self.conf[self.backend_name] and \
            self.conf[self.backend_name]['fetch-cache']:
            fetch_cache = True

        for repo in self.repos:
            p2o_args = self.compose_p2o_params(self.backend_name, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None
            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue
            url = p2o_args['url']
            backend_args = self.compose_perceval_params(
                self.backend_name, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_name,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_name
            feed_backend(es_col_url, clean, fetch_cache, ds, backend_args,
                         cfg[ds]['raw_index'], cfg[ds]['enriched_index'], url)
        t3 = time.time()
        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s', self.backend_name,
                    spent_time)
Esempio n. 2
0
    url = args.elastic_url

    clean = args.no_incremental
    if args.fetch_cache:
        clean = True

    try:
        if args.backend:
            # Configure elastic bulk size and scrolling
            if args.bulk_size:
                ElasticSearch.max_items_bulk = args.bulk_size
            if args.scroll_size:
                ElasticItems.scroll_size = args.scroll_size
            if not args.enrich_only:
                feed_backend(url, clean, args.fetch_cache, args.backend,
                             args.backend_args, args.index, args.index_enrich,
                             args.project, args.arthur)

                # Wait for one second, to ensure bulk write reflects in searches
                # https://www.elastic.co/guide/en/elasticsearch/reference/6.1/docs-refresh.html
                # (there are better ways of doing this, but for now...)
                time.sleep(1)
                logging.info("Backend feed completed")

            if args.enrich or args.enrich_only:
                unaffiliated_group = None
                enrich_backend(
                    url, clean, args.backend, args.backend_args, args.index,
                    args.index_enrich, args.db_projects_map,
                    args.json_projects_map, args.db_sortinghat,
                    args.no_incremental, args.only_identities,
    def execute(self):
        cfg = self.config.get_conf()

        if 'scroll_size' in cfg['general']:
            ElasticItems.scroll_size = cfg['general']['scroll_size']

        if 'bulk_size' in cfg['general']:
            ElasticSearch.max_items_bulk = cfg['general']['bulk_size']

        if ('collect' in cfg[self.backend_section]
                and not cfg[self.backend_section]['collect']):
            logging.info('%s collect disabled', self.backend_section)
            return

        t2 = time.time()
        logger.info('[%s] raw data collection starts', self.backend_section)
        print("Collection for {}: starting...".format(self.backend_section))
        clean = False

        fetch_archive = False
        if ('fetch-archive' in cfg[self.backend_section]
                and cfg[self.backend_section]['fetch-archive']):
            fetch_archive = True

        # repos could change between executions because changes in projects
        repos = TaskProjects.get_repos_by_backend_section(self.backend_section)

        if not repos:
            logger.warning("No collect repositories for %s",
                           self.backend_section)

        for repo in repos:
            p2o_args = self._compose_p2o_params(self.backend_section, repo)
            filter_raw = p2o_args[
                'filter-raw'] if 'filter-raw' in p2o_args else None

            if filter_raw:
                # If filter-raw exists the goal is to enrich already collected
                # data, so don't collect anything
                logging.warning("Not collecting filter raw repository: %s",
                                repo)
                continue

            url = p2o_args['url']
            backend_args = self._compose_perceval_params(
                self.backend_section, repo)
            logger.debug(backend_args)
            logger.debug('[%s] collection starts for %s', self.backend_section,
                         repo)
            es_col_url = self._get_collection_url()
            ds = self.backend_section
            backend = self.get_backend(self.backend_section)
            project = None  # just used for github in cauldron
            try:
                feed_backend(es_col_url, clean, fetch_archive, backend,
                             backend_args, cfg[ds]['raw_index'],
                             cfg[ds]['enriched_index'], project)
            except Exception:
                logger.error(
                    "Something went wrong collecting data from this %s repo: %s . "
                    "Using the backend_args: %s " %
                    (ds, url, str(backend_args)))
                traceback.print_exc()
                raise DataCollectionError('Failed to collect data from %s' %
                                          url)

        t3 = time.time()

        spent_time = time.strftime("%H:%M:%S", time.gmtime(t3 - t2))
        logger.info('[%s] Data collection finished in %s',
                    self.backend_section, spent_time)
        print("Collection for {}: finished after {} hours".format(
            self.backend_section, spent_time))
Esempio n. 4
0
    url = args.elastic_url

    clean = args.no_incremental
    if args.fetch_cache:
        clean = True

    try:
        if args.backend:
            # Configure elastic bulk size and scrolling
            if args.bulk_size:
                ElasticSearch.max_items_bulk = args.bulk_size
            if args.scroll_size:
                ElasticItems.scroll_size = args.scroll_size
            if not args.enrich_only:
                feed_backend(url, clean, args.fetch_cache,
                             args.backend, args.backend_args,
                             args.index, args.index_enrich, args.project,
                             args.arthur)
                logging.info("Backed feed completed")

            if args.enrich or args.enrich_only:
                enrich_backend(url, clean, args.backend, args.backend_args,
                               args.index, args.index_enrich,
                               args.db_projects_map, args.json_projects_map,
                               args.db_sortinghat,
                               args.no_incremental, args.only_identities,
                               args.github_token,
                               args.studies, args.only_studies,
                               args.elastic_url_enrich, args.events_enrich,
                               args.db_user, args.db_password, args.db_host,
                               args.refresh_projects, args.refresh_identities,
                               args.author_id, args.author_uuid,