Exemple #1
0
 def test_enrich(self, sortinghat=False, projects=False):
     """Test enrich all sources"""
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     es_con = dict(config.items('ElasticSearch'))['url']
     logging.info("Enriching data in: %s", es_con)
     connectors = get_connectors()
     for con in sorted(connectors.keys()):
         perceval_backend = None
         ocean_index = "test_"+con
         enrich_index = "test_"+con+"_enrich"
         clean = False
         ocean_backend = connectors[con][1](perceval_backend)
         elastic_ocean = get_elastic(es_con, ocean_index, clean, ocean_backend)
         ocean_backend.set_elastic(elastic_ocean)
         clean = True
         if not sortinghat and not projects:
             enrich_backend = connectors[con][2]()
         elif sortinghat and not projects:
             enrich_backend = connectors[con][2](db_sortinghat=DB_SORTINGHAT)
         elif not sortinghat and projects:
             enrich_backend = connectors[con][2](db_projects_map=DB_PROJECTS)
         elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend)
         enrich_backend.set_elastic(elastic_enrich)
         enrich_count = self.__enrich_items(ocean_backend, enrich_backend)
         logging.info("Total items enriched %i ", enrich_count)
Exemple #2
0
 def test_enrich(self, sortinghat=False, projects=False):
     """Test enrich all sources"""
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     es_con = dict(config.items('ElasticSearch'))['url']
     logging.info("Enriching data in: %s", es_con)
     connectors = get_connectors()
     for con in sorted(connectors.keys()):
         perceval_backend = None
         ocean_index = "test_" + con
         enrich_index = "test_" + con + "_enrich"
         clean = False
         ocean_backend = connectors[con][1](perceval_backend)
         elastic_ocean = get_elastic(es_con, ocean_index, clean,
                                     ocean_backend)
         ocean_backend.set_elastic(elastic_ocean)
         clean = True
         if not sortinghat and not projects:
             enrich_backend = connectors[con][2](perceval_backend)
         elif sortinghat and not projects:
             enrich_backend = connectors[con][2](
                 perceval_backend, db_sortinghat=DB_SORTINGHAT)
         elif not sortinghat and projects:
             enrich_backend = connectors[con][2](
                 perceval_backend, db_projects_map=DB_PROJECTS)
         elastic_enrich = get_elastic(es_con, enrich_index, clean,
                                      enrich_backend)
         enrich_backend.set_elastic(elastic_enrich)
         enrich_count = self.__enrich_items(ocean_backend, enrich_backend)
         logging.info("Total items enriched %i ", enrich_count)
Exemple #3
0
def get_perceval_params(url, index):
    logging.info("Get perceval params for index: %s" % (index))
    elastic = get_elastic(url, ConfOcean.get_index())
    ConfOcean.set_elastic(elastic)

    r = requests.get(elastic.index_url+"/repos/"+index)

    params = r.json()['_source']['params']

    return params
def feed_backends(url, clean, debug = False, redis = None):
    ''' Update Ocean for all existing backends '''

    logging.info("Updating all Ocean")
    elastic = get_elastic(url, ConfOcean.get_index(), clean)
    ConfOcean.set_elastic(elastic)
    fetch_cache = False

    q = Queue('update', connection=Redis(redis), async=async_)

    for repo in ConfOcean.get_repos():
        task_feed = q.enqueue(feed_backend, url, clean, fetch_cache,
                              repo['backend_name'], repo['backend_params'],
                              repo['index'], repo['index_enrich'], repo['project'])
        logging.info("Queued job")
        logging.info(task_feed)
Exemple #5
0
def feed_backends(url, clean, debug = False, redis = None):
    ''' Update Ocean for all existing backends '''

    logging.info("Updating all Ocean")
    elastic = get_elastic(url, ConfOcean.get_index(), clean)
    ConfOcean.set_elastic(elastic)
    fetch_cache = False

    q = Queue('update', connection=Redis(redis), async=async_)

    for repo in ConfOcean.get_repos():
        task_feed = q.enqueue(feed_backend, url, clean, fetch_cache,
                              repo['backend_name'], repo['backend_params'],
                              repo['index'], repo['index_enrich'], repo['project'])
        logging.info("Queued job")
        logging.info(task_feed)
Exemple #6
0
 def test_refresh_project(self):
     """Test refresh project field for all sources"""
     # self.test_enrich_sh() # Load the identities in ES
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     es_con = dict(config.items('ElasticSearch'))['url']
     logging.info("Refreshing data in: %s", es_con)
     connectors = get_connectors()
     for con in sorted(connectors.keys()):
         enrich_index = "test_"+con+"_enrich"
         enrich_backend = connectors[con][2](db_projects_map=DB_PROJECTS)
         clean = False
         elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend)
         enrich_backend.set_elastic(elastic_enrich)
         logging.info("Refreshing projects fields in enriched index %s", elastic_enrich.index_url)
         self.__refresh_projects(enrich_backend)
Exemple #7
0
 def test_data_load(self):
     """Test load all sources JSON data into ES"""
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     es_con = dict(config.items('ElasticSearch'))['url']
     logging.info("Loading data in: %s", es_con)
     connectors = get_connectors()
     for con in sorted(connectors.keys()):
         with open(os.path.join("data", con + ".json")) as f:
             items = json.load(f)
             es_index = "test_"+con
             clean = True
             perceval_backend = None
             ocean_backend = connectors[con][1](perceval_backend)
             elastic_ocean = get_elastic(es_con, es_index, clean, ocean_backend)
             ocean_backend.set_elastic(elastic_ocean)
             self.__data2es(items, ocean_backend)
Exemple #8
0
 def test_data_load(self):
     """Test load all sources JSON data into ES"""
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     es_con = dict(config.items('ElasticSearch'))['url']
     logging.info("Loading data in: %s", es_con)
     connectors = get_connectors()
     for con in sorted(connectors.keys()):
         with open(os.path.join("data", con + ".json")) as f:
             items = json.load(f)
             es_index = "test_"+con
             clean = True
             perceval_backend = None
             ocean_backend = connectors[con][1](perceval_backend)
             elastic_ocean = get_elastic(es_con, es_index, clean, ocean_backend)
             ocean_backend.set_elastic(elastic_ocean)
             self.__data2es(items, ocean_backend)
Exemple #9
0
def enrich_backends(url, clean, debug = False, redis = None,
                    db_projects_map=None, db_sortinghat=None):
    ''' Enrich all existing indexes '''

    logging.info("Enriching repositories")

    elastic = get_elastic(url, ConfOcean.get_index(), clean)
    ConfOcean.set_elastic(elastic)
    fetch_cache = False

    q = Queue('update', connection=Redis(redis), async=async_)

    for repo in ConfOcean.get_repos():
        enrich_task = q.enqueue(enrich_backend,
                                url, clean,
                                repo['backend_name'], repo['backend_params'],
                                repo['index'], repo['index_enrich'], db_projects_map, db_sortinghat)
        logging.info("Queued job")
        logging.info(enrich_task)
Exemple #10
0
def enrich_backends(url, clean, debug = False, redis = None,
                    db_projects_map=None, db_sortinghat=None):
    ''' Enrich all existing indexes '''

    logging.info("Enriching repositories")

    elastic = get_elastic(url, ConfOcean.get_index(), clean)
    ConfOcean.set_elastic(elastic)
    fetch_cache = False

    q = Queue('update', connection=Redis(redis), async=async_)

    for repo in ConfOcean.get_repos():
        enrich_task = q.enqueue(enrich_backend,
                                url, clean,
                                repo['backend_name'], repo['backend_params'],
                                repo['index'], repo['index_enrich'], db_projects_map, db_sortinghat)
        logging.info("Queued job")
        logging.info(enrich_task)
Exemple #11
0
def feed_backend(url, clean, fetch_cache, backend_name, backend_params,
                 es_index=None, es_index_enrich=None, project=None):
    """ Feed Ocean with backend data """

    backend = None
    repo = {}    # repository data to be stored in conf
    repo['backend_name'] = backend_name
    repo['backend_params'] = backend_params

    if es_index:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend_cmd = klass(*backend_params)

        backend = backend_cmd.backend
        ocean_backend = connector[1](backend, fetch_cache=fetch_cache, project=project)

        logging.info("Feeding Ocean from %s (%s)", backend_name, backend.origin)

        if not es_index:
            es_index = backend_name + "_" + backend.origin
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend)

        ocean_backend.set_elastic(elastic_ocean)

        ConfOcean.set_elastic(elastic_ocean)

        repo['repo_update_start'] = datetime.now().isoformat()

        # perceval backends fetch params
        offset = None
        from_date = None
        category = None

        signature = inspect.signature(backend.fetch)

        if 'from_date' in signature.parameters:
            from_date = backend_cmd.from_date

        if 'offset' in signature.parameters:
            offset = backend_cmd.offset

        if 'category' in signature.parameters:
            category = backend_cmd.category

        # from_date param support
        if offset and category:
            ocean_backend.feed(from_offset=offset, category=category)
        elif offset:
            ocean_backend.feed(from_offset=offset)
        elif from_date and from_date.replace(tzinfo=None) != parser.parse("1970-01-01"):
            if category:
                ocean_backend.feed(backend_cmd.from_date, category=category)
            else:
                ocean_backend.feed(backend_cmd.from_date)
        elif category:
            ocean_backend.feed(category=category)
        else:
            ocean_backend.feed()

    except Exception as ex:
        if backend:
            logging.error("Error feeding ocean from %s (%s): %s" %
                          (backend_name, backend.origin, ex))
            # this print makes blackbird fails
            traceback.print_exc()
        else:
            logging.error("Error feeding ocean %s" % ex)

        repo['success'] = False
        repo['error'] = str(ex)
    else:
        repo['success'] = True

    repo['repo_update'] = datetime.now().isoformat()
    repo['index'] = es_index
    repo['index_enrich'] = es_index_enrich
    repo['project'] = project

    if es_index:
        unique_id = es_index+"_"+backend.origin
        ConfOcean.add_repo(unique_id, repo)
    else:
        logging.debug("Repository not added to Ocean because errors.")
        logging.debug(backend_params)

    logging.info("Done %s " % (backend_name))
Exemple #12
0
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None,
                   ocean_index_enrich = None,
                   db_projects_map=None, json_projects_map=None,
                   db_sortinghat=None,
                   no_incremental=False, only_identities=False,
                   github_token=None, studies=False, only_studies=False,
                   url_enrich=None, events_enrich=False):
    """ Enrich Ocean index """

    def enrich_items(items, enrich_backend, events=False):
        total = 0

        if not events:
            total= enrich_backend.enrich_items(items)
        else:
            total = enrich_backend.enrich_events(items)
        return total

    def enrich_sortinghat(ocean_backend, enrich_backend):
        # First we add all new identities to SH
        item_count = 0
        new_identities = []

        for item in ocean_backend:
            item_count += 1
            # Get identities from new items to be added to SortingHat
            identities = enrich_backend.get_identities(item)
            for identity in identities:
                if identity not in new_identities:
                    new_identities.append(identity)
            if item_count % 1000 == 0:
                logging.debug("Processed %i items identities (%i identities)" \
                               % (item_count, len(new_identities)))
        logging.debug("TOTAL ITEMS: %i" % (item_count))

        logging.info("Total new identities to be checked %i" % len(new_identities))

        merged_identities = SortingHat.add_identities(enrich_backend.sh_db,
                                                      new_identities, enrich_backend.get_connector_name())

        # Redo enrich for items with new merged identities
        renrich_items = []
        # For testing
        # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b']
        for mid in merged_identities:
            renrich_items += get_items_from_uuid(mid, enrich_backend, ocean_backend)

        # Enrich items with merged identities
        enrich_count_merged = enrich_items(renrich_items, enrich_backend)
        return enrich_count_merged

    def do_studies(enrich_backend, last_enrich):
        try:
            for study in enrich_backend.studies:
                logging.info("Starting study: %s", study)
                study(from_date=last_enrich)
        except Exception as e:
            logging.error("Problem executing studies for %s", backend_name)
            traceback.print_exc()

    try:
        from grimoire.elk.sortinghat import SortingHat
    except ImportError:
        logging.warning("SortingHat not available.")

    backend = None
    enrich_index = None

    if ocean_index or ocean_index_enrich:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend = None
        backend_cmd = None
        if klass:
            # Data is retrieved from Perceval
            backend_cmd = klass(*backend_params)

            backend = backend_cmd.backend

        if ocean_index_enrich:
            enrich_index = ocean_index_enrich
        else:
            if not ocean_index:
                ocean_index = backend_name + "_" + backend.origin
            enrich_index = ocean_index+"_enrich"
        if events_enrich:
            enrich_index += "_events"

        enrich_backend = connector[2](db_sortinghat, db_projects_map, json_projects_map)
        if url_enrich:
            elastic_enrich = get_elastic(url_enrich, enrich_index, clean, enrich_backend)
        else:
            elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)
        if github_token and backend_name == "git":
            enrich_backend.set_github_token(github_token)

        # We need to enrich from just updated items since last enrichment
        # Always filter by origin to support multi origin indexes
        last_enrich = None
        if backend:
            # Only supported in data retrieved from a perceval backend
            filter_ = {"name":"origin",
                       "value":backend.origin}
            # Check if backend supports from_date
            signature = inspect.signature(backend.fetch)

            if 'from_date' in signature.parameters:
                last_enrich = enrich_backend.get_last_update_from_es(filter_)
            elif 'offset' in signature.parameters:
                last_enrich = enrich_backend.get_last_offset_from_es(filter_)

            if no_incremental:
                last_enrich = None

            # If from_date of offset in the backed, use it
            if backend_cmd:
                if 'from_date' in signature.parameters:
                    if backend_cmd.from_date.replace(tzinfo=None) != parser.parse("1970-01-01"):
                        last_enrich = backend_cmd.from_date
                elif 'offset' in signature.parameters:
                    if backend_cmd.offset and backend_cmd.offset != 0:
                        last_enrich = backend_cmd.offset

            logging.debug("Last enrichment: %s", last_enrich)

            if 'from_date' in signature.parameters:
                ocean_backend = connector[1](backend, from_date=last_enrich)
            elif 'offset' in signature.parameters:
                ocean_backend = connector[1](backend, offset=last_enrich)
            else:
                ocean_backend = connector[1](backend)
        else:
            if not no_incremental:
                last_enrich = enrich_backend.get_last_update_from_es()
            logging.debug("Last enrichment: %s", last_enrich)
            if last_enrich:
                logging.debug("Last enrichment: %s", last_enrich)
                ocean_backend = connector[1](backend, from_date=last_enrich)
            else:
                ocean_backend = connector[1](backend)


        clean = False  # Don't remove ocean index when enrich
        elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend)
        ocean_backend.set_elastic(elastic_ocean)

        logging.info("Adding enrichment data to %s", enrich_backend.elastic.index_url)

        if only_studies:
            logging.info("Running only studies (no SH and no enrichment)")
            do_studies(enrich_backend, last_enrich)
        else:
            if db_sortinghat:
                enrich_count_merged = 0

                enrich_count_merged = enrich_sortinghat(ocean_backend, enrich_backend)
                logging.info("Total items enriched for merged identities %i ", enrich_count_merged)

            if only_identities:
                logging.info("Only SH identities added. Enrich not done!")

            else:
                # Enrichment for the new items once SH update is finished
                if not events_enrich:
                    enrich_count = enrich_items(ocean_backend, enrich_backend)
                    if enrich_count:
                        logging.info("Total items enriched %i ", enrich_count)
                else:
                    enrich_count = enrich_items(ocean_backend, enrich_backend, events=True)
                    if enrich_count:
                        logging.info("Total events enriched %i ", enrich_count)
                if studies:
                    do_studies(enrich_backend, last_enrich)

    except Exception as ex:
        traceback.print_exc()
        if backend:
            logging.error("Error enriching ocean from %s (%s): %s",
                          backend_name, backend.origin, ex)
        else:
            logging.error("Error enriching ocean %s", ex)

    logging.info("Done %s ", backend_name)
def feed_backend(url,
                 clean,
                 fetch_cache,
                 backend_name,
                 backend_params,
                 es_index=None,
                 es_index_enrich=None,
                 project=None):
    """ Feed Ocean with backend data """

    backend = None
    repo = {}  # repository data to be stored in conf
    repo['backend_name'] = backend_name
    repo['backend_params'] = backend_params

    if es_index:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend_cmd = klass(*backend_params)

        backend = backend_cmd.backend
        ocean_backend = connector[1](backend,
                                     fetch_cache=fetch_cache,
                                     project=project)

        logging.info("Feeding Ocean from %s (%s)", backend_name,
                     backend.origin)

        if not es_index:
            es_index = backend_name + "_" + backend.origin
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend)

        ocean_backend.set_elastic(elastic_ocean)

        ConfOcean.set_elastic(elastic_ocean)

        repo['repo_update_start'] = datetime.now().isoformat()

        # offset param suppport
        offset = None

        try:
            offset = backend_cmd.offset
        except AttributeError:
            # The backend does not support offset
            pass

        # from_date param support
        try:
            if offset:
                ocean_backend.feed(offset=offset)
            else:
                if backend_cmd.from_date.replace(tzinfo=None) == \
                    parser.parse("1970-01-01").replace(tzinfo=None):
                    # Don't use the default value
                    ocean_backend.feed()
                else:
                    ocean_backend.feed(backend_cmd.from_date)
        except AttributeError:
            # The backend does not support from_date
            ocean_backend.feed()

    except Exception as ex:
        if backend:
            logging.error("Error feeding ocean from %s (%s): %s" %
                          (backend_name, backend.origin, ex))
            # this print makes blackbird fails
            traceback.print_exc()
        else:
            logging.error("Error feeding ocean %s" % ex)

        repo['success'] = False
        repo['error'] = str(ex)
    else:
        repo['success'] = True

    repo['repo_update'] = datetime.now().isoformat()
    repo['index'] = es_index
    repo['index_enrich'] = es_index_enrich
    repo['project'] = project

    if es_index:
        unique_id = es_index + "_" + backend.origin
        ConfOcean.add_repo(unique_id, repo)
    else:
        logging.debug("Repository not added to Ocean because errors.")
        logging.debug(backend_params)

    logging.info("Done %s " % (backend_name))
def enrich_backend(url,
                   clean,
                   backend_name,
                   backend_params,
                   ocean_index=None,
                   ocean_index_enrich=None,
                   db_projects_map=None,
                   db_sortinghat=None,
                   no_incremental=False,
                   only_identities=False,
                   github_token=None,
                   studies=False,
                   only_studies=False,
                   url_enrich=None):
    """ Enrich Ocean index """
    def enrich_items(items, enrich_backend):
        total = 0

        items_pack = []

        for item in items:
            # print("%s %s" % (item['url'], item['lastUpdated_date']))
            if len(items_pack) >= enrich_backend.elastic.max_items_bulk:
                logging.info("Adding %i (%i done) enriched items to %s" % \
                             (enrich_backend.elastic.max_items_bulk, total,
                              enrich_backend.elastic.index_url))
                enrich_backend.enrich_items(items_pack)
                items_pack = []
            items_pack.append(item)
            total += 1
        enrich_backend.enrich_items(items_pack)

        return total

    def enrich_sortinghat(ocean_backend, enrich_backend):
        # First we add all new identities to SH
        item_count = 0
        new_identities = []

        for item in ocean_backend:
            item_count += 1
            # Get identities from new items to be added to SortingHat
            identities = enrich_backend.get_identities(item)
            for identity in identities:
                if identity not in new_identities:
                    new_identities.append(identity)
            if item_count % 1000 == 0:
                logging.debug("Processed %i items identities (%i identities)" \
                               % (item_count, len(new_identities)))
        logging.debug("TOTAL ITEMS: %i" % (item_count))

        logging.info("Total new identities to be checked %i" %
                     len(new_identities))

        merged_identities = SortingHat.add_identities(
            enrich_backend.sh_db, new_identities,
            enrich_backend.get_connector_name())

        # Redo enrich for items with new merged identities
        renrich_items = []
        # For testing
        # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b']
        for mid in merged_identities:
            renrich_items += get_items_from_uuid(mid, enrich_backend,
                                                 ocean_backend)

        # Enrich items with merged identities
        enrich_count_merged = enrich_items(renrich_items, enrich_backend)
        return enrich_count_merged

    def do_studies(enrich_backend, last_enrich):
        try:
            for study in enrich_backend.studies:
                logging.info("Starting study: %s", study)
                study(from_date=last_enrich)
        except Exception as e:
            logging.warning("Problem executing studies for %s", backend_name)
            print(e)

    backend = None
    enrich_index = None

    if ocean_index or ocean_index_enrich:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend = None
        if klass:
            # Data is retrieved from Perceval
            backend_cmd = klass(*backend_params)

            backend = backend_cmd.backend

        if ocean_index_enrich:
            enrich_index = ocean_index_enrich
        else:
            if not ocean_index:
                ocean_index = backend_name + "_" + backend.origin
            enrich_index = ocean_index + "_enrich"

        enrich_backend = connector[2](backend, db_sortinghat, db_projects_map)
        if url_enrich:
            elastic_enrich = get_elastic(url_enrich, enrich_index, clean,
                                         enrich_backend)
        else:
            elastic_enrich = get_elastic(url, enrich_index, clean,
                                         enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)
        if github_token and backend_name == "git":
            enrich_backend.set_github_token(github_token)

        # We need to enrich from just updated items since last enrichment
        # Always filter by origin to support multi origin indexes
        last_enrich = None
        if backend:
            # Only supported in data retrieved from a perceval backend
            filter_ = {"name": "origin", "value": backend.origin}
            last_enrich = enrich_backend.get_last_update_from_es(filter_)
        if no_incremental:
            last_enrich = None

        logging.debug("Last enrichment: %s", last_enrich)

        # last_enrich=parser.parse('2016-06-01')

        ocean_backend = connector[1](backend, from_date=last_enrich)
        clean = False  # Don't remove ocean index when enrich
        elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend)
        ocean_backend.set_elastic(elastic_ocean)

        logging.info("Adding enrichment data to %s",
                     enrich_backend.elastic.index_url)

        if only_studies:
            logging.info("Running only studies (no SH and no enrichment)")
            do_studies(enrich_backend, last_enrich)

        else:
            if db_sortinghat:
                enrich_count_merged = 0

                enrich_count_merged = enrich_sortinghat(
                    ocean_backend, enrich_backend)
                logging.info("Total items enriched for merged identities %i ",
                             enrich_count_merged)

            if only_identities:
                logging.info("Only SH identities added. Enrich not done!")

            else:
                # Enrichment for the new items once SH update is finished
                enrich_count = enrich_items(ocean_backend, enrich_backend)
                logging.info("Total items enriched %i ", enrich_count)

                if studies:
                    do_studies(enrich_backend, last_enrich)

    except Exception as ex:
        traceback.print_exc()
        if backend:
            logging.error("Error enriching ocean from %s (%s): %s",
                          backend_name, backend.origin, ex)
        else:
            logging.error("Error enriching ocean %s", ex)

    logging.info("Done %s ", backend_name)
Exemple #15
0
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None,
                   ocean_index_enrich = None,
                   db_projects_map=None, json_projects_map=None,
                   db_sortinghat=None,
                   no_incremental=False, only_identities=False,
                   github_token=None, studies=False, only_studies=False,
                   url_enrich=None, events_enrich=False,
                   db_user=None, db_password=None, db_host=None,
                   do_refresh_projects=False, do_refresh_identities=False,
                   author_id=None, author_uuid=None):
    """ Enrich Ocean index """


    backend = None
    enrich_index = None

    if ocean_index or ocean_index_enrich:
        clean = False  # don't remove index, it could be shared

    if do_refresh_projects or do_refresh_identities:
        clean = False  # refresh works over the existing enriched items

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend = None
        backend_cmd = None
        if klass:
            # Data is retrieved from Perceval
            backend_cmd = klass(*backend_params)
            backend = backend_cmd.backend

        if ocean_index_enrich:
            enrich_index = ocean_index_enrich
        else:
            if not ocean_index:
                ocean_index = backend_name + "_" + backend.origin
            enrich_index = ocean_index+"_enrich"
        if events_enrich:
            enrich_index += "_events"

        enrich_backend = connector[2](db_sortinghat, db_projects_map, json_projects_map,
                                      db_user, db_password, db_host)
        if url_enrich:
            elastic_enrich = get_elastic(url_enrich, enrich_index, clean, enrich_backend)
        else:
            elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)
        if github_token and backend_name == "git":
            enrich_backend.set_github_token(github_token)

        ocean_backend = get_ocean_backend(backend_cmd, enrich_backend, no_incremental)

        if only_studies:
            logger.info("Running only studies (no SH and no enrichment)")
            do_studies(enrich_backend, no_incremental)
        elif do_refresh_projects:
            logger.info("Refreshing project field in enriched index")
            field_id = enrich_backend.get_field_unique_id()
            eitems = refresh_projects(enrich_backend)
            enrich_backend.elastic.bulk_upload_sync(eitems, field_id)
        elif do_refresh_identities:

            if author_id:
                query_string = {}
                query_string["fields"] = 'author_id'
                query_string["query"] = author_id
            elif author_uuid:
                query_string = {}
                query_string["fields"] = 'author_uuid'
                query_string["query"] = author_uuid

            logger.info("Refreshing identities fields in enriched index")
            field_id = enrich_backend.get_field_unique_id()
            logger.info(field_id)
            eitems = refresh_identities(enrich_backend, query_string)
            enrich_backend.elastic.bulk_upload_sync(eitems, field_id)
        else:
            clean = False  # Don't remove ocean index when enrich
            elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend)
            ocean_backend.set_elastic(elastic_ocean)

            logger.info("Adding enrichment data to %s", enrich_backend.elastic.index_url)

            if db_sortinghat:
                # FIXME: This step won't be done from enrich in the future
                total_ids = load_identities(ocean_backend, enrich_backend)
                logger.info("Total identities loaded %i ", total_ids)

            if only_identities:
                logger.info("Only SH identities added. Enrich not done!")

            else:
                # Enrichment for the new items once SH update is finished
                if not events_enrich:
                    enrich_count = enrich_items(ocean_backend, enrich_backend)
                    if enrich_count:
                        logger.info("Total items enriched %i ", enrich_count)
                else:
                    enrich_count = enrich_items(ocean_backend, enrich_backend, events=True)
                    if enrich_count:
                        logger.info("Total events enriched %i ", enrich_count)
                if studies:
                    do_studies(enrich_backend)

    except Exception as ex:
        traceback.print_exc()
        if backend:
            logger.error("Error enriching ocean from %s (%s): %s",
                          backend_name, backend.origin, ex)
        else:
            logger.error("Error enriching ocean %s", ex)

    logger.info("Done %s ", backend_name)
Exemple #16
0
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None,
                   ocean_index_enrich = None,
                   db_projects_map=None, db_sortinghat=None,
                   no_incremental=False):
    """ Enrich Ocean index """

    def enrich_items(items, enrich_backend):
        total = 0

        items_pack = []

        for item in items:
            # print("%s %s" % (item['url'], item['lastUpdated_date']))
            if len(items_pack) >= enrich_backend.elastic.max_items_bulk:
                logging.info("Adding %i (%i done) enriched items to %s" % \
                             (enrich_backend.elastic.max_items_bulk, total,
                              enrich_backend.elastic.index_url))
                enrich_backend.enrich_items(items_pack)
                items_pack = []
            items_pack.append(item)
            total += 1
        enrich_backend.enrich_items(items_pack)

        return total

    def enrich_sortinghat(backend_name, ocean_backend, enrich_backend):
        # First we add all new identities to SH
        item_count = 0
        new_identities = []

        for item in ocean_backend:
            item_count += 1
            # Get identities from new items to be added to SortingHat
            identities = enrich_backend.get_identities(item)
            for identity in identities:
                if identity not in new_identities:
                    new_identities.append(identity)
            if item_count % 1000 == 0:
                logging.debug("Processed %i items identities (%i identities)" \
                               % (item_count, len(new_identities)))
        logging.debug("TOTAL ITEMS: %i" % (item_count))

        logging.info("Total new identities to be checked %i" % len(new_identities))

        merged_identities = SortingHat.add_identities(enrich_backend.sh_db,
                                                      new_identities, backend_name)

        # Redo enrich for items with new merged identities
        renrich_items = []
        # For testing
        # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b']
        for mid in merged_identities:
            renrich_items += get_items_from_uuid(mid, enrich_backend, ocean_backend)

        # Enrich items with merged identities
        enrich_count_merged = enrich_items(renrich_items, enrich_backend)
        return enrich_count_merged


    backend = None
    enrich_index = None

    if ocean_index or ocean_index_enrich:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend_cmd = klass(*backend_params)

        backend = backend_cmd.backend

        if ocean_index_enrich:
            enrich_index = ocean_index_enrich
        else:
            if not ocean_index:
                ocean_index = backend_name + "_" + backend.origin
            enrich_index = ocean_index+"_enrich"

        enrich_backend = connector[2](backend, db_projects_map, db_sortinghat)
        elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend)
        enrich_backend.set_elastic(elastic_enrich)

        # We need to enrich from just updated items since last enrichment
        # Always filter by origin to support multi origin indexes
        filter_ = {"name":"origin",
                   "value":backend.origin}
        last_enrich = enrich_backend.get_last_update_from_es(filter_)
        if no_incremental:
            last_enrich = None

        logging.debug("Last enrichment: %s" % (last_enrich))

        ocean_backend = connector[1](backend, from_date=last_enrich)
        clean = False  # Don't remove ocean index when enrich
        elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend)
        ocean_backend.set_elastic(elastic_ocean)

        logging.info("Adding enrichment data to %s" %
                     (enrich_backend.elastic.index_url))

        if db_sortinghat:
            enrich_count_merged = 0

            enrich_count_merged = enrich_sortinghat(backend_name,
                                                    ocean_backend, enrich_backend)
            logging.info("Total items enriched for merged identities %i " %  enrich_count_merged)
        # Enrichment for the new items once SH update is finished
        enrich_count = enrich_items(ocean_backend, enrich_backend)
        logging.info("Total items enriched %i " %  enrich_count)


    except Exception as ex:
        traceback.print_exc()
        if backend:
            logging.error("Error enriching ocean from %s (%s): %s" %
                          (backend_name, backend.origin, ex))
        else:
            logging.error("Error enriching ocean %s" % ex)

    logging.info("Done %s " % (backend_name))
Exemple #17
0
def feed_backend(url, clean, fetch_cache, backend_name, backend_params,
                 es_index=None, es_index_enrich=None, project=None):
    """ Feed Ocean with backend data """

    backend = None
    repo = {}    # repository data to be stored in conf
    repo['backend_name'] = backend_name
    repo['backend_params'] = backend_params
    if es_index:
        clean = False  # don't remove index, it could be shared


    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        backend_cmd = klass(*backend_params)

        backend = backend_cmd.backend
        ocean_backend = connector[1](backend, fetch_cache=fetch_cache, project=project)

        logging.info("Feeding Ocean from %s (%s)" % (backend_name,
                                                     backend.origin))

        if not es_index:
            es_index = backend_name + "_" + backend.origin
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend)

        ocean_backend.set_elastic(elastic_ocean)

        ConfOcean.set_elastic(elastic_ocean)

        repo['repo_update_start'] = datetime.now().isoformat()

        try:
            if backend_cmd.from_date.replace(tzinfo=None) == \
                parser.parse("1970-01-01").replace(tzinfo=None):
                # Don't use the default value
                ocean_backend.feed()
            else:
                ocean_backend.feed(backend_cmd.from_date)
        except AttributeError:
            # The backend does not support from_date
            ocean_backend.feed()

    except Exception as ex:
        if backend:
            logging.error("Error feeding ocean from %s (%s): %s" %
                          (backend_name, backend.origin, ex))
            # don't propagete ... it makes blackbird fails
            # TODO: manage it in p2o
            # traceback.print_exc()
        else:
            logging.error("Error feeding ocean %s" % ex)

        repo['success'] = False
        repo['error'] = str(ex)
    else:
        repo['success'] = True

    repo['repo_update'] = datetime.now().isoformat()
    repo['index'] = es_index
    repo['index_enrich'] = es_index_enrich
    repo['project'] = project

    if es_index:
        unique_id = es_index+"_"+backend.origin
        ConfOcean.add_repo(unique_id, repo)
    else:
        logging.debug("Repository not added to Ocean because errors.")
        logging.debug(backend_params)

    logging.info("Done %s " % (backend_name))
Exemple #18
0
if __name__ == '__main__':

    app_init = datetime.now()

    args = get_params()

    config_logging(args.debug)

    if args.index is None:
        # Extract identities from all indexes
        pass
    else:
        logging.info("Extracting identities from: %s" % (args.index))
        perceval_params = get_perceval_params(args.elastic_url, args.index)
        backend_name = perceval_params['backend']
        connector = get_connector_from_name(backend_name)
        perceval_backend_class = connector[0]
        ocean_backend_class = connector[1]
        perceval_backend = None  # Don't use perceval

        perceval_backend = perceval_backend_class(**perceval_params)

        obackend =  ocean_backend_class(perceval_backend, incremental=False)
        obackend.set_elastic(get_elastic(args.elastic_url, args.index))

        identities = get_identities(obackend)
        SortingHat.add_identities(identities, backend_name)

        # Add the identities to Sorting Hat

        print ("Total identities processed: %i" % (len(identities)))