def test_enrich(self, sortinghat=False, projects=False): """Test enrich all sources""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Enriching data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): perceval_backend = None ocean_index = "test_"+con enrich_index = "test_"+con+"_enrich" clean = False ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) clean = True if not sortinghat and not projects: enrich_backend = connectors[con][2]() elif sortinghat and not projects: enrich_backend = connectors[con][2](db_sortinghat=DB_SORTINGHAT) elif not sortinghat and projects: enrich_backend = connectors[con][2](db_projects_map=DB_PROJECTS) elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) enrich_count = self.__enrich_items(ocean_backend, enrich_backend) logging.info("Total items enriched %i ", enrich_count)
def test_enrich(self, sortinghat=False, projects=False): """Test enrich all sources""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Enriching data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): perceval_backend = None ocean_index = "test_" + con enrich_index = "test_" + con + "_enrich" clean = False ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) clean = True if not sortinghat and not projects: enrich_backend = connectors[con][2](perceval_backend) elif sortinghat and not projects: enrich_backend = connectors[con][2]( perceval_backend, db_sortinghat=DB_SORTINGHAT) elif not sortinghat and projects: enrich_backend = connectors[con][2]( perceval_backend, db_projects_map=DB_PROJECTS) elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) enrich_count = self.__enrich_items(ocean_backend, enrich_backend) logging.info("Total items enriched %i ", enrich_count)
def get_perceval_params(url, index): logging.info("Get perceval params for index: %s" % (index)) elastic = get_elastic(url, ConfOcean.get_index()) ConfOcean.set_elastic(elastic) r = requests.get(elastic.index_url+"/repos/"+index) params = r.json()['_source']['params'] return params
def feed_backends(url, clean, debug = False, redis = None): ''' Update Ocean for all existing backends ''' logging.info("Updating all Ocean") elastic = get_elastic(url, ConfOcean.get_index(), clean) ConfOcean.set_elastic(elastic) fetch_cache = False q = Queue('update', connection=Redis(redis), async=async_) for repo in ConfOcean.get_repos(): task_feed = q.enqueue(feed_backend, url, clean, fetch_cache, repo['backend_name'], repo['backend_params'], repo['index'], repo['index_enrich'], repo['project']) logging.info("Queued job") logging.info(task_feed)
def test_refresh_project(self): """Test refresh project field for all sources""" # self.test_enrich_sh() # Load the identities in ES config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Refreshing data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): enrich_index = "test_"+con+"_enrich" enrich_backend = connectors[con][2](db_projects_map=DB_PROJECTS) clean = False elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) logging.info("Refreshing projects fields in enriched index %s", elastic_enrich.index_url) self.__refresh_projects(enrich_backend)
def test_data_load(self): """Test load all sources JSON data into ES""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Loading data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): with open(os.path.join("data", con + ".json")) as f: items = json.load(f) es_index = "test_"+con clean = True perceval_backend = None ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) self.__data2es(items, ocean_backend)
def enrich_backends(url, clean, debug = False, redis = None, db_projects_map=None, db_sortinghat=None): ''' Enrich all existing indexes ''' logging.info("Enriching repositories") elastic = get_elastic(url, ConfOcean.get_index(), clean) ConfOcean.set_elastic(elastic) fetch_cache = False q = Queue('update', connection=Redis(redis), async=async_) for repo in ConfOcean.get_repos(): enrich_task = q.enqueue(enrich_backend, url, clean, repo['backend_name'], repo['backend_params'], repo['index'], repo['index_enrich'], db_projects_map, db_sortinghat) logging.info("Queued job") logging.info(enrich_task)
def feed_backend(url, clean, fetch_cache, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None): """ Feed Ocean with backend data """ backend = None repo = {} # repository data to be stored in conf repo['backend_name'] = backend_name repo['backend_params'] = backend_params if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend_cmd = klass(*backend_params) backend = backend_cmd.backend ocean_backend = connector[1](backend, fetch_cache=fetch_cache, project=project) logging.info("Feeding Ocean from %s (%s)", backend_name, backend.origin) if not es_index: es_index = backend_name + "_" + backend.origin elastic_ocean = get_elastic(url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ConfOcean.set_elastic(elastic_ocean) repo['repo_update_start'] = datetime.now().isoformat() # perceval backends fetch params offset = None from_date = None category = None signature = inspect.signature(backend.fetch) if 'from_date' in signature.parameters: from_date = backend_cmd.from_date if 'offset' in signature.parameters: offset = backend_cmd.offset if 'category' in signature.parameters: category = backend_cmd.category # from_date param support if offset and category: ocean_backend.feed(from_offset=offset, category=category) elif offset: ocean_backend.feed(from_offset=offset) elif from_date and from_date.replace(tzinfo=None) != parser.parse("1970-01-01"): if category: ocean_backend.feed(backend_cmd.from_date, category=category) else: ocean_backend.feed(backend_cmd.from_date) elif category: ocean_backend.feed(category=category) else: ocean_backend.feed() except Exception as ex: if backend: logging.error("Error feeding ocean from %s (%s): %s" % (backend_name, backend.origin, ex)) # this print makes blackbird fails traceback.print_exc() else: logging.error("Error feeding ocean %s" % ex) repo['success'] = False repo['error'] = str(ex) else: repo['success'] = True repo['repo_update'] = datetime.now().isoformat() repo['index'] = es_index repo['index_enrich'] = es_index_enrich repo['project'] = project if es_index: unique_id = es_index+"_"+backend.origin ConfOcean.add_repo(unique_id, repo) else: logging.debug("Repository not added to Ocean because errors.") logging.debug(backend_params) logging.info("Done %s " % (backend_name))
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None, ocean_index_enrich = None, db_projects_map=None, json_projects_map=None, db_sortinghat=None, no_incremental=False, only_identities=False, github_token=None, studies=False, only_studies=False, url_enrich=None, events_enrich=False): """ Enrich Ocean index """ def enrich_items(items, enrich_backend, events=False): total = 0 if not events: total= enrich_backend.enrich_items(items) else: total = enrich_backend.enrich_events(items) return total def enrich_sortinghat(ocean_backend, enrich_backend): # First we add all new identities to SH item_count = 0 new_identities = [] for item in ocean_backend: item_count += 1 # Get identities from new items to be added to SortingHat identities = enrich_backend.get_identities(item) for identity in identities: if identity not in new_identities: new_identities.append(identity) if item_count % 1000 == 0: logging.debug("Processed %i items identities (%i identities)" \ % (item_count, len(new_identities))) logging.debug("TOTAL ITEMS: %i" % (item_count)) logging.info("Total new identities to be checked %i" % len(new_identities)) merged_identities = SortingHat.add_identities(enrich_backend.sh_db, new_identities, enrich_backend.get_connector_name()) # Redo enrich for items with new merged identities renrich_items = [] # For testing # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b'] for mid in merged_identities: renrich_items += get_items_from_uuid(mid, enrich_backend, ocean_backend) # Enrich items with merged identities enrich_count_merged = enrich_items(renrich_items, enrich_backend) return enrich_count_merged def do_studies(enrich_backend, last_enrich): try: for study in enrich_backend.studies: logging.info("Starting study: %s", study) study(from_date=last_enrich) except Exception as e: logging.error("Problem executing studies for %s", backend_name) traceback.print_exc() try: from grimoire.elk.sortinghat import SortingHat except ImportError: logging.warning("SortingHat not available.") backend = None enrich_index = None if ocean_index or ocean_index_enrich: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend = None backend_cmd = None if klass: # Data is retrieved from Perceval backend_cmd = klass(*backend_params) backend = backend_cmd.backend if ocean_index_enrich: enrich_index = ocean_index_enrich else: if not ocean_index: ocean_index = backend_name + "_" + backend.origin enrich_index = ocean_index+"_enrich" if events_enrich: enrich_index += "_events" enrich_backend = connector[2](db_sortinghat, db_projects_map, json_projects_map) if url_enrich: elastic_enrich = get_elastic(url_enrich, enrich_index, clean, enrich_backend) else: elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if github_token and backend_name == "git": enrich_backend.set_github_token(github_token) # We need to enrich from just updated items since last enrichment # Always filter by origin to support multi origin indexes last_enrich = None if backend: # Only supported in data retrieved from a perceval backend filter_ = {"name":"origin", "value":backend.origin} # Check if backend supports from_date signature = inspect.signature(backend.fetch) if 'from_date' in signature.parameters: last_enrich = enrich_backend.get_last_update_from_es(filter_) elif 'offset' in signature.parameters: last_enrich = enrich_backend.get_last_offset_from_es(filter_) if no_incremental: last_enrich = None # If from_date of offset in the backed, use it if backend_cmd: if 'from_date' in signature.parameters: if backend_cmd.from_date.replace(tzinfo=None) != parser.parse("1970-01-01"): last_enrich = backend_cmd.from_date elif 'offset' in signature.parameters: if backend_cmd.offset and backend_cmd.offset != 0: last_enrich = backend_cmd.offset logging.debug("Last enrichment: %s", last_enrich) if 'from_date' in signature.parameters: ocean_backend = connector[1](backend, from_date=last_enrich) elif 'offset' in signature.parameters: ocean_backend = connector[1](backend, offset=last_enrich) else: ocean_backend = connector[1](backend) else: if not no_incremental: last_enrich = enrich_backend.get_last_update_from_es() logging.debug("Last enrichment: %s", last_enrich) if last_enrich: logging.debug("Last enrichment: %s", last_enrich) ocean_backend = connector[1](backend, from_date=last_enrich) else: ocean_backend = connector[1](backend) clean = False # Don't remove ocean index when enrich elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) logging.info("Adding enrichment data to %s", enrich_backend.elastic.index_url) if only_studies: logging.info("Running only studies (no SH and no enrichment)") do_studies(enrich_backend, last_enrich) else: if db_sortinghat: enrich_count_merged = 0 enrich_count_merged = enrich_sortinghat(ocean_backend, enrich_backend) logging.info("Total items enriched for merged identities %i ", enrich_count_merged) if only_identities: logging.info("Only SH identities added. Enrich not done!") else: # Enrichment for the new items once SH update is finished if not events_enrich: enrich_count = enrich_items(ocean_backend, enrich_backend) if enrich_count: logging.info("Total items enriched %i ", enrich_count) else: enrich_count = enrich_items(ocean_backend, enrich_backend, events=True) if enrich_count: logging.info("Total events enriched %i ", enrich_count) if studies: do_studies(enrich_backend, last_enrich) except Exception as ex: traceback.print_exc() if backend: logging.error("Error enriching ocean from %s (%s): %s", backend_name, backend.origin, ex) else: logging.error("Error enriching ocean %s", ex) logging.info("Done %s ", backend_name)
def feed_backend(url, clean, fetch_cache, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None): """ Feed Ocean with backend data """ backend = None repo = {} # repository data to be stored in conf repo['backend_name'] = backend_name repo['backend_params'] = backend_params if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend_cmd = klass(*backend_params) backend = backend_cmd.backend ocean_backend = connector[1](backend, fetch_cache=fetch_cache, project=project) logging.info("Feeding Ocean from %s (%s)", backend_name, backend.origin) if not es_index: es_index = backend_name + "_" + backend.origin elastic_ocean = get_elastic(url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ConfOcean.set_elastic(elastic_ocean) repo['repo_update_start'] = datetime.now().isoformat() # offset param suppport offset = None try: offset = backend_cmd.offset except AttributeError: # The backend does not support offset pass # from_date param support try: if offset: ocean_backend.feed(offset=offset) else: if backend_cmd.from_date.replace(tzinfo=None) == \ parser.parse("1970-01-01").replace(tzinfo=None): # Don't use the default value ocean_backend.feed() else: ocean_backend.feed(backend_cmd.from_date) except AttributeError: # The backend does not support from_date ocean_backend.feed() except Exception as ex: if backend: logging.error("Error feeding ocean from %s (%s): %s" % (backend_name, backend.origin, ex)) # this print makes blackbird fails traceback.print_exc() else: logging.error("Error feeding ocean %s" % ex) repo['success'] = False repo['error'] = str(ex) else: repo['success'] = True repo['repo_update'] = datetime.now().isoformat() repo['index'] = es_index repo['index_enrich'] = es_index_enrich repo['project'] = project if es_index: unique_id = es_index + "_" + backend.origin ConfOcean.add_repo(unique_id, repo) else: logging.debug("Repository not added to Ocean because errors.") logging.debug(backend_params) logging.info("Done %s " % (backend_name))
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None, ocean_index_enrich=None, db_projects_map=None, db_sortinghat=None, no_incremental=False, only_identities=False, github_token=None, studies=False, only_studies=False, url_enrich=None): """ Enrich Ocean index """ def enrich_items(items, enrich_backend): total = 0 items_pack = [] for item in items: # print("%s %s" % (item['url'], item['lastUpdated_date'])) if len(items_pack) >= enrich_backend.elastic.max_items_bulk: logging.info("Adding %i (%i done) enriched items to %s" % \ (enrich_backend.elastic.max_items_bulk, total, enrich_backend.elastic.index_url)) enrich_backend.enrich_items(items_pack) items_pack = [] items_pack.append(item) total += 1 enrich_backend.enrich_items(items_pack) return total def enrich_sortinghat(ocean_backend, enrich_backend): # First we add all new identities to SH item_count = 0 new_identities = [] for item in ocean_backend: item_count += 1 # Get identities from new items to be added to SortingHat identities = enrich_backend.get_identities(item) for identity in identities: if identity not in new_identities: new_identities.append(identity) if item_count % 1000 == 0: logging.debug("Processed %i items identities (%i identities)" \ % (item_count, len(new_identities))) logging.debug("TOTAL ITEMS: %i" % (item_count)) logging.info("Total new identities to be checked %i" % len(new_identities)) merged_identities = SortingHat.add_identities( enrich_backend.sh_db, new_identities, enrich_backend.get_connector_name()) # Redo enrich for items with new merged identities renrich_items = [] # For testing # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b'] for mid in merged_identities: renrich_items += get_items_from_uuid(mid, enrich_backend, ocean_backend) # Enrich items with merged identities enrich_count_merged = enrich_items(renrich_items, enrich_backend) return enrich_count_merged def do_studies(enrich_backend, last_enrich): try: for study in enrich_backend.studies: logging.info("Starting study: %s", study) study(from_date=last_enrich) except Exception as e: logging.warning("Problem executing studies for %s", backend_name) print(e) backend = None enrich_index = None if ocean_index or ocean_index_enrich: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend = None if klass: # Data is retrieved from Perceval backend_cmd = klass(*backend_params) backend = backend_cmd.backend if ocean_index_enrich: enrich_index = ocean_index_enrich else: if not ocean_index: ocean_index = backend_name + "_" + backend.origin enrich_index = ocean_index + "_enrich" enrich_backend = connector[2](backend, db_sortinghat, db_projects_map) if url_enrich: elastic_enrich = get_elastic(url_enrich, enrich_index, clean, enrich_backend) else: elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if github_token and backend_name == "git": enrich_backend.set_github_token(github_token) # We need to enrich from just updated items since last enrichment # Always filter by origin to support multi origin indexes last_enrich = None if backend: # Only supported in data retrieved from a perceval backend filter_ = {"name": "origin", "value": backend.origin} last_enrich = enrich_backend.get_last_update_from_es(filter_) if no_incremental: last_enrich = None logging.debug("Last enrichment: %s", last_enrich) # last_enrich=parser.parse('2016-06-01') ocean_backend = connector[1](backend, from_date=last_enrich) clean = False # Don't remove ocean index when enrich elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) logging.info("Adding enrichment data to %s", enrich_backend.elastic.index_url) if only_studies: logging.info("Running only studies (no SH and no enrichment)") do_studies(enrich_backend, last_enrich) else: if db_sortinghat: enrich_count_merged = 0 enrich_count_merged = enrich_sortinghat( ocean_backend, enrich_backend) logging.info("Total items enriched for merged identities %i ", enrich_count_merged) if only_identities: logging.info("Only SH identities added. Enrich not done!") else: # Enrichment for the new items once SH update is finished enrich_count = enrich_items(ocean_backend, enrich_backend) logging.info("Total items enriched %i ", enrich_count) if studies: do_studies(enrich_backend, last_enrich) except Exception as ex: traceback.print_exc() if backend: logging.error("Error enriching ocean from %s (%s): %s", backend_name, backend.origin, ex) else: logging.error("Error enriching ocean %s", ex) logging.info("Done %s ", backend_name)
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None, ocean_index_enrich = None, db_projects_map=None, json_projects_map=None, db_sortinghat=None, no_incremental=False, only_identities=False, github_token=None, studies=False, only_studies=False, url_enrich=None, events_enrich=False, db_user=None, db_password=None, db_host=None, do_refresh_projects=False, do_refresh_identities=False, author_id=None, author_uuid=None): """ Enrich Ocean index """ backend = None enrich_index = None if ocean_index or ocean_index_enrich: clean = False # don't remove index, it could be shared if do_refresh_projects or do_refresh_identities: clean = False # refresh works over the existing enriched items if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend = None backend_cmd = None if klass: # Data is retrieved from Perceval backend_cmd = klass(*backend_params) backend = backend_cmd.backend if ocean_index_enrich: enrich_index = ocean_index_enrich else: if not ocean_index: ocean_index = backend_name + "_" + backend.origin enrich_index = ocean_index+"_enrich" if events_enrich: enrich_index += "_events" enrich_backend = connector[2](db_sortinghat, db_projects_map, json_projects_map, db_user, db_password, db_host) if url_enrich: elastic_enrich = get_elastic(url_enrich, enrich_index, clean, enrich_backend) else: elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if github_token and backend_name == "git": enrich_backend.set_github_token(github_token) ocean_backend = get_ocean_backend(backend_cmd, enrich_backend, no_incremental) if only_studies: logger.info("Running only studies (no SH and no enrichment)") do_studies(enrich_backend, no_incremental) elif do_refresh_projects: logger.info("Refreshing project field in enriched index") field_id = enrich_backend.get_field_unique_id() eitems = refresh_projects(enrich_backend) enrich_backend.elastic.bulk_upload_sync(eitems, field_id) elif do_refresh_identities: if author_id: query_string = {} query_string["fields"] = 'author_id' query_string["query"] = author_id elif author_uuid: query_string = {} query_string["fields"] = 'author_uuid' query_string["query"] = author_uuid logger.info("Refreshing identities fields in enriched index") field_id = enrich_backend.get_field_unique_id() logger.info(field_id) eitems = refresh_identities(enrich_backend, query_string) enrich_backend.elastic.bulk_upload_sync(eitems, field_id) else: clean = False # Don't remove ocean index when enrich elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) logger.info("Adding enrichment data to %s", enrich_backend.elastic.index_url) if db_sortinghat: # FIXME: This step won't be done from enrich in the future total_ids = load_identities(ocean_backend, enrich_backend) logger.info("Total identities loaded %i ", total_ids) if only_identities: logger.info("Only SH identities added. Enrich not done!") else: # Enrichment for the new items once SH update is finished if not events_enrich: enrich_count = enrich_items(ocean_backend, enrich_backend) if enrich_count: logger.info("Total items enriched %i ", enrich_count) else: enrich_count = enrich_items(ocean_backend, enrich_backend, events=True) if enrich_count: logger.info("Total events enriched %i ", enrich_count) if studies: do_studies(enrich_backend) except Exception as ex: traceback.print_exc() if backend: logger.error("Error enriching ocean from %s (%s): %s", backend_name, backend.origin, ex) else: logger.error("Error enriching ocean %s", ex) logger.info("Done %s ", backend_name)
def enrich_backend(url, clean, backend_name, backend_params, ocean_index=None, ocean_index_enrich = None, db_projects_map=None, db_sortinghat=None, no_incremental=False): """ Enrich Ocean index """ def enrich_items(items, enrich_backend): total = 0 items_pack = [] for item in items: # print("%s %s" % (item['url'], item['lastUpdated_date'])) if len(items_pack) >= enrich_backend.elastic.max_items_bulk: logging.info("Adding %i (%i done) enriched items to %s" % \ (enrich_backend.elastic.max_items_bulk, total, enrich_backend.elastic.index_url)) enrich_backend.enrich_items(items_pack) items_pack = [] items_pack.append(item) total += 1 enrich_backend.enrich_items(items_pack) return total def enrich_sortinghat(backend_name, ocean_backend, enrich_backend): # First we add all new identities to SH item_count = 0 new_identities = [] for item in ocean_backend: item_count += 1 # Get identities from new items to be added to SortingHat identities = enrich_backend.get_identities(item) for identity in identities: if identity not in new_identities: new_identities.append(identity) if item_count % 1000 == 0: logging.debug("Processed %i items identities (%i identities)" \ % (item_count, len(new_identities))) logging.debug("TOTAL ITEMS: %i" % (item_count)) logging.info("Total new identities to be checked %i" % len(new_identities)) merged_identities = SortingHat.add_identities(enrich_backend.sh_db, new_identities, backend_name) # Redo enrich for items with new merged identities renrich_items = [] # For testing # merged_identities = ['7e0bcf6ff46848403eaffa29ef46109f386fa24b'] for mid in merged_identities: renrich_items += get_items_from_uuid(mid, enrich_backend, ocean_backend) # Enrich items with merged identities enrich_count_merged = enrich_items(renrich_items, enrich_backend) return enrich_count_merged backend = None enrich_index = None if ocean_index or ocean_index_enrich: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend_cmd = klass(*backend_params) backend = backend_cmd.backend if ocean_index_enrich: enrich_index = ocean_index_enrich else: if not ocean_index: ocean_index = backend_name + "_" + backend.origin enrich_index = ocean_index+"_enrich" enrich_backend = connector[2](backend, db_projects_map, db_sortinghat) elastic_enrich = get_elastic(url, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) # We need to enrich from just updated items since last enrichment # Always filter by origin to support multi origin indexes filter_ = {"name":"origin", "value":backend.origin} last_enrich = enrich_backend.get_last_update_from_es(filter_) if no_incremental: last_enrich = None logging.debug("Last enrichment: %s" % (last_enrich)) ocean_backend = connector[1](backend, from_date=last_enrich) clean = False # Don't remove ocean index when enrich elastic_ocean = get_elastic(url, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) logging.info("Adding enrichment data to %s" % (enrich_backend.elastic.index_url)) if db_sortinghat: enrich_count_merged = 0 enrich_count_merged = enrich_sortinghat(backend_name, ocean_backend, enrich_backend) logging.info("Total items enriched for merged identities %i " % enrich_count_merged) # Enrichment for the new items once SH update is finished enrich_count = enrich_items(ocean_backend, enrich_backend) logging.info("Total items enriched %i " % enrich_count) except Exception as ex: traceback.print_exc() if backend: logging.error("Error enriching ocean from %s (%s): %s" % (backend_name, backend.origin, ex)) else: logging.error("Error enriching ocean %s" % ex) logging.info("Done %s " % (backend_name))
def feed_backend(url, clean, fetch_cache, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None): """ Feed Ocean with backend data """ backend = None repo = {} # repository data to be stored in conf repo['backend_name'] = backend_name repo['backend_params'] = backend_params if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: backend_cmd = klass(*backend_params) backend = backend_cmd.backend ocean_backend = connector[1](backend, fetch_cache=fetch_cache, project=project) logging.info("Feeding Ocean from %s (%s)" % (backend_name, backend.origin)) if not es_index: es_index = backend_name + "_" + backend.origin elastic_ocean = get_elastic(url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) ConfOcean.set_elastic(elastic_ocean) repo['repo_update_start'] = datetime.now().isoformat() try: if backend_cmd.from_date.replace(tzinfo=None) == \ parser.parse("1970-01-01").replace(tzinfo=None): # Don't use the default value ocean_backend.feed() else: ocean_backend.feed(backend_cmd.from_date) except AttributeError: # The backend does not support from_date ocean_backend.feed() except Exception as ex: if backend: logging.error("Error feeding ocean from %s (%s): %s" % (backend_name, backend.origin, ex)) # don't propagete ... it makes blackbird fails # TODO: manage it in p2o # traceback.print_exc() else: logging.error("Error feeding ocean %s" % ex) repo['success'] = False repo['error'] = str(ex) else: repo['success'] = True repo['repo_update'] = datetime.now().isoformat() repo['index'] = es_index repo['index_enrich'] = es_index_enrich repo['project'] = project if es_index: unique_id = es_index+"_"+backend.origin ConfOcean.add_repo(unique_id, repo) else: logging.debug("Repository not added to Ocean because errors.") logging.debug(backend_params) logging.info("Done %s " % (backend_name))
if __name__ == '__main__': app_init = datetime.now() args = get_params() config_logging(args.debug) if args.index is None: # Extract identities from all indexes pass else: logging.info("Extracting identities from: %s" % (args.index)) perceval_params = get_perceval_params(args.elastic_url, args.index) backend_name = perceval_params['backend'] connector = get_connector_from_name(backend_name) perceval_backend_class = connector[0] ocean_backend_class = connector[1] perceval_backend = None # Don't use perceval perceval_backend = perceval_backend_class(**perceval_params) obackend = ocean_backend_class(perceval_backend, incremental=False) obackend.set_elastic(get_elastic(args.elastic_url, args.index)) identities = get_identities(obackend) SortingHat.add_identities(identities, backend_name) # Add the identities to Sorting Hat print ("Total identities processed: %i" % (len(identities)))