def get_backend_id(backend_name, backend_params): if backend_name not in get_connectors(): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connectors()[backend_name] klass = connector[3] # BackendCmd for the connector backend_cmd = klass(*backend_params) backend = backend_cmd.backend return backend.unique_id
def get_backend_id(backend_name, backend_params): if backend_name not in get_connectors(): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connectors()[backend_name] klass = connector[3] # BackendCmd for the connector backend_cmd = klass(*backend_params) backend = backend_cmd.backend return backend.unique_id
def test_enrich(self, sortinghat=False, projects=False): """Test enrich all sources""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Enriching data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): perceval_backend = None ocean_index = "test_" + con enrich_index = "test_" + con + "_enrich" clean = False ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) clean = True if not sortinghat and not projects: enrich_backend = connectors[con][2]() elif sortinghat and not projects: enrich_backend = connectors[con][2]( db_sortinghat=DB_SORTINGHAT) elif not sortinghat and projects: enrich_backend = connectors[con][2]( db_projects_map=DB_PROJECTS) elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if sortinghat: # Load SH identities load_identities(ocean_backend, enrich_backend) enrich_count = enrich_backend.enrich_items(ocean_backend) if enrich_count is not None: logging.info("Total items enriched %i ", enrich_count)
def test_add_alias(self): """Test whether add_alias properly works""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] tmp_index = "test-add-aliases" tmp_index_url = es_con + "/" + tmp_index enrich_backend = get_connectors()["git"][2]() elastic_enrich = get_elastic(es_con, tmp_index, True, enrich_backend) self._enrich.set_elastic(elastic_enrich) # add alias with self.assertLogs(logger, level='INFO') as cm: self._enrich.elastic.add_alias(DEMOGRAPHICS_ALIAS) self.assertEqual(cm.output[0], 'INFO:grimoire_elk.elastic:Alias %s created on %s.' % (DEMOGRAPHICS_ALIAS, anonymize_url(tmp_index_url))) r = self._enrich.requests.get(self._enrich.elastic.index_url + "/_alias", headers=HEADER_JSON, verify=False) self.assertIn(DEMOGRAPHICS_ALIAS, r.json()[self._enrich.elastic.index]['aliases']) # add alias again with self.assertLogs(logger, level='DEBUG') as cm: self._enrich.elastic.add_alias(DEMOGRAPHICS_ALIAS) self.assertEqual(cm.output[0], 'DEBUG:grimoire_elk.elastic:Alias %s already exists on %s.' % (DEMOGRAPHICS_ALIAS, anonymize_url(tmp_index_url))) requests.delete(tmp_index_url, verify=False)
def setUp(self): # The name of the connector is needed only to get access to the SortingHat DB self.enrich_backend = get_connectors()["github"][2]( db_sortinghat=DB_SORTINGHAT, db_user=self.db_user, db_password=self.db_password)
def get_backend_sections(cls): # a backend name could include and extra ":<param>" # to have several backend entries with different configs gelk_backends = list(get_connectors().keys()) extra_backends = ["apache", "google_hits", "remo:activities"] return gelk_backends + extra_backends
def find_ds_mapping(data_source, es_major_version): """ Find the mapping given a perceval data source :param data_source: name of the perceval data source :param es_major_version: string with the major version for Elasticsearch :return: a dict with the mappings (raw and enriched) """ mappings = {"raw": None, "enriched": None} # Backend connectors connectors = get_connectors() try: raw_klass = connectors[data_source][1] enrich_klass = connectors[data_source][2] except KeyError: print("Data source not found", data_source) sys.exit(1) # Mapping for raw index backend = raw_klass(None) if backend: mapping = json.loads(backend.mapping.get_elastic_mappings(es_major_version)['items']) mappings['raw'] = [mapping, find_general_mappings(es_major_version)] # Mapping for enriched index backend = enrich_klass(None) if backend: mapping = json.loads(backend.mapping.get_elastic_mappings(es_major_version)['items']) mappings['enriched'] = [mapping, find_general_mappings(es_major_version)] return mappings
def get_backend_sections(cls): # a backend name could include and extra ":<param>" # to have several backend entries with different configs gelk_backends = list(get_connectors().keys()) extra_backends = ["apache", "google_hits", "remo:activities"] return gelk_backends + extra_backends
def test_refresh_project(self): """Test refresh project field for all sources""" # self.test_enrich_sh() # Load the identities in ES config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] db_user = '' db_password = '' if 'Database' in config: if 'user' in config['Database']: db_user = config['Database']['user'] if 'password' in config['Database']: db_password = config['Database']['password'] logging.info("Refreshing data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): enrich_index = "test_" + con + "_enrich" enrich_backend = connectors[con][2](db_projects_map=DB_PROJECTS, db_user=db_user, db_password=db_password) clean = False elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) logging.info("Refreshing projects fields in enriched index %s", elastic_enrich.index_url) self.__refresh_projects(enrich_backend)
def test_read_data(self): """Test load all sources JSON""" config = configparser.ConfigParser() config.read(CONFIG_FILE) connectors = get_connectors() # Check we have data for all the data sources for con in sorted(connectors.keys()): with open(os.path.join("data", con + ".json")) as f: json.load(f)
def get_study_sections(cls): # a study name could include and extra ":<param>" # to have several backend entries with different configs studies = [] connectors = get_connectors() for _, backends in connectors.items(): enrich_backend = backends[2]() for study in enrich_backend.studies: studies.append(study.__name__) return tuple(set(studies))
def setUpClass(cls): cls.config = configparser.ConfigParser() cls.config.read(CONFIG_FILE) cls.es_con = dict(cls.config.items('ElasticSearch'))['url'] cls.connectors = get_connectors() # Sorting hat settings cls.db_user = '' cls.db_password = '' if 'Database' in cls.config: if 'user' in cls.config['Database']: cls.db_user = cls.config['Database']['user'] if 'password' in cls.config['Database']: cls.db_password = cls.config['Database']['password']
def test_enrich(self, sortinghat=False, projects=False): """Test enrich all sources""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] db_user = '' db_password = '' if 'Database' in config: if 'user' in config['Database']: db_user = config['Database']['user'] if 'password' in config['Database']: db_password = config['Database']['password'] logging.info("Enriching data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): perceval_backend = None ocean_index = "test_" + con enrich_index = "test_" + con + "_enrich" clean = False ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, ocean_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) clean = True if not sortinghat and not projects: enrich_backend = connectors[con][2]() elif sortinghat and not projects: enrich_backend = connectors[con][2]( db_sortinghat=DB_SORTINGHAT, db_user=db_user, db_password=db_password) elif not sortinghat and projects: enrich_backend = connectors[con][2]( db_projects_map=DB_PROJECTS, db_user=db_user, db_password=db_password) elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) if sortinghat: # Load SH identities load_identities(ocean_backend, enrich_backend) raw_count = len([item for item in ocean_backend.fetch()]) enrich_count = enrich_backend.enrich_items(ocean_backend) self.assertEqual(raw_count, enrich_count)
def test_data_load(self): """Test load all sources JSON data into ES""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Loading data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): with open(os.path.join("data", con + ".json")) as f: items = json.load(f) es_index = "test_" + con clean = True perceval_backend = None ocean_backend = connectors[con][1](perceval_backend) elastic_ocean = get_elastic(es_con, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) self.__data2es(items, ocean_backend)
def test_refresh_identities(self): """Test refresh identities for all sources""" # self.test_enrich_sh() # Load the identities in ES config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Refreshing data in: %s", es_con) connectors = get_connectors() for con in sorted(connectors.keys()): enrich_index = "test_" + con + "_enrich" enrich_backend = connectors[con][2](db_sortinghat=DB_SORTINGHAT) clean = False elastic_enrich = get_elastic(es_con, enrich_index, clean, enrich_backend) enrich_backend.set_elastic(elastic_enrich) logging.info("Refreshing identities fields in enriched index %s", elastic_enrich.index_url) self.__refresh_identities(enrich_backend)
def test_data_load_error(self): """Test whether an exception is thrown when inserting data intO""" config = configparser.ConfigParser() config.read(CONFIG_FILE) es_con = dict(config.items('ElasticSearch'))['url'] logging.info("Loading data in: %s", es_con) connector = get_connectors()['functest'] with open(os.path.join("data", "functest_wrong.json")) as f: items = json.load(f) es_index = "test_functest" clean = True perceval_backend = None ocean_backend = connector[1](perceval_backend) elastic_ocean = get_elastic(es_con, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) inserted = self.__data2es(items, ocean_backend) self.assertGreater(len(items), inserted)
def find_perceval_backend(es_url, index): backend = None # Backend connectors connectors = get_connectors() # Get the first item to detect the data source and raw/enriched type res = requests.get('%s/%s/_search?size=1' % (es_url, index)) first_item = res.json()['hits']['hits'][0]['_source'] fields = first_item.keys() if 'metadata__enriched_on' in fields: enrich_class = first_item['metadata__gelk_backend_name'] logging.debug("Detected enriched index for %s", enrich_class) # Time to get the mapping con_name = get_connector_name_from_cls_name(enrich_class) logging.debug("Getting the mapping for %s", con_name) klass = connectors[con_name][2] backend = klass() elif 'perceval_version' in fields: logging.debug("Detected raw index for %s", first_item['backend_name']) con_name = get_connector_name_from_cls_name(first_item['backend_name']) klass = connectors[con_name][1] backend = klass(None) elif 'retweet_count' in fields: con_name = 'twitter' logging.debug("Detected raw index for %s", con_name) elif 'type' in fields and first_item['type'] == 'googleSearchHits': logging.debug("Detected raw index for googleSearchHits") elif 'httpversion' in fields: logging.debug("Detected raw index for apache") else: logging.error("Can not find is the index if raw or enriched: %s", index) sys.exit(1) return backend
def fetch(self): if not self.state or self.state.is_empty(): supported_data_sources = list(gelk_utils.get_connectors()) for data_source_name in supported_data_sources: data_source = DataSource(name=data_source_name) yield data_source elif self.state.data_sources: data_sources = DataSource.objects.filter(name__in=self.state.data_sources) for data_source in data_sources: yield data_source elif self.state.repository_views: views = RepositoryView.objects.filter(id__in=self.state.repository_views) for data_source in self.__fetch_from_repository_views(views): yield data_source elif self.state.projects: projects = Project.objects.filter(name__in=self.state.projects) for data_source in self.__fetch_from_projects(projects): yield data_source elif self.state.eco_name: ecosystem = Ecosystem.objects.get(name=self.state.eco_name) projects = ecosystem.projects.all() for data_source in self.__fetch_from_projects(projects): yield data_source
def setUp(self): self.__tests_dir = os.path.dirname(os.path.realpath(__file__)) self.__events_dir = os.path.join(self.__tests_dir, "data/events/") self.connectors = get_connectors()
def __get_backends(self): gelk_backends = list(get_connectors().keys()) extra_backends = ["google_hits"] return gelk_backends + extra_backends
def test_init(self): """Test whether the backends can be loaded """ self.assertEqual(len(get_connectors()), NUMBER_BACKENDS)
from grimoire_elk.utils import get_connectors print(get_connectors())