def execute(self): # FIXME this should be called just once # code = 0 when command success code = Init(**self.sh_kwargs).run(self.db_sh) if not self.backend_section: logger.error( "Backend not configured in TaskIdentitiesCollection %s", self.backend_section) return backend_conf = self.config.get_conf()[self.backend_section] if 'collect' in backend_conf and not backend_conf['collect']: logger.info("Don't load ids from a backend without collection %s", self.backend_section) return if self.load_ids: logger.info("[%s] Gathering identities from raw data", self.backend_section) enrich_backend = self._get_enrich_backend() ocean_backend = self._get_ocean_backend(enrich_backend) load_identities(ocean_backend, enrich_backend)
def execute(self): code = Init(**self.sh_kwargs).run(self.db_sh) if code != 0: logger.warning("Can not create the SortingHat database") logger.debug("Sortinghat initialized")
def setUp(self): if not hasattr(sys.stdout, 'getvalue') and not hasattr(sys.stderr, 'getvalue'): self.fail('This test needs to be run in buffered mode') # Create temporal names for the registry self.name = 'tmp' + uuid.uuid4().hex self.name_reuse = 'tmp' + uuid.uuid4().hex config = configparser.ConfigParser() config.read(CONFIG_FILE) # Create command self.kwargs = {'user' : config['Database']['user'], 'password' : config['Database']['password'], 'host' : config['Database']['host'], 'port' : config['Database']['port']} self.cmd = Init(database=self.name, **self.kwargs) self.cmd_reuse = Init(database=self.name_reuse, **self.kwargs)
def run(self): #FIXME this should be called just once # code = 0 when command success code = Init(**self.sh_kwargs).run(self.db_sh) if not self.backend_name: logger.error("Backend not configured in TaskIdentitiesCollection.") return if self.load_ids: logger.info("[%s] Gathering identities from raw data" % self.backend_name) enrich_backend = self.get_enrich_backend() ocean_backend = self._get_ocean_backend(enrich_backend) load_identities(ocean_backend, enrich_backend)
def test_connection_error(self): """Check connection errors""" kwargs = {'user' : 'nouser', 'password' : 'nopassword', 'database' : None, 'host' : 'localhost', 'port' : '3306'} cmd = Init(**kwargs) code = cmd.run(self.name) self.assertEqual(code, CODE_DATABASE_ERROR) with warnings.catch_warnings(record=True): output = sys.stderr.getvalue().strip() self.assertRegexpMatches(output, DB_ACCESS_ERROR % {'user' : 'nouser'})
def setUp(self): if not hasattr(sys.stdout, 'getvalue') and not hasattr( sys.stderr, 'getvalue'): self.fail('This test needs to be run in buffered mode') # Create a temporal name for the registry self.name = 'tmp' + uuid.uuid4().hex # Create command self.kwargs = { 'user': DB_USER, 'password': DB_PASSWORD, 'database': self.name, 'host': DB_HOST, 'port': DB_PORT } self.cmd = Init(**self.kwargs)
def test_connection_error(self): """Check connection errors""" kwargs = {'user' : 'nouser', 'password' : 'nopassword', 'database' : None, 'host' : 'localhost', 'port' : '3306', 'reuse': False} cmd = Init(**kwargs) code = cmd.initialize(self.name) self.assertEqual(code, CODE_DATABASE_ERROR) # Context added to catch deprecation warnings raised on Python 3 with warnings.catch_warnings(record=True): output = sys.stderr.getvalue().strip() self.assertRegexpMatches(output, DB_ACCESS_ERROR % {'user' : 'nouser'})
def run(self): # code = 0 when command success code = Init(**self.sh_kwargs).run(self.db_sh) if self.load_orgs: logger.info("[sortinghat] Loading orgs from file %s", self.conf['sh_orgs_file']) code = Load(**self.sh_kwargs).run("--orgs", self.conf['sh_orgs_file']) if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", self.conf['sh_orgs_file']) #FIXME get the number of loaded orgs if 'sh_ids_file' in self.conf.keys(): filenames = self.conf['sh_ids_file'].split(',') for f in filenames: logger.info("[sortinghat] Loading identities from file %s", f) f = f.replace(' ', '') code = Load(**self.sh_kwargs).run("--identities", f) if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", f)
def execute(self): def is_remote(filename): """ Naive implementation. To be evolved """ remote = False if 'http' in filename: return True return remote def load_identities_file(filename, reset=False): """ Load an identities file in Sortinghat with reset option The reset option cleans all merges and affiliations in identities that will be loaded to honor the identities grouping from the file. """ if reset: logger.info( "[sortinghat] Loading identities with reset from file %s", filename) code = Load(**self.sh_kwargs).run("--reset", "--identities", filename) else: logger.info("[sortinghat] Loading identities from file %s", filename) code = Load(**self.sh_kwargs).run("--identities", filename) if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", filename) logger.info("[sortinghat] End of loading identities from file %s", filename) def load_sortinghat_identities(config): """ Load identities from a file in SortingHat JSON format """ cfg = config.get_conf() filenames = cfg['sortinghat']['identities_file'] api_token = cfg['sortinghat']['identities_api_token'] for filename in filenames: filename = filename.replace( ' ', '') # spaces used in config file list if filename == '': continue if is_remote(filename): # Use the GitHub Data API to get the file # First we need the SHA for this file try: # https://github.com/<owner>/<repo>/blob/<branch>/<sh_identities> repo_file = filename.rsplit("/", 1)[1] repository_raw = filename.rsplit("/", 1)[0] repository = repository_raw.rsplit("/", 2)[0] repository_api = repository.replace( 'github.com', 'api.github.com/repos') # repository_type = repository_raw.rsplit("/", 2)[1] repository_branch = repository_raw.rsplit("/", 2)[2] repo_file_sha = \ TaskIdentitiesExport.sha_github_file(config, repo_file, repository_api, repository_branch) if not repo_file_sha: logger.error( "Can't find identities file %s. Not loading identities", filename) return file_url = repository_api + "/git/blobs/" + repo_file_sha headers = {"Authorization": "token " + api_token} res = requests.get(file_url, headers=headers) res.raise_for_status() with tempfile.NamedTemporaryFile() as temp: temp.write(base64.b64decode(res.json()['content'])) temp.flush() load_identities_file( temp.name, cfg['sortinghat']['reset_on_load']) except IndexError as ex: logger.error("Can not load identities from: %s", filename) logger.debug( "Expected format: https://github.com/owner/repo/blob/master/file" ) logger.debug(ex) else: load_identities_file(filename, cfg['sortinghat']['reset_on_load']) def load_grimoirelab_identities(config): """ Load identities from files in GrimoireLab YAML format """ logger.info("Loading GrimoireLab identities in SortingHat") cfg = config.get_conf() # Get the identities identities_url = cfg['sortinghat']['identities_file'][0] if not is_remote(identities_url): identities_filename = identities_url else: # The file should be in gitlab in other case if 'identities_api_token' not in cfg['sortinghat']: logger.error( "API Token not provided. Identities won't be loaded") return token = cfg['sortinghat']['identities_api_token'] res = requests.get(identities_url, headers={"PRIVATE-TOKEN": token}) res.raise_for_status() identities = tempfile.NamedTemporaryFile() identities.write(res.content) identities_filename = identities.name # Convert to a JSON file in SH format # grimoirelab2sh -i identities.yaml -s ssf:manual -o ssf.json json_identities = tempfile.mktemp() cmd = [ 'grimoirelab2sh', '-i', identities_filename, '-s', cfg['general']['short_name'] + ':manual', '-o', json_identities ] if self.__execute_command(cmd) != 0: logger.error('Can not generate the SH JSON file from ' + 'GrimoireLab yaml file. Do the files exists? ' + 'Is the API token right?') else: # Load the JSON file in SH format load_identities_file(json_identities, cfg['sortinghat']['reset_on_load']) # Closing tmp files so they are removed for the remote case if is_remote(identities_url): identities.close() os.remove(json_identities) # ** START SYNC LOGIC ** # Check that enrichment tasks are not active before loading identities while True: time.sleep( 1) # check each second if the identities load could start with TasksManager.IDENTITIES_TASKS_ON_LOCK: with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK: enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON logger.debug("Enrich tasks active: %i", enrich_tasks) if enrich_tasks == 0: # The load of identities can be started TasksManager.IDENTITIES_TASKS_ON = True break # ** END SYNC LOGIC ** cfg = self.config.get_conf() # code = 0 when command success code = Init(**self.sh_kwargs).run(self.db_sh) # Basic loading of organizations from a SH JSON file. Legacy stuff. if 'load_orgs' in cfg['sortinghat'] and cfg['sortinghat']['load_orgs']: if 'orgs_file' not in cfg[ 'sortinghat'] or not cfg['sortinghat']['orgs_file']: logger.error("Load orgs active but no orgs_file configured") else: logger.info("[sortinghat] Loading orgs from file %s", cfg['sortinghat']['orgs_file']) code = Load(**self.sh_kwargs).run( "--orgs", cfg['sortinghat']['orgs_file']) if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", cfg['sortinghat']['orgs_file']) # FIXME get the number of loaded orgs # Identities loading from files. It could be in several formats. # Right now GrimoireLab and SortingHat formats are supported if 'identities_file' in cfg['sortinghat']: if cfg['sortinghat']['identities_format'] == 'sortinghat': load_sortinghat_identities(self.config) elif cfg['sortinghat']['identities_format'] == 'grimoirelab': load_grimoirelab_identities(self.config) # FIXME: If there are exceptions in the above code the # TasksManager.IDENTITIES_TASKS_ON won't be deactivated with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False
def execute(self): def is_remote(filename): """ Naive implementation. To be evolved """ remote = False if 'http' in filename: return True return remote def load_identities_file(filename, reset=False): """ Load an identities file in Sortinghat with reset option The reset option cleans all merges and affiliations in identities that will be loaded to honor the identities grouping from the file. """ if reset: logger.info( "[sortinghat] Loading identities with reset from file %s", filename) code = Load(**self.sh_kwargs).run("--reset", "--identities", filename) else: content_hash = get_file_hash(filename) if content_hash not in self.current_identities_files_hash: logger.info("[sortinghat] Loading identities from file %s", filename) code = Load(**self.sh_kwargs).run("--identities", filename) self.current_identities_files_hash.update({ content_hash: { 'filename': filename, 'has_changed': 1 } }) else: self.current_identities_files_hash.update({ content_hash: { 'filename': filename, 'has_changed': 0 } }) logger.info( "[sortinghat] No changes in file %s, identities won't be loaded", filename) code = CMD_SUCCESS if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", filename) logger.info("[sortinghat] End of loading identities from file %s", filename) def load_sortinghat_identities(config): """ Load identities from a file in SortingHat JSON format """ cfg = config.get_conf() filenames = cfg['sortinghat']['identities_file'] api_token = cfg['sortinghat']['identities_api_token'] for filename in filenames: filename = filename.replace( ' ', '') # spaces used in config file list if filename == '': continue if is_remote(filename): # Use the GitHub Data API to get the file # First we need the SHA for this file try: # https://github.com/<owner>/<repo>/blob/<branch>/<sh_identities> repo_file = filename.rsplit("/", 1)[1] repository_raw = filename.rsplit("/", 1)[0] repository = repository_raw.rsplit("/", 2)[0] repository_api = repository.replace( 'github.com', 'api.github.com/repos') # repository_type = repository_raw.rsplit("/", 2)[1] repository_branch = repository_raw.rsplit("/", 2)[2] repo_file_sha = \ TaskIdentitiesExport.sha_github_file(config, repo_file, repository_api, repository_branch) if not repo_file_sha: logger.error( "Can't find identities file %s. Not loading identities", filename) return file_url = repository_api + "/git/blobs/" + repo_file_sha headers = {"Authorization": "token " + api_token} res = requests.get(file_url, headers=headers) res.raise_for_status() with tempfile.NamedTemporaryFile() as temp: temp.write(base64.b64decode(res.json()['content'])) temp.flush() load_identities_file( temp.name, cfg['sortinghat']['reset_on_load']) except IndexError as ex: logger.error("Can not load identities from: %s", filename) logger.debug( "Expected format: https://github.com/owner/repo/blob/master/file" ) logger.debug(ex) else: load_identities_file(filename, cfg['sortinghat']['reset_on_load']) def load_grimoirelab_identities(config): """ Load identities from files in GrimoireLab YAML format """ logger.info("Loading GrimoireLab identities in SortingHat") cfg = config.get_conf() # Get the identities identities_url = cfg['sortinghat']['identities_file'][0] if not is_remote(identities_url): identities_filename = identities_url else: # The file should be in gitlab in other case if 'identities_api_token' not in cfg['sortinghat']: logger.error( "API Token not provided. Identities won't be loaded") return token = cfg['sortinghat']['identities_api_token'] res = requests.get(identities_url, headers={"PRIVATE-TOKEN": token}) res.raise_for_status() identities = tempfile.NamedTemporaryFile() identities.write(res.content) identities.flush() identities_filename = identities.name # Convert to a JSON file in SH format # grimoirelab2sh -i identities.yaml -s ssf:manual -o ssf.json json_identities = tempfile.mktemp() cmd = [ 'grimoirelab2sh', '-i', identities_filename, '-s', cfg['general']['short_name'] + ':manual', '-o', json_identities ] if not cfg['sortinghat']['strict_mapping']: cmd += ['--no-email-validation'] if self.__execute_command(cmd) != 0: logger.error('Can not generate the SH JSON file from ' 'GrimoireLab yaml file. Do the files exists? ' 'Is the API token right?') else: # Load the JSON file in SH format load_identities_file(json_identities, cfg['sortinghat']['reset_on_load']) # Closing tmp files so they are removed for the remote case if is_remote(identities_url): identities.close() os.remove(json_identities) # ** START SYNC LOGIC ** # Check that enrichment tasks are not active before loading identities while True: time.sleep( 1) # check each second if the identities load could start with TasksManager.IDENTITIES_TASKS_ON_LOCK: with TasksManager.NUMBER_ENRICH_TASKS_ON_LOCK: enrich_tasks = TasksManager.NUMBER_ENRICH_TASKS_ON logger.debug("[load identities] Enrich tasks active: %i", enrich_tasks) if enrich_tasks == 0: # The load of identities can be started TasksManager.IDENTITIES_TASKS_ON = True break # ** END SYNC LOGIC ** cfg = self.config.get_conf() # code = 0 when command success code = Init(**self.sh_kwargs).run(self.db_sh, '--reuse') # Basic loading of organizations from a SH JSON file. Legacy stuff. if 'load_orgs' in cfg['sortinghat'] and cfg['sortinghat']['load_orgs']: if 'orgs_file' not in cfg[ 'sortinghat'] or not cfg['sortinghat']['orgs_file']: logger.error("Load orgs active but no orgs_file configured") elif not os.path.exists(cfg['sortinghat']['orgs_file']): logger.error("Orgs file not found on disk") else: orgs_file = cfg['sortinghat']['orgs_file'] orgs_file_hash = get_file_hash(orgs_file) if not self.current_orgs_file_hash or self.current_orgs_file_hash != orgs_file_hash: logger.info("[sortinghat] Loading orgs from file %s", orgs_file) code = Load(**self.sh_kwargs).run("--orgs", orgs_file) if code != CMD_SUCCESS: logger.error("[sortinghat] Error loading %s", orgs_file) self.current_orgs_file_hash = orgs_file_hash with open(orgs_file, 'r') as f: json_content = json.loads(f.read()) logger.info("[sortinghat] %s organizations loaded", len(json_content['organizations'])) else: logger.info( "[sortinghat] No changes in file %s, organizations won't be loaded", orgs_file) # Identities loading from files. It could be in several formats. # Right now GrimoireLab and SortingHat formats are supported if 'identities_file' in cfg['sortinghat']: try: if cfg['sortinghat']['identities_format'] == 'sortinghat': load_sortinghat_identities(self.config) elif cfg['sortinghat']['identities_format'] == 'grimoirelab': load_grimoirelab_identities(self.config) except Exception: with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False raise # If one of the identities file has changed, after loading the identities # we need to unify in order to mix the identities loaded with then ones # from data sources. unify = any([ v['has_changed'] for v in self.current_identities_files_hash.values() ]) if unify: cmd = [ 'sortinghat', '-u', self.db_user, '-p', self.db_password, '--host', self.db_host, '-d', self.db_sh ] cmd += ['unify', '--fast-matching'] for algo in cfg['sortinghat']['matching']: ucmd = cmd + ['-m', algo] if not cfg['sortinghat']['strict_mapping']: ucmd += ['--no-strict-matching'] logger.debug("Doing unify after identities load") self.__execute_command(ucmd) with TasksManager.IDENTITIES_TASKS_ON_LOCK: TasksManager.IDENTITIES_TASKS_ON = False