def test_physicochemical_distances_moments(self, structure_klifs_ids, normalized): """ Test feature group extraction methods. Notes ----- Input structure KLIFS IDs must be able to generate a valid fingerprint, otherwise test will fail. """ fingerprints = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids) physicochemical = fingerprints.physicochemical(normalized) assert physicochemical.index.to_list() == structure_klifs_ids if normalized: assert physicochemical.columns.to_list( ) == FEATURE_NAMES_PHYSICOCHEMICAL else: assert physicochemical.columns.to_list( ) == FEATURE_NAMES_PHYSICOCHEMICAL_DICT assert isinstance(physicochemical.iloc[0, 0], list) distances = fingerprints.distances(normalized) assert distances.index.to_list() == structure_klifs_ids assert distances.columns.to_list( ) == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert isinstance(distances.iloc[0, 0], list) moments = fingerprints.moments(normalized) assert moments.index.to_list() == structure_klifs_ids assert moments.columns.to_list() == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert isinstance(moments.iloc[0, 0], list)
def from_structure_klifs_ids(cls, structure_klifs_ids, klifs_session=None, n_cores=1): """ Calculate feature distances for all possible structure pairs. Parameters ---------- structure_klifs_id : int Input structure KLIFS ID (output fingerprints may contain less IDs because some structures could not be encoded). klifs_session : opencadd.databases.klifs.session.Session Local or remote KLIFS session. n_cores : int or None Number of cores to be used for fingerprint generation as defined by the user. Returns ------- kissim.comparison.FeatureDistancesGenerator Feature distances generator. """ fingerprint_generator = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids, klifs_session, n_cores) feature_distances_generator = cls.from_fingerprint_generator( fingerprint_generator, n_cores) return feature_distances_generator
def test_physicochemical_distances_moments_exploded( self, structure_klifs_ids, normalized): """ Test feature group extraction methods. Notes ----- Input structure KLIFS IDs must be able to generate a valid fingerprint, otherwise test will fail. """ fingerprints = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids) def _index_structure_klifs_id(multiplier): index_structure_klifs_id = [] for i in structure_klifs_ids: index_structure_klifs_id.extend([i] * multiplier) return index_structure_klifs_id index_residue_ix = list(range(1, 86)) * len(structure_klifs_ids) index_moment = list(range(1, 4)) * len(structure_klifs_ids) physicochemical = fingerprints.physicochemical_exploded(normalized) assert physicochemical.index.get_level_values( "structure_klifs_id").to_list() == _index_structure_klifs_id(85) assert physicochemical.index.get_level_values( "residue_ix").to_list() == index_residue_ix if normalized: assert physicochemical.columns.to_list( ) == FEATURE_NAMES_PHYSICOCHEMICAL else: assert physicochemical.columns.to_list( ) == FEATURE_NAMES_PHYSICOCHEMICAL_DICT assert physicochemical.dtypes.unique() == "float64" distances = fingerprints.distances_exploded(normalized) assert distances.index.get_level_values( "structure_klifs_id").to_list() == _index_structure_klifs_id(85) assert distances.index.get_level_values( "residue_ix").to_list() == index_residue_ix assert distances.columns.to_list( ) == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert distances.dtypes.unique() == "float64" moments = fingerprints.moments_exploded(normalized) assert moments.index.get_level_values( "structure_klifs_id").to_list() == _index_structure_klifs_id(3) assert moments.index.get_level_values( "moment").to_list() == index_moment assert moments.columns.to_list() == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert moments.dtypes.unique() == "float64"
def compare_from_cli(args): """ Compare fingerprints. Parameters ---------- args : argsparse.Namespace CLI arguments. """ configure_logger(Path(args.output) / "distances.log") fingerprint_generator = FingerprintGenerator.from_json(args.input) compare(fingerprint_generator, args.output, args.weights, args.ncores)
def outliers(fingerprints_path, distance_cutoff, fingerprints_wo_outliers_path=None): """ Remove outlier fingerprints (defined by spatial distances maximum). Parameters ---------- fingerprints_path : str or pathlib.Path Path to fingerprints JSON file. distance_cutoff : float Tolerated distance maximum; fingerprints with distances greater than this cutoff will be removed. fingerprints_wo_outliers_path : None or str or pathlib.Path Path to fingerprints JSON file with outliers removed. Returns ------- kissim.encoding.FingerprintGenerator Fingerprints without outliers. """ # Load fingerprints logger.info("Read fingerprints...") fingerprints_path = Path(fingerprints_path) fingerprint_generator = FingerprintGenerator.from_json(fingerprints_path) logger.info(f"Number of fingerprints: {len(fingerprint_generator.data)}") # Find structures/fingerprints IDs to be removed logger.info(f"Use the following distance minimum/maximum cutoffs" f" to identify outlier structures: {distance_cutoff}") remove_structure_ids = [] for structure_id, fp in fingerprint_generator.data.items(): if (fp.distances > distance_cutoff).any().any(): remove_structure_ids.append(structure_id) logger.info(f"Structure IDs to be removed: {remove_structure_ids}") # Remove fingerprints logger.info("Remove fingerprints with distance outliers...") for structure_id in remove_structure_ids: del fingerprint_generator.data[structure_id] logger.info(f"Number of fingerprints: {len(fingerprint_generator.data)}") # Optionally: Save to file if fingerprints_wo_outliers_path is not None: logger.info( f"Save cleaned fingerprints to {fingerprints_wo_outliers_path}...") fingerprints_wo_outliers_path = Path(fingerprints_wo_outliers_path) fingerprint_generator.to_json(fingerprints_wo_outliers_path) return fingerprint_generator
def test_to_from_json(self, structure_klifs_ids, normalize, values_array_sum): """ Test if saving/loading a fingerprint to/from a json file. """ fingerprints = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids, LOCAL, 1) json_filepath = Path("fingerprints.json") with enter_temp_directory(): # Save json file fingerprints.to_json(json_filepath) assert json_filepath.exists() # Load json file fingerprints_reloaded = FingerprintGenerator.from_json( json_filepath, normalize) assert isinstance(fingerprints_reloaded, FingerprintGenerator) # Attribute data assert list(fingerprints.data.keys()) == list( fingerprints_reloaded.data.keys()) if normalize: assert list(fingerprints.data_normalized.keys()) == list( fingerprints_reloaded.data_normalized.keys()) else: assert fingerprints_reloaded.data_normalized is None values_array_sum_calculated = sum([ np.nansum(fingerprint.values_array(True, True, True)) for structure_klifs_id, fingerprint in fingerprints_reloaded.data.items() ]) assert pytest.approx(values_array_sum_calculated, abs=1e-4) == values_array_sum
def test_from_structure_klifs_id(self, structure_klifs_ids, klifs_session, n_cores, fingerprints_values_array_sum): """ Test if fingerprints can be generated locally and remotely in sequence and in parallel. """ fingerprints = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids, klifs_session, n_cores) # Test attributes # Attribute: structure_klifs_id assert fingerprints.structure_klifs_ids == structure_klifs_ids # Attribute: klifs_session if klifs_session is not None: assert fingerprints.klifs_session == klifs_session else: # If no session was provided, use set up remote session assert fingerprints.klifs_session._client is not None # Attribute: data assert isinstance(fingerprints.data, dict) for key, value in fingerprints.data.items(): assert isinstance(key, int) assert isinstance(value, Fingerprint) fingerprints_values_array_sum_calculated = sum([ np.nansum(fingerprint.values_array(True, True, True)) for structure_klifs_id, fingerprint in fingerprints.data.items() ]) assert (pytest.approx(fingerprints_values_array_sum_calculated, abs=1e-4) == fingerprints_values_array_sum) # Attribute: data_normalized assert isinstance(fingerprints.data_normalized, dict) for key, value in fingerprints.data_normalized.items(): assert isinstance(key, int) assert isinstance(value, FingerprintNormalized) # Property: subpocket_centers assert isinstance(fingerprints.subpocket_centers, pd.DataFrame) assert fingerprints.subpocket_centers.index.to_list( ) == structure_klifs_ids assert (fingerprints.subpocket_centers.columns.get_level_values( 0).to_list() == np.repeat(FEATURE_NAMES_DISTANCES_AND_MOMENTS, 3).tolist()) assert (fingerprints.subpocket_centers.columns.get_level_values( 1).to_list() == [ "x", "y", "z", ] * len(FEATURE_NAMES_DISTANCES_AND_MOMENTS))
def test_main_encode(args): """ Test CLI for encoding using subprocesses. """ output = Path("fingerprints.json") args = args.split() with enter_temp_directory(): subprocess.run(args, check=True) # Json file there? assert output.exists() # Log file there? assert Path(f"{output.stem}.log").exists() # Json file can be loaded as FingerprintGenerator object? fingerprint_generator = FingerprintGenerator.from_json(output) assert isinstance(fingerprint_generator, FingerprintGenerator) assert isinstance(list(fingerprint_generator.data.values())[0], Fingerprint)
def encode(structure_klifs_ids, fingerprints_filepath=None, local_klifs_download_path=None, n_cores=1): """ Encode structures. Parameters ---------- structure_klifs_ids : list of int Structure KLIFS IDs. fingerprints_filepath : str or pathlib.Path Path to output json file. Default None. local_klifs_download_path : str or None If path to local KLIFS download is given, set up local KLIFS session. If None is given, set up remote KLIFS session. n_cores : int Number of cores used to generate fingerprints. Returns ------- kissim.encoding.FingerprintGenerator Fingerprints. """ # Set up KLIFS session klifs_session = _setup_klifs_session(local_klifs_download_path) # Generate fingerprints fingerprints = FingerprintGenerator.from_structure_klifs_ids( structure_klifs_ids, klifs_session, n_cores) # Optionally: Save fingerprints to json file if fingerprints_filepath: logger.info(f"Write fingerprints to file: {fingerprints_filepath}") fingerprints.to_json(fingerprints_filepath) return fingerprints