def get_hits_similar_to(self, molecule_set=None): """Get sorted list of num_hits Molecule in the Set most similar to a query Molecule.This is defined as the sorted set (decreasing similarity) of molecules with the highest (query_molecule, set_molecule) similarity. Args: molecule_set (AIMSim.chemical_datastructures MoleculeSet): MoleculeSet object used to calculate sorted similarities. Only used if self.similarities or self.sorted_similarities not set. Returns: np.ndarray(int): Ids of most similar molecules in decreasing order of similarity. np.ndarray(float): Corresponding similarity values. """ if not hasattr(self, 'sorted_similarities_'): if not hasattr(self, 'similarities_'): if molecule_set is None: raise InvalidConfigurationError('MoleculeSet object not ' 'passed for task') else: self.similarities_ = molecule_set.compare_against_molecule( self.target_molecule) self.sorted_similarities_ = np.argsort(self.similarities_) ids = np.array([ self.sorted_similarities_[-1 - hit_id] for hit_id in range(self.n_hits) ]) return ids, self.similarities_[ids]
def _set_tasks(self, tasks): """ Args: tasks (dict): The tasks field of the config yaml containing various tasks and their parameters. """ for task, task_configs in tasks.items(): try: if task == "compare_target_molecule": loaded_task = CompareTargetMolecule(task_configs) elif task == "visualize_dataset": loaded_task = VisualizeDataset(task_configs) elif task == "see_property_variation_w_similarity": loaded_task = SeePropertyVariationWithSimilarity( task_configs) elif task == "identify_outliers": loaded_task = IdentifyOutliers(task_configs) elif task == "cluster": loaded_task = ClusterData(task_configs) else: print(f"{task} not recognized") continue self.to_do.append(loaded_task) except InvalidConfigurationError as e: print(f"Error in the config file for task: ", task) print("\n", e) raise e if len(self.to_do) == 0: raise InvalidConfigurationError("No tasks were read, exiting.")
def _extract_configs(self): """ Raises: InvalidConfigurationError: If correlation_type does not match implemented types. """ self.plot_settings = {"response": "response"} self.plot_settings.update( self.configs.get("property_plot_settings", {})) self.log_fpath = self.configs.get("log_file_path", None) self.correlation_type = self.configs.get('correlation_type') if self.correlation_type is None: self.correlation_type = 'pearson' if self.correlation_type.lower() in ['pearson', 'linear']: self.correlation_fn = pearsonr else: raise InvalidConfigurationError( f'{self.correlation_type} correlation ' f'not supported') if self.log_fpath is not None: log_dir = dirname(self.log_fpath) makedirs(log_dir, exist_ok=True)
def __call__( self, molecule_set_configs, fingerprint_type=None, fingerprint_params=None, similarity_measure=None, subsample_subset_size=0.01, optim_algo='max_min', show_top=0, only_metric=True, ): """ Calculate the correlation in the properties of molecules in set and their nearest and furthest neighbors using different fingerprints / similarity measure choices. Choose the best fingerprint and similarity measure pair (called measure choice for brevity) based on an optimization strategy. Args: molecule_set_configs (dict): All configurations (except fingerprint_type, fingerprint_params and similarity_measure) needed to form the moleculeSet. fingerprint_type (str): Label to indicate which fingerprint to use. If supplied, fingerprint is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. fingerprint_params (dict): Hyper-parameters for fingerprints. Passed to the MoleculeSet constructor. If None is passed, set to empty dictionary before passing to MoleculeSet. similarity_measure (str): Label to indicate which similarity measure to use. If supplied, similarity measure is fixed and optimization carried out over similarity measures. Use None to indicate that optimization needs to be carried out over fingerprints. Default is None. subsample_subset_size (float): Fraction of molecule_set to subsample. This is separate from the sample_ratio parameter used when creating a moleculeSet since it is recommended to have an more aggressive subsampling strategy for this task due to the combinatorial explosion of looking at multiple fingerprints and similarity measures. Default is 0.01. optim_algo (str): Label to indicate the optimization algorithm chosen. Options are: 'max': The measure choice which maximizes correlation of properties between nearest neighbors (most similar). This is the default. 'min': The measure choice which minimizes the absolute value of property correlation between furthest neighbors (most dissimilar). 'max_min': The measure choice which maximizes correlation of properties between nearest neighbors (most similar) and minimizes he absolute value of property correlation between furthest neighbors (most dissimilar). This is the default. show_top (int): Number of top performing measures to show in plot. If 0, no plots are generated and the top performer is returned. only_metric (bool): If True only similarity measures satisfying the metricity property (i.e. can be converted to distance metrics) are selected. Returns: (NamedTuple): Top performer with fields: fingerprint_type (str): Label for fingerprint type similarity_measure (str): Label for similarity measure nearest_neighbor_correlation (float): Correlation of property of molecule and its nearest neighbor. furthest_neighbor_correlation (float): Correlation of property of molecule and its furthest neighbor. score_ (float): Overall score based on optimization strategy. More is better. """ print(f'Using subsample size {subsample_subset_size} for ' f'measure search') trial_ = namedtuple('trial_', [ 'fingerprint_type', 'similarity_measure', 'nearest_neighbor_correlation', 'furthest_neighbor_correlation', 'score_' ]) if fingerprint_type is None: all_fingerprint_types = Descriptor.get_supported_fprints() fingerprint_params = None else: all_fingerprint_types = [fingerprint_type] if similarity_measure is None: if only_metric: print('Only trying measures with valid distance metrics') all_similarity_measures = SimilarityMeasure.get_uniq_metrics() else: all_similarity_measures = [similarity_measure] is_verbose = molecule_set_configs.get("is_verbose", False) all_scores = [] if fingerprint_params is None: fingerprint_params = {} for similarity_measure in all_similarity_measures: if only_metric and not SimilarityMeasure( metric=similarity_measure).is_distance_metric(): continue if is_verbose: print(f'Trying {similarity_measure} similarity') for fingerprint_type in all_fingerprint_types: if is_verbose: print(f'Trying {fingerprint_type} fingerprint') try: molecule_set = MoleculeSet( molecule_database_src=molecule_set_configs[ 'molecule_database_src'], molecule_database_src_type=molecule_set_configs[ 'molecule_database_src_type'], similarity_measure=similarity_measure, fingerprint_type=fingerprint_type, fingerprint_params=fingerprint_params, is_verbose=is_verbose, n_threads=molecule_set_configs.get('n_threads', 1), sampling_ratio=subsample_subset_size) except (InvalidConfigurationError, ValueError) as e: if is_verbose: print( f'Could not try {fingerprint_type} with ' f'similarity measure {similarity_measure} due to ' f'{e}') continue nearest_corr, nearest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_similar( molecule_set) furthest_corr, furthest_p_val = self.prop_var_w_similarity. \ get_property_correlations_in_most_dissimilar( molecule_set) if optim_algo == 'max_min': score_ = nearest_corr - abs(furthest_corr) elif optim_algo == 'max': score_ = nearest_corr elif optim_algo == 'min': score_ = -abs(furthest_corr) else: raise InvalidConfigurationError(f'{optim_algo} ' f'not implemented') all_scores.append( trial_(fingerprint_type=fingerprint_type, similarity_measure=similarity_measure, nearest_neighbor_correlation=nearest_corr, furthest_neighbor_correlation=furthest_corr, score_=score_)) all_scores.sort(key=lambda x: x[-1], reverse=True) if self.log_fpath is not None: print('Writing to ', self.log_fpath) log_data = [trial._asdict() for trial in all_scores] with open(self.log_fpath, "w") as fp: json.dump(log_data, fp) if show_top > 0: top_performers = all_scores[:show_top] all_nearest_neighbor_correlations = [] all_furthest_neighbor_correlations = [] top_scores = [] all_measures = [] for trial in top_performers: all_nearest_neighbor_correlations.append( trial.nearest_neighbor_correlation) all_furthest_neighbor_correlations.append( trial.furthest_neighbor_correlation) top_scores.append(trial.score_) all_measures.append( Descriptor.shorten_label(trial.fingerprint_type) + '\n' + trial.similarity_measure) bar_heights = np.array([ top_scores, all_nearest_neighbor_correlations, all_furthest_neighbor_correlations ]) colors = self.plot_settings.pop('colors') plot_multiple_barchart(x=[_ for _ in range(len(top_performers))], heights=bar_heights, legend_labels=[ 'Overall scores', 'Nearest neighbor property ' 'correlation', 'Furthest neighbor property ' 'correlations' ], colors=colors, xtick_labels=all_measures, ylabel='Value', xlabel='Measure', **self.plot_settings) return all_scores[0]
def plot_multiple_barchart(x, heights, colors, legend_labels=None, xtick_labels=None, **kwargs): """Plot a bar chart with multiplears per category. Args: x (list or numpy array): X axis grid. heights (list or numpy array): Heights of the sets of bars. Size of the array is (n_bars_per_xtick, n_xticks), colors (list or str): Plot colors. If list supplied, list[0] is used for first series, list[1] is used for second series and list[2] is used for third series etc. legend_labels (list or numpy array): Array of legend names for each bar type. Size is (n_bars_per_xticks). Default is None. xtick_labels (list, optional): Labels to use for each bar. Default is None in which case just the indices of the heights are used. Raises: InvalidConfigurationError: If number of colors or legend labels supplied is less than (or equal to, for legend_labels) n_bars (per xtick). """ plot_params = { "title": kwargs.pop("title", ""), "title_fontsize": kwargs.pop("title_fontsize", 24), "xlabel": kwargs.pop("xlabel", ""), "xlabel_fontsize": kwargs.pop("xlabel_fontsize", 20), "ylabel": kwargs.pop("ylabel", ""), "ylabel_fontsize": kwargs.pop("ylabel_fontsize", 20), "xticksize": kwargs.pop("xticksize", 24), "yticksize": kwargs.pop("yticksize", 24), } x = np.array(x) heights = np.array(heights) bar_width = kwargs.pop('bar_width', 0.2) n_bars_per_xtick = heights.shape[0] if isinstance(colors, str): colors = [colors] * n_bars_per_xtick if len(colors) < n_bars_per_xtick: raise InvalidConfigurationError(f'{len(colors)} colors supplied ' f'insufficient for ' f'{n_bars_per_xtick} bars') plt.figure() plt.tight_layout() plt.rcParams["svg.fonttype"] = "none" if xtick_labels is None: xtick_labels = x bars = [] for bar_id in range(n_bars_per_xtick): bars.append(plt.bar(x + bar_id*bar_width, heights[bar_id], bar_width, color=colors[bar_id], **kwargs)) plt.title(plot_params["title"], fontsize=plot_params["title_fontsize"]) plt.xlabel(plot_params["xlabel"], fontsize=plot_params["xlabel_fontsize"]) plt.ylabel(plot_params["ylabel"], fontsize=plot_params["ylabel_fontsize"]) plt.xticks(x + bar_width * ((n_bars_per_xtick-1)/2), xtick_labels, fontsize=plot_params["xticksize"]) plt.yticks(fontsize=plot_params["yticksize"]) if legend_labels is not None: if len(legend_labels) != n_bars_per_xtick: raise InvalidConfigurationError(f'{len(legend_labels)} legend ' f'labels not sufficient for ' f'{n_bars_per_xticks} bars') plt.legend(bars, legend_labels)
def plot_density(densities, n_densities=1, legends=None, **kwargs): """Plot the similarity density. Args: densities (list or numpy ndarray): Vector(s) of densities to plot. Shape (n_densities, n_points_per_density). n_densities can be 1. n_densities (int): Number of densities. Pass this if passing more than one densities. legends (list): Optional list of legends for annotating different densities. kwargs: dict Keyword arguments to modify plot. Some common ones: xlabel: str Label of the x-axis. Default is "Samples" ylabel: str Label of the y-axis. Default is "Similarity Density" xlabel_fontsize: int Fontsize of the x-axis label. Default is 20. ylabel_fontsize: int Fontsize of the y-axis label. Default is 20. plot_title: str Plot title. Default is None. plot_title_fontsize: int Fontsize of the title. Default is 24. color: str or list Color of the plot. Multiple colors can be passed as list if multiple densities are plotted. shade: bool To shade the plot or not. """ plot_title = kwargs.pop("plot_title", None) xlabel = kwargs.pop("xlabel", "Samples") ylabel = kwargs.pop("ylabel", "Similarity Density") plot_title_fontsize = kwargs.pop("plot_title_fontsize", 24) xlabel_fontsize = int(kwargs.pop("xlabel_fontsize", 20)) ylabel_fontsize = int(kwargs.pop("ylabel_fontsize", 20)) legend_fontsize = int(kwargs.pop("legend_fontsize", 20)) color = kwargs.pop("plot_color", None) shade = kwargs.pop("shade", False) if n_densities == 1: valid_number_types = (np.float, np.int64, int, float) for density in densities: is_number = isinstance(density, valid_number_types) if not is_number: raise InvalidConfigurationError(f'Element of type ' f'{type(density)} passed when ' f'expecting types ' f'{valid_number_types}') # converting to 2D array for uniform processing densities = [densities] if color is None or isinstance(color, str): color = [color] * n_densities if legends is None: legends = [None] * n_densities if len(color) < n_densities: raise InvalidConfigurationError(f'{len(color)} colors supplied ' f'for {n_densities} ' f'densities') if len(legends) < n_densities: raise InvalidConfigurationError(f'{len(legends)} colors supplied ' f'for {n_densities} ' f'densities') plt.figure() plt.rcParams["svg.fonttype"] = "none" for density_id, density in enumerate(densities): kdeplot(density, color=color[density_id], label=legends[density_id], shade=shade, **kwargs) plt.xlabel(xlabel, fontsize=xlabel_fontsize) plt.ylabel(ylabel, fontsize=ylabel_fontsize) if not legends == [None] * n_densities: plt.legend(fontsize=legend_fontsize) if plot_title is not None: plt.title(plot_title, fontsize=plot_title_fontsize)