예제 #1
0
    def get_hits_similar_to(self, molecule_set=None):
        """Get sorted list of num_hits Molecule in the Set most
        similar to a query Molecule.This is defined as the sorted set
        (decreasing similarity)  of molecules with the highest
        (query_molecule, set_molecule) similarity.

        Args:
            molecule_set (AIMSim.chemical_datastructures MoleculeSet):
                MoleculeSet object used to calculate sorted similarities.
                Only used if self.similarities or
                self.sorted_similarities not set.

        Returns:
            np.ndarray(int): Ids of most similar
                molecules in decreasing order of similarity.
            np.ndarray(float): Corresponding similarity values.

        """
        if not hasattr(self, 'sorted_similarities_'):
            if not hasattr(self, 'similarities_'):
                if molecule_set is None:
                    raise InvalidConfigurationError('MoleculeSet object not '
                                                    'passed for task')
                else:
                    self.similarities_ = molecule_set.compare_against_molecule(
                        self.target_molecule)
            self.sorted_similarities_ = np.argsort(self.similarities_)
        ids = np.array([
            self.sorted_similarities_[-1 - hit_id]
            for hit_id in range(self.n_hits)
        ])

        return ids, self.similarities_[ids]
예제 #2
0
    def _set_tasks(self, tasks):
        """
        Args:
            tasks (dict): The tasks field of the config yaml containing various
            tasks and their parameters.
        """
        for task, task_configs in tasks.items():
            try:
                if task == "compare_target_molecule":
                    loaded_task = CompareTargetMolecule(task_configs)
                elif task == "visualize_dataset":
                    loaded_task = VisualizeDataset(task_configs)
                elif task == "see_property_variation_w_similarity":
                    loaded_task = SeePropertyVariationWithSimilarity(
                        task_configs)
                elif task == "identify_outliers":
                    loaded_task = IdentifyOutliers(task_configs)
                elif task == "cluster":
                    loaded_task = ClusterData(task_configs)
                else:
                    print(f"{task} not recognized")
                    continue
                self.to_do.append(loaded_task)
            except InvalidConfigurationError as e:
                print(f"Error in the config file for task: ", task)
                print("\n", e)
                raise e

        if len(self.to_do) == 0:
            raise InvalidConfigurationError("No tasks were read, exiting.")
예제 #3
0
 def _extract_configs(self):
     """
     Raises:
     InvalidConfigurationError: If correlation_type does not match
                                implemented types.
     """
     self.plot_settings = {"response": "response"}
     self.plot_settings.update(
         self.configs.get("property_plot_settings", {}))
     self.log_fpath = self.configs.get("log_file_path", None)
     self.correlation_type = self.configs.get('correlation_type')
     if self.correlation_type is None:
         self.correlation_type = 'pearson'
     if self.correlation_type.lower() in ['pearson', 'linear']:
         self.correlation_fn = pearsonr
     else:
         raise InvalidConfigurationError(
             f'{self.correlation_type} correlation '
             f'not supported')
     if self.log_fpath is not None:
         log_dir = dirname(self.log_fpath)
         makedirs(log_dir, exist_ok=True)
예제 #4
0
    def __call__(
        self,
        molecule_set_configs,
        fingerprint_type=None,
        fingerprint_params=None,
        similarity_measure=None,
        subsample_subset_size=0.01,
        optim_algo='max_min',
        show_top=0,
        only_metric=True,
    ):
        """
        Calculate the correlation in the properties of molecules in set
        and their nearest and furthest neighbors using different
        fingerprints / similarity measure choices. Choose the best fingerprint
        and similarity measure pair (called measure choice for brevity)
        based on an optimization strategy.

        Args:
            molecule_set_configs (dict): All configurations (except
                fingerprint_type, fingerprint_params and similarity_measure)
                needed to form the moleculeSet.
            fingerprint_type (str): Label to indicate which fingerprint to
                use. If supplied, fingerprint is fixed and optimization
                carried out over similarity measures. Use None to indicate
                that optimization needs to be carried out over fingerprints.
                Default is None.
            fingerprint_params (dict): Hyper-parameters for fingerprints.
                Passed to the MoleculeSet constructor. If None is passed,
                set to empty dictionary before passing to MoleculeSet.
            similarity_measure (str): Label to indicate which similarity
                measure to use. If supplied, similarity measure is fixed
                and optimization carried out over similarity measures.
                Use None to indicate that optimization needs to be carried
                out over fingerprints. Default is None.
            subsample_subset_size (float): Fraction of molecule_set to
                subsample. This is separate from the sample_ratio parameter
                used when creating a moleculeSet since it is recommended
                to have an more aggressive subsampling strategy for this task
                due to the combinatorial explosion of looking at multiple
                fingerprints and similarity measures. Default is 0.01.
            optim_algo (str): Label to indicate the optimization algorithm
                chosen. Options are:
                'max': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar).
                    This is the default.
                'min': The measure choice which minimizes the absolute value
                    of property correlation
                    between furthest neighbors (most dissimilar).
                'max_min': The measure choice which maximizes correlation
                    of properties between nearest neighbors (most similar)
                    and minimizes he absolute value of property correlation
                    between furthest neighbors (most dissimilar).
                    This is the default.
            show_top (int): Number of top performing measures to show in plot.
                If 0, no plots are generated and the top performer is returned.
            only_metric (bool): If True only similarity measures satisfying
                the metricity property
                (i.e. can be converted to distance metrics) are selected.

        Returns:
            (NamedTuple): Top performer with fields:
                fingerprint_type (str): Label for fingerprint type
               similarity_measure (str): Label for similarity measure
               nearest_neighbor_correlation (float): Correlation of property
                   of molecule and its nearest neighbor.
               furthest_neighbor_correlation (float): Correlation of property
                   of molecule and its furthest neighbor.
               score_ (float): Overall score based on optimization strategy.
                   More is better.

        """
        print(f'Using subsample size {subsample_subset_size} for '
              f'measure search')
        trial_ = namedtuple('trial_', [
            'fingerprint_type', 'similarity_measure',
            'nearest_neighbor_correlation', 'furthest_neighbor_correlation',
            'score_'
        ])
        if fingerprint_type is None:
            all_fingerprint_types = Descriptor.get_supported_fprints()
            fingerprint_params = None
        else:
            all_fingerprint_types = [fingerprint_type]
        if similarity_measure is None:
            if only_metric:
                print('Only trying measures with valid distance metrics')
            all_similarity_measures = SimilarityMeasure.get_uniq_metrics()
        else:
            all_similarity_measures = [similarity_measure]
        is_verbose = molecule_set_configs.get("is_verbose", False)
        all_scores = []
        if fingerprint_params is None:
            fingerprint_params = {}
        for similarity_measure in all_similarity_measures:
            if only_metric and not SimilarityMeasure(
                    metric=similarity_measure).is_distance_metric():
                continue
            if is_verbose:
                print(f'Trying {similarity_measure} similarity')
            for fingerprint_type in all_fingerprint_types:
                if is_verbose:
                    print(f'Trying {fingerprint_type} fingerprint')
                try:
                    molecule_set = MoleculeSet(
                        molecule_database_src=molecule_set_configs[
                            'molecule_database_src'],
                        molecule_database_src_type=molecule_set_configs[
                            'molecule_database_src_type'],
                        similarity_measure=similarity_measure,
                        fingerprint_type=fingerprint_type,
                        fingerprint_params=fingerprint_params,
                        is_verbose=is_verbose,
                        n_threads=molecule_set_configs.get('n_threads', 1),
                        sampling_ratio=subsample_subset_size)
                except (InvalidConfigurationError, ValueError) as e:
                    if is_verbose:
                        print(
                            f'Could not try {fingerprint_type} with '
                            f'similarity measure {similarity_measure} due to '
                            f'{e}')
                    continue
                nearest_corr, nearest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_similar(
                        molecule_set)
                furthest_corr, furthest_p_val = self.prop_var_w_similarity. \
                    get_property_correlations_in_most_dissimilar(
                        molecule_set)
                if optim_algo == 'max_min':
                    score_ = nearest_corr - abs(furthest_corr)
                elif optim_algo == 'max':
                    score_ = nearest_corr
                elif optim_algo == 'min':
                    score_ = -abs(furthest_corr)
                else:
                    raise InvalidConfigurationError(f'{optim_algo} '
                                                    f'not implemented')
                all_scores.append(
                    trial_(fingerprint_type=fingerprint_type,
                           similarity_measure=similarity_measure,
                           nearest_neighbor_correlation=nearest_corr,
                           furthest_neighbor_correlation=furthest_corr,
                           score_=score_))
        all_scores.sort(key=lambda x: x[-1], reverse=True)
        if self.log_fpath is not None:
            print('Writing to ', self.log_fpath)
            log_data = [trial._asdict() for trial in all_scores]
            with open(self.log_fpath, "w") as fp:
                json.dump(log_data, fp)

        if show_top > 0:
            top_performers = all_scores[:show_top]
            all_nearest_neighbor_correlations = []
            all_furthest_neighbor_correlations = []
            top_scores = []
            all_measures = []
            for trial in top_performers:
                all_nearest_neighbor_correlations.append(
                    trial.nearest_neighbor_correlation)
                all_furthest_neighbor_correlations.append(
                    trial.furthest_neighbor_correlation)
                top_scores.append(trial.score_)
                all_measures.append(
                    Descriptor.shorten_label(trial.fingerprint_type) + '\n' +
                    trial.similarity_measure)
            bar_heights = np.array([
                top_scores, all_nearest_neighbor_correlations,
                all_furthest_neighbor_correlations
            ])
            colors = self.plot_settings.pop('colors')
            plot_multiple_barchart(x=[_ for _ in range(len(top_performers))],
                                   heights=bar_heights,
                                   legend_labels=[
                                       'Overall scores',
                                       'Nearest neighbor property '
                                       'correlation',
                                       'Furthest neighbor property '
                                       'correlations'
                                   ],
                                   colors=colors,
                                   xtick_labels=all_measures,
                                   ylabel='Value',
                                   xlabel='Measure',
                                   **self.plot_settings)

        return all_scores[0]
예제 #5
0
def plot_multiple_barchart(x,
                           heights,
                           colors,
                           legend_labels=None,
                           xtick_labels=None,
                           **kwargs):
    """Plot a bar chart with multiplears per category.

    Args:
        x (list or numpy array): X axis grid.
        heights (list or numpy array): Heights of the sets of bars.
            Size of the array is (n_bars_per_xtick, n_xticks),
        colors (list or str): Plot colors. If list supplied,
            list[0] is used for first series, list[1] is used for
            second series and list[2] is used for third series etc.
        legend_labels (list or numpy array): Array of legend names for
            each bar type. Size is (n_bars_per_xticks). Default is None.
        xtick_labels (list, optional): Labels to use for each bar. Default is
            None in which case just the indices of the heights are used.

    Raises:
        InvalidConfigurationError: If number of colors or legend labels
        supplied is less than (or equal to, for legend_labels) n_bars
        (per xtick).
    """
    plot_params = {
        "title": kwargs.pop("title", ""),
        "title_fontsize": kwargs.pop("title_fontsize", 24),
        "xlabel": kwargs.pop("xlabel", ""),
        "xlabel_fontsize": kwargs.pop("xlabel_fontsize", 20),
        "ylabel": kwargs.pop("ylabel", ""),
        "ylabel_fontsize": kwargs.pop("ylabel_fontsize", 20),
        "xticksize": kwargs.pop("xticksize", 24),
        "yticksize": kwargs.pop("yticksize", 24),
    }
    x = np.array(x)
    heights = np.array(heights)
    bar_width = kwargs.pop('bar_width', 0.2)
    n_bars_per_xtick = heights.shape[0]
    if isinstance(colors, str):
        colors = [colors] * n_bars_per_xtick
    if len(colors) < n_bars_per_xtick:
        raise InvalidConfigurationError(f'{len(colors)} colors supplied '
                                        f'insufficient for '
                                        f'{n_bars_per_xtick} bars')
    plt.figure()
    plt.tight_layout()
    plt.rcParams["svg.fonttype"] = "none"
    if xtick_labels is None:
        xtick_labels = x
    bars = []
    for bar_id in range(n_bars_per_xtick):
        bars.append(plt.bar(x + bar_id*bar_width,
                            heights[bar_id],
                            bar_width,
                            color=colors[bar_id],
                            **kwargs))

    plt.title(plot_params["title"], fontsize=plot_params["title_fontsize"])
    plt.xlabel(plot_params["xlabel"], fontsize=plot_params["xlabel_fontsize"])
    plt.ylabel(plot_params["ylabel"], fontsize=plot_params["ylabel_fontsize"])
    plt.xticks(x + bar_width * ((n_bars_per_xtick-1)/2),
               xtick_labels,
               fontsize=plot_params["xticksize"])
    plt.yticks(fontsize=plot_params["yticksize"])
    if legend_labels is not None:
        if len(legend_labels) != n_bars_per_xtick:
            raise InvalidConfigurationError(f'{len(legend_labels)} legend '
                                            f'labels not sufficient for '
                                            f'{n_bars_per_xticks} bars')
        plt.legend(bars, legend_labels)
예제 #6
0
def plot_density(densities, n_densities=1, legends=None, **kwargs):
    """Plot the similarity density.

    Args:
        densities (list or numpy ndarray): Vector(s) of densities to plot.
            Shape (n_densities, n_points_per_density). n_densities can be 1.
        n_densities (int): Number of densities.
        Pass this if passing more than one densities.
        legends (list): Optional list of legends for annotating
            different densities.

    kwargs: dict
        Keyword arguments to modify plot. Some common ones:
        xlabel: str
            Label of the x-axis. Default is "Samples"
        ylabel: str
            Label of the y-axis. Default is "Similarity Density"
        xlabel_fontsize: int
            Fontsize of the x-axis label. Default is 20.
        ylabel_fontsize: int
            Fontsize of the y-axis label. Default is 20.
        plot_title: str
            Plot title. Default is None.
        plot_title_fontsize: int
            Fontsize of the title. Default is 24.
        color: str or list
            Color of the plot. Multiple colors can be passed as list
            if multiple densities are plotted.
        shade: bool
            To shade the plot or not.
    """
    plot_title = kwargs.pop("plot_title", None)
    xlabel = kwargs.pop("xlabel", "Samples")
    ylabel = kwargs.pop("ylabel", "Similarity Density")
    plot_title_fontsize = kwargs.pop("plot_title_fontsize", 24)
    xlabel_fontsize = int(kwargs.pop("xlabel_fontsize", 20))
    ylabel_fontsize = int(kwargs.pop("ylabel_fontsize", 20))
    legend_fontsize = int(kwargs.pop("legend_fontsize", 20))
    color = kwargs.pop("plot_color", None)
    shade = kwargs.pop("shade", False)

    if n_densities == 1:
        valid_number_types = (np.float, np.int64, int, float)
        for density in densities:
            is_number = isinstance(density, valid_number_types)
            if not is_number:
                raise InvalidConfigurationError(f'Element of type '
                                                f'{type(density)} passed when '
                                                f'expecting types '
                                                f'{valid_number_types}')
        # converting to 2D array for uniform processing
        densities = [densities]
    if color is None or isinstance(color, str):
        color = [color] * n_densities
    if legends is None:
        legends = [None] * n_densities
    if len(color) < n_densities:
        raise InvalidConfigurationError(f'{len(color)} colors supplied '
                                        f'for {n_densities} '
                                        f'densities')
    if len(legends) < n_densities:
        raise InvalidConfigurationError(f'{len(legends)} colors supplied '
                                        f'for {n_densities} '
                                        f'densities')

    plt.figure()
    plt.rcParams["svg.fonttype"] = "none"
    for density_id, density in enumerate(densities):
        kdeplot(density,
                color=color[density_id],
                label=legends[density_id],
                shade=shade,
                **kwargs)
    plt.xlabel(xlabel, fontsize=xlabel_fontsize)
    plt.ylabel(ylabel, fontsize=ylabel_fontsize)
    if not legends == [None] * n_densities:
        plt.legend(fontsize=legend_fontsize)
    if plot_title is not None:
        plt.title(plot_title, fontsize=plot_title_fontsize)