Beispiel #1
0
def create_RF_feature_extractors(extractor_kwargs,
                                 n_estimators=[10, 100, 1000],
                                 min_samples_leaves=[0.25, 0.1, 1],
                                 max_depths=[None, 1, 10, 100]):
    extractors = []
    for one_vs_rest in [False, True]:
        # We only consider min samples leafs and max_depths if we have real multiclass
        for md in [None] if one_vs_rest else max_depths:
            for msl in [1] if one_vs_rest else min_samples_leaves:
                suffix = "" if one_vs_rest else "_multiclass"
                if md is not None:  # None is the default value
                    suffix += "_max_depth{}".format(msl)
                if msl > 1:  # 1 is the default vlaue
                    suffix += "_max_depth{}".format(msl)
                for nest in n_estimators:
                    extractors.append(
                        fe.RandomForestFeatureExtractor(
                            name="{}-estimators{}".format(nest, suffix),
                            classifier_kwargs={
                                'n_estimators': nest,
                                'min_samples_leaf': msl,
                                'max_depth': md
                            },
                            one_vs_rest=one_vs_rest,
                            **extractor_kwargs))
    return _shuffle_and_shorten(extractors)
Beispiel #2
0
logger = logging.getLogger("demo")
logger.setLevel('INFO')

# Create data for which we know the ground truth
dg = DataGenerator(natoms=20,
                   nclusters=2,
                   natoms_per_cluster=2,
                   nframes_per_cluster=500)
samples, labels = dg.generate_data()
feature_to_resids = dg.feature_to_resids
logger.info("Generated samples and labels of shapes %s and %s", samples.shape,
            labels.shape)

# Identify important residues using a random forest
extractor = fe.RandomForestFeatureExtractor(samples=samples, labels=labels)
extractor = fe.PCAFeatureExtractor(
    samples=samples)  # Uncomment for unsupervised learning
extractor.extract_features()

# Postprocess the results to convert importance per feature into importance per residue
postprocessor = extractor.postprocessing()
postprocessor.average()
postprocessor.persist()

# Visualize the importance per residue
# Dashed lines show the residues we know are important (i.e. the atoms moved by the toy model)
visualization.visualize([[postprocessor]], highlighted_residues=dg.moved_atoms)

logger.info(
    "Below we list all features and their importance. Those with highest importance are good candidates for CVs"
Beispiel #3
0
def run_toy_model(dg, data, labels, supervised=True, filetype="svg", n_iterations=10, variance_cutoff="1_components"):
    cluster_indices = labels.argmax(axis=1)
    feature_to_resids = dg.feature_to_resids
    suffix = dg.test_model + "_" + dg.feature_type \
             + ("_supervised" if supervised else "_unsupervised") \
             + ("_var-cutoff=" + str(variance_cutoff) if not supervised else "")
    kwargs = {
        'samples': data,
        'labels': cluster_indices,
        'filter_by_distance_cutoff': False,
        'use_inverse_distances': True,
        'n_splits': 1,
        'n_iterations': n_iterations,
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }

    supervised_feature_extractors = [
        fe.MlpFeatureExtractor(
            activation="relu",
            classifier_kwargs={
                # 'hidden_layer_sizes': (dg.natoms, dg.nclusters * 2),
                'hidden_layer_sizes': (int(dg.natoms / 2),),
                # 'hidden_layer_sizes': [int(min(dg.nfeatures, 100) / (i + 1)) for i in range(10)],
                'max_iter': 10000,
                'alpha': 0.001,
            },
            per_frame_importance_outfile="output/toy_model_perframe.txt",
            one_vs_rest=True,
            **kwargs),
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (dg.nfeatures,),
        #         'alpha': 50,
        #     },
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 100},
            **kwargs),
    ]
    unsupervised_feature_extractors = [
        fe.MlpAeFeatureExtractor(
            classifier_kwargs={
                # hidden_layer_sizes=(int(data.shape[1]/2),),
                'hidden_layer_sizes': (dg.nclusters,),
                # 'hidden_layer_sizes': (10, 5, 1, 5, 10,),
                # hidden_layer_sizes=(100, 1, 100,),
                # hidden_layer_sizes=(200, 50, 10, 1, 10, 50, 200, ),
                'max_iter': 100000,
                # hidden_layer_sizes=(300, 200, 50, 10, 1, 10, 50, 200, 300,),
                # max_iter=10000,
                # 'alpha': 0.0001,
                'alpha': 1,
                'solver': "adam",
            },
            use_reconstruction_for_lrp=True,
            activation="logistic",
            **kwargs),
        fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None},
                               variance_cutoff=variance_cutoff,
                               name='PCA',
                               **kwargs),
        # fe.RbmFeatureExtractor(classifier_kwargs={'n_components': dg.nclusters},
        #                        relevance_method='from_lrp',
        #                        name='RBM',
        #                        **kwargs),
    ]
    feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    postprocessors = []
    filter_results = False

    for extractor in feature_extractors:
        extractor.error_limit = 50
        logger.info("Computing relevance for extractors %s", extractor.name)
        extractor.extract_features()
        p = extractor.postprocessing(working_dir="./{}".format(extractor.name),
                                     pdb_file=None,
                                     feature_to_resids=feature_to_resids,
                                     filter_results=filter_results)
        p.average()
        p.evaluate_performance()
        p.persist()
        postprocessors.append([p])
    logger.info("Done")

    logger.info(
        "Actual atoms moved: %s.\n(Cluster generation method %s. Noise level=%s, displacement=%s. frames/cluster=%s)",
        sorted(dg.moved_atoms),
        dg.test_model, dg.noise_level, dg.displacement, dg.nframes_per_cluster)

    visualization.visualize(postprocessors,
                            show_importance=True,
                            show_performance=False,
                            show_projected_data=False,
                            highlighted_residues=dg.moved_atoms,
                            outfile="output/test_importance_per_residue_{suffix}.{filetype}".format(suffix=suffix,
                                                                                                    filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=True,
    #                         show_projected_data=False,
    #                         outfile="output/test_performance_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                      filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=False,
    #                         show_projected_data=True,
    #                         outfile="output/test_projection_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                     filetype=filetype))
    logger.info("Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
                "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".format(**kwargs))
Beispiel #4
0
def run_VSD(working_dir="bio_input/VSD/",
            cluster_for_prediction=None,
            dt_for_prediction=10,
            multiclass=False):
    data = np.load(working_dir + 'frame_i_j_contacts_dt1.npy')
    cluster_indices = np.loadtxt(working_dir + 'clusters_indices.dat')

    kwargs = {
        'samples': data,
        'labels': cluster_indices,
        'filter_by_distance_cutoff': True,
        'use_inverse_distances': True,
        'n_splits': 3,
        'n_iterations': 5,
        'scaling': True,
        'shuffle_datasets': True
    }

    if cluster_for_prediction is not None:
        cluster_traj = md.load("{}/{}_dt{}.xtc".format(working_dir,
                                                       cluster_for_prediction,
                                                       dt_for_prediction),
                               top=working_dir + "alpha.pdb")
        other_samples, _, _ = tp.to_distances(traj=cluster_traj,
                                              scheme="closest-heavy",
                                              pairs="all-residues",
                                              use_inverse_distances=True,
                                              ignore_nonprotein=True,
                                              periodic=True)
        logger.debug(
            "Loaded cluster samples for prediction of shape %s for state %s",
            other_samples.shape, cluster_for_prediction)
        cluster_traj = None  # free memory
    else:
        other_samples = False
    feature_extractors = [
        fe.RandomForestFeatureExtractor(
            classifier_kwargs={'n_estimators': 100},
            one_vs_rest=not multiclass,
            **kwargs),
        fe.KLFeatureExtractor(bin_width=0.1, **kwargs),
        fe.MlpFeatureExtractor(
            classifier_kwargs={
                'hidden_layer_sizes': [
                    100,
                ],
                'max_iter': 100000,
                'alpha': 0.0001
            },
            activation="relu",
            one_vs_rest=not multiclass,
            per_frame_importance_samples=other_samples,
            per_frame_importance_labels=
            None,  # If None the method will use predicted labels for LRP
            per_frame_importance_outfile="{}/mlp_perframe_importance_{}/"
            "VSD_mlp_perframeimportance_{}_dt{}.txt".format(
                working_dir, "multiclass" if multiclass else "binaryclass",
                cluster_for_prediction, dt_for_prediction),
            **kwargs)
    ]

    common_peaks = {
        "R1-R4": [294, 297, 300, 303],
        "K5": [306],
        "R6": [309],
    }
    do_computations = True
    filetype = "svg"
    for extractor in feature_extractors:
        logger.info("Computing relevance for extractors %s", extractor.name)
        extractor.extract_features()
        p = extractor.postprocessing(working_dir=working_dir,
                                     pdb_file=working_dir + "alpha.pdb",
                                     filter_results=False)
        if do_computations:
            p.average()
            p.evaluate_performance()
            p.persist()
        else:
            p.load()

        visualization.visualize(
            [[p]],
            show_importance=True,
            show_performance=False,
            show_projected_data=False,
            highlighted_residues=common_peaks,
            outfile=working_dir +
            "{extractor}/importance_per_residue_{suffix}.{filetype}".format(
                suffix="", extractor=extractor.name, filetype=filetype))
        if do_computations:
            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=True,
                show_projected_data=False,
                outfile=working_dir +
                "{extractor}/performance_{suffix}.{filetype}".format(
                    extractor=extractor.name, suffix="", filetype=filetype))

            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=False,
                show_projected_data=True,
                outfile=working_dir +
                "{extractor}/projected_data_{suffix}.{filetype}".format(
                    extractor=extractor.name, suffix="", filetype=filetype))

    logger.info("Done")
Beispiel #5
0
def run_CaM(parser):
    # Known important residues
    common_peaks = [109, 144, 124, 145, 128, 105, 112, 136, 108, 141, 92]

    shuffle_data = True

    args = parser.parse_args()
    working_dir = args.out_directory
    n_runs = args.number_of_runs
    samples = np.load(args.feature_list)

    cluster_indices = np.loadtxt(args.cluster_indices)

    # Shift cluster indices to start at 0
    cluster_indices -= cluster_indices.min()

    if shuffle_data:
        # Permute blocks of 100 frames
        n_samples = samples.shape[0]
        n_samples = int(n_samples / 100) * 100
        inds = np.arange(n_samples)
        inds = inds.reshape((int(n_samples / 100), 100))
        perm_inds = np.random.permutation(inds)
        perm_inds = np.ravel(perm_inds)

        samples = samples[perm_inds]
        cluster_indices = cluster_indices[perm_inds]

    pdb_file = args.pdb_file

    labels = cluster_indices

    lower_distance_cutoff = 1.0
    upper_distance_cutoff = 1.0
    n_components = 20

    # Check if samples format is correct
    if len(samples.shape) != 2:
        sys.exit("Matrix with features should have 2 dimensions")

    kwargs = {
        'samples': samples,
        'labels': labels,
        'filter_by_distance_cutoff': True,
        'lower_bound_distance_cutoff': lower_distance_cutoff,
        'upper_bound_distance_cutoff': upper_distance_cutoff,
        'use_inverse_distances': True,
        'n_splits': args.number_of_k_splits,
        'n_iterations': args.number_of_iterations,
        'scaling': True
    }

    feature_extractors = [
        fe.PCAFeatureExtractor(variance_cutoff=0.75, **kwargs),
        fe.RbmFeatureExtractor(relevance_method="from_components", **kwargs),
        fe.MlpAeFeatureExtractor(activation=relprop.relu,
                                 classifier_kwargs={
                                     'solver': 'adam',
                                     'hidden_layer_sizes': (100, )
                                 },
                                 **kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.MlpFeatureExtractor(classifier_kwargs={
            'hidden_layer_sizes': (120, ),
            'solver': 'adam',
            'max_iter': 1000000
        },
                               activation=relprop.relu,
                               **kwargs),
    ]

    postprocessors = []
    for extractor in feature_extractors:

        tmp_pp = []
        for i_run in range(n_runs):
            extractor.extract_features()
            # Post-process data (rescale and filter feature importances)
            p = extractor.postprocessing(working_dir=working_dir,
                                         rescale_results=True,
                                         filter_results=False,
                                         feature_to_resids=None,
                                         pdb_file=pdb_file)
            p.average().evaluate_performance()
            p.persist()

            # Add common peaks
            tmp_pp.append(p)

        postprocessors.append(tmp_pp)

    visualization.visualize(
        postprocessors,
        show_importance=True,
        show_projected_data=False,
        show_performance=False,
        highlighted_residues=common_peaks,
        outfile="{}/importance-per-residue.png".format(working_dir))
    logger.info("Done")
Beispiel #6
0
def run_beta2(
        working_dir="bio_input/beta2/",
        n_iterations=1,
        n_splits=1,
        shuffle_datasets=True,
        overwrite=False,
        dt=1,
        feature_type="ca_inv",  # "closest-heavy_inv", "CA_inv", "cartesian_ca", "cartesian_noh" or "compact_ca_inv"
        filetype="svg",
        classtype="multiclass",
        supervised=True,
        load_trajectory_for_predictions=False,
        filter_by_distance_cutoff=False,
        ligand_type='holo'):
    results_dir = "{}/results/{}/{}/{}/".format(
        working_dir, classtype, feature_type,
        "cutoff" if filter_by_distance_cutoff else "nocutoff")
    samples_dir = "{}/samples/{}/{}".format(working_dir, classtype,
                                            feature_type)
    data = np.load("{}/samples_dt{}.npz".format(samples_dir, dt))['array']
    feature_to_resids = np.load("{}/feature_to_resids.npy".format(
        samples_dir, feature_type))
    labels = np.loadtxt(
        "{wd}/cluster_indices/{ct}/cluster_indices_dt{dt}.txt".format(
            wd=working_dir, ct=classtype, dt=dt))
    if classtype == "multiclass":
        label_names = ["agonist-bound", "protonated-asp79"]
        mixed_classes = True
    else:
        label_names = ["apo", "holo"]
        mixed_classes = False
    suffix = str(-1) + "clusters_" + str(n_iterations) + "iterations_" \
             + ("distance-cutoff_" if filter_by_distance_cutoff else "") + feature_type
    labels -= labels.min()
    if len(data) != len(labels) or data.shape[1] != len(feature_to_resids):
        raise Exception(
            "Inconsistent input data. The number of features or the number of frames to no match"
        )
    logger.info("Loaded data of shape %s for feature type %s", data.shape,
                feature_type)

    # ## Define the different methods to use
    # Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface
    cutoff_offset = 0.2 if "closest-heavy" in feature_type else 0
    kwargs = {
        'samples':
        data,
        'labels':
        labels,
        'label_names':
        label_names,
        'filter_by_distance_cutoff':
        filter_by_distance_cutoff,
        'lower_bound_distance_cutoff':
        filtering.lower_bound_distance_cutoff_default - cutoff_offset,
        'upper_bound_distance_cutoff':
        filtering.upper_bound_distance_cutoff_default - cutoff_offset,
        'use_inverse_distances':
        True,
        'n_splits':
        n_splits,
        'n_iterations':
        n_iterations,
        'shuffle_datasets':
        shuffle_datasets
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }
    unsupervised_feature_extractors = [
        fe.PCAFeatureExtractor(
            classifier_kwargs={'n_components': None},
            variance_cutoff='auto',
            # variance_cutoff='1_components',
            name='PCA',
            **kwargs),
        fe.RbmFeatureExtractor(classifier_kwargs={'n_components': 1},
                               relevance_method='from_lrp',
                               name='RBM',
                               **kwargs),
        # fe.MlpAeFeatureExtractor(
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (100, 30, 2, 30, 100,),  # int(data.shape[1]/2),),
        #         # max_iter=10000,
        #         'alpha': 0.01,
        #         'activation': "logistic"
        #     },
        #     use_reconstruction_for_lrp=True,
        #     **kwargs),
    ]
    if load_trajectory_for_predictions:
        other_samples, other_labels = _load_trajectory_for_predictions(
            ligand_type)
    else:
        other_samples, other_labels = None, None
    supervised_feature_extractors = [
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     n_nodes=data.shape[1] * 2,
        #     alpha=0.1,
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.MlpFeatureExtractor(
            name="MLP" if other_samples is None else
            "MLP_predictor_{}".format(ligand_type),
            classifier_kwargs={
                # 'hidden_layer_sizes': [int(min(100, data.shape[1]) / (i + 1)) + 1 for i in range(3)],
                'hidden_layer_sizes': (30, ),
                # 'max_iter': 10000,
                'alpha': 0.1,
                'activation': "relu"
            },
            # per_frame_importance_samples=other_samples,
            # per_frame_importance_labels=other_labels,
            # per_frame_importance_outfile="/home/oliverfl/projects/gpcr/mega/Result_Data/beta2-dror/apo-holo/trajectories"
            #                              "/mlp_perframe_importance_{}/"
            #                              "{}_mlp_perframeimportance_{}clusters_{}cutoff.txt"
            #     .format(ligand_type, feature_type, nclusters, "" if filter_by_distance_cutoff else "no"),
            **kwargs),
    ]

    if supervised is None:
        feature_extractors = unsupervised_feature_extractors + supervised_feature_extractors
    else:
        feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    highlighted_residues = _get_important_residues(supervised, feature_type)
    # # Run the relevance analysis
    postprocessors = []
    for extractor in feature_extractors:
        do_computations = True
        if os.path.exists(results_dir):
            existing_files = glob.glob(
                "{}/{}/importance_per_residue.npy".format(
                    results_dir, extractor.name))
            if len(existing_files) > 0 and not overwrite:
                logger.debug("File %s already exists. skipping computations",
                             existing_files[0])
                do_computations = False
        if do_computations:
            logger.info("Computing relevance for extractors %s",
                        extractor.name)
            extractor.extract_features()
        p = extractor.postprocessing(
            working_dir=results_dir,
            pdb_file=working_dir + "/trajectories/all.pdb",
            # pdb_file=working_dir + "/trajectories/protein_noh.pdb",
            feature_to_resids=feature_to_resids,
            filter_results=False)
        if do_computations:
            p.average()
            p.evaluate_performance()
            p.persist()
        else:
            p.load()

        postprocessors.append([p])
        # # Visualize results
        visualization.visualize(
            [[p]],
            show_importance=True,
            show_performance=False,
            show_projected_data=False,
            mixed_classes=mixed_classes,
            highlighted_residues=highlighted_residues,
            outfile=results_dir +
            "/{extractor}/importance_per_residue_{suffix}_{extractor}.{filetype}"
            .format(suffix=suffix, extractor=extractor.name,
                    filetype=filetype))

        if do_computations:
            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=True,
                show_projected_data=False,
                mixed_classes=mixed_classes,
                outfile=results_dir +
                "/{extractor}/performance_{suffix}_{extractor}.{filetype}".
                format(suffix=suffix,
                       extractor=extractor.name,
                       filetype=filetype))
            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=False,
                show_projected_data=True,
                mixed_classes=mixed_classes,
                outfile=results_dir +
                "/{extractor}/projected_data_{suffix}_{extractor}.{filetype}".
                format(suffix=suffix,
                       extractor=extractor.name,
                       filetype=filetype))
    logger.info(
        "Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
        "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".
        format(**kwargs))