Ejemplo n.º 1
0
def create_PCA_feature_extractors(
        extractor_kwargs,
        variance_cutoffs=["auto", "1_components", "2_components", 50, 100]):
    return [
        fe.PCAFeatureExtractor(name="{}-cutoff".format(cutoff),
                               variance_cutoff=cutoff,
                               **extractor_kwargs)
        for cutoff in variance_cutoffs
    ]
Ejemplo n.º 2
0
def run_beta2(
        working_dir="bio_input/beta2/",
        n_iterations=1,
        n_splits=1,
        shuffle_datasets=True,
        overwrite=False,
        dt=1,
        feature_type="ca_inv",  # "closest-heavy_inv", "CA_inv", "cartesian_ca", "cartesian_noh" or "compact_ca_inv"
        filetype="svg",
        classtype="multiclass",
        supervised=True,
        load_trajectory_for_predictions=False,
        filter_by_distance_cutoff=False,
        ligand_type='holo'):
    results_dir = "{}/results/{}/{}/{}/".format(working_dir, classtype, feature_type,
                                                "cutoff" if filter_by_distance_cutoff else "nocutoff")
    samples_dir = "{}/samples/{}/{}".format(working_dir, classtype, feature_type)
    data = np.load("{}/samples_dt{}.npz".format(samples_dir, dt))['array']
    feature_to_resids = np.load("{}/feature_to_resids.npy".format(samples_dir, feature_type))
    labels = np.loadtxt("{wd}/cluster_indices/{ct}/cluster_indices_dt{dt}.txt".format(wd=working_dir,
                                                                                      ct=classtype,
                                                                                      dt=dt))
    if classtype == "multiclass":
        label_names = ["agonist-bound", "protonated-asp79"]
        mixed_classes = True
    else:
        label_names = ["apo", "holo"]
        mixed_classes = False
    suffix = str(-1) + "clusters_" + str(n_iterations) + "iterations_" \
             + ("distance-cutoff_" if filter_by_distance_cutoff else "") + feature_type
    labels -= labels.min()
    if len(data) != len(labels) or data.shape[1] != len(feature_to_resids):
        raise Exception("Inconsistent input data. The number of features or the number of frames to no match")
    logger.info("Loaded data of shape %s for feature type %s", data.shape, feature_type)

    # ## Define the different methods to use
    # Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface
    cutoff_offset = 0.2 if "closest-heavy" in feature_type else 0
    kwargs = {
        'samples': data,
        'labels': labels,
        'label_names': label_names,
        'filter_by_distance_cutoff': filter_by_distance_cutoff,
        'lower_bound_distance_cutoff': filtering.lower_bound_distance_cutoff_default - cutoff_offset,
        'upper_bound_distance_cutoff': filtering.upper_bound_distance_cutoff_default - cutoff_offset,
        'use_inverse_distances': True,
        'n_splits': n_splits,
        'n_iterations': n_iterations,
        'shuffle_datasets': shuffle_datasets
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }
    unsupervised_feature_extractors = [
        fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None},
                               variance_cutoff='auto',
                               # variance_cutoff='1_components',
                               name='PCA',
                               **kwargs),
        fe.RbmFeatureExtractor(classifier_kwargs={'n_components': 1},
                               relevance_method='from_lrp',
                               name='RBM',
                               **kwargs),
        # fe.MlpAeFeatureExtractor(
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (100, 30, 2, 30, 100,),  # int(data.shape[1]/2),),
        #         # max_iter=10000,
        #         'alpha': 0.01,
        #         'activation': "logistic"
        #     },
        #     use_reconstruction_for_lrp=True,
        #     **kwargs),
    ]
    if load_trajectory_for_predictions:
        other_samples, other_labels = _load_trajectory_for_predictions(ligand_type)
    else:
        other_samples, other_labels = None, None
    supervised_feature_extractors = [
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     n_nodes=data.shape[1] * 2,
        #     alpha=0.1,
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.MlpFeatureExtractor(
            name="MLP" if other_samples is None else "MLP_predictor_{}".format(ligand_type),
            classifier_kwargs={
                # 'hidden_layer_sizes': [int(min(100, data.shape[1]) / (i + 1)) + 1 for i in range(3)],
                'hidden_layer_sizes': (30,),
                # 'max_iter': 10000,
                'alpha': 0.1,
                'activation': "relu"
            },
            # per_frame_importance_samples=other_samples,
            # per_frame_importance_labels=other_labels,
            # per_frame_importance_outfile="/home/oliverfl/projects/gpcr/mega/Result_Data/beta2-dror/apo-holo/trajectories"
            #                              "/mlp_perframe_importance_{}/"
            #                              "{}_mlp_perframeimportance_{}clusters_{}cutoff.txt"
            #     .format(ligand_type, feature_type, nclusters, "" if filter_by_distance_cutoff else "no"),
            **kwargs),
    ]

    if supervised is None:
        feature_extractors = unsupervised_feature_extractors + supervised_feature_extractors
    else:
        feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    highlighted_residues = _get_important_residues(supervised, feature_type)
    # # Run the relevance analysis
    postprocessors = []
    for extractor in feature_extractors:
        do_computations = True
        if os.path.exists(results_dir):
            existing_files = glob.glob("{}/{}/importance_per_residue.npy".format(results_dir, extractor.name))
            if len(existing_files) > 0 and not overwrite:
                logger.debug("File %s already exists. skipping computations", existing_files[0])
                do_computations = False
        if do_computations:
            logger.info("Computing relevance for extractors %s", extractor.name)
            extractor.extract_features()
        p = extractor.postprocessing(working_dir=results_dir,
                                     pdb_file=working_dir + "/trajectories/all.pdb",
                                     # pdb_file=working_dir + "/trajectories/protein_noh.pdb",
                                     feature_to_resids=feature_to_resids,
                                     filter_results=False)
        if do_computations:
            p.average()
            p.evaluate_performance()
            p.persist()
        else:
            p.load()

        postprocessors.append([p])
        # # Visualize results
        visualization.visualize([[p]],
                                show_importance=True,
                                show_performance=False,
                                show_projected_data=False,
                                mixed_classes=mixed_classes,
                                highlighted_residues=highlighted_residues,
                                outfile=results_dir + "/{extractor}/importance_per_residue_{suffix}_{extractor}.{filetype}".format(
                                    suffix=suffix,
                                    extractor=extractor.name,
                                    filetype=filetype))

        if do_computations:
            visualization.visualize([[p]],
                                    show_importance=False,
                                    show_performance=True,
                                    show_projected_data=False,
                                    mixed_classes=mixed_classes,
                                    outfile=results_dir + "/{extractor}/performance_{suffix}_{extractor}.{filetype}".format(
                                        suffix=suffix,
                                        extractor=extractor.name,
                                        filetype=filetype))
            visualization.visualize([[p]],
                                    show_importance=False,
                                    show_performance=False,
                                    show_projected_data=True,
                                    mixed_classes=mixed_classes,
                                    outfile=results_dir + "/{extractor}/projected_data_{suffix}_{extractor}.{filetype}".format(
                                        suffix=suffix,
                                        extractor=extractor.name,
                                        filetype=filetype))
    logger.info("Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
                "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".format(**kwargs))
Ejemplo n.º 3
0
def run_CaM(parser):
    # Known important residues
    common_peaks = [109, 144, 124, 145, 128, 105, 112, 136, 108, 141, 92]

    shuffle_data = True

    args = parser.parse_args()
    working_dir = args.out_directory
    n_runs = args.number_of_runs
    samples = np.load(args.feature_list)

    cluster_indices = np.loadtxt(args.cluster_indices)

    # Shift cluster indices to start at 0
    cluster_indices -= cluster_indices.min()

    if shuffle_data:
        # Permute blocks of 100 frames
        n_samples = samples.shape[0]
        n_samples = int(n_samples / 100) * 100
        inds = np.arange(n_samples)
        inds = inds.reshape((int(n_samples / 100), 100))
        perm_inds = np.random.permutation(inds)
        perm_inds = np.ravel(perm_inds)

        samples = samples[perm_inds]
        cluster_indices = cluster_indices[perm_inds]

    pdb_file = args.pdb_file

    labels = cluster_indices

    lower_distance_cutoff = 1.0
    upper_distance_cutoff = 1.0
    n_components = 20

    # Check if samples format is correct
    if len(samples.shape) != 2:
        sys.exit("Matrix with features should have 2 dimensions")

    kwargs = {
        'samples': samples,
        'labels': labels,
        'filter_by_distance_cutoff': True,
        'lower_bound_distance_cutoff': lower_distance_cutoff,
        'upper_bound_distance_cutoff': upper_distance_cutoff,
        'use_inverse_distances': True,
        'n_splits': args.number_of_k_splits,
        'n_iterations': args.number_of_iterations,
        'scaling': True
    }

    feature_extractors = [
        fe.PCAFeatureExtractor(variance_cutoff=0.75, **kwargs),
        fe.RbmFeatureExtractor(relevance_method="from_components", **kwargs),
        fe.MlpAeFeatureExtractor(activation=relprop.relu,
                                 classifier_kwargs={
                                     'solver': 'adam',
                                     'hidden_layer_sizes': (100, )
                                 },
                                 **kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.MlpFeatureExtractor(classifier_kwargs={
            'hidden_layer_sizes': (120, ),
            'solver': 'adam',
            'max_iter': 1000000
        },
                               activation=relprop.relu,
                               **kwargs),
    ]

    postprocessors = []
    for extractor in feature_extractors:

        tmp_pp = []
        for i_run in range(n_runs):
            extractor.extract_features()
            # Post-process data (rescale and filter feature importances)
            p = extractor.postprocessing(working_dir=working_dir,
                                         rescale_results=True,
                                         filter_results=False,
                                         feature_to_resids=None,
                                         pdb_file=pdb_file)
            p.average().evaluate_performance()
            p.persist()

            # Add common peaks
            tmp_pp.append(p)

        postprocessors.append(tmp_pp)

    visualization.visualize(
        postprocessors,
        show_importance=True,
        show_projected_data=False,
        show_performance=False,
        highlighted_residues=common_peaks,
        outfile="{}/importance-per-residue.png".format(working_dir))
    logger.info("Done")
def run_toy_model(dg,
                  data,
                  labels,
                  supervised=True,
                  filetype="svg",
                  n_iterations=10,
                  variance_cutoff="1_components"):
    cluster_indices = labels.argmax(axis=1)
    feature_to_resids = dg.feature_to_resids()
    suffix = dg.test_model + "_" + dg.feature_type \
             + ("_supervised" if supervised else "_unsupervised") \
             + ("_var-cutoff=" + str(variance_cutoff) if not supervised else "")
    kwargs = {
        'samples': data,
        'labels': cluster_indices,
        'filter_by_distance_cutoff': False,
        'use_inverse_distances': True,
        'n_splits': 1,
        'n_iterations': n_iterations,
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }

    supervised_feature_extractors = [
        fe.MlpFeatureExtractor(
            activation="relu",
            classifier_kwargs={
                # 'hidden_layer_sizes': (dg.natoms, dg.nclusters * 2),
                'hidden_layer_sizes': (int(dg.natoms / 2), ),
                # 'hidden_layer_sizes': [int(min(dg.nfeatures, 100) / (i + 1)) for i in range(10)],
                'max_iter': 10000,
                'alpha': 0.001,
            },
            per_frame_importance_outfile="output/toy_model_perframe.txt",
            one_vs_rest=True,
            **kwargs),
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (dg.nfeatures,),
        #         'alpha': 50,
        #     },
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 100},
            **kwargs),
    ]
    unsupervised_feature_extractors = [
        fe.MlpAeFeatureExtractor(
            classifier_kwargs={
                # hidden_layer_sizes=(int(data.shape[1]/2),),
                'hidden_layer_sizes': (dg.nclusters, ),
                # 'hidden_layer_sizes': (10, 5, 1, 5, 10,),
                # hidden_layer_sizes=(100, 1, 100,),
                # hidden_layer_sizes=(200, 50, 10, 1, 10, 50, 200, ),
                'max_iter': 100000,
                # hidden_layer_sizes=(300, 200, 50, 10, 1, 10, 50, 200, 300,),
                # max_iter=10000,
                # 'alpha': 0.0001,
                'alpha': 1,
                'solver': "adam",
            },
            use_reconstruction_for_lrp=True,
            activation="logistic",
            **kwargs),
        fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None},
                               variance_cutoff=variance_cutoff,
                               name='PCA',
                               **kwargs),
        # fe.RbmFeatureExtractor(classifier_kwargs={'n_components': dg.nclusters},
        #                        relevance_method='from_lrp',
        #                        name='RBM',
        #                        **kwargs),
    ]
    feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    postprocessors = []
    filter_results = False

    for extractor in feature_extractors:
        extractor.error_limit = 50
        logger.info("Computing relevance for extractors %s", extractor.name)
        extractor.extract_features()
        p = extractor.postprocessing(working_dir="./{}".format(extractor.name),
                                     pdb_file=None,
                                     feature_to_resids=feature_to_resids,
                                     filter_results=filter_results)
        p.average()
        p.evaluate_performance()
        p.persist()
        postprocessors.append([p])
    logger.info("Done")

    logger.info(
        "Actual atoms moved: %s.\n(Cluster generation method %s. Noise level=%s, displacement=%s. frames/cluster=%s)",
        sorted(dg.moved_atoms), dg.test_model, dg.noise_level, dg.displacement,
        dg.nframes_per_cluster)

    visualization.visualize(
        postprocessors,
        show_importance=True,
        show_performance=False,
        show_projected_data=False,
        highlighted_residues=dg.moved_atoms,
        outfile="output/test_importance_per_residue_{suffix}.{filetype}".
        format(suffix=suffix, filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=True,
    #                         show_projected_data=False,
    #                         outfile="output/test_performance_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                      filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=False,
    #                         show_projected_data=True,
    #                         outfile="output/test_projection_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                     filetype=filetype))
    logger.info(
        "Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
        "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".
        format(**kwargs))