Ejemplo n.º 1
0
def run_CaM(parser):
    # Known important residues
    common_peaks = [109, 144, 124, 145, 128, 105, 112, 136, 108, 141, 92]

    shuffle_data = True

    args = parser.parse_args()
    working_dir = args.out_directory
    n_runs = args.number_of_runs
    samples = np.load(args.feature_list)

    cluster_indices = np.loadtxt(args.cluster_indices)

    # Shift cluster indices to start at 0
    cluster_indices -= cluster_indices.min()

    if shuffle_data:
        # Permute blocks of 100 frames
        n_samples = samples.shape[0]
        n_samples = int(n_samples / 100) * 100
        inds = np.arange(n_samples)
        inds = inds.reshape((int(n_samples / 100), 100))
        perm_inds = np.random.permutation(inds)
        perm_inds = np.ravel(perm_inds)

        samples = samples[perm_inds]
        cluster_indices = cluster_indices[perm_inds]

    pdb_file = args.pdb_file

    labels = cluster_indices

    lower_distance_cutoff = 1.0
    upper_distance_cutoff = 1.0
    n_components = 20

    # Check if samples format is correct
    if len(samples.shape) != 2:
        sys.exit("Matrix with features should have 2 dimensions")

    kwargs = {
        'samples': samples,
        'labels': labels,
        'filter_by_distance_cutoff': True,
        'lower_bound_distance_cutoff': lower_distance_cutoff,
        'upper_bound_distance_cutoff': upper_distance_cutoff,
        'use_inverse_distances': True,
        'n_splits': args.number_of_k_splits,
        'n_iterations': args.number_of_iterations,
        'scaling': True
    }

    feature_extractors = [
        fe.PCAFeatureExtractor(variance_cutoff=0.75, **kwargs),
        fe.RbmFeatureExtractor(relevance_method="from_components", **kwargs),
        fe.MlpAeFeatureExtractor(activation=relprop.relu,
                                 classifier_kwargs={
                                     'solver': 'adam',
                                     'hidden_layer_sizes': (100, )
                                 },
                                 **kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.MlpFeatureExtractor(classifier_kwargs={
            'hidden_layer_sizes': (120, ),
            'solver': 'adam',
            'max_iter': 1000000
        },
                               activation=relprop.relu,
                               **kwargs),
    ]

    postprocessors = []
    for extractor in feature_extractors:

        tmp_pp = []
        for i_run in range(n_runs):
            extractor.extract_features()
            # Post-process data (rescale and filter feature importances)
            p = extractor.postprocessing(working_dir=working_dir,
                                         rescale_results=True,
                                         filter_results=False,
                                         feature_to_resids=None,
                                         pdb_file=pdb_file)
            p.average().evaluate_performance()
            p.persist()

            # Add common peaks
            tmp_pp.append(p)

        postprocessors.append(tmp_pp)

    visualization.visualize(
        postprocessors,
        show_importance=True,
        show_projected_data=False,
        show_performance=False,
        highlighted_residues=common_peaks,
        outfile="{}/importance-per-residue.png".format(working_dir))
    logger.info("Done")
Ejemplo n.º 2
0
def compute(
        extractor_type,
        n_splits=1,
        n_iterations=10,
        feature_type='cartesian_rot',
        iterations_per_model=10,
        test_model='linear',
        overwrite=False,
        accuracy_method='mse',
        displacement=1e-1,
        visualize=True,
        noise_level=1e-2,  # [1e-2, 1e-2, 2e-1, 2e-1],
        output_dir="output/benchmarking/"):
    """

    :param extractor_type:
    :param n_splits:
    :param n_iterations:
    :param feature_type:
    :param iterations_per_model:
    :param test_model:
    :param overwrite:
    :param displacement: for toy model important atoms
    :param noise_level: for toy model frame generation
    :param output_dir:
    :return: postprocessors (np.array of dim iterations_per_model, nfeature_extractors)
    """
    all_postprocessors = []
    extractor_names = configuration.get_feature_extractors_names(
        extractor_type, n_splits=n_splits, n_iterations=n_iterations)
    n_extractors = len(extractor_names)
    for iter in range(iterations_per_model):
        modeldir = "{output_dir}/{extractor_type}/{feature_type}/{test_model}/noise-{noise_level}/iter-{iter}/".format(
            output_dir=output_dir,
            extractor_type=extractor_type,
            feature_type=feature_type,
            test_model=test_model,
            noise_level=noise_level,
            iter=iter)

        finished_extractors = []
        for name in extractor_names:
            if not overwrite and os.path.exists(modeldir):
                filepath = "{}/{}/importance_per_residue.npy".format(
                    modeldir, name)
                existing_files = glob.glob(filepath)
                if len(existing_files) > 0:
                    logger.debug(
                        "File %s already exists. skipping computations",
                        existing_files[0])
                    finished_extractors.append(name)
                else:
                    logger.debug(
                        "File %s does not exists. performing computations",
                        filepath)
            else:
                os.makedirs(modeldir)
        needs_computations = len(finished_extractors) < n_extractors
        dg = DataGenerator(
            natoms=100,
            nclusters=3,
            natoms_per_cluster=[10, 10, 10],
            nframes_per_cluster=1200 if needs_computations else 2,
            # Faster generation for postprocessing purposes when we don't need the frames
            test_model=test_model,
            noise_natoms=None,
            noise_level=noise_level,
            displacement=displacement,
            feature_type=feature_type)
        samples, labels = dg.generate_data()
        cluster_indices = labels.argmax(axis=1)

        feature_extractors = configuration.create_feature_extractors(
            extractor_type,
            samples=samples,
            labels=cluster_indices,
            n_splits=n_splits,
            n_iterations=n_iterations)
        # First we run the computations if necessary
        for i_extractor, extractor in enumerate(feature_extractors):
            if extractor.name in finished_extractors:
                continue

            extractor.extract_features()
            pp = extractor.postprocessing(
                predefined_relevant_residues=dg.moved_atoms,
                rescale_results=True,
                filter_results=False,
                working_dir=modeldir,
                accuracy_method=accuracy_method,
                feature_to_resids=dg.feature_to_resids())
            pp.average()
            pp.evaluate_performance()
            logger.debug("Saving feature importance")
            pp.persist()
            logger.info("Accuracy for %s: %s (%s)", extractor.name,
                        pp.accuracy, pp.accuracy_method)
            if visualize:
                visualization.visualize(
                    [[pp]],
                    show_importance=False,
                    show_performance=False,
                    show_projected_data=True,
                    outfile="{}/{}/projected_data.svg".format(
                        modeldir, extractor.name),
                    highlighted_residues=np.array(
                        pp.predefined_relevant_residues).flatten(),
                    show_average=False)
                visualization.visualize(
                    [[pp]],
                    show_importance=False,
                    show_performance=True,
                    show_projected_data=False,
                    outfile="{}/{}/performance.svg".format(
                        modeldir, extractor.name),
                    highlighted_residues=np.array(
                        pp.predefined_relevant_residues).flatten(),
                    show_average=False)
                # Delete extractor to free memory
                feature_extractors[i_extractor] = None
                del extractor

        # The we run through them another time, generate figures and loads data
        # This gives a quick check that the data has been persisted correctly and
        # saves memory since we don't risk keeping any references to the data and classifier
        feature_extractors = configuration.create_feature_extractors(
            extractor_type,
            samples=samples,
            labels=cluster_indices,
            n_splits=n_splits,
            n_iterations=n_iterations)
        all_postprocessors.append([])
        for i_extractor, extractor in enumerate(feature_extractors):
            pp = extractor.postprocessing(
                predefined_relevant_residues=dg.moved_atoms,
                rescale_results=True,
                filter_results=False,
                working_dir=modeldir,
                accuracy_method=accuracy_method,
                feature_to_resids=dg.feature_to_resids())
            pp.load()
            pp.compute_accuracy(
            )  # Recompute performance to handle changes in the accuracy measure
            logger.info("Accuracy for %s: %s (%s)", extractor.name,
                        pp.accuracy, pp.accuracy_method)
            if visualize:
                visualization.visualize(
                    [[pp]],
                    show_importance=True,
                    show_performance=False,
                    show_projected_data=False,
                    outfile="{}/{}/importance_per_residue.svg".format(
                        modeldir, extractor.name),
                    highlighted_residues=np.array(
                        pp.predefined_relevant_residues).flatten(),
                    show_average=False)
            all_postprocessors[-1].append(pp)

    return np.array(all_postprocessors)
def run_toy_model(dg,
                  data,
                  labels,
                  supervised=True,
                  filetype="svg",
                  n_iterations=10,
                  variance_cutoff="1_components"):
    cluster_indices = labels.argmax(axis=1)
    feature_to_resids = dg.feature_to_resids()
    suffix = dg.test_model + "_" + dg.feature_type \
             + ("_supervised" if supervised else "_unsupervised") \
             + ("_var-cutoff=" + str(variance_cutoff) if not supervised else "")
    kwargs = {
        'samples': data,
        'labels': cluster_indices,
        'filter_by_distance_cutoff': False,
        'use_inverse_distances': True,
        'n_splits': 1,
        'n_iterations': n_iterations,
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }

    supervised_feature_extractors = [
        fe.MlpFeatureExtractor(
            activation="relu",
            classifier_kwargs={
                # 'hidden_layer_sizes': (dg.natoms, dg.nclusters * 2),
                'hidden_layer_sizes': (int(dg.natoms / 2), ),
                # 'hidden_layer_sizes': [int(min(dg.nfeatures, 100) / (i + 1)) for i in range(10)],
                'max_iter': 10000,
                'alpha': 0.001,
            },
            per_frame_importance_outfile="output/toy_model_perframe.txt",
            one_vs_rest=True,
            **kwargs),
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (dg.nfeatures,),
        #         'alpha': 50,
        #     },
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 100},
            **kwargs),
    ]
    unsupervised_feature_extractors = [
        fe.MlpAeFeatureExtractor(
            classifier_kwargs={
                # hidden_layer_sizes=(int(data.shape[1]/2),),
                'hidden_layer_sizes': (dg.nclusters, ),
                # 'hidden_layer_sizes': (10, 5, 1, 5, 10,),
                # hidden_layer_sizes=(100, 1, 100,),
                # hidden_layer_sizes=(200, 50, 10, 1, 10, 50, 200, ),
                'max_iter': 100000,
                # hidden_layer_sizes=(300, 200, 50, 10, 1, 10, 50, 200, 300,),
                # max_iter=10000,
                # 'alpha': 0.0001,
                'alpha': 1,
                'solver': "adam",
            },
            use_reconstruction_for_lrp=True,
            activation="logistic",
            **kwargs),
        fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None},
                               variance_cutoff=variance_cutoff,
                               name='PCA',
                               **kwargs),
        # fe.RbmFeatureExtractor(classifier_kwargs={'n_components': dg.nclusters},
        #                        relevance_method='from_lrp',
        #                        name='RBM',
        #                        **kwargs),
    ]
    feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    postprocessors = []
    filter_results = False

    for extractor in feature_extractors:
        extractor.error_limit = 50
        logger.info("Computing relevance for extractors %s", extractor.name)
        extractor.extract_features()
        p = extractor.postprocessing(working_dir="./{}".format(extractor.name),
                                     pdb_file=None,
                                     feature_to_resids=feature_to_resids,
                                     filter_results=filter_results)
        p.average()
        p.evaluate_performance()
        p.persist()
        postprocessors.append([p])
    logger.info("Done")

    logger.info(
        "Actual atoms moved: %s.\n(Cluster generation method %s. Noise level=%s, displacement=%s. frames/cluster=%s)",
        sorted(dg.moved_atoms), dg.test_model, dg.noise_level, dg.displacement,
        dg.nframes_per_cluster)

    visualization.visualize(
        postprocessors,
        show_importance=True,
        show_performance=False,
        show_projected_data=False,
        highlighted_residues=dg.moved_atoms,
        outfile="output/test_importance_per_residue_{suffix}.{filetype}".
        format(suffix=suffix, filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=True,
    #                         show_projected_data=False,
    #                         outfile="output/test_performance_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                      filetype=filetype))
    # visualization.visualize(postprocessors,
    #                         show_importance=False,
    #                         show_performance=False,
    #                         show_projected_data=True,
    #                         outfile="output/test_projection_{suffix}.{filetype}".format(suffix=suffix,
    #                                                                                     filetype=filetype))
    logger.info(
        "Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
        "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".
        format(**kwargs))
Ejemplo n.º 4
0
def run_beta2(
        working_dir="bio_input/beta2/",
        n_iterations=1,
        n_splits=1,
        shuffle_datasets=True,
        overwrite=False,
        dt=1,
        feature_type="ca_inv",  # "closest-heavy_inv", "CA_inv", "cartesian_ca", "cartesian_noh" or "compact_ca_inv"
        filetype="svg",
        classtype="multiclass",
        supervised=True,
        load_trajectory_for_predictions=False,
        filter_by_distance_cutoff=False,
        ligand_type='holo'):
    results_dir = "{}/results/{}/{}/{}/".format(working_dir, classtype, feature_type,
                                                "cutoff" if filter_by_distance_cutoff else "nocutoff")
    samples_dir = "{}/samples/{}/{}".format(working_dir, classtype, feature_type)
    data = np.load("{}/samples_dt{}.npz".format(samples_dir, dt))['array']
    feature_to_resids = np.load("{}/feature_to_resids.npy".format(samples_dir, feature_type))
    labels = np.loadtxt("{wd}/cluster_indices/{ct}/cluster_indices_dt{dt}.txt".format(wd=working_dir,
                                                                                      ct=classtype,
                                                                                      dt=dt))
    if classtype == "multiclass":
        label_names = ["agonist-bound", "protonated-asp79"]
        mixed_classes = True
    else:
        label_names = ["apo", "holo"]
        mixed_classes = False
    suffix = str(-1) + "clusters_" + str(n_iterations) + "iterations_" \
             + ("distance-cutoff_" if filter_by_distance_cutoff else "") + feature_type
    labels -= labels.min()
    if len(data) != len(labels) or data.shape[1] != len(feature_to_resids):
        raise Exception("Inconsistent input data. The number of features or the number of frames to no match")
    logger.info("Loaded data of shape %s for feature type %s", data.shape, feature_type)

    # ## Define the different methods to use
    # Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface
    cutoff_offset = 0.2 if "closest-heavy" in feature_type else 0
    kwargs = {
        'samples': data,
        'labels': labels,
        'label_names': label_names,
        'filter_by_distance_cutoff': filter_by_distance_cutoff,
        'lower_bound_distance_cutoff': filtering.lower_bound_distance_cutoff_default - cutoff_offset,
        'upper_bound_distance_cutoff': filtering.upper_bound_distance_cutoff_default - cutoff_offset,
        'use_inverse_distances': True,
        'n_splits': n_splits,
        'n_iterations': n_iterations,
        'shuffle_datasets': shuffle_datasets
        # 'upper_bound_distance_cutoff': 1.,
        # 'lower_bound_distance_cutoff': 1.
    }
    unsupervised_feature_extractors = [
        fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None},
                               variance_cutoff='auto',
                               # variance_cutoff='1_components',
                               name='PCA',
                               **kwargs),
        fe.RbmFeatureExtractor(classifier_kwargs={'n_components': 1},
                               relevance_method='from_lrp',
                               name='RBM',
                               **kwargs),
        # fe.MlpAeFeatureExtractor(
        #     classifier_kwargs={
        #         'hidden_layer_sizes': (100, 30, 2, 30, 100,),  # int(data.shape[1]/2),),
        #         # max_iter=10000,
        #         'alpha': 0.01,
        #         'activation': "logistic"
        #     },
        #     use_reconstruction_for_lrp=True,
        #     **kwargs),
    ]
    if load_trajectory_for_predictions:
        other_samples, other_labels = _load_trajectory_for_predictions(ligand_type)
    else:
        other_samples, other_labels = None, None
    supervised_feature_extractors = [
        # fe.ElmFeatureExtractor(
        #     activation="relu",
        #     n_nodes=data.shape[1] * 2,
        #     alpha=0.1,
        #     **kwargs),
        fe.KLFeatureExtractor(**kwargs),
        fe.RandomForestFeatureExtractor(
            one_vs_rest=True,
            classifier_kwargs={'n_estimators': 500},
            **kwargs),
        fe.MlpFeatureExtractor(
            name="MLP" if other_samples is None else "MLP_predictor_{}".format(ligand_type),
            classifier_kwargs={
                # 'hidden_layer_sizes': [int(min(100, data.shape[1]) / (i + 1)) + 1 for i in range(3)],
                'hidden_layer_sizes': (30,),
                # 'max_iter': 10000,
                'alpha': 0.1,
                'activation': "relu"
            },
            # per_frame_importance_samples=other_samples,
            # per_frame_importance_labels=other_labels,
            # per_frame_importance_outfile="/home/oliverfl/projects/gpcr/mega/Result_Data/beta2-dror/apo-holo/trajectories"
            #                              "/mlp_perframe_importance_{}/"
            #                              "{}_mlp_perframeimportance_{}clusters_{}cutoff.txt"
            #     .format(ligand_type, feature_type, nclusters, "" if filter_by_distance_cutoff else "no"),
            **kwargs),
    ]

    if supervised is None:
        feature_extractors = unsupervised_feature_extractors + supervised_feature_extractors
    else:
        feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors
    logger.info("Done. using %s feature extractors", len(feature_extractors))
    highlighted_residues = _get_important_residues(supervised, feature_type)
    # # Run the relevance analysis
    postprocessors = []
    for extractor in feature_extractors:
        do_computations = True
        if os.path.exists(results_dir):
            existing_files = glob.glob("{}/{}/importance_per_residue.npy".format(results_dir, extractor.name))
            if len(existing_files) > 0 and not overwrite:
                logger.debug("File %s already exists. skipping computations", existing_files[0])
                do_computations = False
        if do_computations:
            logger.info("Computing relevance for extractors %s", extractor.name)
            extractor.extract_features()
        p = extractor.postprocessing(working_dir=results_dir,
                                     pdb_file=working_dir + "/trajectories/all.pdb",
                                     # pdb_file=working_dir + "/trajectories/protein_noh.pdb",
                                     feature_to_resids=feature_to_resids,
                                     filter_results=False)
        if do_computations:
            p.average()
            p.evaluate_performance()
            p.persist()
        else:
            p.load()

        postprocessors.append([p])
        # # Visualize results
        visualization.visualize([[p]],
                                show_importance=True,
                                show_performance=False,
                                show_projected_data=False,
                                mixed_classes=mixed_classes,
                                highlighted_residues=highlighted_residues,
                                outfile=results_dir + "/{extractor}/importance_per_residue_{suffix}_{extractor}.{filetype}".format(
                                    suffix=suffix,
                                    extractor=extractor.name,
                                    filetype=filetype))

        if do_computations:
            visualization.visualize([[p]],
                                    show_importance=False,
                                    show_performance=True,
                                    show_projected_data=False,
                                    mixed_classes=mixed_classes,
                                    outfile=results_dir + "/{extractor}/performance_{suffix}_{extractor}.{filetype}".format(
                                        suffix=suffix,
                                        extractor=extractor.name,
                                        filetype=filetype))
            visualization.visualize([[p]],
                                    show_importance=False,
                                    show_performance=False,
                                    show_projected_data=True,
                                    mixed_classes=mixed_classes,
                                    outfile=results_dir + "/{extractor}/projected_data_{suffix}_{extractor}.{filetype}".format(
                                        suffix=suffix,
                                        extractor=extractor.name,
                                        filetype=filetype))
    logger.info("Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}."
                "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".format(**kwargs))
def run_VSD(working_dir="bio_input/VSD/",
            cluster_for_prediction=None,
            dt_for_prediction=10,
            multiclass=False):
    data = np.load(working_dir + 'frame_i_j_contacts_dt1.npy')
    cluster_indices = np.loadtxt(working_dir + 'clusters_indices.dat')

    kwargs = {
        'samples': data,
        'labels': cluster_indices,
        'filter_by_distance_cutoff': True,
        'use_inverse_distances': True,
        'n_splits': 3,
        'n_iterations': 5,
        'scaling': True,
        'shuffle_datasets': True
    }

    if cluster_for_prediction is not None:
        cluster_traj = md.load("{}/{}_dt{}.xtc".format(working_dir,
                                                       cluster_for_prediction,
                                                       dt_for_prediction),
                               top=working_dir + "alpha.pdb")
        other_samples, _, _ = tp.to_distances(traj=cluster_traj,
                                              scheme="closest-heavy",
                                              pairs="all-residues",
                                              use_inverse_distances=True,
                                              ignore_nonprotein=True,
                                              periodic=True)
        logger.debug(
            "Loaded cluster samples for prediction of shape %s for state %s",
            other_samples.shape, cluster_for_prediction)
        cluster_traj = None  # free memory
    else:
        other_samples = False
    feature_extractors = [
        fe.RandomForestFeatureExtractor(
            classifier_kwargs={'n_estimators': 100},
            one_vs_rest=not multiclass,
            **kwargs),
        fe.KLFeatureExtractor(bin_width=0.1, **kwargs),
        fe.MlpFeatureExtractor(
            classifier_kwargs={
                'hidden_layer_sizes': [
                    100,
                ],
                'max_iter': 100000,
                'alpha': 0.0001
            },
            activation="relu",
            one_vs_rest=not multiclass,
            per_frame_importance_samples=other_samples,
            per_frame_importance_labels=
            None,  # If None the method will use predicted labels for LRP
            per_frame_importance_outfile="{}/mlp_perframe_importance_{}/"
            "VSD_mlp_perframeimportance_{}_dt{}.txt".format(
                working_dir, "multiclass" if multiclass else "binaryclass",
                cluster_for_prediction, dt_for_prediction),
            **kwargs)
    ]

    common_peaks = {
        "R1-R4": [294, 297, 300, 303],
        "K5": [306],
        "R6": [309],
    }
    do_computations = True
    filetype = "svg"
    for extractor in feature_extractors:
        logger.info("Computing relevance for extractors %s", extractor.name)
        extractor.extract_features()
        p = extractor.postprocessing(working_dir=working_dir,
                                     pdb_file=working_dir + "alpha.pdb",
                                     filter_results=False)
        if do_computations:
            p.average()
            p.evaluate_performance()
            p.persist()
        else:
            p.load()

        visualization.visualize(
            [[p]],
            show_importance=True,
            show_performance=False,
            show_projected_data=False,
            highlighted_residues=common_peaks,
            outfile=working_dir +
            "{extractor}/importance_per_residue_{suffix}.{filetype}".format(
                suffix="", extractor=extractor.name, filetype=filetype))
        if do_computations:
            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=True,
                show_projected_data=False,
                outfile=working_dir +
                "{extractor}/performance_{suffix}.{filetype}".format(
                    extractor=extractor.name, suffix="", filetype=filetype))

            visualization.visualize(
                [[p]],
                show_importance=False,
                show_performance=False,
                show_projected_data=True,
                outfile=working_dir +
                "{extractor}/projected_data_{suffix}.{filetype}".format(
                    extractor=extractor.name, suffix="", filetype=filetype))

    logger.info("Done")