Esempio n. 1
0
def test_change_protein_data_dir():
    with enter_temp_directory():
        create_fake_series()
        yaml_file ={}
        yaml_file["base_dir"] = "./fake_series"
        protein = "fake_kinase1"
        with enter_protein_data_dir(yaml_file, protein):
            current_folder_path, current_folder_name = os.path.split(os.getcwd())
            assert current_folder_name == "fake_kinase1"
    return
def test_change_protein_data_dir():
    with enter_temp_directory():
        create_fake_series()
        yaml_file = {}
        yaml_file["base_dir"] = "./fake_series"
        protein = "fake_kinase1"
        with enter_protein_data_dir(yaml_file, protein):
            current_folder_path, current_folder_name = os.path.split(
                os.getcwd())
            assert current_folder_name == "fake_kinase1"
    return
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"):
    yaml_file = load_yaml_file(yaml_file)

    df_dict={}
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            df_dict[protein] = verboseload(os.path.join(os.getcwd(),
                folder_name,"feature_descriptor.h5"))
    for ind,protein in enumerate(yaml_file["protein_list"]):
        for ind2, protein2 in enumerate(yaml_file["protein_list"]):
            assert (df_dict[protein].resnames==
                    df_dict[protein2].resnames).all()

    return
Esempio n. 4
0
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"):
    yaml_file = load_yaml_file(yaml_file)

    df_dict={}
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            df_dict[protein] = verboseload(os.path.join(os.getcwd(),
                folder_name,"feature_descriptor.h5"))
    for ind,protein in enumerate(yaml_file["protein_list"]):
        for ind2, protein2 in enumerate(yaml_file["protein_list"]):
            assert (df_dict[protein].resnames==
                    df_dict[protein2].resnames).all()

    return
def pull_features(yaml_file, prt, skip=1, feature_indices=None):
    """
    Simple utility to pull certain features from the feature_folder object
    :param prt: Protein model to use
    :param skip: skip for each file(defaults to 1)
    :param feature_indices: which indices to pull
    :return: dictionary keyed on file name with feature values as arrays
    """
    yaml_file = load_yaml_file(yaml_file)
    all_f ={}
    with enter_protein_data_dir(yaml_file, prt.name):
        feature_file_list = glob.glob("./%s/*.jl"%yaml_file["feature_dir"])
        for i in feature_file_list:
            all_f[os.path.basename(i)]=load(i)[:, feature_indices]

    return all_f
Esempio n. 6
0
def subsample_protein(yaml_file, protein, stride=5,out_dir="sub_protein_traj"):
    yaml_file=load_yaml_file(yaml_file)

    p=Pool(int(cpu_count()/2))

    with enter_protein_data_dir(yaml_file, protein):
        flist = [os.path.abspath(i) for i in
                 glob.glob("%s/*.hdf5"%yaml_file["protein_dir"])]

    base_dir = yaml_file["base_dir"]
    new_output_dir = os.path.join(base_dir,protein,out_dir)
    if not os.path.isdir(new_output_dir):
        os.mkdir(new_output_dir)
    fout = [os.path.join(new_output_dir,os.path.basename(i)) for i in flist]

    zippy = zip(flist, fout, itertools.repeat(stride))

    jobs= [(i,o,s) for i,o,s in zippy]
    p.map(subsample_traj,jobs)
    return
def test_slicer():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,
                  'msm__lag_time': 1, 'bootstrap__n_samples':1 }

        create_fake_data(base_dir, protein_list, project_dict)

        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)

        dict_feat_ind={}
        dict_feat_ind["kinase_1"] =[0, 2]
        dict_feat_ind["kinase_2"] =[1, 1, 0, 2]

        series_feature_slicer(yaml_file, dict_feat_ind)


        for protein in protein_list:
            with enter_protein_data_dir(yaml_file, protein):
                assert (os.path.isdir("sliced_feature_dir"))
                flist = glob.glob("./%s/*.jl"%feature_dir)
                for fname in flist:
                    original_file = verboseload(fname)
                    expected_file = original_file[:, dict_feat_ind[protein]]
                    written_file = verboseload("./%s/%s"%("sliced_feature_dir",
                                                          os.path.basename(fname)
                                                          ))
                    assert (expected_file==written_file).all()
    return
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True):
    """
    Function to get the common features across protein using the common residues.
    can optionally save the pandas data to the mdl_dir
    :param yaml_file: The protein yaml_file
    :param featurizer: featurizer object used.
    :param prt_mapping: Mapping of each residue to its sequence
    :param aligned_dict : Dictionary of alignments for each protein
    :return:
    """
    result_dict = {}
    df_dict={}
    for protein in yaml_file["protein_list"]:
        print(protein)
        #reset the featurizer
        featurizer = clone(featurizer)
        trj = load_random_traj(yaml_file, protein)
        df = pd.DataFrame(featurizer.describe_features(trj))
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        aligned_dict[protein])
        feature_vec =[]
        #for every feature
        for i in df.iterrows():
            #get the index and the feature itself
            feature_ind, feature_dict = i
            all_res_in_algn = []
            mapped_index_list=[]
            for aa_ind in feature_dict["resids"]:
                aa_code = prt_seq[aa_ind]
                #make sure we have the same residue
                assert(trj.top.residue(aa_ind).code==aa_code)
                #get the mapping for that aa to the main alignment
                mapped_index = prt_mapping[aa_ind]
                #for every protein in the alignment, check if we have the same residue
                #at the same position
                all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code
                                          for prt in yaml_file["protein_list"]]))
                mapped_index_list.append(mapped_index)


            #to account for additions and deletions, we check if the difference between
            #the mapping and the actual residue codes is the same.
            mapped_index_difference = [x - mapped_index_list[i - 1]
                                       for i, x in enumerate(mapped_index_list) if i > 0]
            resid_index_difference = [x - feature_dict["resids"][i - 1]
                                       for i, x in enumerate(feature_dict["resids"]) if i > 0]
            if not np.all(mapped_index_difference==resid_index_difference):
                all_res_in_algn.append(False)


            if np.alltrue(all_res_in_algn):
                feature_vec.append(feature_ind)

        df_dict[protein] = df.iloc[feature_vec]
        result_dict[protein] = feature_vec

        if save_df:
            new_df = df.iloc[feature_vec]
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("feature_descriptor.h5"))
            with enter_protein_data_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("sliced_feature_dir",
                                                 "feature_descriptor.h5"))
    return result_dict, df_dict
Esempio n. 9
0
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True):
    """
    Function to get the common features across protein using the common residues.
    can optionally save the pandas data to the mdl_dir
    :param yaml_file: The protein yaml_file
    :param featurizer: featurizer object used.
    :param prt_mapping: Mapping of each residue to its sequence
    :param aligned_dict : Dictionary of alignments for each protein
    :return:
    """
    result_dict = {}
    df_dict={}
    for protein in yaml_file["protein_list"]:
        print(protein)
        #reset the featurizer
        featurizer = clone(featurizer)
        trj = load_random_traj(yaml_file, protein)
        df = pd.DataFrame(featurizer.describe_features(trj))
        prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein,
                                                        aligned_dict[protein])
        feature_vec =[]
        #for every feature
        for i in df.iterrows():
            #get the index and the feature itself
            feature_ind, feature_dict = i
            all_res_in_algn = []
            mapped_index_list=[]
            for aa_ind in feature_dict["resids"]:
                aa_code = prt_seq[aa_ind]
                #make sure we have the same residue
                assert(trj.top.residue(aa_ind).code==aa_code)
                #get the mapping for that aa to the main alignment
                mapped_index = prt_mapping[aa_ind]
                #for every protein in the alignment, check if we have the same residue
                #at the same position
                all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code
                                          for prt in yaml_file["protein_list"]]))
                mapped_index_list.append(mapped_index)


            #to account for additions and deletions, we check if the difference between
            #the mapping and the actual residue codes is the same.
            mapped_index_difference = [x - mapped_index_list[i - 1]
                                       for i, x in enumerate(mapped_index_list) if i > 0]
            resid_index_difference = [x - feature_dict["resids"][i - 1]
                                       for i, x in enumerate(feature_dict["resids"]) if i > 0]
            if not np.all(mapped_index_difference==resid_index_difference):
                all_res_in_algn.append(False)


            if np.alltrue(all_res_in_algn):
                feature_vec.append(feature_ind)

        df_dict[protein] = df.iloc[feature_vec]
        result_dict[protein] = feature_vec

        if save_df:
            new_df = df.iloc[feature_vec]
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("feature_descriptor.h5"))
            with enter_protein_data_dir(yaml_file, protein):
                verbosedump(new_df, os.path.join("sliced_feature_dir",
                                                 "feature_descriptor.h5"))
    return result_dict, df_dict