def test_change_protein_data_dir(): with enter_temp_directory(): create_fake_series() yaml_file ={} yaml_file["base_dir"] = "./fake_series" protein = "fake_kinase1" with enter_protein_data_dir(yaml_file, protein): current_folder_path, current_folder_name = os.path.split(os.getcwd()) assert current_folder_name == "fake_kinase1" return
def test_change_protein_data_dir(): with enter_temp_directory(): create_fake_series() yaml_file = {} yaml_file["base_dir"] = "./fake_series" protein = "fake_kinase1" with enter_protein_data_dir(yaml_file, protein): current_folder_path, current_folder_name = os.path.split( os.getcwd()) assert current_folder_name == "fake_kinase1" return
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"): yaml_file = load_yaml_file(yaml_file) df_dict={} for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): df_dict[protein] = verboseload(os.path.join(os.getcwd(), folder_name,"feature_descriptor.h5")) for ind,protein in enumerate(yaml_file["protein_list"]): for ind2, protein2 in enumerate(yaml_file["protein_list"]): assert (df_dict[protein].resnames== df_dict[protein2].resnames).all() return
def pull_features(yaml_file, prt, skip=1, feature_indices=None): """ Simple utility to pull certain features from the feature_folder object :param prt: Protein model to use :param skip: skip for each file(defaults to 1) :param feature_indices: which indices to pull :return: dictionary keyed on file name with feature values as arrays """ yaml_file = load_yaml_file(yaml_file) all_f ={} with enter_protein_data_dir(yaml_file, prt.name): feature_file_list = glob.glob("./%s/*.jl"%yaml_file["feature_dir"]) for i in feature_file_list: all_f[os.path.basename(i)]=load(i)[:, feature_indices] return all_f
def subsample_protein(yaml_file, protein, stride=5,out_dir="sub_protein_traj"): yaml_file=load_yaml_file(yaml_file) p=Pool(int(cpu_count()/2)) with enter_protein_data_dir(yaml_file, protein): flist = [os.path.abspath(i) for i in glob.glob("%s/*.hdf5"%yaml_file["protein_dir"])] base_dir = yaml_file["base_dir"] new_output_dir = os.path.join(base_dir,protein,out_dir) if not os.path.isdir(new_output_dir): os.mkdir(new_output_dir) fout = [os.path.join(new_output_dir,os.path.basename(i)) for i in flist] zippy = zip(flist, fout, itertools.repeat(stride)) jobs= [(i,o,s) for i,o,s in zippy] p.map(subsample_traj,jobs) return
def test_slicer(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootstrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) dict_feat_ind={} dict_feat_ind["kinase_1"] =[0, 2] dict_feat_ind["kinase_2"] =[1, 1, 0, 2] series_feature_slicer(yaml_file, dict_feat_ind) for protein in protein_list: with enter_protein_data_dir(yaml_file, protein): assert (os.path.isdir("sliced_feature_dir")) flist = glob.glob("./%s/*.jl"%feature_dir) for fname in flist: original_file = verboseload(fname) expected_file = original_file[:, dict_feat_ind[protein]] written_file = verboseload("./%s/%s"%("sliced_feature_dir", os.path.basename(fname) )) assert (expected_file==written_file).all() return
def _get_common_features(yaml_file, featurizer, aligned_dict,save_df=True): """ Function to get the common features across protein using the common residues. can optionally save the pandas data to the mdl_dir :param yaml_file: The protein yaml_file :param featurizer: featurizer object used. :param prt_mapping: Mapping of each residue to its sequence :param aligned_dict : Dictionary of alignments for each protein :return: """ result_dict = {} df_dict={} for protein in yaml_file["protein_list"]: print(protein) #reset the featurizer featurizer = clone(featurizer) trj = load_random_traj(yaml_file, protein) df = pd.DataFrame(featurizer.describe_features(trj)) prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein, aligned_dict[protein]) feature_vec =[] #for every feature for i in df.iterrows(): #get the index and the feature itself feature_ind, feature_dict = i all_res_in_algn = [] mapped_index_list=[] for aa_ind in feature_dict["resids"]: aa_code = prt_seq[aa_ind] #make sure we have the same residue assert(trj.top.residue(aa_ind).code==aa_code) #get the mapping for that aa to the main alignment mapped_index = prt_mapping[aa_ind] #for every protein in the alignment, check if we have the same residue #at the same position all_res_in_algn.append(np.alltrue([aligned_dict[prt][mapped_index]==aa_code for prt in yaml_file["protein_list"]])) mapped_index_list.append(mapped_index) #to account for additions and deletions, we check if the difference between #the mapping and the actual residue codes is the same. mapped_index_difference = [x - mapped_index_list[i - 1] for i, x in enumerate(mapped_index_list) if i > 0] resid_index_difference = [x - feature_dict["resids"][i - 1] for i, x in enumerate(feature_dict["resids"]) if i > 0] if not np.all(mapped_index_difference==resid_index_difference): all_res_in_algn.append(False) if np.alltrue(all_res_in_algn): feature_vec.append(feature_ind) df_dict[protein] = df.iloc[feature_vec] result_dict[protein] = feature_vec if save_df: new_df = df.iloc[feature_vec] with enter_protein_mdl_dir(yaml_file, protein): verbosedump(new_df, os.path.join("feature_descriptor.h5")) with enter_protein_data_dir(yaml_file, protein): verbosedump(new_df, os.path.join("sliced_feature_dir", "feature_descriptor.h5")) return result_dict, df_dict