def featurize_project_wrapper(yaml_file, protein, feat=None, stride=1, view=None, protein_only=True): """ Wrapper function for featurizing project. :param yaml_file: The yaml file to work with :param protein: Protein Name :param feat: Featurization obj. If none, it defaults to phi, psi and chi1. Should support a describe_features attribute :param view: ipython view or pool view to parallelize over. :return: """ yaml_file = load_yaml_file(yaml_file) base_dir = yaml_file["base_dir"] _check_output_folder_exists(yaml_file, protein) #get the paths if protein_only: traj_folder = os.path.join(base_dir, protein, yaml_file["protein_dir"]) else: traj_folder = os.path.join(base_dir, protein, "trajectories") traj_files = sorted(glob.glob(os.path.join(traj_folder,"*.hdf5" )), key=keynat) print("Found %d files for featurization in %s" %(len(traj_files), traj_folder)) jobs = [(yaml_file, protein, feat, traj_file, stride) for traj_file in traj_files] result = view.map(featurize_file, jobs) return result
def sample_tic_region(yaml_file, protein_name, tic_region, n_frames=50, fname=None,save_trj=True): """ Helper function for sampling tic in a particular tic_region. :param yaml_file: The projects yaml file :param protein_name: The name of the protein :param tic_region(dict): The tic_region. Can be multidimensional with 1 number per tic coordinate(defaults to 0 for all non-mentioned regions) :param n_frames: The number of frames around the coordinate :return: """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) key_list = list(prt.tica_data.keys()) data = [prt.tica_data[i] for i in key_list] indices = sample_region(data, tic_region, n_frames) if fname is None: fname = "sampled_tic_region.xtc" trj =_frame_loader(yaml_file, prt, key_list, indices, save_trj, fname) return trj
def sample_for_all_proteins(yaml_file, protein=None, tics=None, n_frames=100, scheme="linear"): """ :param yaml_file: The project yaml file. :param protein: The name of the protein. If none, then it is done for all the protein names in the yaml_file. If it is a list, it is iteratively done for each of the protein else its only called once. :param tics: list of tics to sample from. If None, then it is done for all the tics specified in the yaml file :param n_frames number of frames wanted for each tic :param scheme:One of 3 sampling schemes linear:Samples the tic linearly random:Samples the tic randomly edge: Samples the tic edges only :return: """ yaml_file = load_yaml_file(yaml_file) if protein is None : protein = yaml_file[protein_list] if tics==None: tics = range(yaml_file["params"]["tica__n_components"]) for protein_name in protein: for tic_index in tics: sample_one_tic(yaml_file, protein_name, tic_index, n_frames, scheme) return
def sample_tic_region(yaml_file, protein_name, tic_region, n_frames=50, fname=None, save_trj=True): """ Helper function for sampling tic in a particular tic_region. :param yaml_file: The projects yaml file :param protein_name: The name of the protein :param tic_region(dict): The tic_region. Can be multidimensional with 1 number per tic coordinate(defaults to 0 for all non-mentioned regions) :param n_frames: The number of frames around the coordinate :return: """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) key_list = list(prt.tica_data.keys()) data = [prt.tica_data[i] for i in key_list] indices = sample_region(data, tic_region, n_frames) if fname is None: fname = "sampled_tic_region.xtc" trj = _frame_loader(yaml_file, prt, key_list, indices, save_trj, fname) return trj
def sample_for_all_proteins(yaml_file, protein=None, tics=None, n_frames=100, scheme="linear"): """ :param yaml_file: The project yaml file. :param protein: The name of the protein. If none, then it is done for all the protein names in the yaml_file. If it is a list, it is iteratively done for each of the protein else its only called once. :param tics: list of tics to sample from. If None, then it is done for all the tics specified in the yaml file :param n_frames number of frames wanted for each tic :param scheme:One of 3 sampling schemes linear:Samples the tic linearly random:Samples the tic randomly edge: Samples the tic edges only :return: """ yaml_file = load_yaml_file(yaml_file) if protein is None: protein = yaml_file[protein_list] if tics == None: tics = range(yaml_file["params"]["tica__n_components"]) for protein_name in protein: for tic_index in tics: sample_one_tic(yaml_file, protein_name, tic_index, n_frames, scheme) return
def featurize_file(job_tuple): yaml_file, protein, feat, traj_file,stride = job_tuple yaml_file = load_yaml_file(yaml_file) if feat is None: feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) _check_output_folder_exists(yaml_file, protein) output_folder = os.path.join(yaml_file["base_dir"], protein, yaml_file["feature_dir"]) traj_name = os.path.splitext(os.path.basename(traj_file))[0] output_fname = os.path.join(output_folder, traj_name+".jl") feat_descriptor = os.path.join(output_folder, "feature_descriptor.h5") try: trj = mdt.load(traj_file) except : warnings.warn("Removing %s because of misformed trajectory"%traj_file) os.remove(traj_file) return features = feat.partial_transform(trj) verbosedump(features, output_fname) if not os.path.isfile(feat_descriptor) and hasattr(feat, "describe_features"): dih_df = pd.DataFrame(feat.describe_features(trj[0])) verbosedump(dih_df, feat_descriptor) return
def test_dihedral_feat(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) for prt in ["kinase_1", "kinase_2"]: print(prt) prj = yaml_file["project_dict"][prt][0] featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5")) for i in np.random.choice(flist, 3): trj = mdt.load(i) my_feat = feat.partial_transform(trj) expected_fname = os.path.join(base_dir, prt, yaml_file["feature_dir"], os.path.splitext(os.path.basename(i))[0]+".jl") calc_feat = verboseload(expected_fname) assert np.allclose(my_feat, calc_feat) return True
def test_subsampler(): print(base_dir) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) out_dir = "sub_protein_traj" subsample_series(yaml_file,out_dir=out_dir,overwrite=False) assert(os.path.isdir(os.path.join(base_dir,"kinase_1",out_dir))) for k in ["kinase_1","kinase_2"]: for i in glob.glob(os.path.join(base_dir,k, "protein_traj","*.hdf5")): t1 = mdt.load(i) t2 = mdt.load(os.path.join(base_dir,k ,out_dir, os.path.basename(i))) assert (t1.n_frames==t2.n_frames*5)
def _check_output_folder_exists(yaml_file, protein, folder_name=None): yaml_file = load_yaml_file(yaml_file) if folder_name is None: folder_name= yaml_file["feature_dir"] output_folder = os.path.join(yaml_file["base_dir"], protein,folder_name) if not os.path.isdir(output_folder): os.mkdir(output_folder) return
def test_get_common_features_2(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer(types=['phi','psi','chi1']) common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) assert(len(set([len(common_feature_dic[i]) for i in yaml_file["protein_list"]]))==1) return
def test_present_for_all_same_seq(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) for protein in yaml_file["protein_list"]: aligned_seq = aligned_dict[protein] prt_mapping, prt_seq =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert(len(_present_for_all(protein, prt_mapping, prt_seq, aligned_dict))==len(prt_seq)) return
def subsample_series(yaml_file,stride=5,out_dir="sub_protein_traj",overwrite=True): yaml_file = load_yaml_file(yaml_file) for protein in yaml_file["protein_list"]: subsample_protein(yaml_file,protein, stride, out_dir) yaml_file["protein_dir"] = out_dir #write the new yaml file if overwrite: with open(os.path.join(yaml_file["mdl_dir"], 'project.yaml'), 'w') as yaml_out: yaml_out.write(yaml.dump(yaml_file)) return
def validate_series(yaml_file, sequence_dictionary): """ :param yaml_file: The mdl yaml file. :param sequence_dictionary: Dictionary of sequences :return: Runs a large number of sequence tests on the series to make sure the sequences for each protein match the given sequence and the series itself """ yaml_file = load_yaml_file(yaml_file) p = Pool(cpu_count()) jobs = [(yaml_file, protein, sequence_dictionary) for protein in yaml_file["protein_list"]] p.map(_validate_protein, jobs) return
def test_get_common_residues(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) res_dic,prt_seq = _get_common_residues(yaml_file, aligned_dict) for protein in yaml_file["protein_list"]: print(len(res_dic[protein]),t.n_residues) assert(len(res_dic[protein])==len(t.top.to_fasta(chain=0))) return
def test_get_common_features(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) aligned_dict[protein] = t.top.to_fasta(chain=0) f= DihedralFeaturizer() common_feature_dic,_ = _get_common_features(yaml_file,f, aligned_dict, False) for protein in yaml_file["protein_list"]: t = load_random_traj(yaml_file, protein) assert(len(common_feature_dic[protein])==f.transform(t)[0].shape[1]) return
def sample_one_tic(yaml_file,protein_name,tic_index,n_frames, scheme="linear"): """ :param yaml_file: The project's yaml file :param protein: The name of protein :param tic_index: Tic index to sample along :param n_frames: The number of frames wanted :return: Dumps a tic%d.xtc and tic%d.log for a given protein inside its model. """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
def test_subsampler(): print(base_dir) yaml_file = load_yaml_file( os.path.join(base_dir, "mdl_dir", "project.yaml")) out_dir = "sub_protein_traj" subsample_series(yaml_file, out_dir=out_dir, overwrite=False) assert (os.path.isdir(os.path.join(base_dir, "kinase_1", out_dir))) for k in ["kinase_1", "kinase_2"]: for i in glob.glob(os.path.join(base_dir, k, "protein_traj", "*.hdf5")): t1 = mdt.load(i) t2 = mdt.load( os.path.join(base_dir, k, out_dir, os.path.basename(i))) assert (t1.n_frames == t2.n_frames * 5)
def test_series_slicer(yaml_file, folder_name="sliced_feature_dir"): yaml_file = load_yaml_file(yaml_file) df_dict={} for protein in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, protein): df_dict[protein] = verboseload(os.path.join(os.getcwd(), folder_name,"feature_descriptor.h5")) for ind,protein in enumerate(yaml_file["protein_list"]): for ind2, protein2 in enumerate(yaml_file["protein_list"]): assert (df_dict[protein].resnames== df_dict[protein2].resnames).all() return
def test_map_residue_seq_with_insert(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) expected[protein] = [i.index+3 for i in t.top.residues if i.is_protein] aligned_dict[protein] = "---"+ t.top.to_fasta(chain=0) aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def test_normalize_features(): print(base_dir) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) normalize_project_series(yaml_file, stride=1) all_data=[] for kinase in ["kinase_1","kinase_2"]: flist = glob.glob(os.path.join(base_dir, "%s"%kinase , "normalized_features/*.jl")) for i in flist: all_data.extend(verboseload(i)) assert(np.alltrue(np.isclose(np.mean(all_data, axis=1), 0 , atol=0.2))) assert(np.alltrue(np.isclose(np.std(all_data, axis=1), 1 , atol=0.3)))
def pull_features(yaml_file, prt, skip=1, feature_indices=None): """ Simple utility to pull certain features from the feature_folder object :param prt: Protein model to use :param skip: skip for each file(defaults to 1) :param feature_indices: which indices to pull :return: dictionary keyed on file name with feature values as arrays """ yaml_file = load_yaml_file(yaml_file) all_f ={} with enter_protein_data_dir(yaml_file, prt.name): feature_file_list = glob.glob("./%s/*.jl"%yaml_file["feature_dir"]) for i in feature_file_list: all_f[os.path.basename(i)]=load(i)[:, feature_indices] return all_f
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=40,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.StandardScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def normalize_project_series(yaml_file, output_folder="normalized_features", stride=1,nrm=None): """ routine to take a set of proteins features stored in the feature_dir and normalize them by removing the mean and setting variance to 1 using the standard scaler. The normalizer is dumped into the mdl dir. :param yaml_file: The yaml file to work with. :param output_folder: The name of the output folder to dump normalized features in :param stride: The initial stride in files to fit the normalizer with. This is necessary to prevent memory errors. defaults to every 40th file :param nrm: previously fit normalizer. else it uses the standard scaler from scikitlearn :return: """ yaml_file = load_yaml_file(yaml_file) #setup normalizer if nrm is None: nrm = preprocessing.RobustScaler() all_data = {} for prt in yaml_file["protein_list"]: with enter_protein_data_dir(yaml_file, prt): print(prt) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride] for f in flist: all_data[f]=verboseload(f) seq=[] for i in all_data.keys(): seq.extend(all_data[i]) #fit it nrm.fit(seq) #dump it into the mdl dir. verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"]) for prt in yaml_file["protein_list"]: _check_output_folder_exists(yaml_file, prt, output_folder) with enter_protein_data_dir(yaml_file, prt): output_folder_path = os.path.abspath(output_folder) flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"])) for f in flist: res = verboseload(f) res = nrm.transform(res) verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f))) return
def test_map_residue_seq_with_insert_at_end(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) #add an insertion AFTER 10 residues. We expect all but the 10 have expected[protein] = [i for i in range(t.n_residues) if t.top.residue(i).code is not None] aligned_dict[protein] = t.top.to_fasta(chain=0)+"---" aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def sample_one_tic(yaml_file, protein_name, tic_index, n_frames, scheme="linear"): """ :param yaml_file: The project's yaml file :param protein: The name of protein :param tic_index: Tic index to sample along :param n_frames: The number of frames wanted :return: Dumps a tic%d.xtc and tic%d.log for a given protein inside its model. """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
def test_normalize_features(): print(base_dir) yaml_file = load_yaml_file( os.path.join(base_dir, "mdl_dir", "project.yaml")) normalize_project_series(yaml_file, stride=1) all_data = [] for kinase in ["kinase_1", "kinase_2"]: flist = glob.glob( os.path.join(base_dir, "%s" % kinase, "normalized_features/*.jl")) for i in flist: all_data.extend(verboseload(i)) assert (np.alltrue(np.isclose(np.mean(all_data, axis=1), 0, atol=0.2))) assert (np.alltrue(np.isclose(np.std(all_data, axis=1), 1, atol=0.3)))
def featurize_series(yaml_file, ip_view, protein_list = None): """ :param yaml_file: The yaml file to work with :param ip_view: ipython view(required) :param protein_list: list of proteins, if None then all the proteins in yaml_file["protein_list"] are processed :return: converted and concatenated trajectories in yaml_file["base_dir"]+protein_name+trajectories and the stripped files in yaml_file["base_dir"]+protein_name+protein_traj """ yaml_file = load_yaml_file(yaml_file) if protein_list is None: protein_list = yaml_file["protein_list"] for protein in protein_list: featurize_project_wrapper(yaml_file, protein, None, 1, ip_view) return
def subsample_protein(yaml_file, protein, stride=5,out_dir="sub_protein_traj"): yaml_file=load_yaml_file(yaml_file) p=Pool(int(cpu_count()/2)) with enter_protein_data_dir(yaml_file, protein): flist = [os.path.abspath(i) for i in glob.glob("%s/*.hdf5"%yaml_file["protein_dir"])] base_dir = yaml_file["base_dir"] new_output_dir = os.path.join(base_dir,protein,out_dir) if not os.path.isdir(new_output_dir): os.mkdir(new_output_dir) fout = [os.path.join(new_output_dir,os.path.basename(i)) for i in flist] zippy = zip(flist, fout, itertools.repeat(stride)) jobs= [(i,o,s) for i,o,s in zippy] p.map(subsample_traj,jobs) return
def featurize_series(yaml_file, ip_view, protein_list=None): """ :param yaml_file: The yaml file to work with :param ip_view: ipython view(required) :param protein_list: list of proteins, if None then all the proteins in yaml_file["protein_list"] are processed :return: converted and concatenated trajectories in yaml_file["base_dir"]+protein_name+trajectories and the stripped files in yaml_file["base_dir"]+protein_name+protein_traj """ yaml_file = load_yaml_file(yaml_file) if protein_list is None: protein_list = yaml_file["protein_list"] for protein in protein_list: featurize_project_wrapper(yaml_file, protein, None, 1, ip_view) return
def series_feature_slicer(yaml_file, dict_feat_ind=None, featurizer=None, folder_name="sliced_feature_dir", view=None): """ :param yaml_file: The project yaml file with :param dict_feat_ind: Dict of wanted feature indices for each protein. Defaults to none when you want the code to figure out what features to keep. :param featurizer: The featurizer object that was used to generat. :param folder_name: Name of the output folder. Defaults to sliced_feature_dir :param view: pool of workers. Defaults to multiprocessing :return: None """ yaml_file = load_yaml_file(yaml_file) if view is None: view = Pool() #if we want to do this and we cant find the sequence if dict_feat_ind is None and ("alignment_file" not in yaml_file or featurizer is None or (not hasattr(featurizer, "describe_features"))): raise ValueError("To find common features, we need both " "the alignment file in the yaml file" "AND a featurizer obj that supports describe_features") if dict_feat_ind is None: #load alignment file aligned_dict = _parse_alignment_file(yaml_file["alignment_file"]) #get list of common residue indices #dict_common_res, prt_mapping = _get_common_residues(yaml_file, aligned_dict) #get list of feature indices dict_feat_ind, df_dict = _get_common_features(yaml_file, featurizer, aligned_dict) _feature_slicer(yaml_file, dict_feat_ind, folder_name, view) return
def test_map_residue_seq_with_two_inserts(): yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) aligned_dict={} for protein in yaml_file["protein_list"]: expected = {} t = load_random_traj(yaml_file, protein) #add an insertion AFTER 10 residues. and then again at 20 expected[protein] = [i for i in range(10) if t.top.residue(i).code is not None] + \ [i+3 for i in range(10, 20) if t.top.residue(i).code is not None]+\ [i+5 for i in range(20, t.n_residues) if t.top.residue(i).code is not None] prt_code = t.top.to_fasta(chain=0) aligned_dict[protein] = prt_code[:10]+\ "---"+ \ prt_code[10:20]+\ "--"+ \ prt_code[20:] aligned_seq = aligned_dict[protein] actual,_ =_map_residue_ind_seq_ind(yaml_file, protein, aligned_seq) assert expected[protein] == list(actual.values()) return
def create_equivalent_contact_featurizer(yaml_file, alignment_file, protein_list=None, pairs=None, same_residue=True, transform=None, **kwargs): """ Create a equivalent contacts featurizer for a set of proteins :param yaml_file: yaml file location :param alignment_file: alignment file location :param pairs: wanted sequence index positions in the alignment You need to just figure out the wanted location for one residue. _map_residue_ind_seq_ind function can help with this :same residue: True is you would restrict to having the same residue at the same sequence position. :param kwargs: kwargs for the contact featurizer :return: dictionary of contact featurizers. one for each protein """ featurizer_dict={} #load alignment file yaml_file = load_yaml_file(yaml_file) alignment_file = _parse_alignment_file(alignment_file) if protein_list is None: protein_list = yaml_file["protein_list"] if pairs is None: #use the max length(probably a horrible idea) max_seq_len = max([len(alignment_file[i]) for i in alignment_file.keys()]) pairs = [i for i in itertools.combinations(range(max_seq_len), 2)] for protein in protein_list: print(protein) #get a list of residues we can keep can_keep=[] #get mapping and seq prt_mapping, prt_seq = _map_residue_ind_seq_ind(yaml_file, protein, alignment_file[protein]) #for wanted positions in the massive wanted indices list inv_map = {v: k for k, v in prt_mapping.items()} for position in np.unique(pairs): #get the #get the possible codes at every position possible_codes = set([alignment_file[p][position] for p in alignment_file.keys()]) #if there is not a missing residue if not "-" in possible_codes: if same_residue and len(set(possible_codes))!=1: continue # get the inverse mapping and add it to the list of can keep residue_index = inv_map[position] can_keep.append(residue_index) #sort it because i dont want random bs issues. can_keep = np.sort(can_keep) #get its pairs actual_pairs = np.array([i for i in itertools.combinations(can_keep, 2) if i in pairs]) if transform=='logistic': featurizer_dict[protein] = LogisticContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform=='binary': featurizer_dict[protein] = BinaryContactFeaturizer(contacts=actual_pairs, **kwargs) elif transform is None or transform=="none": featurizer_dict[protein] = ContactFeaturizer(contacts=actual_pairs, **kwargs) else: raise ValueError("type needs to be one of logistic, binary, none") return featurizer_dict
def test_convert_project(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file( os.path.join(base_dir, "mdl_dir", "project.yaml")) def test_hdf5(protein, p, r, clone): trj, stripped_trj = _load_project_clone(protein, p, r, clone) trj2 = mdt.load( os.path.join(base_dir, protein, "trajectories/%s_%d_0.hdf5" % (p, r))) trj3 = mdt.load( os.path.join(base_dir, protein, "protein_traj/%s_%d_0.hdf5" % (p, r))) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(trj, i) == getattr(trj2, i) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(stripped_trj, i) == getattr(trj3, i) assert (trj.xyz == trj2.xyz).all() assert (stripped_trj.xyz == trj3.xyz).all() return True def test_stripped_hdf5(protein, p, r, clone): trj, stripped_trj = _load_project_clone(protein, p, r, clone) trj3 = mdt.load( os.path.join(base_dir, protein, "protein_traj/%s_%d_0.hdf5" % (p, r))) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(stripped_trj, i) == getattr(trj3, i) assert (stripped_trj.xyz == trj3.xyz).all() return True def test_hdf5_file_validation(): """ Kinase1/RUN1/CLONE0 has a missing file results-001.tar.bz2. We make sure that that hdf5 has the first results-000.tar.bz2 but not 002. This is a hardcoded test that is not really desirable """ trj = HDF5TrajectoryFile( os.path.join(base_dir, "kinase_1", "trajectories", "fake_proj1_1_0.hdf5")) flist = trj._handle.root.processed_filenames fpath, fname = os.path.split(flist[0]) return os.path.join(fpath,six.b("results-000.tar.bz2")) in flist and \ os.path.join(fpath,six.b("results-002.tar.bz2")) not in flist def test_non_contingous(): """ Kinase2/fake_proj3/RUN1/ has two clones Clone 0 and Clone 2 we make sure that the naming convention is correct """ assert os.path.isfile( os.path.join(base_dir, "kinase_2", "protein_traj", "fake_proj3_1_0.hdf5")) assert not os.path.isfile( os.path.join(base_dir, "kinase_2", "protein_traj", "fake_proj3_1_1.hdf5")) assert os.path.isfile( os.path.join(base_dir, "kinase_2", "protein_traj", "fake_proj3_1_2.hdf5")) return True for i in range(3): #extract the project multiple times to see what happens extract_project_wrapper(yaml_file, "kinase_1", "fake_proj1", pool) extract_project_wrapper(yaml_file, "kinase_1", "fake_proj2", pool) assert test_hdf5("kinase_1", "fake_proj1", 0, 0) assert test_hdf5_file_validation() assert test_hdf5("kinase_1", "fake_proj2", 0, 0) #do it for the second project too. extract_project_wrapper(yaml_file, "kinase_2", "fake_proj3", pool, protein_only=True) assert test_stripped_hdf5("kinase_2", "fake_proj3", 0, 0) assert test_non_contingous() return True
def test_convert_project(): print(base_dir) pool = Pool(6) yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml")) def test_hdf5(protein, p, r, clone): trj, stripped_trj = _load_project_clone(protein, p, r, clone) trj2 = mdt.load(os.path.join(base_dir, protein, "trajectories/%s_%d_0.hdf5"%(p,r))) trj3 = mdt.load(os.path.join(base_dir, protein,"protein_traj/%s_%d_0.hdf5"%(p,r))) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(trj, i) == getattr(trj2,i) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(stripped_trj, i) == getattr(trj3,i) assert (trj.xyz==trj2.xyz).all() assert (stripped_trj.xyz==trj3.xyz).all() return True def test_stripped_hdf5(protein, p, r, clone): trj, stripped_trj = _load_project_clone(protein, p, r, clone) trj3 = mdt.load(os.path.join(base_dir, protein,"protein_traj/%s_%d_0.hdf5"%(p,r))) for i in ["top", "n_atoms", "n_chains", "n_frames", "n_residues"]: assert getattr(stripped_trj, i) == getattr(trj3,i) assert (stripped_trj.xyz==trj3.xyz).all() return True def test_hdf5_file_validation(): """ Kinase1/RUN1/CLONE0 has a missing file results-001.tar.bz2. We make sure that that hdf5 has the first results-000.tar.bz2 but not 002. This is a hardcoded test that is not really desirable """ trj = HDF5TrajectoryFile(os.path.join(base_dir,"kinase_1", "trajectories","fake_proj1_1_0.hdf5")) flist=trj._handle.root.processed_filenames fpath, fname = os.path.split(flist[0]) return os.path.join(fpath,six.b("results-000.tar.bz2")) in flist and \ os.path.join(fpath,six.b("results-002.tar.bz2")) not in flist def test_non_contingous(): """ Kinase2/fake_proj3/RUN1/ has two clones Clone 0 and Clone 2 we make sure that the naming convention is correct """ assert os.path.isfile(os.path.join(base_dir,"kinase_2", "protein_traj", "fake_proj3_1_0.hdf5")) assert not os.path.isfile(os.path.join(base_dir,"kinase_2", "protein_traj", "fake_proj3_1_1.hdf5")) assert os.path.isfile(os.path.join(base_dir,"kinase_2", "protein_traj", "fake_proj3_1_2.hdf5")) return True for i in range(3): #extract the project multiple times to see what happens extract_project_wrapper(yaml_file, "kinase_1", "fake_proj1", pool) extract_project_wrapper(yaml_file, "kinase_1", "fake_proj2", pool) assert test_hdf5("kinase_1", "fake_proj1", 0, 0) assert test_hdf5_file_validation() assert test_hdf5("kinase_1", "fake_proj2", 0, 0) #do it for the second project too. extract_project_wrapper(yaml_file, "kinase_2", "fake_proj3", pool, protein_only=True) assert test_stripped_hdf5("kinase_2", "fake_proj3", 0, 0) assert test_non_contingous() return True