def _test_tic_dict(prj): p1 = Protein(prj, "kinase_1") #p1._get_all_tics() current_data = [] rnd_tic = np.random.randint(p1.n_tics_) rnd_state = np.random.randint(p1.n_states_) for traj_index, traj_name in enumerate(p1.fixed_assignments.keys()): for f_i, f in enumerate(p1.fixed_assignments[traj_name]): if f == rnd_state: current_data.append(p1.tica_data[traj_name][f_i][rnd_tic]) assert len(current_data)==len(p1.tic_data(rnd_tic)[rnd_state]) assert current_data == p1.tic_data(rnd_tic)[rnd_state] return True
def _test_tic_sampling(yaml_file, protein_name, tic_list, n_frames, scheme): #test to make sure we are sampling right sample_for_all_proteins(yaml_file, [protein_name], tic_list, n_frames, scheme=scheme) ser = ProteinSeries(yaml_file) prt = Protein(ser, protein_name) for tic_index in [0,1]: traj_path = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name,"tic%d.xtc"%tic_index) traj_top = os.path.join(base_dir,yaml_file["mdl_dir"], protein_name, "prot.pdb") tica_traj = mdt.load(traj_path,top=traj_top) print(tica_traj.n_frames) feat = DihedralFeaturizer(types=['phi', 'psi','chi1']) f = feat.partial_transform(tica_traj) t_f = np.round(prt.tica_mdl.transform([f])) #check that the tic goes from min to max print("Look here",t_f[0]) assert t_f[0][0][tic_index] <= t_f[0][-1][tic_index] all_vals = [] for traj_tica_data in prt.tica_data.values(): all_vals.extend(traj_tica_data[:,tic_index]) #sort it because all three sampling schemes use it all_vals = np.round(np.sort(all_vals)) print(tic_index) print(t_f[0][:,tic_index] >= all_vals[0]) print(t_f[0][:,tic_index] <= all_vals[-1]) #make sure the frames are within limitsss assert (t_f[0][:,tic_index] >= all_vals[0]).all() assert (t_f[0][:,tic_index] <= all_vals[-1]).all() return True
def test_map_tic_component(): yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml") yaml_file = load_yaml_file(yaml_file) fit_pipeline(yaml_file["base_dir"]) with enter_protein_data_dir(yaml_file, "kinase_1"): df = pd.DataFrame(verboseload( os.path.join(yaml_file["feature_dir"], "feature_descriptor.h5") )) trj = mdt.load(os.path.join(yaml_file["protein_dir"], "fake_proj1_0_0.hdf5")) ser = ProteinSeries(yaml_file,base_dir) prt = Protein(ser, "kinase_1") tica_mdl = prt.tica_mdl tic_index=0 t_c = tica_mdl.components_[tic_index, :] a_i, r_i = _map_tic_component(t_c, df, trj) assert len(a_i[0]) == trj.n_atoms assert len(r_i[0]) == trj.n_residues #spot check residue 0 df2 = pd.DataFrame([i[1] for i in df.iterrows() if 0 in i[1]["resids"]]) r0_imp = np.sum(abs(t_c[df2.index])) assert r0_imp==r_i[0,0]
def sample_tic_region(yaml_file, protein_name, tic_region, n_frames=50, fname=None, save_trj=True): """ Helper function for sampling tic in a particular tic_region. :param yaml_file: The projects yaml file :param protein_name: The name of the protein :param tic_region(dict): The tic_region. Can be multidimensional with 1 number per tic coordinate(defaults to 0 for all non-mentioned regions) :param n_frames: The number of frames around the coordinate :return: """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) key_list = list(prt.tica_data.keys()) data = [prt.tica_data[i] for i in key_list] indices = sample_region(data, tic_region, n_frames) if fname is None: fname = "sampled_tic_region.xtc" trj = _frame_loader(yaml_file, prt, key_list, indices, save_trj, fname) return trj
def test_msm_pull_centroid(): yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml") ser = ProteinSeries(yaml_file,base_dir) prt = Protein(ser, "kinase_1") trj = sample_state_centroid(yaml_file, prt.name, states='all', n_frames=2, output_name="centroids.xtc") assert(trj.n_frames==prt.n_states_*2) assert(os.path.isfile(os.path.join(base_dir,"mdl_dir","kinase_1","centroids.xtc")))
def test_msm_traj(): yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml") yaml_file = load_yaml_file(yaml_file) n_steps=2 ser = ProteinSeries(yaml_file,base_dir) prt = Protein(ser, "kinase_2") starting_state = prt.msm.state_labels_[0] sample_msm_traj(yaml_file, "kinase_2",n_steps=n_steps,starting_state=starting_state) with enter_protein_mdl_dir(yaml_file, "kinase_2"): msm_steps = verboseload("msm_traj.pkl") msm_traj = mdt.load("msm_traj.xtc",top="prot.pdb") assert (msm_traj.n_frames==n_steps) assert(len(msm_steps)==n_steps) states = _fit_transform(prt, msm_traj) assert (states==msm_steps).all()
def sample_one_tic(yaml_file, protein_name, tic_index, n_frames, scheme="linear"): """ :param yaml_file: The project's yaml file :param protein: The name of protein :param tic_index: Tic index to sample along :param n_frames: The number of frames wanted :return: Dumps a tic%d.xtc and tic%d.log for a given protein inside its model. """ yaml_file = load_yaml_file(yaml_file) prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) return pull_frames(yaml_file, prt, tic_index, n_frames, scheme)
def _load_protein_matrices(yaml_file, protein_name): """ Helper routine to load matrices for a protein :param yaml_file: yaml file to work with :param protein_name: name of the protein :return: prj :The protein Series prt : The protein project key_mapping: mapping of the assigment matrix 0-axis to traj names assignment_matrix: Massive matrix of tics_mapping: mapping of the tics_array matrix 0-axis to traj names tics_array: Multi dimensional array where the 0th axis is equal to the number of trajectors, the 1st axis is equal to largest traj and the 3rd dimension is equal to the number of tics in the mdl. """ prj = ProteinSeries(yaml_file) prt = Protein(prj, protein_name) key_mapping, assignment_matrix = create_assignment_matrix( prt.fixed_assignments) tics_mapping, tics_array = create_tics_array(prt.fixed_assignments, prt.kmeans_mdl, prt.tica_data) return prj, prt, key_mapping, assignment_matrix, tics_mapping, tics_array
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir, "mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = { "kinase_1": [ "fake_proj1", ], "kinase_2": ["fake_proj2"] } mdl_params = { 'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2, 'msm__lag_time': 1, 'bootrap__n_samples': 1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc = {} for i in range(prt1.n_tics_): locally_calc[i] = [] global_min = min( min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max( max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0]) assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1]) assert (len(lin_spaced_tic_dict[i]) == n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert (len(H_dict.keys()) == prt1.n_states_) assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1) rnd_state = np.random.randint(0, prt1.n_states_) assert (np.allclose( H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins=lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None, lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert ((df.protein_name == prt1.name).all()) assert ((df.mdl_index == "mle").all()) return True assert (test_bounds()) assert (test_histogram_data()) assert (test_one_dim_free_energy()) return
def test_plotting_utils(): with enter_temp_directory(): base_dir = os.path.abspath(os.path.curdir) mdl_dir = os.path.join(base_dir,"mdl_dir") feature_dir = "feature_dir" series_name = "fake_series" protein_list = ["kinase_1", "kinase_2"] project_dict = {"kinase_1": ["fake_proj1",], "kinase_2": ["fake_proj2"]} mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1, 'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01, 'cluster__n_clusters': 2,'msm__lag_time': 1, 'bootrap__n_samples':1 } create_fake_data(base_dir, protein_list, project_dict) setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name, protein_list, project_dict, mdl_params) fit_pipeline(base_dir) prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml")) prt1 = Protein(prj, "kinase_1") prt2 = Protein(prj, "kinase_2") prt1._mlpt_fct = 0.0 prt2._mlpt_fct = 0.0 n_bins = 100 lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2], range(prt1.n_tics_), n_bins) def test_bounds(): locally_calc={} for i in range(prt1.n_tics_): locally_calc[i] =[] global_min = min(min([min(i) for i in prt1.tica_data.values()]), min([min(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_min) global_max = max(max([max(i) for i in prt1.tica_data.values()]), max([max(i) for i in prt2.tica_data.values()])) locally_calc[i].append(global_max) for i in range(prt1.n_tics_): assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0]) assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1]) assert(len(lin_spaced_tic_dict[i])==n_bins) return True def test_histogram_data(): H_dict, H_calc, _ = tica_histogram(prj, prt1, [0], x_array=lin_spaced_tic_dict[0], n_bins=None) assert(len(H_dict.keys()) == prt1.n_states_) assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1) rnd_state = np.random.randint(0,prt1.n_states_) assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state], bins = lin_spaced_tic_dict[0], normed=True)[0])) return True def test_one_dim_free_energy(): df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None , lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False) assert((df.protein_name==prt1.name).all()) assert((df.mdl_index=="mle").all()) return True assert(test_bounds()) assert(test_histogram_data()) assert(test_one_dim_free_energy()) return