def fit_protein_tica(yaml_file,sparse=False): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None): if not os.path.exists(ktica_dir): os.makedirs(ktica_dir) if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): clusterer = verboseload(clusterer_dir) tica = verboseload(tica_dir) features = tica clusters = clusterer.cluster_centers_ landmarks = clusters print("here's what goes into the combined class:") #print(np.shape(features)) print(np.shape(landmarks)) print(type(landmarks)) nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print(np.shape(nyx)) print(dir(nyx)) if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01): if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: if wolf: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(nystroem_data_filename): nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) print("Computed Nystroem.") del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" %nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print("Loaded Nystroem") if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del(nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" %projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def test_doublewell(): data = build_dataset() tica = tICA(n_components=1).fit(data) tic0 = tica.components_[0] stica = SparseTICA(n_components=1, verbose=False).fit(data) stic0 = stica.components_[0] np.testing.assert_array_almost_equal(stic0[1:], np.zeros(9)) np.testing.assert_almost_equal(stic0[0], 0.58, decimal=1)
def test_1(): data = build_dataset() tica = tICA(n_components=1).fit(data) tic0 = tica.components_[0] print('tICA\n', tic0) stica = SparseTICA(n_components=1, verbose=True).fit(data) stic0 = stica.components_[0] print('Sparse tICA\n', stic0) assert np.allclose(stic0, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
def fit_protein_tica(yaml_file,sparse=False,ksparse=None): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("tica__"): current_mdl_params[i.split("tica__")[1]] = mdl_params[i] if sparse==True: protein_tica_mdl = SparseTICA(**current_mdl_params) elif type(ksparse)==int: current_mdl_params["k"] = ksparse protein_tica_mdl = KSparseTICA(**current_mdl_params) else: protein_tica_mdl = tICA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): if os.path.exists("./normalized_features"): featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat) else: print('Warning: features have not been scaled') featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_tica_mdl.partial_fit(featurized_path) except: print('Error') print("Done partial fitting to protein %s" % protein) # dumping the tica_mdl tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl") verbosedump(protein_tica_mdl, tica_mdl_path) return
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components=5, wolf=True, shrinkage=None, rho=0.05, parallel=True, sparse=True, traj_ext=".h5", normalize=True, partial_fit=True, subsample=1, recompute_tica=False, features=None): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir normalizer = "%s/normalizer.h5" % features_directory n = compat_verboseload(normalizer) #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" if not sparse: if shrinkage is None: tica_model = tICA(n_components=n_components, lag_time=lag_time) else: tica_model = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho) else: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho, shrinkage=shrinkage) if not os.path.exists(projected_data_filename) or recompute_tica: print("loading feature files") feature_files = get_trajectory_files(features_directory, ext=".dataset") if partial_fit: transformed_data = [] for i, feature_file in enumerate(feature_files): print("fitting tICA model to %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) tica_model.partial_fit(normalized_featurized_traj) print("Finished computing tICA model. Now transforming.") for i, feature_file in enumerate(feature_files): print("Transforming %s" % feature_file) if features is None: featurized_traj = load_file(feature_file) else: featurized_traj = features[i] normalized_featurized_traj = n.transform(featurized_traj) transformed_data.append( tica_model.partial_transform(n.transform(featurized_traj))) fit_model = tica_model else: if features is None: if not parallel: features = [] for feature_file in feature_files: #if "A-00" not in feature_file and "A-01" not in feature_file: continue #print("Loading feature files one at a time") print("loading %s" % feature_file) #if sparse: # features.append(load_features(feature_file)[0:1000,0:10]) #else: features.append( load_file(feature_file)[::subsample, :]) else: pool = mp.Pool(mp.cpu_count()) features = pool.map(load_file, feature_files) pool.terminate() transpose = False for i in range(0, len(features)): if np.shape(features[0])[1] != np.shape(features[i])[1]: transpose = True break if transpose: for i in range(0, len(features)): features[i] = np.transpose(features[i]) print(np.shape(features[0])) #print np.shape(features[1]) print((features[0][0][0:10])) #print(features[1][0][0:10]) print((np.shape(features))) if normalize: features = [n.transform(f) for f in features] print("fitting data to tICA model") fit_model = tica_model.fit(features) if subsample == 1: transformed_data = fit_model.transform(features) else: transformed_data = [ fit_model.transform(n.transform(load_file(f))) for f in feature_files ] print("transformed data with tICA model") print((fit_model.summarize())) #print(dir(fit_model)) #save_dataset(fit_model, fit_model_filename) verbosedump(fit_model, fit_model_filename) print("saved tICA model") verbosedump(transformed_data, projected_data_filename) print("saved data projected onto tICA coords") else: print("already computed tICA model")
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file="", landmarks_dir="", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename="", fit_model_filename="", projected_data_filename="", landmark_subsample=1, sparse=False, wolf=True, rho=0.01, shrinkage=None): if not os.path.exists(ktica_dir): os.makedirs(ktica_dir) if not sparse: if shrinkage is None: tica_model = tICA(n_components=n_components, lag_time=lag_time) else: tica_model = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho) else: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho, shrinkage=shrinkage) if not os.path.exists(nystroem_data_filename): clusterer = verboseload(clusterer_dir) tica = verboseload(tica_dir) features = tica clusters = clusterer.cluster_centers_ landmarks = clusters print("here's what goes into the combined class:") #print(np.shape(features)) print((np.shape(landmarks))) print((type(landmarks))) nys = Nystroem( n_components=np.shape(landmarks)[0], basis=landmarks) #np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" % nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print((np.shape(nyx))) print((dir(nyx))) if not os.path.exists(projected_data_filename): fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del (nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" % projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: print("Already performed landmark kernel tICA.")
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse=False, shrinkage=0.05, wolf=True, rho=0.01, n_components=25, lag_time=5, refcoords_csv=None): if not sparse: if shrinkage is None: tica_model = tICA(n_components=n_components, lag_time=lag_time) else: if wolf: tica_model = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) else: tica_model = tICA(n_components=n_components, lag_time=lag_time, gamma=shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho) else: tica_model = SparseTICA(n_components=n_components, lag_time=lag_time, rho=rho, shrinkage=shrinkage) if not os.path.exists(nystroem_data_filename): nys = Nystroem( n_components=np.shape(landmarks)[0], basis=landmarks) #np.shape(landmarks)[0])# basis=landmarks) nyx = nys.fit_transform(features) print("Computed Nystroem.") del features del landmarks try: save_dataset(nyx, nystroem_data_filename) except: os.system("rm -rf %s" % nystroem_data_filename) save_dataset(nyx, nystroem_data_filename) else: nyx = load_dataset(nystroem_data_filename) print("Loaded Nystroem") if not os.path.exists(fit_model_filename): print("Fitting Kernel tICA model") fit_model = tica_model.fit(nyx) verbosedump(fit_model, fit_model_filename) transformed_data = fit_model.transform(nyx) del (nyx) try: save_dataset(transformed_data, projected_data_filename) except: os.system("rm -rf %s" % projected_data_filename) save_dataset(transformed_data, projected_data_filename) else: fit_model = verboseload(fit_model_filename) transformed_data = fit_model.transform(nyx) os.system("rm -rf %s" % projected_data_filename) save_dataset(transformed_data, projected_data_filename) if refcoords_csv is not None: np.savetxt(refcoords_csv, transformed_data, delimiter=",") return
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5, wolf = True, shrinkage = None, rho = 0.05, parallel=True, sparse = True, traj_ext = ".h5"): if not os.path.exists(model_dir): os.makedirs(model_dir) projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb" if not sparse: if shrinkage is None: tica_model = tICA(n_components = n_components, lag_time = lag_time) else: tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage) else: if shrinkage is None: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho) else: tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage) if not os.path.exists(projected_data_filename): print("loading feature files") feature_files = get_trajectory_files(features_directory, ext = traj_ext) if len(feature_files) == 0: feature_files = get_trajectory_files(features_directory, ext = ".dataset") if not parallel: features = [] for feature_file in feature_files: #if "A-00" not in feature_file and "A-01" not in feature_file: continue #print("Loading feature files one at a time") print "loading %s" %feature_file #if sparse: # features.append(load_features(feature_file)[0:1000,0:10]) #else: features.append(load_features(feature_file)) else: pool = mp.Pool(mp.cpu_count()) features = pool.map(load_features, feature_files) pool.terminate() transpose = False for i in range(0, len(features)): if np.shape(features[0])[1] != np.shape(features[i])[1]: transpose = True break if transpose: for i in range(0, len(features)): features[i] = np.transpose(features[i]) print np.shape(features[0]) #print np.shape(features[1]) print(features[0][0][0:10]) #print(features[1][0][0:10]) print(np.shape(features)) print("fitting data to tICA model") fit_model = tica_model.fit(features) print(fit_model.summarize()) #print(dir(fit_model)) #save_dataset(fit_model, fit_model_filename) transformed_data = fit_model.transform(features) print("transformed data with tICA model") verbosedump(fit_model, fit_model_filename) print("saved tICA model") verbosedump(transformed_data, projected_data_filename) print("saved data projected onto tICA coords") else: print("already computed tICA model")