def fit_protein_tica(yaml_file,sparse=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None):
	if not os.path.exists(ktica_dir): os.makedirs(ktica_dir)
	
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(nystroem_data_filename):
		clusterer = verboseload(clusterer_dir)
		tica = verboseload(tica_dir)
		features = tica
		clusters = clusterer.cluster_centers_
		landmarks = clusters

		print("here's what goes into the combined class:")
		#print(np.shape(features))
		print(np.shape(landmarks))
		print(type(landmarks))
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)

	print(np.shape(nyx))
	print(dir(nyx))

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01):
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			if wolf:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
			else:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage)

		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)


	if not os.path.exists(nystroem_data_filename):
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		print("Computed Nystroem.")
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)
		print("Loaded Nystroem")

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
Esempio n. 4
0
def test_doublewell():
    data = build_dataset()
    tica = tICA(n_components=1).fit(data)
    tic0 = tica.components_[0]

    stica = SparseTICA(n_components=1, verbose=False).fit(data)
    stic0 = stica.components_[0]

    np.testing.assert_array_almost_equal(stic0[1:], np.zeros(9))
    np.testing.assert_almost_equal(stic0[0], 0.58, decimal=1)
Esempio n. 5
0
def test_1():
    data = build_dataset()
    tica = tICA(n_components=1).fit(data)
    tic0 = tica.components_[0]
    print('tICA\n', tic0)

    stica = SparseTICA(n_components=1, verbose=True).fit(data)
    stic0 = stica.components_[0]
    print('Sparse tICA\n', stic0)
    assert np.allclose(stic0, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
def fit_protein_tica(yaml_file,sparse=False,ksparse=None):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    elif type(ksparse)==int:
        current_mdl_params["k"] = ksparse
        protein_tica_mdl = KSparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            if os.path.exists("./normalized_features"):
                featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat)
            else:
                print('Warning: features have not been scaled')
                featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
           
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    print('Error')
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
Esempio n. 7
0
def fit_and_transform(features_directory,
                      model_dir,
                      stride=5,
                      lag_time=10,
                      n_components=5,
                      wolf=True,
                      shrinkage=None,
                      rho=0.05,
                      parallel=True,
                      sparse=True,
                      traj_ext=".h5",
                      normalize=True,
                      partial_fit=True,
                      subsample=1,
                      recompute_tica=False,
                      features=None):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir
    fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir
    normalizer = "%s/normalizer.h5" % features_directory
    n = compat_verboseload(normalizer)
    #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

    if not sparse:
        if shrinkage is None:
            tica_model = tICA(n_components=n_components, lag_time=lag_time)
        else:
            tica_model = tICA(n_components=n_components,
                              lag_time=lag_time,
                              shrinkage=shrinkage)

    else:
        if shrinkage is None:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho)
        else:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho,
                                    shrinkage=shrinkage)

    if not os.path.exists(projected_data_filename) or recompute_tica:
        print("loading feature files")
        feature_files = get_trajectory_files(features_directory,
                                             ext=".dataset")

        if partial_fit:
            transformed_data = []

            for i, feature_file in enumerate(feature_files):
                print("fitting tICA model to %s" % feature_file)
                if features is None:
                    featurized_traj = load_file(feature_file)
                else:
                    featurized_traj = features[i]
                normalized_featurized_traj = n.transform(featurized_traj)
                tica_model.partial_fit(normalized_featurized_traj)

            print("Finished computing tICA model. Now transforming.")

            for i, feature_file in enumerate(feature_files):
                print("Transforming %s" % feature_file)
                if features is None:
                    featurized_traj = load_file(feature_file)
                else:
                    featurized_traj = features[i]
                normalized_featurized_traj = n.transform(featurized_traj)
                transformed_data.append(
                    tica_model.partial_transform(n.transform(featurized_traj)))

            fit_model = tica_model

        else:
            if features is None:
                if not parallel:
                    features = []
                    for feature_file in feature_files:
                        #if "A-00" not in feature_file and "A-01" not in feature_file: continue
                        #print("Loading feature files one at a time")
                        print("loading %s" % feature_file)
                        #if sparse:
                        #	features.append(load_features(feature_file)[0:1000,0:10])
                        #else:

                        features.append(
                            load_file(feature_file)[::subsample, :])
                else:
                    pool = mp.Pool(mp.cpu_count())
                    features = pool.map(load_file, feature_files)
                    pool.terminate()

            transpose = False
            for i in range(0, len(features)):
                if np.shape(features[0])[1] != np.shape(features[i])[1]:
                    transpose = True
                    break
            if transpose:
                for i in range(0, len(features)):
                    features[i] = np.transpose(features[i])
            print(np.shape(features[0]))
            #print np.shape(features[1])
            print((features[0][0][0:10]))
            #print(features[1][0][0:10])
            print((np.shape(features)))

            if normalize:
                features = [n.transform(f) for f in features]

            print("fitting data to tICA model")
            fit_model = tica_model.fit(features)

            if subsample == 1:
                transformed_data = fit_model.transform(features)
            else:
                transformed_data = [
                    fit_model.transform(n.transform(load_file(f)))
                    for f in feature_files
                ]
            print("transformed data with tICA model")

        print((fit_model.summarize()))
        #print(dir(fit_model))
        #save_dataset(fit_model, fit_model_filename)

        verbosedump(fit_model, fit_model_filename)
        print("saved tICA model")
        verbosedump(transformed_data, projected_data_filename)
        print("saved data projected onto tICA coords")

    else:
        print("already computed tICA model")
Esempio n. 8
0
def landmark_ktica_ticaTraj(tica_dir,
                            clusterer_dir,
                            ktica_dir,
                            clusters_map_file="",
                            landmarks_dir="",
                            nystroem_components=1000,
                            n_components=10,
                            lag_time=5,
                            nystroem_data_filename="",
                            fit_model_filename="",
                            projected_data_filename="",
                            landmark_subsample=1,
                            sparse=False,
                            wolf=True,
                            rho=0.01,
                            shrinkage=None):
    if not os.path.exists(ktica_dir): os.makedirs(ktica_dir)

    if not sparse:
        if shrinkage is None:
            tica_model = tICA(n_components=n_components, lag_time=lag_time)
        else:
            tica_model = tICA(n_components=n_components,
                              lag_time=lag_time,
                              shrinkage=shrinkage)

    else:
        if shrinkage is None:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho)
        else:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho,
                                    shrinkage=shrinkage)

    if not os.path.exists(nystroem_data_filename):
        clusterer = verboseload(clusterer_dir)
        tica = verboseload(tica_dir)
        features = tica
        clusters = clusterer.cluster_centers_
        landmarks = clusters

        print("here's what goes into the combined class:")
        #print(np.shape(features))
        print((np.shape(landmarks)))
        print((type(landmarks)))
        nys = Nystroem(
            n_components=np.shape(landmarks)[0],
            basis=landmarks)  #np.shape(landmarks)[0])# basis=landmarks)
        nyx = nys.fit_transform(features)
        del features
        del landmarks
        try:
            save_dataset(nyx, nystroem_data_filename)
        except:
            os.system("rm -rf %s" % nystroem_data_filename)
            save_dataset(nyx, nystroem_data_filename)
    else:
        nyx = load_dataset(nystroem_data_filename)

    print((np.shape(nyx)))
    print((dir(nyx)))

    if not os.path.exists(projected_data_filename):
        fit_model = tica_model.fit(nyx)
        verbosedump(fit_model, fit_model_filename)
        transformed_data = fit_model.transform(nyx)
        del (nyx)
        try:
            save_dataset(transformed_data, projected_data_filename)
        except:
            os.system("rm -rf %s" % projected_data_filename)
            save_dataset(transformed_data, projected_data_filename)
    else:
        print("Already performed landmark kernel tICA.")
Esempio n. 9
0
def ktica(features,
          landmarks,
          projected_data_filename,
          nystroem_data_filename,
          fit_model_filename,
          sparse=False,
          shrinkage=0.05,
          wolf=True,
          rho=0.01,
          n_components=25,
          lag_time=5,
          refcoords_csv=None):
    if not sparse:
        if shrinkage is None:
            tica_model = tICA(n_components=n_components, lag_time=lag_time)
        else:
            if wolf:
                tica_model = tICA(n_components=n_components,
                                  lag_time=lag_time,
                                  shrinkage=shrinkage)
            else:
                tica_model = tICA(n_components=n_components,
                                  lag_time=lag_time,
                                  gamma=shrinkage)

    else:
        if shrinkage is None:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho)
        else:
            tica_model = SparseTICA(n_components=n_components,
                                    lag_time=lag_time,
                                    rho=rho,
                                    shrinkage=shrinkage)

    if not os.path.exists(nystroem_data_filename):
        nys = Nystroem(
            n_components=np.shape(landmarks)[0],
            basis=landmarks)  #np.shape(landmarks)[0])# basis=landmarks)
        nyx = nys.fit_transform(features)
        print("Computed Nystroem.")
        del features
        del landmarks
        try:
            save_dataset(nyx, nystroem_data_filename)
        except:
            os.system("rm -rf %s" % nystroem_data_filename)
            save_dataset(nyx, nystroem_data_filename)
    else:
        nyx = load_dataset(nystroem_data_filename)
        print("Loaded Nystroem")

    if not os.path.exists(fit_model_filename):
        print("Fitting Kernel tICA model")
        fit_model = tica_model.fit(nyx)
        verbosedump(fit_model, fit_model_filename)
        transformed_data = fit_model.transform(nyx)
        del (nyx)
        try:
            save_dataset(transformed_data, projected_data_filename)
        except:
            os.system("rm -rf %s" % projected_data_filename)
            save_dataset(transformed_data, projected_data_filename)
    else:
        fit_model = verboseload(fit_model_filename)
        transformed_data = fit_model.transform(nyx)
        os.system("rm -rf %s" % projected_data_filename)
        save_dataset(transformed_data, projected_data_filename)
        if refcoords_csv is not None:
            np.savetxt(refcoords_csv, transformed_data, delimiter=",")
    return
Esempio n. 10
0
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5, wolf = True, shrinkage = None, rho = 0.05, parallel=True, sparse = True, traj_ext = ".h5"):
	if not os.path.exists(model_dir):
		os.makedirs(model_dir)

	projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir
	fit_model_filename  = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(features_directory, ext = traj_ext)
		if len(feature_files) == 0: feature_files = get_trajectory_files(features_directory, ext = ".dataset")

		if not parallel:
			features = []
			for feature_file in feature_files:
				#if "A-00" not in feature_file and "A-01" not in feature_file: continue
				#print("Loading feature files one at a time")
				print "loading %s" %feature_file
				#if sparse: 
				#	features.append(load_features(feature_file)[0:1000,0:10])
				#else:
				
				features.append(load_features(feature_file))
		else:
			pool = mp.Pool(mp.cpu_count())
			features = pool.map(load_features, feature_files)
			pool.terminate()
		transpose = False
		for i in range(0, len(features)):
			if np.shape(features[0])[1] != np.shape(features[i])[1]:
				transpose = True
				break
		if transpose: 
			for i in range(0, len(features)):
				features[i] = np.transpose(features[i])
		print np.shape(features[0])
		#print np.shape(features[1])
		print(features[0][0][0:10])
		#print(features[1][0][0:10])
		print(np.shape(features))

		print("fitting data to tICA model")
		fit_model = tica_model.fit(features)
		print(fit_model.summarize())
		#print(dir(fit_model))
		#save_dataset(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(features)
		print("transformed data with tICA model")
		verbosedump(fit_model, fit_model_filename)
		print("saved tICA model")
		verbosedump(transformed_data, projected_data_filename)
		print("saved data projected onto tICA coords")

	else:
		print("already computed tICA model")