def train(wordlists, lsa=None): anchor_tags = ['pizza', 'auto', 'taxi', 'cinema', 'coffee', 'dinner'] anchors = [['pizza'], ['auto', 'car', 'repair'], ['ride'], ['movie'], ['coffee'], ['dinner', 'restaurant']] nanchors = len(anchors) steps = [ sklearn.feature_extraction.text.TfidfVectorizer(analyzer=identity) ] if lsa: steps.append(sklearn.decomposition.TruncatedSVD(lsa)) steps.append(sklearn.preprocessing.Normalizer(copy=False)) pipeline = sklearn.pipeline.make_pipeline(*steps) x = pipeline.fit_transform(wordlists + anchors) kmeans = sklearn.cluster.KMeans(n_clusters=nanchors, init=make_dense(x[-nanchors:]), n_init=1).fit(x) anchor_labels = kmeans.labels_[-nanchors:] if len(set(anchor_labels)) != nanchors: print("Anchors don't map to separate classes:", list(zip(anchors, anchor_labels)), file=sys.stderr) sys.exit(1) tagmap = [ '<' + t + '>' for l, t in sorted(zip(anchor_labels, anchor_tags)) ] preds = [tagmap[x] for x in kmeans.labels_[:-nanchors]] model = {'tagmap': tagmap, 'pipeline': pipeline, 'kmeans': kmeans} return model, preds
def test_pcca_1(): # Make a simple dataset with four states, where there are 2 obvious macrostate basins--the remaining states interconvert quickly n_frames = 10000 chunk = np.zeros(n_frames, 'int') rnd = lambda : np.random.randint(0, 2, n_frames) # Generates random noise states within each basin # States 0 and 1 interconvert, states 2 and 3 interconvert. assignments = [np.hstack((chunk + rnd(), chunk + 2 + rnd())), np.hstack((chunk + 2 + rnd(), chunk + rnd()))] pcca = lumping.PCCA(2) macro_msm = markovstatemodel.MarkovStateModel() pipeline = sklearn.pipeline.Pipeline([("pcca", pcca), ("macro_msm", macro_msm)]) macro_assignments = pipeline.fit_transform(assignments) # Now let's make make the output assignments start with zero at the first position. i0 = macro_assignments[0][0] if i0 == 1: for m in macro_assignments: m *= -1 m += 1 eq(macro_assignments[0], np.hstack((chunk, chunk + 1))) eq(macro_assignments[1], np.hstack((chunk + 1, chunk)))
import mdtraj as md import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.datasets, mixtape.subset_featurizer, mixtape.feature_selection import numpy as np import sklearn.pipeline, sklearn.externals.joblib import mixtape.utils trajectories = mixtape.datasets.alanine_dipeptide.fetch_alanine_dipeptide()["trajectories"] train = trajectories[0::2] test = trajectories[1::2] n_timescales = 4 n_clusters = 100 clusterer = mixtape.cluster.KCenters(n_clusters=n_clusters, metric=md.rmsd) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_timescales) pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer), ("msm", msm)]) assignments = pipeline.fit_transform(train) msm.timescales_ print(pipeline.score(train), pipeline.score(test))
stride = 1 lag_time = 1 trj0, trajectories, filenames = load_trajectories(stride=stride) featurizer = sklearn.externals.joblib.load("./featurizer-%d.job" % n_choose) train = trajectories[0::2] test = trajectories[1::2] n_components = 3 tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) subsampler = mixtape.utils.Subsampler(lag_time=lag_time) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("subsampler", subsampler)]) X_all = pipeline.fit_transform(trajectories) q = np.concatenate(X_all) n_states = 3 model = mixtape.ghmm.GaussianFusionHMM(n_states, n_components, fusion_prior=0., init_algo='GMM') model.fit(X_all) model.score(X_all) for i, j in [(0, 1)]: figure() hexbin(q[:,i], q[:, j], bins='log') errorbar(model.means_[:, i], model.means_[:, j], xerr=model.vars_[:,i] ** 0.5, yerr=model.vars_[:, j] ** 0.5, fmt='kx', linewidth=4) figure(1)
def main(args): if args.predict is None: # We are training a model. np.random.seed(args.seed) # Create a random generator with a given seed generator = np.random.RandomState(args.seed) train = Dataset() data = train.data target = train.target # making features bool_train = np.all(data.astype(int) == data, axis=0) int_train = [i for i, b in enumerate(bool_train) if b] real_train = [i for i, b in enumerate(bool_train) if not b] i = "tr" transformers = [] for arr, encoder in zip([int_train, real_train], [ sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), sklearn.preprocessing.StandardScaler() ]): if len(arr) > 0: transformers.append((i, encoder, arr)) i += 'a' ct = sklearn.compose.ColumnTransformer(transformers=transformers) poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False) pipeline = sklearn.pipeline.Pipeline([('ct', ct), ('pl', poly)]) data = pipeline.fit_transform(data) # TODO: Train a model on the given dataset and store it in `model`. # adding ones data = np.c_[data, np.ones(data.shape[0])] # Generate initial linear regression weights weights = generator.uniform(size=data.shape[1]) for epoch in range(args.epochs): permutation = generator.permutation(data.shape[0]) i = 0 n_batches = data.shape[0] // args.batch_size for batch in range(n_batches): gradient_sum = np.zeros(shape=weights.shape) for sample in range(args.batch_size): index = permutation[i + sample] predictions = np.transpose(data[index]) @ weights gradient_sum += (predictions - target[index]) * data[index] average_gradient = gradient_sum / args.batch_size # SGD update weights = weights - args.learning_rate * average_gradient i = i + args.batch_size model = weights # Serialize features with lzma.open(args.feature_path, "wb") as feature_file: pickle.dump(pipeline, feature_file) # Serialize the model. with lzma.open(args.model_path, "wb") as model_file: pickle.dump(model, model_file) else: # Use the model and return test set predictions, as either a Python list or a NumPy array. test = Dataset(args.predict) with lzma.open(args.model_path, "rb") as model_file: model = pickle.load(model_file) with lzma.open(args.feature_path, "rb") as feature_file: pipeline = pickle.load(feature_file) test_d = pipeline.transform(test.data) test_d = np.c_[test_d, np.ones(test_d.shape[0])] # TODO: Generate `predictions` with the test set predictions. predictions = test_d @ model return predictions