def train(wordlists, lsa=None):
    anchor_tags = ['pizza', 'auto', 'taxi', 'cinema', 'coffee', 'dinner']
    anchors = [['pizza'], ['auto', 'car', 'repair'], ['ride'], ['movie'],
               ['coffee'], ['dinner', 'restaurant']]
    nanchors = len(anchors)

    steps = [
        sklearn.feature_extraction.text.TfidfVectorizer(analyzer=identity)
    ]

    if lsa:
        steps.append(sklearn.decomposition.TruncatedSVD(lsa))
        steps.append(sklearn.preprocessing.Normalizer(copy=False))

    pipeline = sklearn.pipeline.make_pipeline(*steps)
    x = pipeline.fit_transform(wordlists + anchors)
    kmeans = sklearn.cluster.KMeans(n_clusters=nanchors,
                                    init=make_dense(x[-nanchors:]),
                                    n_init=1).fit(x)

    anchor_labels = kmeans.labels_[-nanchors:]
    if len(set(anchor_labels)) != nanchors:
        print("Anchors don't map to separate classes:",
              list(zip(anchors, anchor_labels)),
              file=sys.stderr)
        sys.exit(1)
    tagmap = [
        '<' + t + '>' for l, t in sorted(zip(anchor_labels, anchor_tags))
    ]

    preds = [tagmap[x] for x in kmeans.labels_[:-nanchors]]
    model = {'tagmap': tagmap, 'pipeline': pipeline, 'kmeans': kmeans}

    return model, preds
Example #2
0
def test_pcca_1():
    # Make a simple dataset with four states, where there are 2 obvious macrostate basins--the remaining states interconvert quickly
    n_frames = 10000
    chunk = np.zeros(n_frames, 'int')
    rnd = lambda : np.random.randint(0, 2, n_frames)  # Generates random noise states within each basin
    # States 0 and 1 interconvert, states 2 and 3 interconvert.  
    assignments = [np.hstack((chunk + rnd(), chunk + 2  + rnd())), np.hstack((chunk + 2 + rnd(), chunk + rnd()))]

    pcca = lumping.PCCA(2)
    macro_msm = markovstatemodel.MarkovStateModel()

    pipeline = sklearn.pipeline.Pipeline([("pcca", pcca), ("macro_msm", macro_msm)])
    macro_assignments = pipeline.fit_transform(assignments)

    # Now let's make make the output assignments start with zero at the first position.
    i0 = macro_assignments[0][0]
    if i0 == 1:
        for m in macro_assignments:
            m *= -1
            m += 1

    eq(macro_assignments[0], np.hstack((chunk, chunk + 1)))
    eq(macro_assignments[1], np.hstack((chunk + 1, chunk)))
Example #3
0
import mdtraj as md
import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.datasets, mixtape.subset_featurizer, mixtape.feature_selection
import numpy as np
import sklearn.pipeline, sklearn.externals.joblib
import mixtape.utils


trajectories = mixtape.datasets.alanine_dipeptide.fetch_alanine_dipeptide()["trajectories"]
train = trajectories[0::2]
test = trajectories[1::2]

n_timescales = 4
n_clusters = 100

clusterer = mixtape.cluster.KCenters(n_clusters=n_clusters, metric=md.rmsd)
msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_timescales)

pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer), ("msm", msm)])
assignments = pipeline.fit_transform(train)

msm.timescales_
print(pipeline.score(train), pipeline.score(test))
Example #4
0
stride = 1
lag_time = 1

trj0, trajectories, filenames = load_trajectories(stride=stride)
featurizer = sklearn.externals.joblib.load("./featurizer-%d.job" % n_choose)

train = trajectories[0::2]
test = trajectories[1::2]


n_components = 3
tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
subsampler = mixtape.utils.Subsampler(lag_time=lag_time)
pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("subsampler", subsampler)])

X_all = pipeline.fit_transform(trajectories)
q = np.concatenate(X_all)

n_states = 3
model = mixtape.ghmm.GaussianFusionHMM(n_states, n_components, fusion_prior=0., init_algo='GMM')
model.fit(X_all)
model.score(X_all)


for i, j in [(0, 1)]:
    figure()
    hexbin(q[:,i], q[:, j], bins='log')
    errorbar(model.means_[:, i], model.means_[:, j], xerr=model.vars_[:,i] ** 0.5, yerr=model.vars_[:, j] ** 0.5, fmt='kx', linewidth=4)


figure(1)
Example #5
0
def main(args):
    if args.predict is None:
        # We are training a model.
        np.random.seed(args.seed)

        # Create a random generator with a given seed
        generator = np.random.RandomState(args.seed)

        train = Dataset()
        data = train.data
        target = train.target

        # making features
        bool_train = np.all(data.astype(int) == data, axis=0)
        int_train = [i for i, b in enumerate(bool_train) if b]
        real_train = [i for i, b in enumerate(bool_train) if not b]

        i = "tr"
        transformers = []
        for arr, encoder in zip([int_train, real_train], [
                sklearn.preprocessing.OneHotEncoder(sparse=False,
                                                    handle_unknown="ignore"),
                sklearn.preprocessing.StandardScaler()
        ]):

            if len(arr) > 0:
                transformers.append((i, encoder, arr))
                i += 'a'

        ct = sklearn.compose.ColumnTransformer(transformers=transformers)
        poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)

        pipeline = sklearn.pipeline.Pipeline([('ct', ct), ('pl', poly)])
        data = pipeline.fit_transform(data)

        # TODO: Train a model on the given dataset and store it in `model`.
        # adding ones

        data = np.c_[data, np.ones(data.shape[0])]

        # Generate initial linear regression weights
        weights = generator.uniform(size=data.shape[1])

        for epoch in range(args.epochs):
            permutation = generator.permutation(data.shape[0])

            i = 0
            n_batches = data.shape[0] // args.batch_size
            for batch in range(n_batches):
                gradient_sum = np.zeros(shape=weights.shape)

                for sample in range(args.batch_size):
                    index = permutation[i + sample]
                    predictions = np.transpose(data[index]) @ weights
                    gradient_sum += (predictions - target[index]) * data[index]

                average_gradient = gradient_sum / args.batch_size

                # SGD update
                weights = weights - args.learning_rate * average_gradient

                i = i + args.batch_size

        model = weights

        # Serialize features
        with lzma.open(args.feature_path, "wb") as feature_file:
            pickle.dump(pipeline, feature_file)

        # Serialize the model.
        with lzma.open(args.model_path, "wb") as model_file:
            pickle.dump(model, model_file)

    else:
        # Use the model and return test set predictions, as either a Python list or a NumPy array.
        test = Dataset(args.predict)

        with lzma.open(args.model_path, "rb") as model_file:
            model = pickle.load(model_file)

        with lzma.open(args.feature_path, "rb") as feature_file:
            pipeline = pickle.load(feature_file)

        test_d = pipeline.transform(test.data)
        test_d = np.c_[test_d, np.ones(test_d.shape[0])]

        # TODO: Generate `predictions` with the test set predictions.
        predictions = test_d @ model

        return predictions