Exemple #1
0
def test_sample_1():
    # Test that the code actually runs and gives something non-crazy
    # Make an ergodic dataset with two gaussian centers offset by 25 units.
    chunk = np.random.normal(size=(20000, 3))
    data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))]

    clusterer = cluster.KMeans(n_clusters=2)
    msm = MarkovStateModel()
    pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer),
                                          ("msm", msm)])
    pipeline.fit(data)
    trimmed_assignments = pipeline.transform(data)

    # Now let's make make the output assignments start with
    # zero at the first position.
    i0 = trimmed_assignments[0][0]
    if i0 == 1:
        for m in trimmed_assignments:
            m *= -1
            m += 1

    pairs = msm.draw_samples(trimmed_assignments, 2000)

    samples = map_drawn_samples(pairs, data)
    mu = np.mean(samples, axis=1)
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)

    # We should make sure we can sample from Trajectory objects too...
    # Create a fake topology with 1 atom to match our input dataset
    top = md.Topology.from_dataframe(pd.DataFrame({
        "serial": [0],
        "name": ["HN"],
        "element": ["H"],
        "resSeq": [1],
        "resName": "RES",
        "chainID": [0]
    }),
                                     bonds=np.zeros(shape=(0, 2), dtype='int'))
    # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz
    trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data]

    trj_samples = map_drawn_samples(pairs, trajectories)
    mu = np.array([t.xyz.mean(0)[0] for t in trj_samples])
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
Exemple #2
0
def test_sample_1():
    # Test that the code actually runs and gives something non-crazy
    # Make an ergodic dataset with two gaussian centers offset by 25 units.
    chunk = np.random.normal(size=(20000, 3))
    data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))]

    clusterer = cluster.KMeans(n_clusters=2)
    msm = MarkovStateModel()
    pipeline = sklearn.pipeline.Pipeline(
        [("clusterer", clusterer), ("msm", msm)]
    )
    pipeline.fit(data)
    trimmed_assignments = pipeline.transform(data)

    # Now let's make make the output assignments start with
    # zero at the first position.
    i0 = trimmed_assignments[0][0]
    if i0 == 1:
        for m in trimmed_assignments:
            m *= -1
            m += 1

    pairs = msm.draw_samples(trimmed_assignments, 2000)

    samples = map_drawn_samples(pairs, data)
    mu = np.mean(samples, axis=1)
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)

    # We should make sure we can sample from Trajectory objects too...
    # Create a fake topology with 1 atom to match our input dataset
    top = md.Topology.from_dataframe(
        pd.DataFrame({
            "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1],
            "resName": "RES", "chainID": [0]
        }), bonds=np.zeros(shape=(0, 2), dtype='int')
    )
    # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz
    trajectories = [md.Trajectory(x[:, np.newaxis], top)
                    for x in data]

    trj_samples = map_drawn_samples(pairs, trajectories)
    mu = np.array([t.xyz.mean(0)[0] for t in trj_samples])
    eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
Exemple #3
0
import mixtape.utils

n_iter = 1000
n_choose = 50
stride = 1
lag_time = 1
n_components = 2

filenames = glob.glob("./Trajectories/*.h5")
trajectories = [md.load(filename) for filename in filenames]

if len(trajectories) > 1:
    train = trajectories[0::2]
    test = trajectories[1::2]
else:
    train = [trajectories[0][0:trajectories[0].n_frames/2]]
    test = [trajectories[0][trajectories[0].n_frames/2:]]


featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose))
tica = mixtape.tica.tICA(lag_time=lag_time, n_components=n_components)
pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)])
pipeline.fit(train)
print(pipeline.score(train), pipeline.score(test))


pipeline.fit(trajectories)
X = pipeline.transform(trajectories)
q = np.concatenate(X)
hexbin(q[:, 0], q[:, 1], bins='log')
def main(args):
    if args.predict is None:
        # We are training a model.
        np.random.seed(args.seed)

        # Create a random generator with a given seed
        generator = np.random.RandomState(args.seed)

        train = Dataset()
        data = train.data
        target = train.target

        # making features
        bool_train = np.all(data.astype(int) == data, axis=0)
        int_train = [i for i, b in enumerate(bool_train) if b]
        real_train = [i for i, b in enumerate(bool_train) if not b]

        i = "tr"
        transformers = []
        for arr, encoder in zip([int_train, real_train], [
                sklearn.preprocessing.OneHotEncoder(sparse=False,
                                                    handle_unknown="ignore"),
                sklearn.preprocessing.StandardScaler()
        ]):

            if len(arr) > 0:
                transformers.append((i, encoder, arr))
                i += 'a'

        ct = sklearn.compose.ColumnTransformer(transformers=transformers)
        poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False)

        pipeline = sklearn.pipeline.Pipeline([('ct', ct), ('pl', poly)])
        data = pipeline.fit_transform(data)

        # TODO: Train a model on the given dataset and store it in `model`.
        # adding ones

        data = np.c_[data, np.ones(data.shape[0])]

        # Generate initial linear regression weights
        weights = generator.uniform(size=data.shape[1])

        for epoch in range(args.epochs):
            permutation = generator.permutation(data.shape[0])

            i = 0
            n_batches = data.shape[0] // args.batch_size
            for batch in range(n_batches):
                gradient_sum = np.zeros(shape=weights.shape)

                for sample in range(args.batch_size):
                    index = permutation[i + sample]
                    predictions = np.transpose(data[index]) @ weights
                    gradient_sum += (predictions - target[index]) * data[index]

                average_gradient = gradient_sum / args.batch_size

                # SGD update
                weights = weights - args.learning_rate * average_gradient

                i = i + args.batch_size

        model = weights

        # Serialize features
        with lzma.open(args.feature_path, "wb") as feature_file:
            pickle.dump(pipeline, feature_file)

        # Serialize the model.
        with lzma.open(args.model_path, "wb") as model_file:
            pickle.dump(model, model_file)

    else:
        # Use the model and return test set predictions, as either a Python list or a NumPy array.
        test = Dataset(args.predict)

        with lzma.open(args.model_path, "rb") as model_file:
            model = pickle.load(model_file)

        with lzma.open(args.feature_path, "rb") as feature_file:
            pipeline = pickle.load(feature_file)

        test_d = pipeline.transform(test.data)
        test_d = np.c_[test_d, np.ones(test_d.shape[0])]

        # TODO: Generate `predictions` with the test set predictions.
        predictions = test_d @ model

        return predictions