def test_sample_1(): # Test that the code actually runs and gives something non-crazy # Make an ergodic dataset with two gaussian centers offset by 25 units. chunk = np.random.normal(size=(20000, 3)) data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))] clusterer = cluster.KMeans(n_clusters=2) msm = MarkovStateModel() pipeline = sklearn.pipeline.Pipeline([("clusterer", clusterer), ("msm", msm)]) pipeline.fit(data) trimmed_assignments = pipeline.transform(data) # Now let's make make the output assignments start with # zero at the first position. i0 = trimmed_assignments[0][0] if i0 == 1: for m in trimmed_assignments: m *= -1 m += 1 pairs = msm.draw_samples(trimmed_assignments, 2000) samples = map_drawn_samples(pairs, data) mu = np.mean(samples, axis=1) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1) # We should make sure we can sample from Trajectory objects too... # Create a fake topology with 1 atom to match our input dataset top = md.Topology.from_dataframe(pd.DataFrame({ "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1], "resName": "RES", "chainID": [0] }), bonds=np.zeros(shape=(0, 2), dtype='int')) # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data] trj_samples = map_drawn_samples(pairs, trajectories) mu = np.array([t.xyz.mean(0)[0] for t in trj_samples]) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
def test_sample_1(): # Test that the code actually runs and gives something non-crazy # Make an ergodic dataset with two gaussian centers offset by 25 units. chunk = np.random.normal(size=(20000, 3)) data = [np.vstack((chunk, chunk + 25)), np.vstack((chunk + 25, chunk))] clusterer = cluster.KMeans(n_clusters=2) msm = MarkovStateModel() pipeline = sklearn.pipeline.Pipeline( [("clusterer", clusterer), ("msm", msm)] ) pipeline.fit(data) trimmed_assignments = pipeline.transform(data) # Now let's make make the output assignments start with # zero at the first position. i0 = trimmed_assignments[0][0] if i0 == 1: for m in trimmed_assignments: m *= -1 m += 1 pairs = msm.draw_samples(trimmed_assignments, 2000) samples = map_drawn_samples(pairs, data) mu = np.mean(samples, axis=1) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1) # We should make sure we can sample from Trajectory objects too... # Create a fake topology with 1 atom to match our input dataset top = md.Topology.from_dataframe( pd.DataFrame({ "serial": [0], "name": ["HN"], "element": ["H"], "resSeq": [1], "resName": "RES", "chainID": [0] }), bonds=np.zeros(shape=(0, 2), dtype='int') ) # np.newaxis reshapes the data to have a 40000 frames, 1 atom, 3 xyz trajectories = [md.Trajectory(x[:, np.newaxis], top) for x in data] trj_samples = map_drawn_samples(pairs, trajectories) mu = np.array([t.xyz.mean(0)[0] for t in trj_samples]) eq(mu, np.array([[0., 0., 0.0], [25., 25., 25.]]), decimal=1)
import mixtape.utils n_iter = 1000 n_choose = 50 stride = 1 lag_time = 1 n_components = 2 filenames = glob.glob("./Trajectories/*.h5") trajectories = [md.load(filename) for filename in filenames] if len(trajectories) > 1: train = trajectories[0::2] test = trajectories[1::2] else: train = [trajectories[0][0:trajectories[0].n_frames/2]] test = [trajectories[0][trajectories[0].n_frames/2:]] featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose)) tica = mixtape.tica.tICA(lag_time=lag_time, n_components=n_components) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)]) pipeline.fit(train) print(pipeline.score(train), pipeline.score(test)) pipeline.fit(trajectories) X = pipeline.transform(trajectories) q = np.concatenate(X) hexbin(q[:, 0], q[:, 1], bins='log')
def main(args): if args.predict is None: # We are training a model. np.random.seed(args.seed) # Create a random generator with a given seed generator = np.random.RandomState(args.seed) train = Dataset() data = train.data target = train.target # making features bool_train = np.all(data.astype(int) == data, axis=0) int_train = [i for i, b in enumerate(bool_train) if b] real_train = [i for i, b in enumerate(bool_train) if not b] i = "tr" transformers = [] for arr, encoder in zip([int_train, real_train], [ sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), sklearn.preprocessing.StandardScaler() ]): if len(arr) > 0: transformers.append((i, encoder, arr)) i += 'a' ct = sklearn.compose.ColumnTransformer(transformers=transformers) poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=False) pipeline = sklearn.pipeline.Pipeline([('ct', ct), ('pl', poly)]) data = pipeline.fit_transform(data) # TODO: Train a model on the given dataset and store it in `model`. # adding ones data = np.c_[data, np.ones(data.shape[0])] # Generate initial linear regression weights weights = generator.uniform(size=data.shape[1]) for epoch in range(args.epochs): permutation = generator.permutation(data.shape[0]) i = 0 n_batches = data.shape[0] // args.batch_size for batch in range(n_batches): gradient_sum = np.zeros(shape=weights.shape) for sample in range(args.batch_size): index = permutation[i + sample] predictions = np.transpose(data[index]) @ weights gradient_sum += (predictions - target[index]) * data[index] average_gradient = gradient_sum / args.batch_size # SGD update weights = weights - args.learning_rate * average_gradient i = i + args.batch_size model = weights # Serialize features with lzma.open(args.feature_path, "wb") as feature_file: pickle.dump(pipeline, feature_file) # Serialize the model. with lzma.open(args.model_path, "wb") as model_file: pickle.dump(model, model_file) else: # Use the model and return test set predictions, as either a Python list or a NumPy array. test = Dataset(args.predict) with lzma.open(args.model_path, "rb") as model_file: model = pickle.load(model_file) with lzma.open(args.feature_path, "rb") as feature_file: pipeline = pickle.load(feature_file) test_d = pipeline.transform(test.data) test_d = np.c_[test_d, np.ones(test_d.shape[0])] # TODO: Generate `predictions` with the test set predictions. predictions = test_d @ model return predictions