def test_preprocess_custom_inds(): curr_dir = os.getcwd() try: td = tempfile.mkdtemp(dir=curr_dir) ftmp = tempfile.NamedTemporaryFile(delete=False) traj_dirs_tmp = ftmp.name + ".npy" inp = np.array([os.path.join(curr_dir,"data/traj1"), os.path.join(curr_dir,"data/traj2")]) np.save(traj_dirs_tmp, inp, allow_pickle=False) ftmp2 = tempfile.NamedTemporaryFile(delete=False) pdb_fns_tmp = ftmp2.name + ".npy" inp = np.array([os.path.join(curr_dir,"data/beta-peptide1.pdb"), os.path.join(curr_dir,"data/beta-peptide2.pdb")]) np.save(pdb_fns_tmp, inp, allow_pickle=False) ftmp3 = tempfile.NamedTemporaryFile(delete=False) inds_fn_tmp = ftmp3.name + ".npy" pdb = md.load(inp[0]) inds = pdb.top.select("name CA or name N or name CB or name C") both_inds = np.array([inds,inds]) np.save(inds_fn_tmp, both_inds, allow_pickle=False) subprocess.call(['python', CLI_DIR + "/main.py", "process", traj_dirs_tmp, pdb_fns_tmp, td, "-a" + inds_fn_tmp]) assert os.path.exists(os.path.join(td,"wm.npy")) assert os.path.exists(os.path.join(td,"uwm.npy")) assert os.path.exists(os.path.join(td,"master.pdb")) assert os.path.exists(os.path.join(td,"data")) xtc_fns = os.path.join(td,"aligned_xtcs") data_fns = get_fns(xtc_fns,"*.xtc") ind_fns = os.path.join(td,"indicators") inds = get_fns(ind_fns,"*.npy") print(len(data_fns)) assert len(data_fns) == len(inds) finally: os.remove(traj_dirs_tmp) os.remove(pdb_fns_tmp) os.remove(inds_fn_tmp) shutil.rmtree(td)
def test_whitening_correctness(): w = WhitenTraj("./data/whitened/") master = md.load("./data/whitened/master.pdb") traj1 = md.load("./data/whitened/aligned_xtcs/000000.xtc", top=master) traj2 = md.load("./data/whitened/aligned_xtcs/000001.xtc", top=master) wm = np.load("./data/whitened/wm.npy") w.apply_whitening_xtc_dir(w.xtc_dir, master.top, wm, w.cm, 1, "./data/whitened/whitened_xtcs") traj_fns = get_fns("./data/whitened/whitened_xtcs/", "*.xtc") traj = md.load(traj_fns[0], top=master) coords = traj.xyz.reshape((2501, 3 * 39)) c00_1 = np.matmul(coords.transpose(), coords) traj = md.load(traj_fns[1], top=master) coords = traj.xyz.reshape((2500, 3 * 39)) c00_2 = np.matmul(coords.transpose(), coords) c00 = c00_1 + c00_2 c00 /= 5001 assert (np.abs(117 - np.sum(np.diagonal(c00))) < 1)
def preprocess_data(sim_dirs, pdb_fns, outdir, atom_sel=None, stride=1): """ sim_dirs: Path to an np.array containing directory names. The array needs one directory name for each variant where each directory contains all trajectories for that variant. pdb_fns: Path to an np.array containing pdb filenames. The array needs one pdb filename for each variant. The order of variants should match the order of sim_dirs. atom_sel: (optional) Path to an np.array containing a list of indices for each variant, which operates on the pdbs supplied. The indices need to select equivalent atoms across variants. stride: (optional) Path to an np.array containing an integer for each variant. outdir: Path you would like processed data to live. """ try: var_dir_names = np.load(sim_dirs) except: click.echo(f'Incorrect input for sim_dirs. Use --help flag for ' 'information on the correct input for sim_dirs.') raise try: var_pdb_fns = np.load(pdb_fns) except: click.echo(f'Incorrect input for pdb_fns. Use --help flag for ' 'information on the correct input for pdb_fns.') raise if stride: try: stride = np.load(stride) except: click.echo( f'Incorrect input for stride. User must supply a ' 'path to a np.array that has a stride value for each variant.') raise if atom_sel: try: atom_sel = np.load(atom_sel) #Add a check to make sure atom_sel is not same n_atoms = [ md.load(fn).atom_slice(atom_sel[i]).n_atoms for i, fn in enumerate(var_pdb_fns) ] if len(np.unique(n_atoms)) != 1: raise ImproperlyConfigured( f'atom_sel needs to choose equivalent atoms across variants. ' 'After performing atom_sel, pdbs have different numbers of ' 'atoms.') except: click.echo(f'Incorrect input for atom_sel. Use --help flag for ' 'information on the correct input for atom_sel.') raise else: n_resis = [] for fn in var_pdb_fns: pdb = md.load(fn) n_resis.append(pdb.top.n_residues) if len(np.unique(n_resis)) != 1: raise ImproperlyConfigured( f'The PDBs supplied have different numbers of residues. The ' 'default atom selection does not work in this case. Please ' 'use the --atom-sel option to choose equivalent atoms across ' 'different variant pdbs.') if len(var_dir_names) != len(var_pdb_fns): raise ImproperlyConfigured( f'pdb_fns and sim_dirs must point to np.arrays that have ' 'the same length') for vd, fn in zip(var_dir_names, var_pdb_fns): traj_fns = get_fns(vd, "*.xtc") n_traj = len(traj_fns) click.echo("Found %s trajectories in %s" % (n_traj, vd)) if n_traj == 0: raise ImproperlyConfigured("Found no trajectories in %s" % vd) try: traj = md.load(traj_fns[0], top=fn) except: click.echo(f'Order of pdb_fns and sim_dirs need to ' 'correspond to each other.') raise proc_traj = ProcessTraj(var_dir_names, var_pdb_fns, outdir, stride=stride, atom_sel=atom_sel) proc_traj.run() print("Aligned trajectories") whiten_traj = WhitenTraj(outdir) print("starting trajectory whitening") whiten_traj.run()
def train(config): """ config: YML config file. See train_sample.yml for an example and train_sample.txt for parameter descriptions. """ with open(config) as f: job = yaml.load(f) required_keys = [ 'data_dir', 'n_epochs', 'act_map', 'lr', 'n_latent', 'hidden_layer_sizes', 'em_bounds', 'do_em', 'em_batch_size', 'nntype', 'batch_size', 'batch_output_freq', 'epoch_output_freq', 'test_batch_size', 'frac_test', 'subsample', 'outdir', 'data_in_mem' ] optional_keys = ["close_inds_fn", "label_spreading"] if hasattr(job['nntype'], 'split_inds'): required_keys.append("close_inds_fn") if "label_spreading" in job.keys(): if job["label_spreading"] != "gaussian" and job[ "label_spreading"] != "uniform": raise ImproperlyConfigured( f'label_spreading must be set to gaussian or uniform') for key in job.keys(): try: required_keys.remove(key) except: if key in optional_keys: continue else: raise ImproperlyConfigured( f'{key} is not a valid parameter. Check yaml file.') if len(required_keys) != 0: raise ImproperlyConfigured( f'Missing the following parameters in {config} ' '{required_keys} ') data_dir = job['data_dir'] data_fns = get_fns(data_dir, "*.npy") wm_fn = os.path.join(data_dir, "wm.npy") if wm_fn not in data_fns: raise ImproperlyConfigured( f'Cannot find wm.npy in preprocessed data directory. Likely ' 'need to re-run data preprocessing step.') xtc_fns = os.path.join(data_dir, "aligned_xtcs") data_fns = get_fns(xtc_fns, "*.xtc") ind_fns = os.path.join(data_dir, "indicators") inds = get_fns(ind_fns, "*.npy") if (len(inds) != len(data_fns)) or len(inds) == 0: raise ImproperlyConfigured( f'Number of files in aligned_xtcs and indicators should be ' 'equal. Likely need to re-run data preprocessing step.') last_indi = np.load(inds[-1]) n_cores = mp.cpu_count() master_fn = os.path.join(job['data_dir'], "master.pdb") master = md.load(master_fn) n_atoms = master.top.n_atoms n_features = 3 * n_atoms job['layer_sizes'] = [n_features, n_features] if len(job['hidden_layer_sizes']) == 0: job['layer_sizes'].append(int(n_features / 4)) else: for layer in job['hidden_layer_sizes']: job['layer_sizes'].append(layer) job['layer_sizes'].append(job['n_latent']) job['act_map'] = np.array(job['act_map'], dtype=float) job['em_bounds'] = np.array(job['em_bounds']) job['em_n_cores'] = n_cores job['nntype'] = nn_d[job['nntype']] if len(job['act_map']) != last_indi[0] + 1: raise ImproperlyConfigured( f'act_map needs to contain a value for each variant.') if len(job['act_map']) != len(job['em_bounds']): raise ImproperlyConfigured( f'act_map and em_bounds should be the same length since ' 'each variant needs an initial classification label and ' 'a range for the EM update') if n_features != job['layer_sizes'][0]: raise ImproperlyConfigured( f'1st layer size does not match the number of xyz coordinates') if job['layer_sizes'][0] != job['layer_sizes'][1]: raise ImproperlyConfigured(f'1st and 2nd layer size need to be equal.') if job['layer_sizes'][-1] != job['n_latent']: raise ImproperlyConfigured( f'Last layer size needs to equal number of latent variables') if 'close_inds_fn' in job.keys(): if hasattr(job['nntype'], 'split_inds'): inds = np.load(job['close_inds_fn']) close_xyz_inds = [] for i in inds: close_xyz_inds.append(i * 3) close_xyz_inds.append((i * 3) + 1) close_xyz_inds.append((i * 3) + 2) all_inds = np.arange((master.n_atoms * 3)) non_close_xyz_inds = np.setdiff1d(all_inds, close_xyz_inds) job['inds1'] = np.array(close_xyz_inds) job['inds2'] = non_close_xyz_inds else: raise ImproperlyConfigured( f'Indices chosen for a split autoencoder architecture ' '(close_inds_fn), but a split autoencoder architecture ' 'was not chosen (nntype)') if not os.path.exists(job['outdir']): cmd = "mkdir %s" % job['outdir'] os.system(cmd) shutil.copyfile(config, os.path.join(job['outdir'], config)) #raise ImproperlyConfigured( # f'outdir already exists. Rename and try again. ') trainer = Trainer(job) net = trainer.run(data_in_mem=job['data_in_mem'])