Example #1
0
def test_preprocess_custom_inds():
    curr_dir = os.getcwd()
    try:
        td = tempfile.mkdtemp(dir=curr_dir) 
        ftmp = tempfile.NamedTemporaryFile(delete=False)
        traj_dirs_tmp = ftmp.name + ".npy"
        inp = np.array([os.path.join(curr_dir,"data/traj1"),
                    os.path.join(curr_dir,"data/traj2")])
        np.save(traj_dirs_tmp, inp, allow_pickle=False)
    
        ftmp2 = tempfile.NamedTemporaryFile(delete=False)
        pdb_fns_tmp = ftmp2.name + ".npy"
        inp = np.array([os.path.join(curr_dir,"data/beta-peptide1.pdb"),
                    os.path.join(curr_dir,"data/beta-peptide2.pdb")])
        np.save(pdb_fns_tmp, inp, allow_pickle=False) 

        ftmp3 = tempfile.NamedTemporaryFile(delete=False)
        inds_fn_tmp = ftmp3.name + ".npy"   
        pdb = md.load(inp[0])
        inds = pdb.top.select("name CA or name N or name CB or name C")
        both_inds = np.array([inds,inds])
        np.save(inds_fn_tmp, both_inds, allow_pickle=False)

        subprocess.call(['python', CLI_DIR + "/main.py", "process", 
                        traj_dirs_tmp, pdb_fns_tmp, td,
                        "-a" + inds_fn_tmp])

        assert os.path.exists(os.path.join(td,"wm.npy"))
        assert os.path.exists(os.path.join(td,"uwm.npy")) 
        assert os.path.exists(os.path.join(td,"master.pdb"))
        assert os.path.exists(os.path.join(td,"data"))    

        xtc_fns = os.path.join(td,"aligned_xtcs")
        data_fns = get_fns(xtc_fns,"*.xtc")
        ind_fns = os.path.join(td,"indicators")
        inds = get_fns(ind_fns,"*.npy")
        
        print(len(data_fns))
        assert len(data_fns) == len(inds)

    finally:
        os.remove(traj_dirs_tmp)
        os.remove(pdb_fns_tmp)
        os.remove(inds_fn_tmp)
        shutil.rmtree(td)
Example #2
0
def test_whitening_correctness():

    w = WhitenTraj("./data/whitened/")
    master = md.load("./data/whitened/master.pdb")
    traj1 = md.load("./data/whitened/aligned_xtcs/000000.xtc", top=master)
    traj2 = md.load("./data/whitened/aligned_xtcs/000001.xtc", top=master)
    wm = np.load("./data/whitened/wm.npy")
    w.apply_whitening_xtc_dir(w.xtc_dir, master.top, wm, w.cm, 1,
                              "./data/whitened/whitened_xtcs")
    traj_fns = get_fns("./data/whitened/whitened_xtcs/", "*.xtc")
    traj = md.load(traj_fns[0], top=master)
    coords = traj.xyz.reshape((2501, 3 * 39))
    c00_1 = np.matmul(coords.transpose(), coords)

    traj = md.load(traj_fns[1], top=master)
    coords = traj.xyz.reshape((2500, 3 * 39))
    c00_2 = np.matmul(coords.transpose(), coords)
    c00 = c00_1 + c00_2
    c00 /= 5001

    assert (np.abs(117 - np.sum(np.diagonal(c00))) < 1)
Example #3
0
def preprocess_data(sim_dirs, pdb_fns, outdir, atom_sel=None, stride=1):
    """ sim_dirs: Path to an np.array containing directory names. The 
               array needs one directory name for each variant where each
               directory contains all trajectories for that variant.

        pdb_fns: Path to an np.array containing pdb filenames. The 
               array needs one pdb filename for each variant. The order of 
               variants should match the order of sim_dirs.

        atom_sel: (optional) Path to an np.array containing a list of indices for 
              each variant, which operates on the pdbs supplied. The indices
              need to select equivalent atoms across variants.

        stride: (optional) Path to an np.array containing an integer for
               each variant.

        outdir: Path you would like processed data to live.
 """
    try:
        var_dir_names = np.load(sim_dirs)
    except:
        click.echo(f'Incorrect input for sim_dirs. Use --help flag for '
                   'information on the correct input for sim_dirs.')
        raise

    try:
        var_pdb_fns = np.load(pdb_fns)
    except:
        click.echo(f'Incorrect input for pdb_fns. Use --help flag for '
                   'information on the correct input for pdb_fns.')
        raise

    if stride:
        try:
            stride = np.load(stride)
        except:
            click.echo(
                f'Incorrect input for stride. User must supply a '
                'path to a np.array that has a stride value for each variant.')
            raise

    if atom_sel:
        try:
            atom_sel = np.load(atom_sel)
            #Add a check to make sure atom_sel is not same
            n_atoms = [
                md.load(fn).atom_slice(atom_sel[i]).n_atoms
                for i, fn in enumerate(var_pdb_fns)
            ]
            if len(np.unique(n_atoms)) != 1:
                raise ImproperlyConfigured(
                    f'atom_sel needs to choose equivalent atoms across variants. '
                    'After performing atom_sel, pdbs have different numbers of '
                    'atoms.')
        except:
            click.echo(f'Incorrect input for atom_sel. Use --help flag for '
                       'information on the correct input for atom_sel.')
            raise

    else:
        n_resis = []
        for fn in var_pdb_fns:
            pdb = md.load(fn)
            n_resis.append(pdb.top.n_residues)
        if len(np.unique(n_resis)) != 1:
            raise ImproperlyConfigured(
                f'The PDBs supplied have different numbers of residues. The '
                'default atom selection does not work in this case. Please '
                'use the --atom-sel option to choose equivalent atoms across  '
                'different variant pdbs.')

    if len(var_dir_names) != len(var_pdb_fns):
        raise ImproperlyConfigured(
            f'pdb_fns and sim_dirs must point to np.arrays that have '
            'the same length')

    for vd, fn in zip(var_dir_names, var_pdb_fns):
        traj_fns = get_fns(vd, "*.xtc")
        n_traj = len(traj_fns)
        click.echo("Found %s trajectories in %s" % (n_traj, vd))
        if n_traj == 0:
            raise ImproperlyConfigured("Found no trajectories in %s" % vd)
        try:
            traj = md.load(traj_fns[0], top=fn)
        except:
            click.echo(f'Order of pdb_fns and sim_dirs need to '
                       'correspond to each other.')
            raise

    proc_traj = ProcessTraj(var_dir_names,
                            var_pdb_fns,
                            outdir,
                            stride=stride,
                            atom_sel=atom_sel)
    proc_traj.run()
    print("Aligned trajectories")
    whiten_traj = WhitenTraj(outdir)
    print("starting trajectory whitening")
    whiten_traj.run()
Example #4
0
def train(config):
    """ config: YML config file. See train_sample.yml for an example and
                train_sample.txt for parameter descriptions.
    """
    with open(config) as f:
        job = yaml.load(f)

    required_keys = [
        'data_dir', 'n_epochs', 'act_map', 'lr', 'n_latent',
        'hidden_layer_sizes', 'em_bounds', 'do_em', 'em_batch_size', 'nntype',
        'batch_size', 'batch_output_freq', 'epoch_output_freq',
        'test_batch_size', 'frac_test', 'subsample', 'outdir', 'data_in_mem'
    ]
    optional_keys = ["close_inds_fn", "label_spreading"]

    if hasattr(job['nntype'], 'split_inds'):
        required_keys.append("close_inds_fn")

    if "label_spreading" in job.keys():
        if job["label_spreading"] != "gaussian" and job[
                "label_spreading"] != "uniform":
            raise ImproperlyConfigured(
                f'label_spreading must be set to gaussian or uniform')

    for key in job.keys():
        try:
            required_keys.remove(key)
        except:
            if key in optional_keys:
                continue
            else:
                raise ImproperlyConfigured(
                    f'{key} is not a valid parameter. Check yaml file.')

    if len(required_keys) != 0:
        raise ImproperlyConfigured(
            f'Missing the following parameters in {config} '
            '{required_keys} ')

    data_dir = job['data_dir']
    data_fns = get_fns(data_dir, "*.npy")
    wm_fn = os.path.join(data_dir, "wm.npy")
    if wm_fn not in data_fns:
        raise ImproperlyConfigured(
            f'Cannot find wm.npy in preprocessed data directory. Likely '
            'need to re-run data preprocessing step.')

    xtc_fns = os.path.join(data_dir, "aligned_xtcs")
    data_fns = get_fns(xtc_fns, "*.xtc")
    ind_fns = os.path.join(data_dir, "indicators")
    inds = get_fns(ind_fns, "*.npy")
    if (len(inds) != len(data_fns)) or len(inds) == 0:
        raise ImproperlyConfigured(
            f'Number of files in aligned_xtcs and indicators should be '
            'equal. Likely need to re-run data preprocessing step.')
    last_indi = np.load(inds[-1])

    n_cores = mp.cpu_count()
    master_fn = os.path.join(job['data_dir'], "master.pdb")
    master = md.load(master_fn)
    n_atoms = master.top.n_atoms
    n_features = 3 * n_atoms
    job['layer_sizes'] = [n_features, n_features]
    if len(job['hidden_layer_sizes']) == 0:
        job['layer_sizes'].append(int(n_features / 4))
    else:
        for layer in job['hidden_layer_sizes']:
            job['layer_sizes'].append(layer)
    job['layer_sizes'].append(job['n_latent'])
    job['act_map'] = np.array(job['act_map'], dtype=float)
    job['em_bounds'] = np.array(job['em_bounds'])
    job['em_n_cores'] = n_cores
    job['nntype'] = nn_d[job['nntype']]

    if len(job['act_map']) != last_indi[0] + 1:
        raise ImproperlyConfigured(
            f'act_map needs to contain a value for each variant.')

    if len(job['act_map']) != len(job['em_bounds']):
        raise ImproperlyConfigured(
            f'act_map and em_bounds should be the same length since '
            'each variant needs an initial classification label and '
            'a range for the EM update')

    if n_features != job['layer_sizes'][0]:
        raise ImproperlyConfigured(
            f'1st layer size does not match the number of xyz coordinates')

    if job['layer_sizes'][0] != job['layer_sizes'][1]:
        raise ImproperlyConfigured(f'1st and 2nd layer size need to be equal.')

    if job['layer_sizes'][-1] != job['n_latent']:
        raise ImproperlyConfigured(
            f'Last layer size needs to equal number of latent variables')

    if 'close_inds_fn' in job.keys():
        if hasattr(job['nntype'], 'split_inds'):
            inds = np.load(job['close_inds_fn'])
            close_xyz_inds = []
            for i in inds:
                close_xyz_inds.append(i * 3)
                close_xyz_inds.append((i * 3) + 1)
                close_xyz_inds.append((i * 3) + 2)
            all_inds = np.arange((master.n_atoms * 3))
            non_close_xyz_inds = np.setdiff1d(all_inds, close_xyz_inds)
            job['inds1'] = np.array(close_xyz_inds)
            job['inds2'] = non_close_xyz_inds
        else:
            raise ImproperlyConfigured(
                f'Indices chosen for a split autoencoder architecture '
                '(close_inds_fn), but  a split autoencoder architecture '
                'was not chosen (nntype)')

    if not os.path.exists(job['outdir']):
        cmd = "mkdir %s" % job['outdir']
        os.system(cmd)
        shutil.copyfile(config, os.path.join(job['outdir'], config))
        #raise ImproperlyConfigured(
        #        f'outdir already exists. Rename and try again. ')

    trainer = Trainer(job)
    net = trainer.run(data_in_mem=job['data_in_mem'])