コード例 #1
0
ファイル: normal_mode.py プロジェクト: lahplover/nnef
def nma_funnel():
    data_path = f'../hhsuite/hhsuite_beads/hhsuite/'
    protein_sample = pd.read_csv(f'data/hhsuite_CB_cullpdb_cath_funnel.csv')
    # data_path = f'data/fold/cullpdb_val_deep'
    # protein_sample = pd.read_csv(f'{data_path}/sample.csv')
    pdb_selected = protein_sample['pdb'].values

    hh_data_coords = h5py.File('data/hhsuite_CB_cullpdb_cath_funnel.h5',
                               'r',
                               libver='latest',
                               swmr=True)
    coords_dict = {}
    for pdb in tqdm(pdb_selected):
        coords_dict[pdb] = hh_data_coords[pdb][()]

    n_modes = 10
    with h5py.File('data/hhsuite_CB_cullpdb_cath_funnel_normal_modes.h5',
                   'w') as f:
        for pdb_id in tqdm(pdb_selected):
            coords_native = torch.tensor(coords_dict[pdb_id],
                                         dtype=torch.float,
                                         device=device)
            # seq, coords_native, profile = load_protein(data_path, pdb_id, device)
            protein = Protein(None, coords_native, None)
            e, v = calc_modes_cart(pdb_id, protein, data_path)
            modes_vec = v[:, 6:6 + n_modes].transpose(0,
                                                      1).cpu().numpy().reshape(
                                                          n_modes, -1, 3)
            dset = f.create_dataset(f'{pdb_id}',
                                    shape=modes_vec.shape,
                                    data=modes_vec,
                                    dtype='f4')
コード例 #2
0
def convert_rosetta_frags(query_pdb='2jsvX'):
    # get PDB id of rosetta fragment, and extract the CB of the fragment;
    # Sometimes this doesn't work, because the position_id used by Rosetta is different from ids used in my PDB files.
    frag_len = 5
    amino_acids = pd.read_csv('data/amino_acids.csv')
    vocab = {x.upper(): y for x, y in zip(amino_acids.AA3C, amino_acids.AA)}

    data_path = f'data/fragment/{query_pdb}/frags.fsc.200.5mers'
    if not os.path.exists(data_path+'.csv'):
        with open(data_path, 'rt') as f:
            with open(data_path+'.csv', 'wt') as f2:
                line = f.readline()
                f2.write(line[1:])
                for line in f.readlines():
                    if not line.startswith('#'):
                        f2.write(line)

    df = pd.read_csv(data_path+'.csv', sep='\s+')
    pdb_id = df['pdbid'].apply(lambda x: x.upper()).values
    chain = df['c'].values
    pdb_pos = df['vall_pos'].values
    query_pos = df['query_pos'].values
    score_diff = np.round(df['SequenceIdentity'].values * frag_len)

    query_fasta = f'data/fragment/{query_pdb}/{query_pdb}.fasta'
    with open(query_fasta, 'rt') as f:
        f.readline()
        query_seq = np.array(list(f.readline()[:-1]))

    ind = np.zeros(df.shape[0], dtype=np.int)
    q_pos_all = []
    coords_int_all = []

    for i in tqdm(range(pdb_id.shape[0])):
        if query_pos[i] <= 2:
            continue
        beads_path = f'data/fragment/prep_frag/rosetta/{pdb_id[i]}_{chain[i]}_bead.csv'
        if not os.path.exists(beads_path):
            continue
        df_beads = pd.read_csv(beads_path)
        idx = np.arange(df_beads.shape[0])
        # gnum = df_beads['group_num_pdb'].values
        # k = idx[gnum == pdb_pos[i]][0]
        gnum = df_beads['group_num'].values
        k = idx[gnum + 1 == pdb_pos[i]]
        if len(k) == 1:
            k = k[0]
        else:
            continue

        if (k < 2) | (k >= df_beads.shape[0]-frag_len):
            # No -2/ or +1 residues of the fragment
            continue
        if np.sum(gnum[k-1:k+frag_len+1] - gnum[k-2:k+frag_len]) != frag_len+2:
            # missing residues
            continue

        seq = df_beads['group_name'].apply(lambda x: vocab[x]).values
        seq_frag = seq[k:k+frag_len]
        seq_q = query_seq[query_pos[i]-1:query_pos[i]+frag_len-1]

        if np.sum(seq_frag != seq_q) != score_diff[i]:
            print('seq not match', pdb_id[i], seq_frag, seq_q)
            continue
        assert(np.sum(seq_frag != seq_q) == score_diff[i])

        # coords of [-2, -1, frag, +1] residues
        coords = df_beads[['xcb', 'ycb', 'zcb']].values[k-2:k+frag_len+1]
        coords = torch.tensor(coords)
        protein = Protein(None, coords, None)
        coords_int = protein.cartesian_to_internal(coords)

        q_pos_all.append(query_pos[i] - 3)  # rosetta query pos starts from 1, -3 converts it to internal index
        coords_int_all.append(coords_int)
        ind[i] = 1

    df[ind == 1].to_csv(data_path+'_int.csv', index=False)
    q_pos_all = np.array(q_pos_all)  # (num_frag,)
    coords_int_all = torch.stack(coords_int_all, dim=0).numpy()  # (num_frag, frag_len, 3)
    print(f'{query_pdb} total number of frags: {q_pos_all.shape[0]}')
    with h5py.File(f'data/fragment/{query_pdb}/{query_pdb}_int.h5', 'w') as f:
        dset = f.create_dataset("query_pos", shape=q_pos_all.shape, data=q_pos_all, dtype='i')
        dset = f.create_dataset("coords_int", shape=coords_int_all.shape, data=coords_int_all, dtype='f4')
コード例 #3
0
ファイル: decoy_score.py プロジェクト: lahplover/nnef
                loss_terms = trainer.step(data)
                loss = 0
                for loss_i in loss_terms:
                    loss += loss_i.item()

                loss_all.append(loss)
    else:
        for decoy in decoy_list:
            if (decoy_set == 'casp13') | (decoy_set == 'casp14'):
                decoy_id = decoy
            else:
                decoy_id = decoy[:-4]
            seq, coords_native, profile = load_protein_decoy(
                pdb_id, decoy_id, args.mode, device, args)

            protein = Protein(seq, coords_native, profile)
            energy = protein.get_energy(energy_fn).item()
            # print('energy:', energy)
            # residue_energy = protein.get_residue_energy(energy_fn)
            # print(residue_energy)

            if args.relax:
                minimizer = GradMinimizerCartesian(energy_fn,
                                                   protein,
                                                   num_steps=args.relax_steps)
                minimizer.run()
                energy = minimizer.energy_best
                print('energy relaxed:', energy)
            loss_all.append(energy)

    print(pdb_id, loss_all[0])
コード例 #4
0
# sample_ic = SampleICNext(mode)
exp_id = args.load_exp[-5:]
save_dir = args.save_dir
# if not os.path.exists(f'data/fold/{exp_id}'):
#     os.mkdir(f'data/fold/{exp_id}')
if not os.path.exists(f'data/fold/{save_dir}'):
    os.mkdir(f'data/fold/{save_dir}')

num_iter = 20

for pdb_id in pdb_selected:

    seq, coords_native, profile = load_protein(data_path, pdb_id, mode, device,
                                               args)

    protein_native = Protein(seq, coords_native, profile)
    energy_native = protein_native.get_energy(energy_fn).item()
    print('energy_native:', energy_native)
    rg2, collision = protein_native.get_rad_gyration(coords_native)
    print('native radius of gyration square:', rg2.item())
    # residue_energy = protein_native.get_residue_energy(energy_fn)
    # print(residue_energy)
    # write_pdb(seq, coords_native, pdb_id, 'native', exp_id)

    sample_best_all = [coords_native.cpu()]
    energy_best_all = [energy_native]
    for i in tqdm(range(num_iter)):
        protein = Protein(seq, coords_native.clone(), profile.clone())
        if args.random_init:
            # random_coords_int = sample_ic.random_coords_int(len(seq)-3).to(device)
            # protein.update_coords_internal(random_coords_int)
コード例 #5
0
    beads_idx = np.append(ca_gly, cb)
    beads_idx = np.sort(beads_idx)
    print(beads_idx)
    return beads_idx


# md_data_list = ['BPTI', 'Fip35', 'val_deep']
md_data_list = ['val_deep']

if 'Fip35' in md_data_list:
    root_dir = '/home/hyang/bio/erf/data/decoys/msm'
    trj_dir1 = f'{root_dir}/deshaw/DESRES-Trajectory-ww_1-protein/ww_1-protein/'
    trj_dir2 = f'{root_dir}/deshaw/DESRES-Trajectory-ww_2-protein/ww_2-protein/'

    seq_native, coords_native, profile_native = load_protein_bead(f'{root_dir}/fip35_bead.csv', 'CB', device)
    protein_native = Protein(seq_native, coords_native, profile_native)
    energy_native = protein_native.get_energy(energy_fn).item()
    print('native', energy_native)

    for trj_dir in [trj_dir1, trj_dir2]:
        structure = md.load(f'{trj_dir}/ww-protein.pdb')
        top = structure.topology
        df = pd.read_csv(f'{trj_dir}/ww-protein-beads.csv')
        cb_idx = df['beads_cb_index'].values
        seq = df['group_name'].values
        seq_id = df['group_name'].apply(lambda x: vocab[x]).values
        profile = torch.tensor(seq_id, dtype=torch.long, device=device)

        score_list = []
        flist = pd.read_csv(f'{trj_dir}/flist.txt')['fname']
        for k, fname in enumerate(flist):
コード例 #6
0
ファイル: design_sample.py プロジェクト: lahplover/nnef
exp_id = args.load_exp[-5:]
save_dir = args.save_dir
if not os.path.exists(f'{root_dir}/{save_dir}'):
    os.mkdir(f'{root_dir}/{save_dir}')

for pdb_id in tqdm(pdb_selected):
    if os.path.exists(f'{root_dir}/{save_dir}/{pdb_id}_profile.h5'):
        continue

    seq, coords_native, profile = load_protein(root_dir, pdb_id, mode, device, args)

    # skip long sequences
    # if len(seq) > 400:
    #     continue

    protein_native = Protein(seq, coords_native, profile)
    energy_native = protein_native.get_energy(energy_fn).item()
    print('energy_native:', energy_native)
    residue_energy = protein_native.get_residue_energy(energy_fn)
    print(profile)
    print(residue_energy)

    protein = Protein(seq, coords_native.clone(), profile.clone())
    if args.random_init:
        protein.profile = torch.randint(0, 20, profile.size(), device=profile.device)

    energy_init = protein.get_energy(energy_fn).item()
    print('energy_init:', energy_init)

    if design_engine != 'mutation':
        # simulated annealing
コード例 #7
0
decoy_set = args.decoy_set
decoy_loss_dir = args.decoy_loss_dir
root_dir = f'./data/fold/cullpdb_val_deep/'

if not os.path.exists(f'{root_dir}/{decoy_loss_dir}'):
    os.system(f'mkdir -p {root_dir}/{decoy_loss_dir}')

pdb_selected = pd.read_csv(f'{root_dir}/sample.csv')['pdb'].values

for pdb_id in tqdm(pdb_selected):
    if os.path.exists(
            f'{root_dir}/{decoy_loss_dir}/{pdb_id}_{decoy_set}_loss.csv'):
        continue
    seq, coords_native, profile = load_protein(root_dir, pdb_id, device)
    protein = Protein(seq, coords_native, profile)
    energy_native = protein.get_energy(energy_fn).item()

    # load decoy coordinates
    decoy_file = h5py.File(f'{root_dir}/{pdb_id}_decoys_{decoy_set}.h5', 'r')
    coords_decoy = decoy_file['coords'][()]
    coords_decoy = torch.tensor(coords_decoy, dtype=torch.float, device=device)

    loss_all = [energy_native]
    loss_relax = [energy_native]
    for i in tqdm(range(coords_decoy.shape[0])):
        protein.coords = coords_decoy[i]
        energy = protein.get_energy(energy_fn).item()
        loss_all.append(energy)

        if args.relax:
コード例 #8
0
ファイル: normal_mode.py プロジェクト: lahplover/nnef
def nma_test():
    # data_path = f'data/fold/cullpdb_val_deep'
    data_path = f'data/normal_modes'

    protein_sample = pd.read_csv(f'{data_path}/sample.csv')

    pdb_selected = protein_sample['pdb'].values

    for pdb_id in pdb_selected:
        seq, coords_native, profile = load_protein(data_path, pdb_id, device)

        protein = Protein(seq, coords_native.clone(), profile)
        e, v = calc_modes_cart(pdb_id, protein, data_path)

        protein = Protein(seq, coords_native.clone(), profile)
        e_int, v_int = calc_modes_int(pdb_id, protein, data_path)

        # make movies of the normal modes in cartesian space
        num = 7
        n_modes = 6
        for i in range(n_modes):
            mode1 = v[:, 6 + i]
            # mode1 = v[:, -i]
            dxyz = mode1.reshape((-1, 3))
            dxyz = 0.5 * dxyz * np.sqrt(
                dxyz.shape[0])  # make the norm of each mode to 0.5 A
            coords_mode = excite_mode_cart(coords_native,
                                           dxyz,
                                           num=num,
                                           scale=1)
            write_pdb_sample(coords_mode,
                             seq,
                             pdb_id,
                             data_path,
                             nm=f'mode{i}')
            print((torch.sum(dxyz**2) / dxyz.shape[0])**0.5)

            sample_rmsd, coords_align = align_rmsd(coords_mode, num // 2)
            print(sample_rmsd)
            write_pdb_sample(coords_align,
                             seq,
                             pdb_id,
                             data_path,
                             nm=f'mode{i}_align')

        # make movies of the normal modes in torsional space
        num = 7
        n_modes = 6
        for i in range(n_modes):
            dphi = v_int[:, 6 + i]
            # dphi = v_int[:, -i]
            coords_mode = excite_mode_int(protein, dphi, num=num, scale=0.5)
            write_pdb_sample(coords_mode,
                             seq,
                             pdb_id,
                             data_path,
                             nm=f'mode_int{i}')

            sample_rmsd, coords_align = align_rmsd(coords_mode, num // 2)
            print(sample_rmsd)
            write_pdb_sample(coords_align,
                             seq,
                             pdb_id,
                             data_path,
                             nm=f'mode_int{i}_align')

        # make decoys by linear combinations of low freq modes in cartesian space
        num = 7
        n_modes = 4
        modes_vec = v[:, 6:6 + n_modes].transpose(0, 1)
        coords_all = mix_modes_cart(coords_native,
                                    modes_vec,
                                    num=num,
                                    scale=4.0)
        with h5py.File(f'{data_path}/{pdb_id}_decoys_cart.h5', 'w') as f:
            dset = f.create_dataset("coords",
                                    shape=coords_all.shape,
                                    data=coords_all.detach().cpu().numpy(),
                                    dtype='f4')
        # write_pdb_sample(coords_all, seq, pdb_id, data_path, nm=f'mode_mix')

        # make decoys by linear combinations of low freq modes in torsional space
        num = 7
        n_modes = 4
        modes_vec = v_int[:, 6:6 + n_modes].transpose(0, 1)
        coords_all = mix_modes_int(protein, modes_vec, num=num, scale=0.5)
        with h5py.File(f'{data_path}/{pdb_id}_decoys_int.h5', 'w') as f:
            dset = f.create_dataset("coords",
                                    shape=coords_all.shape,
                                    data=coords_all.detach().cpu().numpy(),
                                    dtype='f4')