def nma_funnel(): data_path = f'../hhsuite/hhsuite_beads/hhsuite/' protein_sample = pd.read_csv(f'data/hhsuite_CB_cullpdb_cath_funnel.csv') # data_path = f'data/fold/cullpdb_val_deep' # protein_sample = pd.read_csv(f'{data_path}/sample.csv') pdb_selected = protein_sample['pdb'].values hh_data_coords = h5py.File('data/hhsuite_CB_cullpdb_cath_funnel.h5', 'r', libver='latest', swmr=True) coords_dict = {} for pdb in tqdm(pdb_selected): coords_dict[pdb] = hh_data_coords[pdb][()] n_modes = 10 with h5py.File('data/hhsuite_CB_cullpdb_cath_funnel_normal_modes.h5', 'w') as f: for pdb_id in tqdm(pdb_selected): coords_native = torch.tensor(coords_dict[pdb_id], dtype=torch.float, device=device) # seq, coords_native, profile = load_protein(data_path, pdb_id, device) protein = Protein(None, coords_native, None) e, v = calc_modes_cart(pdb_id, protein, data_path) modes_vec = v[:, 6:6 + n_modes].transpose(0, 1).cpu().numpy().reshape( n_modes, -1, 3) dset = f.create_dataset(f'{pdb_id}', shape=modes_vec.shape, data=modes_vec, dtype='f4')
def convert_rosetta_frags(query_pdb='2jsvX'): # get PDB id of rosetta fragment, and extract the CB of the fragment; # Sometimes this doesn't work, because the position_id used by Rosetta is different from ids used in my PDB files. frag_len = 5 amino_acids = pd.read_csv('data/amino_acids.csv') vocab = {x.upper(): y for x, y in zip(amino_acids.AA3C, amino_acids.AA)} data_path = f'data/fragment/{query_pdb}/frags.fsc.200.5mers' if not os.path.exists(data_path+'.csv'): with open(data_path, 'rt') as f: with open(data_path+'.csv', 'wt') as f2: line = f.readline() f2.write(line[1:]) for line in f.readlines(): if not line.startswith('#'): f2.write(line) df = pd.read_csv(data_path+'.csv', sep='\s+') pdb_id = df['pdbid'].apply(lambda x: x.upper()).values chain = df['c'].values pdb_pos = df['vall_pos'].values query_pos = df['query_pos'].values score_diff = np.round(df['SequenceIdentity'].values * frag_len) query_fasta = f'data/fragment/{query_pdb}/{query_pdb}.fasta' with open(query_fasta, 'rt') as f: f.readline() query_seq = np.array(list(f.readline()[:-1])) ind = np.zeros(df.shape[0], dtype=np.int) q_pos_all = [] coords_int_all = [] for i in tqdm(range(pdb_id.shape[0])): if query_pos[i] <= 2: continue beads_path = f'data/fragment/prep_frag/rosetta/{pdb_id[i]}_{chain[i]}_bead.csv' if not os.path.exists(beads_path): continue df_beads = pd.read_csv(beads_path) idx = np.arange(df_beads.shape[0]) # gnum = df_beads['group_num_pdb'].values # k = idx[gnum == pdb_pos[i]][0] gnum = df_beads['group_num'].values k = idx[gnum + 1 == pdb_pos[i]] if len(k) == 1: k = k[0] else: continue if (k < 2) | (k >= df_beads.shape[0]-frag_len): # No -2/ or +1 residues of the fragment continue if np.sum(gnum[k-1:k+frag_len+1] - gnum[k-2:k+frag_len]) != frag_len+2: # missing residues continue seq = df_beads['group_name'].apply(lambda x: vocab[x]).values seq_frag = seq[k:k+frag_len] seq_q = query_seq[query_pos[i]-1:query_pos[i]+frag_len-1] if np.sum(seq_frag != seq_q) != score_diff[i]: print('seq not match', pdb_id[i], seq_frag, seq_q) continue assert(np.sum(seq_frag != seq_q) == score_diff[i]) # coords of [-2, -1, frag, +1] residues coords = df_beads[['xcb', 'ycb', 'zcb']].values[k-2:k+frag_len+1] coords = torch.tensor(coords) protein = Protein(None, coords, None) coords_int = protein.cartesian_to_internal(coords) q_pos_all.append(query_pos[i] - 3) # rosetta query pos starts from 1, -3 converts it to internal index coords_int_all.append(coords_int) ind[i] = 1 df[ind == 1].to_csv(data_path+'_int.csv', index=False) q_pos_all = np.array(q_pos_all) # (num_frag,) coords_int_all = torch.stack(coords_int_all, dim=0).numpy() # (num_frag, frag_len, 3) print(f'{query_pdb} total number of frags: {q_pos_all.shape[0]}') with h5py.File(f'data/fragment/{query_pdb}/{query_pdb}_int.h5', 'w') as f: dset = f.create_dataset("query_pos", shape=q_pos_all.shape, data=q_pos_all, dtype='i') dset = f.create_dataset("coords_int", shape=coords_int_all.shape, data=coords_int_all, dtype='f4')
loss_terms = trainer.step(data) loss = 0 for loss_i in loss_terms: loss += loss_i.item() loss_all.append(loss) else: for decoy in decoy_list: if (decoy_set == 'casp13') | (decoy_set == 'casp14'): decoy_id = decoy else: decoy_id = decoy[:-4] seq, coords_native, profile = load_protein_decoy( pdb_id, decoy_id, args.mode, device, args) protein = Protein(seq, coords_native, profile) energy = protein.get_energy(energy_fn).item() # print('energy:', energy) # residue_energy = protein.get_residue_energy(energy_fn) # print(residue_energy) if args.relax: minimizer = GradMinimizerCartesian(energy_fn, protein, num_steps=args.relax_steps) minimizer.run() energy = minimizer.energy_best print('energy relaxed:', energy) loss_all.append(energy) print(pdb_id, loss_all[0])
# sample_ic = SampleICNext(mode) exp_id = args.load_exp[-5:] save_dir = args.save_dir # if not os.path.exists(f'data/fold/{exp_id}'): # os.mkdir(f'data/fold/{exp_id}') if not os.path.exists(f'data/fold/{save_dir}'): os.mkdir(f'data/fold/{save_dir}') num_iter = 20 for pdb_id in pdb_selected: seq, coords_native, profile = load_protein(data_path, pdb_id, mode, device, args) protein_native = Protein(seq, coords_native, profile) energy_native = protein_native.get_energy(energy_fn).item() print('energy_native:', energy_native) rg2, collision = protein_native.get_rad_gyration(coords_native) print('native radius of gyration square:', rg2.item()) # residue_energy = protein_native.get_residue_energy(energy_fn) # print(residue_energy) # write_pdb(seq, coords_native, pdb_id, 'native', exp_id) sample_best_all = [coords_native.cpu()] energy_best_all = [energy_native] for i in tqdm(range(num_iter)): protein = Protein(seq, coords_native.clone(), profile.clone()) if args.random_init: # random_coords_int = sample_ic.random_coords_int(len(seq)-3).to(device) # protein.update_coords_internal(random_coords_int)
beads_idx = np.append(ca_gly, cb) beads_idx = np.sort(beads_idx) print(beads_idx) return beads_idx # md_data_list = ['BPTI', 'Fip35', 'val_deep'] md_data_list = ['val_deep'] if 'Fip35' in md_data_list: root_dir = '/home/hyang/bio/erf/data/decoys/msm' trj_dir1 = f'{root_dir}/deshaw/DESRES-Trajectory-ww_1-protein/ww_1-protein/' trj_dir2 = f'{root_dir}/deshaw/DESRES-Trajectory-ww_2-protein/ww_2-protein/' seq_native, coords_native, profile_native = load_protein_bead(f'{root_dir}/fip35_bead.csv', 'CB', device) protein_native = Protein(seq_native, coords_native, profile_native) energy_native = protein_native.get_energy(energy_fn).item() print('native', energy_native) for trj_dir in [trj_dir1, trj_dir2]: structure = md.load(f'{trj_dir}/ww-protein.pdb') top = structure.topology df = pd.read_csv(f'{trj_dir}/ww-protein-beads.csv') cb_idx = df['beads_cb_index'].values seq = df['group_name'].values seq_id = df['group_name'].apply(lambda x: vocab[x]).values profile = torch.tensor(seq_id, dtype=torch.long, device=device) score_list = [] flist = pd.read_csv(f'{trj_dir}/flist.txt')['fname'] for k, fname in enumerate(flist):
exp_id = args.load_exp[-5:] save_dir = args.save_dir if not os.path.exists(f'{root_dir}/{save_dir}'): os.mkdir(f'{root_dir}/{save_dir}') for pdb_id in tqdm(pdb_selected): if os.path.exists(f'{root_dir}/{save_dir}/{pdb_id}_profile.h5'): continue seq, coords_native, profile = load_protein(root_dir, pdb_id, mode, device, args) # skip long sequences # if len(seq) > 400: # continue protein_native = Protein(seq, coords_native, profile) energy_native = protein_native.get_energy(energy_fn).item() print('energy_native:', energy_native) residue_energy = protein_native.get_residue_energy(energy_fn) print(profile) print(residue_energy) protein = Protein(seq, coords_native.clone(), profile.clone()) if args.random_init: protein.profile = torch.randint(0, 20, profile.size(), device=profile.device) energy_init = protein.get_energy(energy_fn).item() print('energy_init:', energy_init) if design_engine != 'mutation': # simulated annealing
decoy_set = args.decoy_set decoy_loss_dir = args.decoy_loss_dir root_dir = f'./data/fold/cullpdb_val_deep/' if not os.path.exists(f'{root_dir}/{decoy_loss_dir}'): os.system(f'mkdir -p {root_dir}/{decoy_loss_dir}') pdb_selected = pd.read_csv(f'{root_dir}/sample.csv')['pdb'].values for pdb_id in tqdm(pdb_selected): if os.path.exists( f'{root_dir}/{decoy_loss_dir}/{pdb_id}_{decoy_set}_loss.csv'): continue seq, coords_native, profile = load_protein(root_dir, pdb_id, device) protein = Protein(seq, coords_native, profile) energy_native = protein.get_energy(energy_fn).item() # load decoy coordinates decoy_file = h5py.File(f'{root_dir}/{pdb_id}_decoys_{decoy_set}.h5', 'r') coords_decoy = decoy_file['coords'][()] coords_decoy = torch.tensor(coords_decoy, dtype=torch.float, device=device) loss_all = [energy_native] loss_relax = [energy_native] for i in tqdm(range(coords_decoy.shape[0])): protein.coords = coords_decoy[i] energy = protein.get_energy(energy_fn).item() loss_all.append(energy) if args.relax:
def nma_test(): # data_path = f'data/fold/cullpdb_val_deep' data_path = f'data/normal_modes' protein_sample = pd.read_csv(f'{data_path}/sample.csv') pdb_selected = protein_sample['pdb'].values for pdb_id in pdb_selected: seq, coords_native, profile = load_protein(data_path, pdb_id, device) protein = Protein(seq, coords_native.clone(), profile) e, v = calc_modes_cart(pdb_id, protein, data_path) protein = Protein(seq, coords_native.clone(), profile) e_int, v_int = calc_modes_int(pdb_id, protein, data_path) # make movies of the normal modes in cartesian space num = 7 n_modes = 6 for i in range(n_modes): mode1 = v[:, 6 + i] # mode1 = v[:, -i] dxyz = mode1.reshape((-1, 3)) dxyz = 0.5 * dxyz * np.sqrt( dxyz.shape[0]) # make the norm of each mode to 0.5 A coords_mode = excite_mode_cart(coords_native, dxyz, num=num, scale=1) write_pdb_sample(coords_mode, seq, pdb_id, data_path, nm=f'mode{i}') print((torch.sum(dxyz**2) / dxyz.shape[0])**0.5) sample_rmsd, coords_align = align_rmsd(coords_mode, num // 2) print(sample_rmsd) write_pdb_sample(coords_align, seq, pdb_id, data_path, nm=f'mode{i}_align') # make movies of the normal modes in torsional space num = 7 n_modes = 6 for i in range(n_modes): dphi = v_int[:, 6 + i] # dphi = v_int[:, -i] coords_mode = excite_mode_int(protein, dphi, num=num, scale=0.5) write_pdb_sample(coords_mode, seq, pdb_id, data_path, nm=f'mode_int{i}') sample_rmsd, coords_align = align_rmsd(coords_mode, num // 2) print(sample_rmsd) write_pdb_sample(coords_align, seq, pdb_id, data_path, nm=f'mode_int{i}_align') # make decoys by linear combinations of low freq modes in cartesian space num = 7 n_modes = 4 modes_vec = v[:, 6:6 + n_modes].transpose(0, 1) coords_all = mix_modes_cart(coords_native, modes_vec, num=num, scale=4.0) with h5py.File(f'{data_path}/{pdb_id}_decoys_cart.h5', 'w') as f: dset = f.create_dataset("coords", shape=coords_all.shape, data=coords_all.detach().cpu().numpy(), dtype='f4') # write_pdb_sample(coords_all, seq, pdb_id, data_path, nm=f'mode_mix') # make decoys by linear combinations of low freq modes in torsional space num = 7 n_modes = 4 modes_vec = v_int[:, 6:6 + n_modes].transpose(0, 1) coords_all = mix_modes_int(protein, modes_vec, num=num, scale=0.5) with h5py.File(f'{data_path}/{pdb_id}_decoys_int.h5', 'w') as f: dset = f.create_dataset("coords", shape=coords_all.shape, data=coords_all.detach().cpu().numpy(), dtype='f4')