def pipeline_mapping(msa_name, df_dca, read=False, a2m=True): """ Map DCA indices to PDB-distance-matrix indices :param read: :param df_dca: :param msa_name: :return: """ from msa_functions import read_first_sequence_in_msa from read_db import get_lengths from get_residues import get_residues from get_region import get_dca_indices from mapping_functions import align_dca2pdb, apply_map print("(pipeline mapping)") if read: infile = "reference_maps\\ref_map_{}.txt".format( msa_name.strip(".fas")) map_pdb_dca = pd.read_csv(infile, delimiter="\t", header=0, dtype=str) # map_pdb_dca = map_pdb_dca.replace("?", np.nan).dropna() # some pdbs have unknown seq res UNK # map_pdb_dca = pd.read_csv(infile, delimiter="\t", header=0, dtype=str, usecols=(0,1)) # used for HK-RR # map_pdb_dca["#HMM"] = map_pdb_dca["#HMM"].astype(int) + 1 # used for HK-RR # map_to_pdb = dict(zip(map_pdb_dca["#HMM"], map_pdb_dca["col"])) # used for HK-RR map_pdb_dca = map_pdb_dca.replace( "X", np.nan).dropna() # some pdbs have unknown seq res UNK map_to_pdb = dict(zip(map_pdb_dca["dca_i"], map_pdb_dca["pdb_i"])) else: uniprot_lengths = get_lengths(msa_name) if a2m: _, dca_lengths, _ = get_dca_indices(msa_name, uniprot_lengths[0]) else: dca_lengths = uniprot_lengths # -- GET MAP FROM MSA TO PDB -- pdbseq_1, pdbseq_2 = get_residues(msa_name, seq=True) pdbseq = [pdbseq_1, pdbseq_2] # splits msa sequence based on modified uniprot lengths (removed lowercase) msaseq = read_first_sequence_in_msa(msa_name, split=True, len_a=dca_lengths[0]) map_to_pdb = align_dca2pdb(msa_name, pdbseq, msaseq) # print("(map dictionary) {}".format(map_to_pdb)) mapped_dca_array = apply_map(df_dca.to_numpy(), map_to_pdb) df_dca_mapped = pd.DataFrame( mapped_dca_array, columns=['i', 'j', 'fn_apc', 'fn', 'ui', 'uj']) df_dca_mapped['i'] = df_dca_mapped['i'].astype(int) df_dca_mapped['j'] = df_dca_mapped['j'].astype(int) df_dca_mapped['ui'] = df_dca_mapped['ui'].astype(int) df_dca_mapped['uj'] = df_dca_mapped['uj'].astype(int) return df_dca_mapped
def scramble_sequence(msa_name, n_replicates): from get_region import get_dca_indices from read_db import get_lengths """ :param msa_name: :param n_replicates: :return: """ if msa_name[0] == '1': pdbid_start_number = 1 elif msa_name[0] == '2': pdbid_start_number = 2 elif msa_name[0] == '3': pdbid_start_number = 3 elif msa_name[0] == '4': pdbid_start_number = 4 elif msa_name[0] == '5': pdbid_start_number = 5 results_dir = "scrambled_sequences_nots\\pdbid_{}\\{}\\".format( pdbid_start_number, msa_name) if not os.path.exists(results_dir): os.makedirs(results_dir) print("\tScramble {}".format(msa_name)) uniprot_lengths = get_lengths(msa_name) _, chain_length, _ = get_dca_indices(msa_name, uniprot_lengths[0]) print(chain_length[0], uniprot_lengths[0]) header_a, header_b, seq_a, seq_b = split_header_seq( msa_name, chain_length[0]) nSeqs = len(seq_b) # creates 2 lists of random indices for seq A and B randomIndex = list(permute_index(nSeqs, n_replicates)) outfile = [] for rep in range(n_replicates): scramble_seq = [] scramble_header = [] for i in range(nSeqs): rand_index_1 = randomIndex[rep][0][i] rand_index_2 = randomIndex[rep][1][i] scramble_header.append(header_a[rand_index_1] + '_' + header_b[rand_index_2]) scramble_seq.append(seq_a[rand_index_1] + seq_b[rand_index_2]) scramble_msa_dict = dict(zip(scramble_header, scramble_seq)) # Write MSA replicates to file outfile.append('{}{}_rep{}_scrambled.fas'.format( results_dir, msa_name, rep)) with open(outfile[rep], 'w', encoding='utf-8') as f: for key in scramble_msa_dict.keys(): f.write(">{}\n{}\n".format(key, scramble_msa_dict[key])) return outfile
def make_monomer_msa_from_dimer(msa): from scramble_sequence import split_header_seq from read_db import get_lengths from get_region import get_dca_indices outDir = "monomer_alignments\\" # msa = "2OXG_Z_2OXG_Y" cid = [msa.split("_")[1], msa.split("_")[3]] ch = get_lengths(msa) _, dca_chains, _ = get_dca_indices(msa, length_a=ch[0]) x = split_header_seq(msa, dca_chains[0]) for i in range(2): a = np.array([x[i], x[i + 2]]) a = a.transpose() np.savetxt("{}{}_{}.fas".format(outDir, msa[:4], cid[i]), a, fmt=">%s\n%s")
def monomer_restraint(sysName, df, cutoff): from read_db import get_lengths from get_region import get_dca_indices from mapping_functions import apply_map ch = get_lengths(sysName) _, dca_ch, _ = get_dca_indices(sysName, ch[0]) print(dca_ch) msa_pairs = [] protein1 = [] protein2 = [] for i in range(1, dca_ch[0]): for j in range(i + 1, dca_ch[0] + 1): protein1.append([i, j]) for i in range(1, dca_ch[1]): for j in range(i + 1, dca_ch[1] + 1): protein2.append([i + dca_ch[0], j + dca_ch[0]]) msa_pairs = np.array(protein1 + protein2) referenceMap = "results\\reference_maps\\ref_map_{}.txt".format(sysName) dfMonomer = df[df["chain_1"] == df["chain_2"]] dfMonomer = dfMonomer[dfMonomer["d"] <= cutoff].reset_index(drop=True) monomer_array = dfMonomer.iloc[:, :3].to_numpy() map_pdb_dca = pd.read_csv(referenceMap, delimiter="\t", header=0, dtype=str) map_pdb_dca = map_pdb_dca.replace( "?", np.nan).dropna() # some pdbs have unknown seq res UNK map_to_dca = dict(zip(map_pdb_dca["pdb_i"], map_pdb_dca["dca_i"])) mappedArray = apply_map(monomer_array, map_to_dca) r = mappedArray[:, :2] r = r.astype(dtype='int') msa_row = msa_pairs.view([('', msa_pairs.dtype)] * msa_pairs.shape[1]) pdb_row = r.view([('', r.dtype)] * r.shape[1]) exclusions_list = np.setdiff1d(msa_row, pdb_row).view(msa_pairs.dtype).reshape( -1, msa_pairs.shape[-1]) assert len(msa_pairs) - len(r) == len(exclusions_list) return exclusions_list
def map_dict(msa_name): import pandas as pd import numpy as np from get_region import get_dca_indices sifts_table_file = "databases/sifts/pdb_chain_uniprot_plus.csv" s = pd.read_csv(sifts_table_file, comment="#") pdbid = msa_name[:4].lower() chain_1 = msa_name.split("_")[1] chain_2 = msa_name.split("_")[3] pdb_start_chain_1 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_1").coord_start.values pdb_start_chain_2 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_2").coord_start.values pdb_end_chain_1 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_1").coord_end.values pdb_end_chain_2 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_2").coord_end.values uniprot_start_chain_1 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_start.values uniprot_end_chain_1 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_end.values uniprot_start_chain_2 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_start.values uniprot_end_chain_2 = s.query( "pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_end.values # pdb pdb_start_chain_1 = ([int(i) for i in pdb_start_chain_1]) pdb_end_chain_1 = ([int(i) for i in pdb_end_chain_1]) # add last index of end index + 1 for chain 2 pdb_start_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1) for i in pdb_start_chain_2]) pdb_end_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1) for i in pdb_end_chain_2]) # uniprot uniprot_start_chain_1 = ([int(i) for i in uniprot_start_chain_1]) uniprot_end_chain_1 = ([int(i) for i in uniprot_end_chain_1]) # add last index of end index + 1 for chain 2 uniprot_start_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1) for i in uniprot_start_chain_2]) uniprot_end_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1) for i in uniprot_end_chain_2]) pdb_start_indices = pdb_start_chain_1 + pdb_start_chain_2 pdb_end_indices = pdb_end_chain_1 + pdb_end_chain_2 uniprot_start_indices = uniprot_start_chain_1 + uniprot_start_chain_2 uniprot_end_indices = uniprot_end_chain_1 + uniprot_end_chain_2 pdb_indices = make_indices(pdb_start_indices, pdb_end_indices) # pdb_indices = range(1, len(pdb_indices)) uniprot_indices = make_indices(uniprot_start_indices, uniprot_end_indices) dca_indices = get_dca_indices(msa_name) uni2pdb = dict(zip(uniprot_indices, pdb_indices)) dca2uni = dict(zip(dca_indices, uniprot_indices)) dca2pdb = dict(zip(dca_indices, pdb_indices)) pdb2uni = dict(zip(pdb_indices, uniprot_indices)) # print(dca2pdb) return uni2pdb, dca2uni, dca2pdb, pdb2uni