def pipeline_mapping(msa_name, df_dca, read=False, a2m=True):
    """
    Map DCA indices to PDB-distance-matrix indices
    :param read:
    :param df_dca:
    :param msa_name:
    :return:
    """
    from msa_functions import read_first_sequence_in_msa
    from read_db import get_lengths
    from get_residues import get_residues
    from get_region import get_dca_indices
    from mapping_functions import align_dca2pdb, apply_map
    print("(pipeline mapping)")
    if read:
        infile = "reference_maps\\ref_map_{}.txt".format(
            msa_name.strip(".fas"))
        map_pdb_dca = pd.read_csv(infile, delimiter="\t", header=0, dtype=str)
        # map_pdb_dca = map_pdb_dca.replace("?", np.nan).dropna()    # some pdbs have unknown seq res UNK
        # map_pdb_dca = pd.read_csv(infile, delimiter="\t", header=0, dtype=str, usecols=(0,1)) # used for HK-RR
        # map_pdb_dca["#HMM"] = map_pdb_dca["#HMM"].astype(int) + 1 # used for HK-RR
        # map_to_pdb = dict(zip(map_pdb_dca["#HMM"], map_pdb_dca["col"])) # used for HK-RR
        map_pdb_dca = map_pdb_dca.replace(
            "X", np.nan).dropna()  # some pdbs have unknown seq res UNK
        map_to_pdb = dict(zip(map_pdb_dca["dca_i"], map_pdb_dca["pdb_i"]))

    else:
        uniprot_lengths = get_lengths(msa_name)
        if a2m:
            _, dca_lengths, _ = get_dca_indices(msa_name, uniprot_lengths[0])
        else:
            dca_lengths = uniprot_lengths

        # -- GET MAP FROM MSA TO PDB --
        pdbseq_1, pdbseq_2 = get_residues(msa_name, seq=True)
        pdbseq = [pdbseq_1, pdbseq_2]
        # splits msa sequence based on modified uniprot lengths (removed lowercase)
        msaseq = read_first_sequence_in_msa(msa_name,
                                            split=True,
                                            len_a=dca_lengths[0])

        map_to_pdb = align_dca2pdb(msa_name, pdbseq, msaseq)

    # print("(map dictionary) {}".format(map_to_pdb))
    mapped_dca_array = apply_map(df_dca.to_numpy(), map_to_pdb)
    df_dca_mapped = pd.DataFrame(
        mapped_dca_array, columns=['i', 'j', 'fn_apc', 'fn', 'ui', 'uj'])
    df_dca_mapped['i'] = df_dca_mapped['i'].astype(int)
    df_dca_mapped['j'] = df_dca_mapped['j'].astype(int)
    df_dca_mapped['ui'] = df_dca_mapped['ui'].astype(int)
    df_dca_mapped['uj'] = df_dca_mapped['uj'].astype(int)

    return df_dca_mapped
Exemple #2
0
def scramble_sequence(msa_name, n_replicates):
    from get_region import get_dca_indices
    from read_db import get_lengths
    """

    :param msa_name:
    :param n_replicates:
    :return:
    """
    if msa_name[0] == '1':
        pdbid_start_number = 1
    elif msa_name[0] == '2':
        pdbid_start_number = 2
    elif msa_name[0] == '3':
        pdbid_start_number = 3
    elif msa_name[0] == '4':
        pdbid_start_number = 4
    elif msa_name[0] == '5':
        pdbid_start_number = 5

    results_dir = "scrambled_sequences_nots\\pdbid_{}\\{}\\".format(
        pdbid_start_number, msa_name)
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    print("\tScramble {}".format(msa_name))
    uniprot_lengths = get_lengths(msa_name)
    _, chain_length, _ = get_dca_indices(msa_name, uniprot_lengths[0])
    print(chain_length[0], uniprot_lengths[0])
    header_a, header_b, seq_a, seq_b = split_header_seq(
        msa_name, chain_length[0])
    nSeqs = len(seq_b)
    # creates 2 lists of random indices for seq A and B
    randomIndex = list(permute_index(nSeqs, n_replicates))
    outfile = []
    for rep in range(n_replicates):
        scramble_seq = []
        scramble_header = []
        for i in range(nSeqs):
            rand_index_1 = randomIndex[rep][0][i]
            rand_index_2 = randomIndex[rep][1][i]
            scramble_header.append(header_a[rand_index_1] + '_' +
                                   header_b[rand_index_2])
            scramble_seq.append(seq_a[rand_index_1] + seq_b[rand_index_2])
        scramble_msa_dict = dict(zip(scramble_header, scramble_seq))
        #     Write MSA replicates to file
        outfile.append('{}{}_rep{}_scrambled.fas'.format(
            results_dir, msa_name, rep))
        with open(outfile[rep], 'w', encoding='utf-8') as f:
            for key in scramble_msa_dict.keys():
                f.write(">{}\n{}\n".format(key, scramble_msa_dict[key]))
    return outfile
def make_monomer_msa_from_dimer(msa):
    from scramble_sequence import split_header_seq
    from read_db import get_lengths
    from get_region import get_dca_indices
    outDir = "monomer_alignments\\"
    # msa = "2OXG_Z_2OXG_Y"
    cid = [msa.split("_")[1], msa.split("_")[3]]
    ch = get_lengths(msa)
    _, dca_chains, _ = get_dca_indices(msa, length_a=ch[0])
    x = split_header_seq(msa, dca_chains[0])
    for i in range(2):
        a = np.array([x[i], x[i + 2]])
        a = a.transpose()
        np.savetxt("{}{}_{}.fas".format(outDir, msa[:4], cid[i]),
                   a,
                   fmt=">%s\n%s")
def monomer_restraint(sysName, df, cutoff):
    from read_db import get_lengths
    from get_region import get_dca_indices
    from mapping_functions import apply_map

    ch = get_lengths(sysName)
    _, dca_ch, _ = get_dca_indices(sysName, ch[0])
    print(dca_ch)
    msa_pairs = []
    protein1 = []
    protein2 = []
    for i in range(1, dca_ch[0]):
        for j in range(i + 1, dca_ch[0] + 1):
            protein1.append([i, j])
    for i in range(1, dca_ch[1]):
        for j in range(i + 1, dca_ch[1] + 1):
            protein2.append([i + dca_ch[0], j + dca_ch[0]])

    msa_pairs = np.array(protein1 + protein2)

    referenceMap = "results\\reference_maps\\ref_map_{}.txt".format(sysName)
    dfMonomer = df[df["chain_1"] == df["chain_2"]]
    dfMonomer = dfMonomer[dfMonomer["d"] <= cutoff].reset_index(drop=True)
    monomer_array = dfMonomer.iloc[:, :3].to_numpy()

    map_pdb_dca = pd.read_csv(referenceMap,
                              delimiter="\t",
                              header=0,
                              dtype=str)
    map_pdb_dca = map_pdb_dca.replace(
        "?", np.nan).dropna()  # some pdbs have unknown seq res UNK
    map_to_dca = dict(zip(map_pdb_dca["pdb_i"], map_pdb_dca["dca_i"]))

    mappedArray = apply_map(monomer_array, map_to_dca)
    r = mappedArray[:, :2]
    r = r.astype(dtype='int')

    msa_row = msa_pairs.view([('', msa_pairs.dtype)] * msa_pairs.shape[1])
    pdb_row = r.view([('', r.dtype)] * r.shape[1])
    exclusions_list = np.setdiff1d(msa_row,
                                   pdb_row).view(msa_pairs.dtype).reshape(
                                       -1, msa_pairs.shape[-1])
    assert len(msa_pairs) - len(r) == len(exclusions_list)
    return exclusions_list
Exemple #5
0
def map_dict(msa_name):
    import pandas as pd
    import numpy as np
    from get_region import get_dca_indices
    sifts_table_file = "databases/sifts/pdb_chain_uniprot_plus.csv"
    s = pd.read_csv(sifts_table_file, comment="#")
    pdbid = msa_name[:4].lower()
    chain_1 = msa_name.split("_")[1]
    chain_2 = msa_name.split("_")[3]

    pdb_start_chain_1 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_1").coord_start.values
    pdb_start_chain_2 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_2").coord_start.values
    pdb_end_chain_1 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_1").coord_end.values
    pdb_end_chain_2 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_2").coord_end.values

    uniprot_start_chain_1 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_start.values
    uniprot_end_chain_1 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_1").uniprot_end.values
    uniprot_start_chain_2 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_start.values
    uniprot_end_chain_2 = s.query(
        "pdb_id == @pdbid and pdb_chain == @chain_2").uniprot_end.values

    # pdb
    pdb_start_chain_1 = ([int(i) for i in pdb_start_chain_1])
    pdb_end_chain_1 = ([int(i) for i in pdb_end_chain_1])
    # add last index of end index + 1 for chain 2
    pdb_start_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1)
                          for i in pdb_start_chain_2])
    pdb_end_chain_2 = ([(int(i) + pdb_end_chain_1[-1] + 1)
                        for i in pdb_end_chain_2])

    # uniprot
    uniprot_start_chain_1 = ([int(i) for i in uniprot_start_chain_1])
    uniprot_end_chain_1 = ([int(i) for i in uniprot_end_chain_1])
    # add last index of end index + 1 for chain 2
    uniprot_start_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1)
                              for i in uniprot_start_chain_2])
    uniprot_end_chain_2 = ([(int(i) + uniprot_end_chain_1[-1] + 1)
                            for i in uniprot_end_chain_2])

    pdb_start_indices = pdb_start_chain_1 + pdb_start_chain_2
    pdb_end_indices = pdb_end_chain_1 + pdb_end_chain_2
    uniprot_start_indices = uniprot_start_chain_1 + uniprot_start_chain_2
    uniprot_end_indices = uniprot_end_chain_1 + uniprot_end_chain_2

    pdb_indices = make_indices(pdb_start_indices, pdb_end_indices)
    # pdb_indices = range(1, len(pdb_indices))
    uniprot_indices = make_indices(uniprot_start_indices, uniprot_end_indices)

    dca_indices = get_dca_indices(msa_name)
    uni2pdb = dict(zip(uniprot_indices, pdb_indices))
    dca2uni = dict(zip(dca_indices, uniprot_indices))
    dca2pdb = dict(zip(dca_indices, pdb_indices))
    pdb2uni = dict(zip(pdb_indices, uniprot_indices))
    # print(dca2pdb)
    return uni2pdb, dca2uni, dca2pdb, pdb2uni