Beispiel #1
0
def score_network(kinase_file, out_file):
    # This takes network files and scores all edges according to pssm

    ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv")
    aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv")
    psites = read_phosphosites("data/psiteplus-phosphosites.tsv")
    kinases = read_kinase_file(kinase_file)
    phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv")
    phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv")
    signs = read_sign_file("out/reg-site-bart-sign-preds.tsv")
    reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv")
    kin_types = read_pfam_file("data/human-pfam.tsv", kinases)
    kinase_fams, family_kins = read_kinase_families("data/kinase-families.tsv")
    sdr_assigns = read_sdr_assignments("data/sdr-pssm-assignments.tsv")
    family_pssms = pssms.build_family_pssms(family_kins, ktable, aa_freqs)
    kinase_pssms = pssms.build_pssms(kinases, kinase_fams, family_pssms, ktable,
                                     aa_freqs)
    kinase_pssms = assign_pssms_by_sdr(kinases, kinase_pssms, sdr_assigns)
    scores = score_kin_pairs(kinases, kinase_pssms, psites, family_pssms)
    # scores = norm_scores
    with open(out_file, "w") as v:
        v.write("\t".join(["node1", "node2", "kinase.type", "sub.kinase.type",
                           "max.pssm.score", "signed.dcg", "site.sign.score",
                           "signed.func.score"])+"\n")
        for pair in scores:
            if pair[0] not in kin_types:
                if pair[0] == "MTOR":
                    kin_type = "ST"
                else:
                    kin_type = "NA"
            else:
                kin_type = kin_types[pair[0]]
            if pair[1] not in kin_types:
                if pair[1] == "MTOR":
                    sub_type = "ST"
                else:
                    sub_type = "NA"
            else:
                sub_type = kin_types[pair[1]]
            if not scores[pair]:
                v.write("\t".join([pair[0], pair[1], kin_type, sub_type, "NA",
                                   "NA", "NA", "NA"])+"\n")
                continue
            own_pssm = str(kinase_pssms[pair[0]][1]).upper()
            residues = set([res for pos, res, score in scores[pair]])
            max_score = max([score for pos, res, score in scores[pair]])
            if residues == set(['Y']):
                dcg, site_score, signed_func_score = signed_regulation_score(
                    pair, scores[pair], phosfun_y, med_phosfun_y, reg_sites, signs)
            else:
                dcg, site_score, signed_func_score = signed_regulation_score(
                    pair, scores[pair], phosfun_st, med_phosfun_st, reg_sites, signs)
            v.write("\t".join([pair[0], pair[1], kin_type, sub_type, str(max_score),
                               str(dcg), str(site_score), str(signed_func_score)])+"\n")
def calc_distr(kin_act_file):
    ktable = pssms.read_kin_sub_data("data/reduced_kinase_table.tsv")
    aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv")
    proteome = pssms.read_proteome()
    out_file_base = os.path.splitext(os.path.basename(kin_act_file))[0]
    dist_out_file = "out/" + out_file_base + "-pssm-dists.tsv"
    with open(dist_out_file, "w") as v:
        for kinase in ktable:
            kin_motif_seqs = ktable[kinase]
            motif_seqs = [seq for (substrate, seq) in kin_motif_seqs]
            pssm = pssms.calc_pssm(motif_seqs, aa_freqs)
            sub_motif_seqs = get_random_sty_motifs(proteome, kin_motif_seqs,
                                                   1000)
            dist = calc_dist(sub_motif_seqs, pssm)
            if not dist:
                continue
            line = [kinase]
            line.extend([str(x) for x in dist])
            v.write("\t".join(line) + "\n")
Beispiel #3
0
def pssm_prop(seqs):
    if len(seqs) == 0:
        return (np.zeros((20, 2)))
    #this function returns simple proportinal pssm
    pssm = np.zeros((20, len(seqs[0])))
    for seq in seqs:
        for i, aa in enumerate(seq):
            if aa not in AMINO_ACIDS:
                continue
            pssm[AMINO_ACIDS[aa], i] += 1 / len(seqs)
    return (pssm)


if __name__ == "__main__":
    ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv")
    aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv")
    psites = read_phosphosites("data/pride-phosphosites.tsv")
    kinases = read_kinase_file("data/human-kinome.txt")
    # phosfun, med_phosfun = read_phosfun_file("data/phosfun-alt.tsv")
    phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv")
    phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv")
    reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv")
    entropies = {}
    pssm_div = {}
    for kinase in kinases:
        if kinase not in ktable:
            continue
        kin_motif_seqs = []
        for substrate, pos, res, seq in ktable[kinase]:
def score_network(kinase_file, out_file):
    # This takes network files and scores all edges according to pssm

    ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv")
    aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv")
    psites = read_phosphosites("data/psiteplus-phosphosites.tsv")
    kinases = read_kinase_file(kinase_file)
    # phosfun, med_phosfun = read_phosfun_file("data/phosfun-alt.tsv")
    phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv")
    phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv")
    reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv")
    kin_types = read_pfam_file("data/human-pfam.tsv", kinases)
    kinase_fams, family_kins = read_kinase_families("data/kinase-families.tsv")
    sdr_assigns = read_sdr_assignments("data/sdr-pssm-assignments.tsv")
    family_pssms = pssms.build_family_pssms(family_kins, ktable, aa_freqs)
    kinase_pssms = pssms.build_pssms(kinases, kinase_fams, family_pssms,
                                     ktable, aa_freqs)
    kinase_pssms = assign_pssms_by_sdr(kinases, kinase_pssms, sdr_assigns)
    scores = score_kin_pairs(kinases, kinase_pssms, psites, family_pssms)
    norm_scores = normalize_scores(scores)
    # scores = norm_scores
    kinases_sorted = list(kinases)
    kinases_sorted.sort()
    pairs = itertools.product(kinases_sorted, repeat=2)
    with open(out_file, "w") as v:
        v.write("\t".join([
            "node1", "node2", "kinase.type", "sub.kinase.type", "pssm.source",
            "max.pssm.score", "dcg", "max.func.score", "max.pssm.score.res",
            "max.func.score.res", "all.pssm.scores", "all.func.scores",
            "all.sites"
        ]) + "\n")
        for pair in pairs:
            if pair[0] == pair[1]:
                continue
            if pair[0] not in kin_types:
                if pair[0] == "MTOR":
                    kin_type = "ST"
                else:
                    kin_type = "NA"
            else:
                kin_type = kin_types[pair[0]]
            if pair[1] not in kin_types:
                if pair[1] == "MTOR":
                    sub_type = "ST"
                else:
                    sub_type = "NA"
            else:
                sub_type = kin_types[pair[1]]
            if pair[0] in kinase_pssms:
                source = kinase_pssms[pair[0]][1]
            else:
                source = "NA"
            if not scores[pair]:
                v.write("\t".join([
                    pair[0], pair[1], kin_type, sub_type, source, "NA", "NA",
                    "NA", "NA", "NA", "NA", "NA", "NA"
                ]) + "\n")
                continue
            own_pssm = str(kinase_pssms[pair[0]][1]).upper()
            residues = set([res for pos, res, score in scores[pair]])
            pssm_scores = ([score for pos, res, score in scores[pair]])
            sites = ([
                "{0}{1}".format(res, pos) for pos, res, score in scores[pair]
            ])
            max_pssm_score = max(pssm_scores)
            max_pssm_score_i = pssm_scores.index(max_pssm_score)
            max_pssm_score_site = sites[max_pssm_score_i]
            all_pssm_scores = ",".join([str(score) for score in pssm_scores])
            all_sites = ",".join(sites)
            if residues == set(['Y']):
                (dcg, max_func_score, max_func_score_site,
                 all_func_scores) = regulation_score(pair, scores[pair],
                                                     phosfun_y, med_phosfun_y,
                                                     reg_sites)
            else:
                (dcg, max_func_score, max_func_score_site,
                 all_func_scores) = regulation_score(pair, scores[pair],
                                                     phosfun_st,
                                                     med_phosfun_st, reg_sites)
            v.write("\t".join([
                pair[0], pair[1], kin_type, sub_type, source,
                str(max_pssm_score),
                str(dcg),
                str(max_func_score), max_pssm_score_site, max_func_score_site,
                all_pssm_scores, all_func_scores, all_sites
            ]) + "\n")
    "C": 11,
    "M": 12,
    "N": 13,
    "Q": 14,
    "K": 15,
    "R": 16,
    "H": 17,
    "D": 18,
    "E": 19
}

if __name__ == "__main__":
    disc = "out/discarded-kinases.tsv"
    kept = "out/kept-kinases.tsv"

    d = pssms.read_kin_sub_data(disc)
    k = pssms.read_kin_sub_data(kept)

    daa = {}
    kaa = {}

    for kinase in d:
        noas = [0] * 15
        for i in range(len(d[kinase][0][3])):
            colaa = set()
            if i == 7:
                continue
            for j in range(len(d[kinase])):

                if d[kinase][j][3][i] not in colaa and d[kinase][j][3][
                        i] in AMINO_ACIDS: