def score_network(kinase_file, out_file): # This takes network files and scores all edges according to pssm ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv") aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv") psites = read_phosphosites("data/psiteplus-phosphosites.tsv") kinases = read_kinase_file(kinase_file) phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv") phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv") signs = read_sign_file("out/reg-site-bart-sign-preds.tsv") reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv") kin_types = read_pfam_file("data/human-pfam.tsv", kinases) kinase_fams, family_kins = read_kinase_families("data/kinase-families.tsv") sdr_assigns = read_sdr_assignments("data/sdr-pssm-assignments.tsv") family_pssms = pssms.build_family_pssms(family_kins, ktable, aa_freqs) kinase_pssms = pssms.build_pssms(kinases, kinase_fams, family_pssms, ktable, aa_freqs) kinase_pssms = assign_pssms_by_sdr(kinases, kinase_pssms, sdr_assigns) scores = score_kin_pairs(kinases, kinase_pssms, psites, family_pssms) # scores = norm_scores with open(out_file, "w") as v: v.write("\t".join(["node1", "node2", "kinase.type", "sub.kinase.type", "max.pssm.score", "signed.dcg", "site.sign.score", "signed.func.score"])+"\n") for pair in scores: if pair[0] not in kin_types: if pair[0] == "MTOR": kin_type = "ST" else: kin_type = "NA" else: kin_type = kin_types[pair[0]] if pair[1] not in kin_types: if pair[1] == "MTOR": sub_type = "ST" else: sub_type = "NA" else: sub_type = kin_types[pair[1]] if not scores[pair]: v.write("\t".join([pair[0], pair[1], kin_type, sub_type, "NA", "NA", "NA", "NA"])+"\n") continue own_pssm = str(kinase_pssms[pair[0]][1]).upper() residues = set([res for pos, res, score in scores[pair]]) max_score = max([score for pos, res, score in scores[pair]]) if residues == set(['Y']): dcg, site_score, signed_func_score = signed_regulation_score( pair, scores[pair], phosfun_y, med_phosfun_y, reg_sites, signs) else: dcg, site_score, signed_func_score = signed_regulation_score( pair, scores[pair], phosfun_st, med_phosfun_st, reg_sites, signs) v.write("\t".join([pair[0], pair[1], kin_type, sub_type, str(max_score), str(dcg), str(site_score), str(signed_func_score)])+"\n")
def calc_distr(kin_act_file): ktable = pssms.read_kin_sub_data("data/reduced_kinase_table.tsv") aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv") proteome = pssms.read_proteome() out_file_base = os.path.splitext(os.path.basename(kin_act_file))[0] dist_out_file = "out/" + out_file_base + "-pssm-dists.tsv" with open(dist_out_file, "w") as v: for kinase in ktable: kin_motif_seqs = ktable[kinase] motif_seqs = [seq for (substrate, seq) in kin_motif_seqs] pssm = pssms.calc_pssm(motif_seqs, aa_freqs) sub_motif_seqs = get_random_sty_motifs(proteome, kin_motif_seqs, 1000) dist = calc_dist(sub_motif_seqs, pssm) if not dist: continue line = [kinase] line.extend([str(x) for x in dist]) v.write("\t".join(line) + "\n")
def pssm_prop(seqs): if len(seqs) == 0: return (np.zeros((20, 2))) #this function returns simple proportinal pssm pssm = np.zeros((20, len(seqs[0]))) for seq in seqs: for i, aa in enumerate(seq): if aa not in AMINO_ACIDS: continue pssm[AMINO_ACIDS[aa], i] += 1 / len(seqs) return (pssm) if __name__ == "__main__": ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv") aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv") psites = read_phosphosites("data/pride-phosphosites.tsv") kinases = read_kinase_file("data/human-kinome.txt") # phosfun, med_phosfun = read_phosfun_file("data/phosfun-alt.tsv") phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv") phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv") reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv") entropies = {} pssm_div = {} for kinase in kinases: if kinase not in ktable: continue kin_motif_seqs = [] for substrate, pos, res, seq in ktable[kinase]:
def score_network(kinase_file, out_file): # This takes network files and scores all edges according to pssm ktable = pssms.read_kin_sub_data("data/psiteplus-kinase-substrates.tsv") aa_freqs = pssms.read_aa_freqs("data/aa-freqs.tsv") psites = read_phosphosites("data/psiteplus-phosphosites.tsv") kinases = read_kinase_file(kinase_file) # phosfun, med_phosfun = read_phosfun_file("data/phosfun-alt.tsv") phosfun_st, med_phosfun_st = read_phosfun_file("data/phosfun-ST.tsv") phosfun_y, med_phosfun_y = read_phosfun_file("data/phosfun-Y.tsv") reg_sites = read_reg_sites_file("data/psiteplus-reg-sites.tsv") kin_types = read_pfam_file("data/human-pfam.tsv", kinases) kinase_fams, family_kins = read_kinase_families("data/kinase-families.tsv") sdr_assigns = read_sdr_assignments("data/sdr-pssm-assignments.tsv") family_pssms = pssms.build_family_pssms(family_kins, ktable, aa_freqs) kinase_pssms = pssms.build_pssms(kinases, kinase_fams, family_pssms, ktable, aa_freqs) kinase_pssms = assign_pssms_by_sdr(kinases, kinase_pssms, sdr_assigns) scores = score_kin_pairs(kinases, kinase_pssms, psites, family_pssms) norm_scores = normalize_scores(scores) # scores = norm_scores kinases_sorted = list(kinases) kinases_sorted.sort() pairs = itertools.product(kinases_sorted, repeat=2) with open(out_file, "w") as v: v.write("\t".join([ "node1", "node2", "kinase.type", "sub.kinase.type", "pssm.source", "max.pssm.score", "dcg", "max.func.score", "max.pssm.score.res", "max.func.score.res", "all.pssm.scores", "all.func.scores", "all.sites" ]) + "\n") for pair in pairs: if pair[0] == pair[1]: continue if pair[0] not in kin_types: if pair[0] == "MTOR": kin_type = "ST" else: kin_type = "NA" else: kin_type = kin_types[pair[0]] if pair[1] not in kin_types: if pair[1] == "MTOR": sub_type = "ST" else: sub_type = "NA" else: sub_type = kin_types[pair[1]] if pair[0] in kinase_pssms: source = kinase_pssms[pair[0]][1] else: source = "NA" if not scores[pair]: v.write("\t".join([ pair[0], pair[1], kin_type, sub_type, source, "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA" ]) + "\n") continue own_pssm = str(kinase_pssms[pair[0]][1]).upper() residues = set([res for pos, res, score in scores[pair]]) pssm_scores = ([score for pos, res, score in scores[pair]]) sites = ([ "{0}{1}".format(res, pos) for pos, res, score in scores[pair] ]) max_pssm_score = max(pssm_scores) max_pssm_score_i = pssm_scores.index(max_pssm_score) max_pssm_score_site = sites[max_pssm_score_i] all_pssm_scores = ",".join([str(score) for score in pssm_scores]) all_sites = ",".join(sites) if residues == set(['Y']): (dcg, max_func_score, max_func_score_site, all_func_scores) = regulation_score(pair, scores[pair], phosfun_y, med_phosfun_y, reg_sites) else: (dcg, max_func_score, max_func_score_site, all_func_scores) = regulation_score(pair, scores[pair], phosfun_st, med_phosfun_st, reg_sites) v.write("\t".join([ pair[0], pair[1], kin_type, sub_type, source, str(max_pssm_score), str(dcg), str(max_func_score), max_pssm_score_site, max_func_score_site, all_pssm_scores, all_func_scores, all_sites ]) + "\n")
"C": 11, "M": 12, "N": 13, "Q": 14, "K": 15, "R": 16, "H": 17, "D": 18, "E": 19 } if __name__ == "__main__": disc = "out/discarded-kinases.tsv" kept = "out/kept-kinases.tsv" d = pssms.read_kin_sub_data(disc) k = pssms.read_kin_sub_data(kept) daa = {} kaa = {} for kinase in d: noas = [0] * 15 for i in range(len(d[kinase][0][3])): colaa = set() if i == 7: continue for j in range(len(d[kinase])): if d[kinase][j][3][i] not in colaa and d[kinase][j][3][ i] in AMINO_ACIDS: