Esempio n. 1
0
 def overlap(self):
     pdb_uni = self.read_pdb_uni()
     cb_ids, cb_seqs = tools_fasta.fasta_to_id_seq(self.bc90)
     cb_pdb_unis = {}
     cb_pdbs = []
     for id in cb_ids:
         if id in pdb_uni:
             cb_pdb_unis[pdb_uni[id]] = id
             cb_pdbs.append(pdb_uni[id])
     pdb_ids, pdb_seqs = tools_fasta.fasta_to_id_seq(self.pdb90)
     print("Proteins overlapping between the PDB and BC datasets")
     for pdb_id in pdb_ids:
         if pdb_id in cb_pdbs:
             print(pdb_id)
             print(cb_pdb_unis[pdb_id])
 def get_human(self):
     pids, seqs = tools_fasta.fasta_to_id_seq(self.human_fp)
     ns = NormScore()
     scores = ns.lc_norm_score(seqs)
     proteome = ['Human'] * len(pids)
     org = ['Human'] * len(pids)
     return pids, proteome, org, scores
 def get_yeast(self):
     pids, seqs = tools_fasta.fasta_to_id_seq(self.yeast_fp)
     ns = NormScore()
     scores = ns.lc_norm_score(seqs)
     proteome = ['Yeast'] * len(pids)
     org = ['Yeast'] * len(pids)
     return pids, proteome, org, scores
Esempio n. 4
0
 def train_df(self):
     pdb_pids, pdb_seqs = tools_fasta.fasta_to_id_seq(self.pdb_fpi)
     pdb_lens = tools_fasta.get_lengths(pdb_seqs)
     bc_pids, bc_seqs = tools_fasta.fasta_to_id_seq(self.bc_fpi)
     bc_lens = tools_fasta.get_lengths(bc_seqs)
     lens = bc_lens + pdb_lens
     pids = bc_pids + pdb_pids
     seqs = bc_seqs + pdb_seqs
     y = [0] * len(bc_pids) + [1] * len(pdb_pids)
     df_dict = {
         'Protein ID': pids,
         'Sequence': seqs,
         'Length': lens,
         'y': y
     }
     cols = ['Protein ID', 'y', 'Sequence', 'Length']
     df = pd.DataFrame(df_dict, columns=cols)
     df.to_csv(self.train_fpo, sep='\t')
 def write_scores(self):
     ids, seqs = tools_fasta.fasta_to_id_seq(self.all_fpi)
     ns = NormScore()
     scores = ns.lc_norm_score(seqs)
     df_out = pd.DataFrame({
         'Protein ID': ids,
         'LC Score': scores
     },
                           columns=['Protein ID', 'LC Score'])
     df_out = df_out.sort_values(by='LC Score', ascending=False)
     print(df_out)
     df_out.to_csv(self.all_fpo, sep='\t')
Esempio n. 6
0
 def get_scores(self):
     pbodies = self.get_pbody()
     pids, seqs = tools_fasta.fasta_to_id_seq(self.yeast_fasta)
     pseqs = []
     ppids = []
     for pid, seq in zip(pids, seqs):
         if pid in pbodies:
             pseqs.append(seq)
             ppids.append(pid)
     ns = NormScore()
     scores = ns.lc_norm_score(pseqs)
     df_dict = {'Protein ID': ppids, 'LC Score': scores}
     df_out = pd.DataFrame(df_dict)
     df_out.to_csv(self.yeast_scores, sep='\t')
 def read_fasta(self):
     pids, seqs = tools_fasta.fasta_to_id_seq(self.fasta_fpi)
     norm = NormScore()
     # ent1[211:457]
     ent1 = seqs[0]
     #print(ent1[211:457])
     ent1wo = ent1[:211] + ent1[457:]
     #print(norm.lc_norm_score([ent1wo]))
     #print(norm.lc_norm_score([ent1]))
     # ent2[224:616]
     ent2 = seqs[1]
     #print(ent2[224:616])
     ent2wo = ent2[:224] + ent2[616:]
     #print(norm.lc_norm_score([ent2wo]))
     # yap1801[351:638]
     yap1801 = seqs[2]
     #print(yap1801[351:638])
     yap1801wo = yap1801[:351] + yap1801[638:]
     #print()
     #print(norm.lc_norm_score([yap1801]))
     #print(norm.lc_norm_score([yap1801wo]))
     # yap1802[319:569]
     yap1802 = seqs[3]
     #print(yap1802[319:569])
     yap1802wo = yap1802[:319] + yap1802[569:]
     #print(norm.lc_norm_score([yap1802wo]))
     # sla1[954:1244]
     sla1 = seqs[4]
     print(len(sla1))
     print(sla1[954:1244])
     print()
     ns = tools_lc.display_lc(sla1, self.k, self.lca, self.lce)
     print(sla1)
     print(ns)
     sla1wo = sla1[:954] + sla1[1244:]
     print(norm.lc_norm_score([sla1wo]))
     print(norm.lc_norm_score([sla1]))
     #sla2[348:442]
     sla2 = seqs[5]
     #print(sla2[348:442])
     sla2wo = sla2[:348] + sla2[442:]
     #print(norm.lc_norm_score([sla2wo]))
     #print(norm.lc_norm_score([sla2]))
     # sup35[0:123]
     sup35 = seqs[6]
Esempio n. 8
0
 def run(self):
     ids, seqs = tools_fasta.fasta_to_id_seq(self.puncta)
     df = pd.read_csv(self.pfam_puncta, sep='\t', index_col=0)
     new_seqs = []
     below = 0
     above = 0
     norm_scores = []
     fl_norm_scores = []
     for id, seq in zip(ids, seqs):
         ndf = df[df['uniprot_acc'] == id]
         ndf = ndf.sort_values(by='seq_start')
         segmented = self.segment_seq(seq, ndf)
         total = 0
         for item in segmented:
             total += len(item)
         if total >= 100:
             above += 1
             fl_score, fl_length = self.get_segment_scores([seq])
             fl_norm = self.norm_function([fl_score], [fl_length])
             raw_score, length = self.get_segment_scores(segmented)
             norm_score = self.norm_function([raw_score], [length])
             norm_scores.append(norm_score[0])
             fl_norm_scores.append(fl_norm[0])
         else:
             below += 1
     print(above)
     print(below)
     print(np.mean(norm_scores))
     print(np.mean(fl_norm_scores))
     print(np.median(norm_scores))
     print(np.median(fl_norm_scores))
     plt.hist(fl_norm_scores,
              alpha=0.5,
              bins=20,
              range=(-100, 200),
              label='Full length scores')
     plt.hist(norm_scores,
              alpha=0.5,
              bins=20,
              range=(-100, 200),
              label='Outside Pfam scores')
     plt.legend()
     plt.show()
 def concat_train(self):
     bc_pids, bc_seqs = tools_fasta.fasta_to_id_seq(self.bc_fpi)
     bc_lens = tools_fasta.get_lengths(bc_seqs)
     pdb_df = pd.read_csv(self.pdb_fpi, sep='\t', index_col=0)
     pdb_pids = list(pdb_df['Protein ID'])
     pdb_seqs = list(pdb_df['Sequence'])
     pdb_lens = list(pdb_df['Length'])
     pids = bc_pids + pdb_pids
     seqs = bc_seqs + pdb_seqs
     lens = bc_lens + pdb_lens
     y = [0] * len(bc_pids) + [1] * len(pdb_pids)
     cols = ['Protein ID', 'y', 'Length', 'Sequence']
     df_dict = {
         'Protein ID': pids,
         'Sequence': seqs,
         'Length': lens,
         'y': y
     }
     df = pd.DataFrame(df_dict, columns=cols)
     df.to_csv(self.fpo, sep='\t')
Esempio n. 10
0
 def with_pfam(self, fasta_fp, pfam_fp, fpo):
     """
     How many proteins in the set have pfam domains?
     What is the fraction occupied by pfam domains?"""
     df = pd.read_csv(pfam_fp, sep='\t')
     pfam_ids = list(set(df['uniprot_acc']))
     pids, seqs = tools_fasta.fasta_to_id_seq(fasta_fp)
     print(len(pids))
     nopfam_ids = list(set(pids) - set(pfam_ids))
     nopfam_seqs = []
     for pid, seq in zip(pids, seqs):
         if pid in nopfam_ids:
             nopfam_seqs.append(seq)
     ns = NormScore()
     scores = ns.lc_norm_score(nopfam_seqs)
     df_out = pd.DataFrame({
         'UniProt ID': nopfam_ids,
         'LC Score': scores
     },
                           columns=['UniProt ID', 'LC Score'])
     df_out = df_out.sort_values(by='LC Score', ascending=False)
     df_out.to_csv(fpo, sep='\t')
Esempio n. 11
0
 def percent_pfam(self, fasta_fp, pfam_fp, fpo):
     df = pd.read_csv(pfam_fp, sep='\t')
     pids, seqs = tools_fasta.fasta_to_id_seq(fasta_fp)
     frac_pfam = []
     for id, seq in zip(pids, seqs):
         ndf = df[df['uniprot_acc'] == id]
         ndf = ndf.sort_values(by='seq_start')
         segmented = self.segment_seq(seq, ndf)
         len_seg = 0
         for seg in segmented:
             len_seg += len(seg)
         frac_pfam.append(float(len(seq) - len_seg) / float(len(seq)))
     ns = NormScore()
     scores = ns.lc_norm_score(seqs)
     df_out = pd.DataFrame(
         {
             'Uniprot ID': pids,
             'LC Score': scores,
             'Pfam Fraction': frac_pfam
         },
         columns=['Uniprot ID', 'LC Score', 'Pfam Fraction'])
     df_out = df_out.sort_values(by='LC Score', ascending=False)
     df_out.to_csv(fpo, sep='\t')
     print(np.mean(frac_pfam))
 def get_pids(self, fasta):
     pids, seqs = tools_fasta.fasta_to_id_seq(fasta)
     return pids