ltr_seqs += list(fasta_reader(handle)) print len(ltr_seqs) # <codecell> conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG TGGAAAATCTCT""".replace('\n', '') ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20)) # <codecell> ltr_bin_align = [] align_inds = [] for key, seq in ltr_align: align_inds.append(tuple(key.split('-'))) hit_first = False bins = [] for g, cb in zip(seq, conb_ltr): if ~hit_first and (g == '-'): bins.append(np.nan) continue hit_first = True bins.append(1.0 if g==cb else 0.0)
oseqs = [] if os.path.exists('aligned_seq.pkl'): with open('aligned_seq.pkl') as handle: oseqs = pickle.load(handle) else: for f in files: fname = f.rsplit(os.sep,1)[-1].split('.')[0] parts = fname.split('-') prot_name = parts[1].replace('_','-') if prot_name == 'V3': continue with open(f) as handle: prot_seqs = list(trans_seq(handle, set(trop_data.index), trans=prot_name != 'LTR')) print prot_name, len(prot_seqs) aligned_seqs = seq_align_to_ref(prot_seqs, ref_seqs[prot_name], max_workers=20) for name, seq in aligned_seqs: oseqs.append({ 'Accession':name, 'Seq':seq, 'Prot':prot_name }) # <codecell> aligned_seqs = pd.pivot_table(pd.DataFrame(oseqs), rows='Accession', cols='Prot', values='Seq', aggfunc='first')
------------AGCTGCATCCGGAGTACTTC---------AAGAACTGCT----------------------------- -------GACATCGA------------------------GCTTG---CT----------------------ACAA---GG GACTTTCCGCTGGGGACTTTCCAG-------------GGAGGCGTGGCCTGGGCGGGACT---GGGGAGTGGCGA---GC CCTCAGATCCTGCATATAAGCAGC---TGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAG CTCTCTGGCT---AACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCT---TGAGTGCTTC---AAGTAGTGTG TGC---CCGTCTG---TTGTGTGACTCTGGT---AACTAGAGATCCC---TCAGAC---CCT---TTTAGTCAGTGTGG- --AAAATCTCT""".replace( "\n", "" ).replace( "-", "" ) # <codecell> aln_seqs = [] for gi, aln_seq in seq_align_to_ref(ltr_seqs, hxb2, max_workers=50): try: aln_seqs.append((gi2date[gi], aln_seq)) except KeyError: pass # <codecell> sorted_alns = sorted(aln_seqs, key=lambda x: x[0]) # <codecell> from itertools import groupby from collections import defaultdict found_seqs = set()