def get_offset(self, ref_file='~/References/HIV-HXB2.fasta', gene='protease'): ''' ''' from pythonlib import Alignment import os outfile = 'ppp.tmp' start, stop = gene_coord[gene] usa_seq = ref_file + '[%d:%d]' % (start, stop) Alignment.needle_align(usa_seq, 'asis:%s' % self.cons, outfile, go=10.0, ge=0.5) tal = Alignment.alignfile2dict([outfile], 'get_offset', 10.0, 0.5) os.remove(outfile) ka = tal.keys()[0] this = tal[ka]['asis'] this.summary() self.offset = this.start print('Offset consensus w.r.t', ref_file, 'is', self.offset, file=sys.stderr) return
def get_cons(self, plurality=0.1): '''Consensus by running EMBOSS cons ''' import subprocess import os import itertools from pythonlib import Alignment cline = 'cons -sequence %s -stdout -auto' % self.sup_file cline += ' -plurality %f' % plurality p = subprocess.Popen(cline, shell=True, bufsize=1024, \ stdin=subprocess.PIPE, stdout=subprocess.PIPE, \ close_fds=True) sc = list(SeqIO.parse(p.stdout, 'fasta'))[0].seq.tostring().upper() strcons = sc.replace('N', '') outfile = 'tmp.tmp' Alignment.needle_align(self.ref_file, 'asis:%s' % strcons, \ outfile, go=10.0, ge=0.5) tal = Alignment.alignfile2dict([outfile], 'ref_cons_alignment', 10.0, 0.5) os.remove(outfile) ka = tal.keys()[0] this = tal[ka]['asis'] it_pair = itertools.izip(this.seq_a, this.seq_b) this_seq = [] while True: try: p = it_pair.next() except StopIteration: break if p is None: break if p[1] == '-': assert p[0] != '-', 'gap-gap?' this_seq.append(p[0]) elif p[0] != '-': this_seq.append(p[1]) ws = ''.join(this_seq) return ws
def find_best_split(seq): ''' ''' import heapq import operator l = len(seq) low_lim = int(0.25 * l) up_lim = int(0.76 * l) step = int(0.25 * l) ref_genome = 'all_clones.fas' best_score = 0 best_split = 0 best_gaps = 0 for split in range(low_lim, up_lim, step): s1 = seq.seq[:low_lim] s2 = seq.seq[low_lim:] h1 = open('tmp1.fas', 'w') h1.write('>%s_1\n' % seq.id.split('#')[0]) h1.write(s1.tostring() + '\n') h1.close() h2 = open('tmp2.fas', 'w') h2.write('>%s_1\n' % seq.id.split('#')[0]) h2.write(s2.tostring() + '\n') h2.close() Alignment.needle_align('tmp1.fas', ref_genome, 'tmp1.needle') alset_1 = Alignment.alignfile2set(['tmp1.needle'], 'split_1', 6.0, 3.0) os.unlink('tmp1.needle') Alignment.needle_align('tmp2.fas', ref_genome, 'tmp2.needle') alset_2 = Alignment.alignfile2set(['tmp2.needle'], 'split_2', 6.0, 3.0) os.unlink('tmp2.needle') k1 = alset_1.keys()[0] l1 = [(s[0], s[1].score) for s in alset_1[k1].iteritems()] best_1 = heapq.nlargest(2, iter(l1), operator.itemgetter(1)) k2 = alset_2.keys()[0] l2 = [(s[0], s[1].score) for s in alset_2[k2].iteritems()] best_2 = heapq.nlargest(2, iter(l2), operator.itemgetter(1)) if best_1[0][1] + best_2[0][1] >= best_score: best_score = best_1[0][1] + best_2[0][1] best_split = split clones = best_1[0][0], best_2[0][0] alset_1[k1][clones[0]].summary() alset_2[k2][clones[1]].summary() best_gaps = alset_1[k1][clones[0]].int_gaps + alset_2[k2][ clones[1]].int_gaps al_start_1, al_stop_1 = alset_1[k1][clones[0]].start, alset_1[k1][ clones[0]].stop al_start_2, al_stop_2 = alset_2[k2][clones[1]].start, alset_2[k2][ clones[1]].stop del alset_1 del alset_2 return best_score, best_split, best_gaps, clones, al_start_1, al_stop_1, al_start_2, al_stop_2
def find_closest_here(reads_file): ''' The diff_thresh has been set to 0.025 because even when aligning error-free reads to the original haplotypes, the distribution of differences of the best 2 identities goes from 0.023 to 0.091 (~9%) ''' from pythonlib import Alignment import tempfile import subprocess import heapq import operator # diff_thresh = 0.025 # abs_thresh = 0.85 ref_file = 'ref.fasta' out = tempfile.NamedTemporaryFile() outname = out.name cmline = 'needle -asequence %s -bsequence %s \ -gapopen 6.0 -gapextend 3.0 -auto -adesshow3 -out %s -aformat3 markx10' \ % (ref_file, reads_file, outname) subprocess.call(cmline, shell=True) dd = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False) kh = dd.keys()[0] d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose=False)[kh] out.close() this = [] mm = [] ident_2 = [] ig = [] for k, v in d.items(): v.summary() ig.append(v.int_gaps) this.append(float(v.ident) / (v.stop - v.start + 1)) mm.append(v.mismatch) #v.stop - v.start + 1 - v.ident ident_2.append(float(v.ident) / (v.stop - v.start + 1 - v.int_gaps)) return ig, this, mm, ident_2
def find_closest(hr): ''' The diff_thresh has been set to 0.025 because even when aligning error-free reads to the original haplotypes, the distribution of differences of the best 2 identities goes from 0.023 to 0.091 (~9%) ''' from pythonlib import Alignment import tempfile import subprocess import heapq import operator diff_thresh = 0.0125 abs_thresh = 0.1 # ref_file = './ref.fas' out = tempfile.NamedTemporaryFile() outname = out.name hap, ref_file = hr cmline = 'needle -asequence asis:\'%s\' -bsequence %s \ -gapopen 10.0 -gapextend 1.0 -auto -adesshow3 -out %s -aformat3 markx10' \ % (hap, ref_file, outname) subprocess.call(cmline, shell=True) d = Alignment.alignfile2dict([outname], 'n', 6.0, 3.0, Verbose = False)['asis'] out.close() this = {} mm = {} gaps = {} for k, v in d.items(): v.summary() this[v.id_b] = float(v.mismatch)/(v.stop - v.start + 1) #float(v.ident)/(v.stop - v.start + 1) mm[v.id_b] = v.mismatch # v.stop - v.start + 1 - v.ident gaps[v.id_b] = v.int_gaps best2 = heapq.nsmallest(2, this.items(), operator.itemgetter(1)) rel_delta = (best2[1][1] - best2[0][1])#/best2[0][1] if rel_delta >= diff_thresh and best2[0][1] <= abs_thresh: return best2[0][0], best2[0][1], mm[best2[0][0]], gaps[best2[0][0]] else: return None, gaps[best2[0][0]]
def alignedvariants(self, threshold=0.9): import subprocess import re import itertools import hashlib from Bio.Emboss.Applications import NeedleCommandline from pythonlib import Alignment files = [] var_dict = {} for i, s in enumerate(self.seq_obj): m_obj = re.search('posterior=(.*)\s*ave_reads=(.*)', s.description) post, ave_reads = map(float, (m_obj.group(1), m_obj.group(2))) if post < threshold or ave_reads < 1.: continue if post > 1.0: print('WARNING: posterior=', post, file=sys.stderr) outfile = 'tmp%d.needle' % i files.append(outfile) needle_cline = NeedleCommandline(asequence='asis:%s' % self.ref, bsequence='asis:%s' % s.seq.tostring().strip('-'), \ outfile=outfile, gapopen=10.0, gapextend=0.5, aformat='markx10') needle_cline.auto = True try: retcode = subprocess.call(str(needle_cline), shell=True) if retcode < 0: sys.exit('Child needle was terminated by signal %d' % -retcode) # else: # print >> sys.stderr, 'Child needle returned %i' % retcode except OSError: sys.exit('Execution of needle failed: %s' % ee) pass tal = Alignment.alignfile2dict([outfile], 'support_seqs%d' % i, 10.0, 0.5, Verbose=False) os.remove(outfile) ka = tal.keys()[0] this = tal[ka]['asis'] it_pair = itertools.izip(this.seq_a, this.seq_b) #this.summary() #start, stop = this.start, this.stop #it_pair = itertools.izip(this.seq_a[start-1:stop], this.seq_b[start-1:stop]) this_seq = [] while True: try: p = it_pair.next() except StopIteration: break if p is None: break if p[1] == '-': assert p[0] != '-', 'gap-gap?' this_seq.append(p[0]) elif p[0] != '-': this_seq.append(p[1]) ws = ''.join(this_seq) var_dict[ws] = var_dict.get(ws, 0) + ave_reads for k, v in var_dict.items(): ts = Seq(k, IUPAC.unambiguous_dna) tsr = SeqRecord(ts, id = hashlib.sha224(k).hexdigest(), \ name='Reconstructed local hap') tsr.description = 'ave_reads=%f' % v self.dna_seqs.append(tsr) print('%d haplotypes have support >=%f'\ % (len(files), threshold), file=sys.stderr) return self.dna_seqs
def count_codons(haps): import pickle from Bio.Seq import translate from operator import itemgetter from pythonlib import Alignment from pythonlib import mystats latex = False # print latex table count = [{} for i in range(102)] oh = open('all.dat', 'w') hap_freq = {} degeneracy = {} mask_mupos = [] #[10, 11, 22, 25, 32, 46, 58, 62, 67, 74, 89] mupos = [] # These sequences are HXB2 proteases wt_protease = 'PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF' wt_protease_nt = 'CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTA\ TTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTA\ TAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT' ac_res = map(align_codons, haps) protease = wt_protease for ar in ac_res: start, residues, freq = ar # start here is human (from 1) start -= 1 # start here is pythonic (from 0) if start == None and residues == None: continue oh.write('%d %s\n' % (round(freq), wt_protease_nt[:start] + residues + wt_protease_nt[len(residues) + start:])) if start % 3 == 0: read = residues elif start % 3 == 1: read = residues[2:] elif start % 3 == 2: read = residues[1:] try: aa = translate(read) # Biopython except: print 'error: read', read continue if start % 3 == 0: start_a = start / 3 + 1 if start % 3: start_a = start / 3 + 2 stop_a = len(aa) + start_a + 1 this_hap = str(protease[:start_a - 1] + aa + protease[stop_a - 2:]) print this_hap.ljust(100), str(freq).ljust( 8 ) # this is used for resistance prediction, whole haplotype and reads for i, c in enumerate(this_hap): count[i + 1][c] = count[i + 1].get(c, 0) + freq Alignment.needle_align('asis:%s' % wt_protease, 'asis:%s ' % this_hap, 'tmp', 10.0, 0.5) d = Alignment.alignfile2dict(['tmp'], 'n', 10.0, 0.5, Verbose=False)['asis']['asis'] os.remove('tmp') mutations = [] for i, c in enumerate(zip(d.seq_a, d.seq_b)): pos = i + 1 if '-' in c: continue if c[0] != c[1]: mutations.append(c[0] + str(pos) + c[1]) if pos not in mask_mupos: mupos.append(pos) signature = ', '.join(mutations) hap_freq[signature] = hap_freq.get(signature, 0.0) + freq degeneracy[signature] = degeneracy.get(signature, 0) + 1 print '' for k, v in hap_freq.items(): print str(v).ljust(15), ' ', k mupos = sorted(mupos) spos = {} for i, j in enumerate(mupos): spos[j] = i hf_sorted = sorted(hap_freq.items(), key=itemgetter(1), reverse=True) tot_reads = sum([h[1] for h in haps]) tot_hap = sum(hap_freq.values()) print 'Tot reads after', tot_reads print 'Tot', tot_hap print 'Simpson\'s index on amino acid sequences = %f +/- %f' % mystats.Simpson( hap_freq.values()) oh = open('degeneracy.pck', 'w') pickle.dump(degeneracy, oh) oh.close() for c in count: ts = sum(c.values()) for k in c.keys(): c[k] /= ts plot_variation(count) if not latex: return hf_sorted print '' print '|c' * (1 + len(spos)) for i in mupos: print '%s%d & ' % (wt_protease[i - 1], i), print '' return hf_sorted
def main(): from Bio import SeqIO import cPickle from multiprocessing import cpu_count from pythonlib import Alignment from pythonlib import pprocess import operator import heapq import time try: n_proc = cpu_count() except NotImplementedError: n_proc = 4 HPP = cPickle.HIGHEST_PROTOCOL min_len = 200 args = sys.argv try: reads_file, clones_file = args[1].rstrip('/'), args[2] except: sys.exit('usage: check_recombination.py reads_file clones_file') reads_dict = {} reads_dict_1 = {} reads_dict_2 = {} gaps_dict = {} gaps_dict_1 = {} gaps_dict_2 = {} f_fasta = open(reads_file) tmp_seqlist = list(SeqIO.parse(f_fasta, 'fasta')) f_fasta.close() countreads = len(tmp_seqlist) print >> sys.stderr, ' %d reads in the original file '.center( 60, '-') % countreads seqlist = [s for s in tmp_seqlist if len(s) > min_len] print >> sys.stderr, ' %d reads are longer than %d '.center( 60, '-') % (len(seqlist), min_len) try: t = time.time() print >> sys.stderr, ' loading file '.center(60, '-') wh = open('%s-check-reads_total.pck' % reads_file.replace('.', 'U').replace('/', '-')) al_set_total = cPickle.load(wh) wh.close() print >> sys.stderr, ' loading file '.center(60, '-') wh = open('%s-check-reads_1.pck' % reads_file.replace('.', 'U').replace('/', '-')) al_set_1 = cPickle.load(wh) wh.close() print >> sys.stderr, ' loading file '.center(60, '-') wh = open('%s-check-reads_2.pck' % reads_file.replace('.', 'U').replace('/', '-')) al_set_2 = cPickle.load(wh) wh.close() print >> sys.stderr, ' pickle objects loaded in %d seconds '.center( 60, '-') % (time.time() - t) except: print >> sys.stderr, ' pickle objects not found, aligning '.center( 60, '-') # reads are considered already aligned f_fasta_forward_filename = 'tmp_reads.fas' f_fasta_forward = open(f_fasta_forward_filename, 'w') SeqIO.write(seqlist, f_fasta_forward, 'fasta') f_fasta_forward.close() # split in 2, first segment f_fasta = open(f_fasta_forward_filename) tmp = list(SeqIO.parse(f_fasta, 'fasta')) f_fasta.close() for seq in tmp: l = len(seq) middle = int(float(l) / 2) seq.seq = seq.seq[:middle] out_file = 'tmp_reads_1.fas' f_fasta_forward = open(out_file, 'w') SeqIO.write(tmp, f_fasta_forward, 'fasta') f_fasta_forward.close() del tmp # split in 2, second segment f_fasta = open(f_fasta_forward_filename) tmp = list(SeqIO.parse(f_fasta, 'fasta')) f_fasta.close() for seq in tmp: l = len(seq) middle = int(float(l) / 2) seq.seq = seq.seq[middle:] out_file = 'tmp_reads_2.fas' f_fasta_forward = open(out_file, 'w') SeqIO.write(tmp, f_fasta_forward, 'fasta') f_fasta_forward.close() del tmp # clones hc = open(clones_file) clones_list = list((SeqIO.parse(hc, 'fasta'))) i = 0 tmp_files = [] for c in clones_list: print >> sys.stderr, ' clone %s '.center(60, '-') % c.id tfn = 'tmp%d.fas' % i tmp_files.append(tfn) th = open(tfn, 'w') th.write('>%s\n' % c.id) th.write('%s' % c.seq.tostring()) th.close() i += 1 # parallelism queue = pprocess.Queue(limit=n_proc) ral_par = queue.manage(pprocess.MakeParallel(Alignment.needle_align)) for tf in tmp_files: # align total ral_par(tf, 'tmp_reads.fas', tf.split('.')[0] + '-total.needle') for tf in tmp_files: # align total ral_par(tf, 'tmp_reads_1.fas', tf.split('.')[0] + '-1.needle') for tf in tmp_files: # align total ral_par(tf, 'tmp_reads_2.fas', tf.split('.')[0] + '-2.needle') for res in queue: if Verbose: print >> sys.stderr, res[0], res[1] # alignment with whole reads files = [ f for f in os.listdir('./') if f.startswith('tmp') and f.endswith('-total.needle') ] al_set_total = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0) wh = open( '%s-check-reads_total.pck' % reads_file.replace('.', 'U').replace('/', '-'), 'w') cPickle.dump(al_set_total, wh, HPP) wh.close() # alignment with first half files = [ f for f in os.listdir('./') if f.startswith('tmp') and f.endswith('-1.needle') ] al_set_1 = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0) wh = open( '%s-check-reads_1.pck' % reads_file.replace('.', 'U').replace('/', '-'), 'w') cPickle.dump(al_set_1, wh, HPP) wh.close() # alignment with second half files = [ f for f in os.listdir('./') if f.startswith('tmp') and f.endswith('-2.needle') ] al_set_2 = Alignment.alignfile2set(files, 'total_read', 6.0, 3.0) wh = open( '%s-check-reads_2.pck' % reads_file.replace('.', 'U').replace('/', '-'), 'w') cPickle.dump(al_set_2, wh, HPP) wh.close() # except ends count = 0 for i in al_set_total: for j in al_set_total[i]: count += 1 ial_set_total = invert_keys(al_set_total) ial_set_1 = invert_keys(al_set_1) ial_set_2 = invert_keys(al_set_2) del al_set_total del al_set_1 del al_set_2 r_keys = ial_set_total.keys() lost = len(seqlist) - len(r_keys) assert lost == 0, 'lost' + str(lost) + 'reads' delta = [] ambiguous = 0 total_delta = [] amb_delta = [] tot_score = [] sum_score = [] tot_score_amb = [] sum_score_amb = [] outliers = [] best_out = {} thresh_inc = 0.05 print >> sys.stderr, 'Total reads', len(r_keys) skewness = [] skewness_amb = [] for k in ial_set_total: total = ial_set_total[k] s1 = ial_set_1[k] s2 = ial_set_2[k] l_tot = [(s[0], s[1].score) for s in total.iteritems()] best2_total = heapq.nlargest(2, iter(l_tot), operator.itemgetter(1)) l_1 = [(s[0], s[1].score) for s in s1.iteritems()] best2_s1 = heapq.nlargest(2, iter(l_1), operator.itemgetter(1)) l_2 = [(s[0], s[1].score) for s in s2.iteritems()] best2_s2 = heapq.nlargest(2, iter(l_2), operator.itemgetter(1)) clone_t = best2_total[0][0] clone_1 = best2_s1[0][0] clone_2 = best2_s2[0][0] ial_set_total[k][clone_t].summary() ial_set_1[k][clone_1].summary() ial_set_2[k][clone_2].summary() len_t = ial_set_total[k][clone_t].stop - ial_set_total[k][ clone_t].start + 1 len_1 = ial_set_1[k][clone_1].stop - ial_set_1[k][clone_1].start + 1 len_2 = ial_set_2[k][clone_2].stop - ial_set_2[k][clone_2].start + 1 bt = best2_total[0][1] / len_t b1 = best2_s1[0][1] / len_1 b2 = best2_s2[0][1] / len_2 relative_gain = (b1 + b2 - 2 * bt) / (b1 + b2) # if 0.4 < relative_gain and relative_gain < 0.8: if abs(len_t - len_1 - len_2) > 5000: print best2_total print best2_s1 print best2_s2 print len_t print len_1 print len_2 print ial_set_total[k][clone_t].seq_a print ial_set_total[k][clone_t].seq_b print ial_set_1[k][clone_1].seq_a print ial_set_1[k][clone_1].seq_b print ial_set_2[k][clone_2].seq_a print ial_set_2[k][clone_2].seq_b sys.exit() if best2_total[0][0] != best2_s1[0][0] or best2_total[0][ 0] != best2_s2[0][0]: amb_delta.append(relative_gain) skewness_amb.append(abs(b1 - b2) / (bt)) tot_score_amb.append(bt) sum_score_amb.append((b1 + b2) / 2) ambiguous += 1 if relative_gain > 0.05 and b1 > 4.5 and b2 > 4.5: tk = k #.split('#')[0] outliers.append(tk) best_out[tk] = [ best2_total[0][0], best2_s1[0][0], best2_s2[0][0] ] else: total_delta.append(relative_gain) skewness.append(abs(b1 - b2) / (bt)) tot_score.append(bt) sum_score.append((b1 + b2) / 2) # print >> sys.stderr, discarded, 'reads had two matches' print >> sys.stderr, ambiguous, 'potentially ambiguous' print >> sys.stderr, len(outliers), 'outliers' # write the outliers handle = open(reads_file) tmp_dict = SeqIO.to_dict(SeqIO.parse(handle, 'fasta')) reads_dict = {} for k in tmp_dict.keys(): k1 = k.split('#')[0] reads_dict[k1] = tmp_dict[k] out_list = [reads_dict[r] for r in outliers] out_handle = open('outliers.fas', 'w') SeqIO.write(out_list, out_handle, 'fasta') out_handle.close() # for k in best_out: # print >> sys.stderr, k, best_out[k] plot_amb_hist(total_delta, amb_delta)