# 4. run mm.py to find minimal matching sholog.debug('running mm.py') mm.main('%s_cor.rest' % in_stem, maxhaplo=200) # 5. run EM freqEst output: sample.$nr.popl sholog.debug('running freqEst') my_prog = os.path.join(dn, 'freqEst') my_arg = " -f %s_cor" % in_stem assert os.path.isfile('%s_cor.rest' % in_stem), \ 'File %s_cor.rest not found' % in_stem retcode_em = run_child(my_prog, my_arg) if retcode_em: sholog.error('freqEst did not return 0') sys.exit('Something went wrong in the EM step') else: sholog.debug('freqEst exited successfully') # 6. run snv.py to parse single nucleotide variants sholog.debug('running snv.py') snv.main(reference=options.f, bam_file=options.b, sigma=options.i, increment=options.w / options.s) # tidy snvs try: os.mkdir('snv') except OSError: pass for snv_file in glob.glob('./SNV*'): shutil.move(snv_file, 'snv/')
def main(in_bam='', in_fasta='', win_length=201, win_shifts=3, region='', max_coverage=10000, alpha=0.1, keep_files=True): ''' Performs the error correction analysis, running diri_sampler and analyzing the result ''' from multiprocessing import Pool, cpu_count import shutil import glob import time import snv # set logging level declog.setLevel(logging.DEBUG) # This handler writes everything to a file. LOG_FILENAME = './dec.log' hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', maxBytes=100000, backupCount=5) f = logging.Formatter("%(levelname)s %(asctime)s\ %(funcName)s %(lineno)d %(message)s") hl.setFormatter(f) declog.addHandler(hl) declog.info(' '.join(sys.argv)) # check options if win_length % win_shifts != 0: sys.exit('Window size must be divisible by win_shifts') if win_min_ext < 1 / float(win_shifts): declog.warning('Some bases might not be covered by any window') if max_coverage / win_length < 1: sys.exit('Please increase max_coverage') if not os.path.isfile(in_bam): sys.exit("File '%s' not found" % in_bam) if not os.path.isfile(in_fasta): sys.exit("File '%s' not found" % in_fasta) incr = win_length / win_shifts max_c = max_coverage / win_length keep_all_files = keep_files #run b2w retcode = windows((in_bam, in_fasta, win_length, incr, win_min_ext * win_length, max_c, region)) if retcode is not 0: sys.exit('b2w run not successful') aligned_reads = parse_aligned_reads('reads.fas') r = aligned_reads.keys()[0] gen_length = aligned_reads[r][1] - aligned_reads[r][0] if win_length > gen_length: sys.exit('The window size must be smaller than the genome region') declog.info('%s reads are being considered' % len(aligned_reads)) for k in aligned_reads.keys(): to_correct[k] = [None, None, None, None, []] to_correct[k][0] = aligned_reads[k][0] to_correct[k][1] = aligned_reads[k][1] to_correct[k][2] = aligned_reads[k][2] to_correct[k][3] = aligned_reads[k][3] to_correct[k][4] = [] # aligned_reads[k][4][:] ############################################ # Now the windows and the error correction # ############################################ runlist = win_to_run(alpha) declog.info('will run on %d windows' % len(runlist)) # run diri_sampler on all available processors but one max_proc = max(cpu_count() - 1, 1) pool = Pool(processes=max_proc) pool.map(run_dpm, runlist) # prepare directories if keep_all_files: for sd_name in [ 'debug', 'sampling', 'freq', 'support', 'corrected', 'raw_reads' ]: try: os.mkdir(sd_name) except OSError: pass # parse corrected reads proposed = {} for i in runlist: winFile, j, a = i del (a) # in future alpha might be different on each window parts = winFile.split('.')[0].split('-') chrom = '-'.join(parts[1:-2]) beg = parts[-2] end = parts[-1] declog.info('reading windows for start position %s' % beg) # correct reads populates correction and quality, globally defined correct_reads(chrom, beg, end) stem = 'w-%s-%s-%s' % (chrom, beg, end) declog.info('this is window %s' % stem) dbg_file = stem + '.dbg' # if os.path.exists(dbg_file): proposed[beg] = (get_prop(dbg_file), j) declog.info('there were %s proposed' % str(proposed[beg][0])) # (re)move intermediate files if not keep_all_files: declog.info('removing intermediate files') tr_files = glob.glob('./w*reads.fas') tr_files.extend(glob.glob('./*.smp')) tr_files.extend(glob.glob('./w*.dbg')) for trf in tr_files: os.remove(trf) tr_files = glob.glob('./w*reads-cor.fas') tr_files.extend(glob.glob('./w*reads-freq.csv')) tr_files.extend(glob.glob('./w*reads-support.fas')) for trf in tr_files: if os.stat(trf).st_size == 0: os.remove(trf) else: for dbg_file in glob.glob('./w*dbg'): if os.stat(dbg_file).st_size > 0: gzf = gzip_file(dbg_file) try: os.remove('debug/%s' % gzf) except OSError: pass shutil.move(gzf, 'debug/') else: os.remove(dbg_file) for smp_file in glob.glob('./w*smp'): if os.stat(smp_file).st_size > 0: gzf = gzip_file(smp_file) try: os.remove('sampling/%s' % gzf) except OSError: pass shutil.move(gzf, 'sampling/') else: os.remove(smp_file) for cor_file in glob.glob('./w*reads-cor.fas'): if os.stat(cor_file).st_size > 0: gzf = gzip_file(cor_file) try: os.remove('corrected/%s' % gzf) except OSError: pass shutil.move(gzf, 'corrected/') else: os.remove(cor_file) for sup_file in glob.glob('./w*reads-support.fas'): if os.stat(sup_file).st_size > 0: gzf = gzip_file(sup_file) try: os.remove('support/%s' % gzf) except OSError: pass shutil.move(gzf, 'support/') else: os.remove(sup_file) for freq_file in glob.glob('./w*reads-freq.csv'): if os.stat(freq_file).st_size > 0: gzf = gzip_file(freq_file) try: os.remove('freq/%s' % gzf) except OSError: pass shutil.move(gzf, 'freq/') else: os.remove(freq_file) for raw_file in glob.glob('./w*reads.fas'): if os.stat(raw_file).st_size > 0: gzf = gzip_file(raw_file) try: os.remove('raw_reads/%s' % gzf) except OSError: pass shutil.move(gzf, 'raw_reads/') else: os.remove(raw_file) ############################################ ## Print the corrected reads ## ## ## correction[read_id][wstart] = sequence ## ## quality[read_id][wstart] = posterior ## # ########################################## reason = [0, 0, 0] declog.info('now correct, %d reads will be analysed' % len(to_correct)) creads = 0 for r in to_correct: if r not in correction.keys(): continue creads += 1 if creads % 500 == 0: declog.info('considered %d corrected reads' % creads) print aligned_reads[r][4] rlen = len(aligned_reads[r][4]) # length of original read rst = aligned_reads[r][2] # read start in the reference corrstore = [] for rpos in range(rlen): this = [] for cst in correction[r]: tp = rpos + rst - int(cst) if tp < 0: reason[0] += 1 if tp >= len(correction[r][cst]): reason[1] += 1 if (tp >= 0 and tp < len(correction[r][cst]) and quality[r][cst] > min_quality): reason[2] += 1 tc = correction[r][cst][tp] this.append(tc) corrstore.append(rpos) if len(this) > 0: cb = base_break(this) else: cb = 'X' to_correct[r][4].append(cb) del this declog.info('considered all corrected reads') ccx = {} cin_stem = '.'.join(os.path.split(in_bam)[1].split('.')[:-1]) fch = open('%s.cor.fas' % cin_stem, 'w') declog.debug('writing to file %s.cor.fas' % cin_stem) for r in to_correct: cor_read = ''.join(to_correct[r][4]) init_x = len(cor_read.lstrip('-')) - len(cor_read.lstrip('-X')) fin_x = len(cor_read.rstrip('-')) - len(cor_read.rstrip('-X')) cx = to_correct[r][4].count('X') - init_x - fin_x ccx[cx] = ccx.get(cx, 0) + 1 if cx <= min_x_thresh and cor_read.lstrip('-X') != '': fch.write('>%s %d\n' % (r, to_correct[r][2] + init_x - to_correct[r][0])) cc = 0 for c in cor_read.lstrip('-X'): if c != 'X': fch.write(str(c)) fch.flush() cc = cc + 1 if cc % fasta_length == 0: fch.write('\n') if cc % fasta_length != 0: fch.write('\n') print ccx fch.close() # write proposed_per_step to file ph = open('proposed.dat', 'w') ph.write('#base\tproposed_per_step\n') for kp in sorted(proposed.iterkeys()): if proposed[kp] != 'not found': ph.write('%s\t%f\n' % (kp, float(proposed[kp][0]) / proposed[kp][1])) ph.close() declog.info('running snv.py') snv.main(reference=in_fasta, bam_file=in_bam, increment=win_length / win_shifts) # tidy snvs try: os.mkdir('snv') except OSError: os.rename('snv', 'snv_before_%d' % int(time.time())) os.mkdir('snv') for snv_file in glob.glob('./SNV*'): shutil.move(snv_file, 'snv/') declog.info('dec.py ends')
def main(in_bam='', in_fasta='', min_overlap=0.95, max_coverage=50000, alpha=0.5, s=0.01, region='', diversity=False): ''' Performs the amplicon analysis, running diri_sampler and analyzing the result ''' import snv # set logging level amplog.setLevel(logging.DEBUG) # This handler writes everything to a file. LOG_FILENAME = './amplian.log' hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', maxBytes=100000, backupCount=5) f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\ %(lineno)d %(message)s") hl.setFormatter(f) amplog.addHandler(hl) amplog.info(' '.join(sys.argv)) # info on reference and region if given, or discover high entropy one ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0] ref_name = ref_seq.id if region: reg_bound = region.split(':')[1].split('-') reg_start, reg_stop = int(reg_bound[0]), int(reg_bound[1]) ref_length = reg_stop - reg_start + 1 elif region == '' and diversity: reg_start, reg_stop = highest_entropy(in_bam, in_fasta) ref_length = reg_stop - reg_start + 1 region = '%s:%d-%d' % (ref_seq.id, reg_start, reg_stop) elif region == '' and not diversity: reg_start = 1 ref_length = len(ref_seq) reg_stop = ref_length amplog.info('analysing region from %d to %d' % (reg_start, reg_stop)) # output the reads, aligned to the amplicon b2w_exe = os.path.join(dn, 'b2w') b2w_args = ' -i 0 -w %d -m %d -x %d %s %s %s' % \ (ref_length, int(min_overlap * ref_length), max_coverage, in_bam, in_fasta, region) ret_b2w = run_child(b2w_exe, b2w_args) amplog.debug('b2w returned %d' % ret_b2w) # run diri_sampler on the aligned reads win_file = 'w-%s-%d-%d.reads.fas' % (ref_name, reg_start, reg_stop) h = list(open('coverage.txt'))[0] n_reads = int(h.split()[-1]) assert os.path.exists(win_file), 'window file %s not found' % win_file diri_exe = os.path.join(dn, 'diri_sampler') iterations = min(200000, n_reads * 20) diri_args = '-i %s -j %d -a %f -t 2000' % (win_file, iterations, alpha) ret_diri = run_child(diri_exe, diri_args) amplog.debug('diri_sampler returned %d' % ret_diri) # diagnostics on the convergence run_diagnostics(win_file, n_reads) # run snv.py to parse single nucleotide variants snv.main(reference=options.in_fasta, bam_file=options.in_bam, sigma=s, increment=1)
sholog.debug('running mm.py') mm.main('%s_cor.rest' % in_stem, maxhaplo=200) # 5. run EM freqEst output: sample.$nr.popl sholog.debug('running freqEst') my_prog = os.path.join(dn, 'freqEst') my_arg = " -f %s_cor" % in_stem assert os.path.isfile('%s_cor.rest' % in_stem), \ 'File %s_cor.rest not found' % in_stem retcode_em = run_child(my_prog, my_arg) if retcode_em: sholog.error('freqEst did not return 0') sys.exit('Something went wrong in the EM step') else: sholog.debug('freqEst exited successfully') # 6. run snv.py to parse single nucleotide variants sholog.debug('running snv.py') snv.main(reference=options.f, bam_file=options.b, sigma=options.i, increment=options.w / options.s) # tidy snvs try: os.mkdir('snv') except OSError: pass for snv_file in glob.glob('./SNV*'): shutil.move(snv_file, 'snv/')
def runPipeline(args, sampleName, sampleDir): """ Runs the main ProDuSe analysis stages on the provided sample Args: args: A namespace object listing command line parameters to be passed to subscripts sampleName: Name of the sample currently being processed sampleDir: Output directory """ printPrefix = "PRODUSE-MAIN\t" # Run Trim args.config = getConfig(sampleDir, "trim") trim.main(args) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Trimming Complete\n" ])) # Run bwa on the trimmed fastqs args.config = getConfig(sampleDir, "trim_bwa") bwa.main(args) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Alignment Complete\n" ])) # Run collapse on the trimmed BAM file args.config = getConfig(sampleDir, "collapse") collapse.main(args) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Collapse Complete\n" ])) # Run bwa on the collapsed args.config = getConfig(sampleDir, "collapse_bwa") bwa.main(args) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Alignment Complete\n" ])) # Run stitcher collapsedBamFile = os.path.abspath( os.path.join(sampleDir, "tmp", sampleName + ".collapse.bam")) stitchedBam = runStitcher(collapsedBamFile, args.stitcherpath) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Stitching Complete\n" ])) # Sort files prior to splitmerge runSort(stitchedBam, byName=True) runSort(collapsedBamFile, byName=True) args.config = getConfig(sampleDir, "splitmerge") splitMergeBam = os.path.join(sampleDir, "results", sampleName + ".SplitMerge.bam") SplitMerge.main(args) runSort(splitMergeBam) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": SplitMerge Complete\n" ])) # Time for SNV calling, what everyone has been waiting for args.config = getConfig(sampleDir, "snv") snv.main(args) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": SNV Calling Complete\n" ])) # Filter variants args.config = getConfig(sampleDir, "filter") filter_produse.main(args) # runFilter(args.vaf, vcfFile, scriptDir + os.sep + "filter_produse.pl") sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": Variant Filtering Complete\n" ])) sys.stderr.write("\t".join([ printPrefix, time.strftime('%X'), sampleName + ": ProDuSe analysis Complete\n" ]))