def find_bowtie2(): try: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger) except RuntimeError: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION, logger) return bowtie2, bowtie2_build
def prelim_map(fastq1, fastq2, prelim_csv, nthreads=BOWTIE_THREADS, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, gzip=False, work_path='', excluded_seeds=None): """ Run the preliminary mapping step. @param fastq1: the file name for the forward reads in FASTQ format @param fastq2: the file name for the reverse reads in FASTQ format @param prelim_csv: an open file object for the output file - all the reads mapped to references in CSV version of the SAM format @param nthreads: the number of threads to use. @param rdgopen: a penalty for opening a gap in the read sequence. @param rfgopen: a penalty for opening a gap in the reference sequence. @param stderr: where to write the standard error output from bowtie2 calls. @param gzip: True if FASTQ files are in gzip format @param work_path: optional path to store working files @param excluded_seeds: a list of seed names to exclude from mapping """ try: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger) except RuntimeError: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION, logger) # check that the inputs exist fastq1 = check_fastq(fastq1, gzip) fastq2 = check_fastq(fastq2, gzip) # generate initial reference files projects = project_config.ProjectConfig.loadDefault() ref_path = os.path.join(work_path, 'micall.fasta') all_excluded_seeds = {project_config.G2P_SEED_NAME} if excluded_seeds: all_excluded_seeds.update(excluded_seeds) with open(ref_path, 'w') as ref: projects.writeSeedFasta(ref, all_excluded_seeds) reffile_template = os.path.join(work_path, 'reference') bowtie2_build.build(ref_path, reffile_template) fieldnames = ['qname', 'flag', 'rname', 'pos', 'mapq', 'cigar', 'rnext', 'pnext', 'tlen', 'seq', 'qual'] writer = csv.writer(prelim_csv, lineterminator=os.linesep) writer.writerow(fieldnames) # do preliminary mapping read_gap_open_penalty = rdgopen ref_gap_open_penalty = rfgopen # stream output from bowtie2 bowtie_args = ['--wrapper', 'micall-0', '--quiet', '-x', reffile_template, '-1', fastq1, '-2', fastq2, '--rdg', "{},{}".format(read_gap_open_penalty, READ_GAP_EXTEND), '--rfg', "{},{}".format(ref_gap_open_penalty, REF_GAP_EXTEND), '--no-hd', # no header lines (start with @) '-X', '1200', '-p', str(nthreads)] for i, line in enumerate(bowtie2.yield_output(bowtie_args, stderr=stderr)): writer.writerow(line.split('\t')[:11]) # discard optional items
def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv, remap_conseq_csv, unmapped1, unmapped2, work_path='', nthreads=BOWTIE_THREADS, callback=None, count_threshold=10, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, gzip=False): """ Iterative re-map reads from raw paired FASTQ files to a reference sequence set that is being updated as the consensus of the reads that were mapped to the last set. @param fastq1: input R1 FASTQ @param fastq2: input R2 FASTQ @param prelim_csv: input CSV output from prelim_csv() @param remap_csv: output CSV, contents of bowtie2 SAM output @param remap_counts_csv: output CSV, counts of reads mapped to regions @param remap_conseq_csv: output CSV, sample- and region-specific consensus sequences generated while remapping reads @param unmapped1: output FASTQ containing R1 reads that did not map to any region @param unmapped2: output FASTQ containing R2 reads that did not map to any region @param work_path: optional path to store working files @param nthreads: optional setting to modify the number of threads used by bowtie2 @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param count_threshold: minimum number of reads that map to a region for it to be remapped @param rdgopen: read gap open penalty @param rfgopen: reference gap open penalty """ reffile = os.path.join(work_path, 'temp.fasta') samfile = os.path.join(work_path, 'temp.sam') try: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger) except: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION, logger) # check that the inputs exist if not os.path.exists(fastq1): logger.error('No FASTQ found at %s', fastq1) sys.exit(1) if not os.path.exists(fastq2): logger.error('No FASTQ found at %s', fastq2) sys.exit(1) # append .gz extension if necessary if gzip: if not fastq1.endswith('.gz'): try: os.symlink(fastq1, fastq1 + '.gz') except OSError: # symbolic link already exists pass fastq1 += '.gz' if not fastq2.endswith('.gz'): try: os.symlink(fastq2, fastq2 + '.gz') except OSError: # symbolic link already exists pass fastq2 += '.gz' worker_pool = multiprocessing.Pool( processes=nthreads) if nthreads > 1 else None # retrieve reference sequences used for preliminary mapping projects = project_config.ProjectConfig.loadDefault() seeds = {} for seed, vals in projects.config['regions'].iteritems(): seqs = vals['reference'] seeds[seed] = ''.join(seqs) conseqs = dict(seeds) # copy # record the raw read count raw_count = line_counter.count( fastq1, gzip=gzip) / 2 # 4 lines per record in FASTQ, paired remap_counts_writer = csv.DictWriter( remap_counts_csv, 'type count filtered_count seed_dist other_dist other_seed'.split(), lineterminator=os.linesep) remap_counts_writer.writeheader() remap_counts_writer.writerow(dict(type='raw', count=raw_count)) # convert preliminary CSV to SAM, count reads if callback: callback(message='... processing preliminary map', progress=0, max_progress=raw_count) with open(samfile, 'w') as f: # write SAM header f.write('@HD\tVN:1.0\tSO:unsorted\n') for rname, refseq in conseqs.iteritems(): f.write('@SQ\tSN:%s\tLN:%d\n' % (rname, len(refseq))) f.write('@PG\tID:bowtie2\tPN:bowtie2\tVN:2.2.3\tCL:""\n') # iterate through prelim CSV and record counts, transfer rows to SAM refgroups = {} # { group_name: (refname, count) } reader = csv.DictReader(prelim_csv) row_count = 0 for refname, group in itertools.groupby(reader, itemgetter('rname')): count = 0 filtered_count = 0 for row in group: if callback and row_count % 1000 == 0: callback(progress=row_count) count += 1 row_count += 1 # write SAM row f.write('\t'.join([row[field] for field in fieldnames]) + '\n') if is_unmapped_read(row['flag']): continue if is_short_read(row, max_primer_length=50): # exclude short reads continue filtered_count += 1 if callback: callback(progress=raw_count) # report preliminary counts to file remap_counts_writer.writerow( dict(type='prelim %s' % refname, count=count, filtered_count=filtered_count)) if refname == '*': continue refgroup = projects.getSeedGroup(refname) seed_count_threshold = 1 if refname == 'HIV1B-env-seed' else count_threshold _best_ref, best_count = refgroups.get( refgroup, (None, seed_count_threshold - 1)) if filtered_count > best_count: refgroups[refgroup] = (refname, filtered_count) seed_counts = { best_ref: best_count for best_ref, best_count in refgroups.itervalues() } # regenerate consensus sequences based on preliminary map conseqs = build_conseqs(samfile, seeds=seeds, worker_pool=worker_pool) # exclude references with low counts (post filtering) new_conseqs = {} map_counts = {} for rname, conseq in conseqs.iteritems(): count = seed_counts.get(rname, None) if count is not None: map_counts[ rname] = count # transfer filtered counts to map counts for remap loop new_conseqs[rname] = conseq conseqs = new_conseqs # start remapping loop n_remaps = 0 new_counts = Counter() unmapped_count = raw_count while conseqs: if callback: callback(message='... remap iteration %d' % n_remaps, progress=0) # reset unmapped files with each iteration unmapped1.seek(0) unmapped1.truncate() unmapped2.seek(0) unmapped2.truncate() unmapped_count = map_to_reference(fastq1, fastq2, conseqs, reffile, samfile, unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count, rdgopen, rfgopen, nthreads, new_counts, stderr, callback) old_seed_names = set(conseqs.iterkeys()) # regenerate consensus sequences distance_report = {} conseqs = build_conseqs( samfile, seeds=seeds, is_filtered=True, worker_pool=worker_pool, filter_coverage=count_threshold / 2, # pairs distance_report=distance_report) new_seed_names = set(conseqs.iterkeys()) n_remaps += 1 write_remap_counts(remap_counts_writer, new_counts, title='remap-{}'.format(n_remaps), distance_report=distance_report) if new_seed_names == old_seed_names: # stopping criterion 1 - none of the regions gained reads if all((count <= map_counts[refname]) for refname, count in new_counts.iteritems()): break # stopping criterion 2 - a sufficient fraction of raw data has been mapped mapping_efficiency = sum(new_counts.values()) / float(raw_count) if mapping_efficiency > MIN_MAPPING_EFFICIENCY: break if n_remaps >= MAX_REMAPS: break # deep copy of mapping counts map_counts = dict(new_counts) # finished iterative phase if worker_pool is not None: worker_pool.close() # generate SAM CSV output remap_writer = csv.DictWriter(remap_csv, fieldnames, lineterminator=os.linesep) remap_writer.writeheader() if new_counts: splitter = MixedReferenceSplitter() split_counts = Counter() # At least one read was mapped, so samfile has relevant data with open(samfile, 'rU') as f: for fields in splitter.split(f): remap_writer.writerow(dict(zip(fieldnames, fields))) for rname, (split_file1, split_file2) in splitter.splits.iteritems(): refseqs = {rname: conseqs[rname]} unmapped_count += map_to_reference( split_file1.name, split_file2.name, refseqs, reffile, samfile, unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count, rdgopen, rfgopen, nthreads, split_counts, stderr, callback) new_counts.update(split_counts) with open(samfile, 'rU') as f: for fields in splitter.walk(f): remap_writer.writerow(dict(zip(fieldnames, fields))) # write consensus sequences and counts remap_conseq_csv.write( 'region,sequence\n') # record consensus sequences for later use for refname in new_counts.iterkeys(): # NOTE this is the consensus sequence to which the reads were mapped, NOT the # current consensus! conseq = conseqs.get(refname) or projects.getReference(refname) remap_conseq_csv.write('%s,%s\n' % (refname, conseq)) write_remap_counts(remap_counts_writer, new_counts, title='remap-final') # report number of unmapped reads remap_counts_writer.writerow(dict(type='unmapped', count=unmapped_count))
def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv, remap_conseq_csv, unmapped1, unmapped2, work_path='', callback=None, count_threshold=10, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, gzip=False, debug_file_prefix=None): """ Iterative re-map reads from raw paired FASTQ files to a reference sequence set that is being updated as the consensus of the reads that were mapped to the last set. @param fastq1: input R1 FASTQ @param fastq2: input R2 FASTQ @param prelim_csv: input CSV output from prelim_csv() @param remap_csv: output CSV, contents of bowtie2 SAM output @param remap_counts_csv: output CSV, counts of reads mapped to regions @param remap_conseq_csv: output CSV, sample- and region-specific consensus sequences generated while remapping reads @param unmapped1: output FASTQ containing R1 reads that did not map to any region @param unmapped2: output FASTQ containing R2 reads that did not map to any region @param work_path: optional path to store working files @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param count_threshold: minimum number of reads that map to a region for it to be remapped @param rdgopen: read gap open penalty @param rfgopen: reference gap open penalty @param stderr: an open file object to receive stderr from the bowtie2 calls @param gzip: True if the FASTQ files are gzipped @param debug_file_prefix: the prefix for the file path to write debug files. If not None, this will be used to write a copy of the reference FASTA files and the output SAM files. """ reffile = os.path.join(work_path, 'temp.fasta') samfile = os.path.join(work_path, 'temp.sam') try: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger) except RuntimeError: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION, logger) # check that the inputs exist fastq1 = check_fastq(fastq1, gzip) fastq2 = check_fastq(fastq2, gzip) # retrieve reference sequences used for preliminary mapping projects = project_config.ProjectConfig.loadDefault() seeds = projects.getAllReferences() # record the raw read count raw_count = line_counter.count( fastq1, gzip=gzip) // 2 # 4 lines per record in FASTQ, paired remap_counts_writer = csv.DictWriter( remap_counts_csv, 'type count filtered_count seed_dist other_dist other_seed'.split(), lineterminator=os.linesep) remap_counts_writer.writeheader() remap_counts_writer.writerow(dict(type='raw', count=raw_count)) # convert preliminary CSV to SAM, count reads with open(samfile, 'w') as f: # transfer filtered counts to map counts for remap loop map_counts = convert_prelim(prelim_csv, f, remap_counts_writer, count_threshold, projects) # regenerate consensus sequences based on preliminary map prelim_conseqs = build_conseqs(samfile, seeds=seeds) # exclude references with low counts (post filtering) conseqs = { rname: prelim_conseqs[rname] for rname in map_counts if rname in prelim_conseqs } # start remapping loop n_remaps = 0 new_counts = Counter() unmapped_count = raw_count while conseqs: # reset unmapped files with each iteration unmapped1.seek(0) unmapped1.truncate() unmapped2.seek(0) unmapped2.truncate() if debug_file_prefix is None: next_debug_prefix = None else: next_debug_prefix = '{}_remap{}'.format(debug_file_prefix, n_remaps + 1) unmapped_count = map_to_reference(fastq1, fastq2, conseqs, reffile, samfile, unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count, rdgopen, rfgopen, new_counts, stderr, callback, debug_file_prefix=next_debug_prefix) old_seed_names = set(conseqs.keys()) # regenerate consensus sequences distance_report = {} conseqs = build_conseqs( samfile, seeds=conseqs, is_filtered=True, filter_coverage=count_threshold // 2, # pairs distance_report=distance_report, original_seeds=seeds) new_seed_names = set(conseqs.keys()) n_remaps += 1 write_remap_counts(remap_counts_writer, new_counts, title='remap-{}'.format(n_remaps), distance_report=distance_report) if new_seed_names == old_seed_names: # stopping criterion 1 - none of the regions gained reads if all((count <= map_counts[refname]) for refname, count in new_counts.items()): break # stopping criterion 2 - a sufficient fraction of raw data has been mapped mapping_efficiency = sum(new_counts.values()) / float(raw_count) if mapping_efficiency > MIN_MAPPING_EFFICIENCY: break if n_remaps >= MAX_REMAPS: break # deep copy of mapping counts map_counts = dict(new_counts) # finished iterative phase # generate SAM CSV output remap_writer = csv.DictWriter(remap_csv, SAM_FIELDS, lineterminator=os.linesep) remap_writer.writeheader() if new_counts: splitter = MixedReferenceSplitter(work_path) split_counts = Counter() # At least one read was mapped, so samfile has relevant data with open(samfile) as f: for fields in splitter.split(f): remap_writer.writerow(dict(zip(SAM_FIELDS, fields))) for rname, (split_file1, split_file2) in splitter.splits.items(): refseqs = {rname: conseqs[rname]} unmapped_count += map_to_reference( split_file1.name, split_file2.name, refseqs, reffile, samfile, unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count, rdgopen, rfgopen, split_counts, stderr, callback) new_counts.update(split_counts) with open(samfile, 'rU') as f: for fields in splitter.walk(f): remap_writer.writerow(dict(zip(SAM_FIELDS, fields))) # write consensus sequences and counts remap_conseq_csv.write( 'region,sequence\n') # record consensus sequences for later use for refname in new_counts.keys(): # NOTE this is the consensus sequence to which the reads were mapped, NOT the # current consensus! conseq = conseqs.get(refname) or projects.getReference(refname) remap_conseq_csv.write('%s,%s\n' % (refname, conseq)) write_remap_counts(remap_counts_writer, new_counts, title='remap-final') # report number of unmapped reads remap_counts_writer.writerow(dict(type='unmapped', count=unmapped_count))
def prelim_map(fastq1, fastq2, prelim_csv, nthreads=BOWTIE_THREADS, callback=None, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, gzip=False, work_path=''): """ Run the preliminary mapping step. @param fastq1: the file name for the forward reads in FASTQ format @param fastq2: the file name for the reverse reads in FASTQ format @param prelim_csv: an open file object for the output file - all the reads mapped to references in CSV version of the SAM format @param nthreads: the number of threads to use. @param callback: a function to report progress with three optional parameters - callback(message, progress, max_progress) @param rdgopen: a penalty for opening a gap in the read sequence. @param rfgopen: a penalty for opening a gap in the reference sequence. @param stderr: where to write the standard error output from bowtie2 calls. @param work_path: optional path to store working files """ try: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger) except: bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION) bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION, logger) # check that the inputs exist if not os.path.exists(fastq1): logger.error('No FASTQ found at %s', fastq1) sys.exit(1) if not os.path.exists(fastq2): logger.error('No FASTQ found at %s', fastq2) sys.exit(1) # append .gz extension if necessary if gzip: if not fastq1.endswith('.gz'): try: os.symlink(fastq1, fastq1 + '.gz') except OSError: # symbolic link already exists pass fastq1 += '.gz' if not fastq2.endswith('.gz'): try: os.symlink(fastq2, fastq2 + '.gz') except OSError: # symbolic link already exists pass fastq2 += '.gz' if callback: # four lines per read, two files total_reads = line_counter.count(fastq1, gzip=gzip) / 2 callback(message='... preliminary mapping', progress=0, max_progress=total_reads) # generate initial reference files projects = project_config.ProjectConfig.loadDefault() ref_path = os.path.join(work_path, 'micall.fasta') with open(ref_path, 'w') as ref: projects.writeSeedFasta(ref) reffile_template = os.path.join(work_path, 'reference') bowtie2_build.build(ref_path, reffile_template) # do preliminary mapping output = {} read_gap_open_penalty = rdgopen ref_gap_open_penalty = rfgopen # stream output from bowtie2 bowtie_args = [ '--wrapper', 'micall-0', '--quiet', '-x', reffile_template, '-1', fastq1, '-2', fastq2, '--rdg', "{},{}".format(read_gap_open_penalty, READ_GAP_EXTEND), '--rfg', "{},{}".format(ref_gap_open_penalty, REF_GAP_EXTEND), '--no-hd', # no header lines (start with @) '-X', '1200', '-p', str(nthreads) ] for i, line in enumerate(bowtie2.yield_output(bowtie_args, stderr=stderr)): if callback and i % 1000 == 0: callback(progress=i) refname = line.split('\t')[2] # read was mapped to this reference if refname not in output: output.update({refname: []}) output[refname].append(line.split('\t')[:11]) # discard optional items fieldnames = [ 'qname', 'flag', 'rname', 'pos', 'mapq', 'cigar', 'rnext', 'pnext', 'tlen', 'seq', 'qual' ] writer = csv.DictWriter(prelim_csv, fieldnames, lineterminator=os.linesep) writer.writeheader() # lines grouped by refname for refname, lines in output.iteritems(): for line in lines: writer.writerow(dict(zip(fieldnames, line))) if callback: # Track progress for second half callback(progress=total_reads)