def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs) / 100) cas_line_re = re.compile( r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group( 1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs)/100) cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def writeNodesDB(nodesDB, nodesDB_f): print BtLog.status_d['5'] % nodesDB_f nodes_count = nodesDB['nodes_count'] i = 0 with open(nodesDB_f, 'w') as fh: fh.write("# nodes_count = %s\n" % nodes_count) for node in nodesDB: if not node == "nodes_count": i += 1 BtLog.progress(i, 1000, nodes_count) fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) base_cov_dict = {blob: [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int( match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random): print BtLog.status_d['6'] % ",".join(taxrules) tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if (blObj.hits): blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random) else: blObj.taxonomy[taxrule] = BtTax.noHit() self.set_of_taxIds = set()
def readNodesDB(nodesDB_f): nodesDB = {} nodesDB_count = 0 nodes_count = 0 with open(nodesDB_f) as fh: for line in fh: if line.startswith("#"): nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) else: nodes_count += 1 node, rank, name, parent = line.rstrip("\n").split("\t") nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent} if (nodesDB_count): BtLog.progress(nodes_count, 1000, nodesDB_count) nodesDB['nodes_count'] = nodes_count return nodesDB
def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random): print BtLog.status_d["6"] % ",".join(taxrules) tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if blObj.hits: blObj.taxonomy[taxrule] = BtTax.taxRule( taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random ) else: blObj.taxonomy[taxrule] = BtTax.noHit() self.set_of_taxIds = set()
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) base_cov_dict = {blob : [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob : 0 for blob in set_of_blobs} cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()} #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def main(): args = docopt(__doc__) fasta_f = args['--infile'] list_f = args['--list'] invert = args['--invert'] prefix = args['--out'] output = [] out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna") print BtLog.status_d['1'] % ("list", list_f) items = BtIO.parseSet(list_f) items_count = len(items) print BtLog.status_d['22'] % fasta_f items_parsed = [] sequences = 0 for header, sequence in BtIO.readFasta(fasta_f): sequences += 1 if header in items: if not (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) else: if (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) BtLog.progress(len(output), 10, items_count, no_limit=True) BtLog.progress(items_count, 10, items_count) items_parsed_count = len(items_parsed) print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences)) items_parsed_count_unique = len(set(items_parsed)) if not items_parsed_count == items_parsed_count_unique: print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1]))) with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("".join(output))
def view(self, **kwargs): # arguments viewObjs = kwargs['viewObjs'] ranks = kwargs['ranks'] taxrule = kwargs['taxrule'] hits_flag = kwargs['hits_flag'] seqs = kwargs['seqs'] cov_libs = kwargs['cov_libs'] progress_bar = kwargs['progressbar'] # Default sequences if no subset if not (seqs): seqs = self.order_of_blobs # Default cov_libs if no subset cov_lib_names = cov_libs if not (cov_libs): cov_lib_names = [covLib for covLib in self.covLibs] tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)] lineages = self.lineages # setup for viewObj in viewObjs: if viewObj.name == 'table': viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names) if viewObj.name == 'concoct_cov': viewObj.header = self.getConcoctCovHeader(cov_lib_names) if viewObj.name == 'covlib': viewObj.header = self.getCovHeader(cov_lib_names) if viewObj.name == 'experimental': viewObj.covs = {cov_lib : [] for cov_lib in cov_lib_names} viewObj.covs["covsum"] = [] for taxrule in self.taxrules: viewObj.tax[taxrule] = {rank : [] for rank in BtTax.RANKS} # bodies for i, seq in enumerate(seqs): if (progress_bar): BtLog.progress(i, 1000, len(seqs)) blob = self.dict_of_blobs[seq] for viewObj in viewObjs: if viewObj.name == 'table': viewObj.body.append(self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages)) if viewObj.name == 'concoct_cov': viewObj.body.append(self.getConcoctCovLine(blob, cov_lib_names)) if viewObj.name == 'experimental': viewObj.names.append(blob['name']) viewObj.gc.append(blob['gc']) viewObj.length.append(blob['length']) cov_sum = 0.0 for cov_lib in blob['covs']: viewObj.covs[cov_lib].append(blob['covs'][cov_lib]) cov_sum += blob['covs'][cov_lib] viewObj.covs['covsum'].append(cov_sum) for taxrule in blob['taxonomy']: for rank in blob['taxonomy'][taxrule]: viewObj.tax[taxrule][rank].append(blob['taxonomy'][taxrule][rank]['tax']) if viewObj.name == 'concoct_tax': for rank in ranks: if not rank in viewObj.body: viewObj.body[rank] = [] viewObj.body[rank].append(self.getConcoctTaxLine(blob, rank, taxrule)) if viewObj.name == 'covlib': viewObj.body.append(self.getCovLine(blob, cov_lib_names)) if (progress_bar): BtLog.progress(len(seqs), 1000, len(seqs)) for viewObj in viewObjs: viewObj.output()
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % ( infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs( outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join( sorted([ sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]] ])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = { read_pair_type: tuple() for read_pair_type in read_pair_count } except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1
def view(self, **kwargs): # arguments viewObjs = kwargs["viewObjs"] ranks = kwargs["ranks"] taxrule = kwargs["taxrule"] hits_flag = kwargs["hits_flag"] seqs = kwargs["seqs"] cov_libs = kwargs["cov_libs"] progress_bar = kwargs["progressbar"] # Default sequences if no subset if not (seqs): seqs = self.order_of_blobs # Default cov_libs if no subset cov_lib_names = cov_libs if not (cov_libs): cov_lib_names = [covLib for covLib in self.covLibs] tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)] lineages = self.lineages # setup for viewObj in viewObjs: if viewObj.name == "table": viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names) if viewObj.name == "concoct_cov": viewObj.header = self.getConcoctCovHeader(cov_lib_names) if viewObj.name == "covlib": viewObj.header = self.getCovHeader(cov_lib_names) if viewObj.name == "experimental": viewObj.covs = {cov_lib: [] for cov_lib in cov_lib_names} viewObj.covs["covsum"] = [] for taxrule in self.taxrules: viewObj.tax[taxrule] = {rank: [] for rank in BtTax.RANKS} # bodies for i, seq in enumerate(seqs): if progress_bar: BtLog.progress(i, 1000, len(seqs)) blob = self.dict_of_blobs[seq] for viewObj in viewObjs: if viewObj.name == "table": viewObj.body.append( self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages) ) if viewObj.name == "concoct_cov": viewObj.body.append(self.getConcoctCovLine(blob, cov_lib_names)) if viewObj.name == "experimental": viewObj.names.append(blob["name"]) viewObj.gc.append(blob["gc"]) viewObj.length.append(blob["length"]) cov_sum = 0.0 for cov_lib in blob["covs"]: viewObj.covs[cov_lib].append(blob["covs"][cov_lib]) cov_sum += blob["covs"][cov_lib] viewObj.covs["covsum"].append(cov_sum) for taxrule in blob["taxonomy"]: for rank in blob["taxonomy"][taxrule]: viewObj.tax[taxrule][rank].append(blob["taxonomy"][taxrule][rank]["tax"]) if viewObj.name == "concoct_tax": for rank in ranks: if not rank in viewObj.body: viewObj.body[rank] = [] viewObj.body[rank].append(self.getConcoctTaxLine(blob, rank, taxrule)) if viewObj.name == "covlib": viewObj.body.append(self.getCovLine(blob, cov_lib_names)) if progress_bar: BtLog.progress(len(seqs), 1000, len(seqs)) for viewObj in viewObjs: viewObj.output()
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count} except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1