def readCas(infile, order_of_blobs): seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs)/100) cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command)): for line in runCmd(command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) BtLog.progress(seqs_total, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def readBam(infile, set_of_blobs): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_mapped)/1000) + 1 # lazy fix base_cov_dict = {} read_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's # execute samtools to get only mapped reads command = "samtools view -F 4 " + infile # ADD flag picard -F 1028 to not consider optical duplicates #command = "samtools view -F 1028 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") if match >= 11: seq_name = match[2] base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) if (base_cov): parsed_reads += 1 if seq_name not in set_of_blobs: print BtLog.warn_d['2'] % (seq_name, infile) else: base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) if not int(reads_mapped) == int(parsed_reads): print warn_d['3'] % (reads_mapped, parsed_reads) return base_cov_dict, reads_total, parsed_reads, read_cov_dict
def readBam(infile, set_of_blobs): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_mapped) / 1000) + 1 # lazy fix base_cov_dict = {} read_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's # execute samtools to get only mapped reads command = "samtools view -F 4 " + infile # ADD flag picard -F 1028 to not consider optical duplicates #command = "samtools view -F 1028 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") if match >= 11: seq_name = match[2] base_cov = sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ]) if (base_cov): parsed_reads += 1 if seq_name not in set_of_blobs: print BtLog.warn_d['2'] % (seq_name, infile) else: base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) if not int(reads_mapped) == int(parsed_reads): print warn_d['3'] % (reads_mapped, parsed_reads) return base_cov_dict, reads_total, parsed_reads, read_cov_dict
def readBam(infile, fasta_headers): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_total) / 1000) base_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's read_cov_dict = {} # execute samtools to get only mapped reads from primary alignment command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") seq_name = match[2] if seq_name not in fasta_headers: print BtLog.warn_d['2'] % (seq_name, infile) else: read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 if not (no_base_cov_flag): base_cov = sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ]) if (base_cov): base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov parsed_reads += 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) return base_cov_dict, read_cov_dict, reads_total, parsed_reads
def readBam(infile, fasta_headers): reads_total, reads_mapped = checkBam(infile) progress_unit = int(int(reads_total)/1000) base_cov_dict = {} cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's read_cov_dict = {} # execute samtools to get only mapped reads from primary alignment command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile # only one counter since only yields mapped reads parsed_reads = 0 for line in runCmd(command): match = line.split("\t") seq_name = match[2] if seq_name not in fasta_headers: print BtLog.warn_d['2'] % (seq_name, infile) else: read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 if not (no_base_cov_flag): base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])]) if (base_cov): base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov parsed_reads += 1 BtLog.progress(parsed_reads, progress_unit, reads_total) BtLog.progress(reads_total, progress_unit, reads_total) return base_cov_dict, read_cov_dict, reads_total, parsed_reads
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs) / 100) cas_line_re = re.compile( r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group( 1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def writeNodesDB(nodesDB, nodesDB_f): nodes_count = nodesDB['nodes_count'] i = 0 with open(nodesDB_f, 'w') as fh: fh.write("# nodes_count = %s\n" % nodes_count) for node in nodesDB: if not node == "nodes_count": i += 1 BtLog.progress(i, 1000, nodes_count) fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) base_cov_dict = {blob: [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = blobtools.SAMTOOLS + " view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) reads_mapped = seen_reads base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int( match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def computeTaxonomy(self, taxrules, nodesDB): tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if (blObj.hits): blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages) else: blObj.taxonomy[taxrule] = BtTax.noHit()
def readNodesDB(nodesDB_f): nodesDB = {} nodes_count = 0 i = 0 with open(nodesDB_f) as fh: for line in fh: if line.startswith("#"): nodes_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) else: i += 1 node, rank, name, parent = line.rstrip("\n").split("\t") nodesDB[node] = {'rank': rank, 'name': name, 'parent': parent} BtLog.progress(i, 1000, nodes_count) return nodesDB
def readNodesDB(nodesDB_f): nodesDB = {} nodes_count = 0 i = 0 with open(nodesDB_f) as fh: for line in fh: if line.startswith("#"): nodes_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) else: i += 1 node, rank, name, parent = line.rstrip("\n").split("\t") nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent} BtLog.progress(i, 1000, nodes_count) return nodesDB
def readCov(infile, set_of_blobs): old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith("# Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("# Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("# Unmapped Reads"): pass elif line.startswith("# Parameters"): pass elif line.startswith("# contig_id"): pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def computeTaxonomy(self, taxrules, nodesDB, min_score, min_bitscore_diff, tax_collision_random): print BtLog.status_d['6'] % ",".join(taxrules) tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules self.min_score = min_score self.min_diff = min_bitscore_diff self.tax_collision_random = tax_collision_random i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if (blObj.hits): blObj.taxonomy[taxrule] = BtTax.taxRule( taxrule, blObj.hits, self.lineages, min_score, min_bitscore_diff, tax_collision_random) else: blObj.taxonomy[taxrule] = BtTax.noHit() self.set_of_taxIds = set()
def main(): args = docopt(__doc__) fasta_f = args['--infile'] list_f = args['--list'] invert = args['--invert'] prefix = args['--out'] output = [] out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna") print BtLog.status_d['1'] % ("list", list_f) items = BtIO.parseSet(list_f) items_count = len(items) print BtLog.status_d['22'] % fasta_f items_parsed = [] sequences = 0 for header, sequence in BtIO.readFasta(fasta_f): sequences += 1 if header in items: if not (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) else: if (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) BtLog.progress(len(output), 10, items_count, no_limit=True) BtLog.progress(items_count, 10, items_count) items_parsed_count = len(items_parsed) print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences)) items_parsed_count_unique = len(set(items_parsed)) if not items_parsed_count == items_parsed_count_unique: print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1]))) with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("".join(output))
def readCov(infile, set_of_blobs): cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") cov_dict = {} seqs_parsed = 0 progress_unit = int(len(set_of_blobs) / 100) with open(infile) as fh: for line in fh: BtLog.progress(seqs_parsed, 10, len(set_of_blobs)) match = cov_line_re.search(line) if match: seqs_parsed += 1 name, cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) cov_dict[name] = cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return cov_dict
def readCov(infile, set_of_blobs): cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") cov_dict = {} seqs_parsed = 0 progress_unit = int(len(set_of_blobs)/100) with open(infile) as fh: for line in fh: BtLog.progress(seqs_parsed, 10, len(set_of_blobs)) match = cov_line_re.search(line) if match: seqs_parsed += 1 name, cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) cov_dict[name] = cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return cov_dict
def parseBamForFilter(infile, include_unmapped, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = blobtools.SAMTOOLS + ' sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % ( infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile progress_unit = int(100000) #if progress_flag: # reads_total, reads_mapped = checkBam(infile) command = blobtools.SAMTOOLS + " view -f 1 -F 256 -F 2048 %s" % infile pair_count_by_type, pair_seqs_by_type, out_fs_by_type = init_read_pairs( outfile, include_unmapped, include, exclude) if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' iterator = '' read_pair_type = None iterator = runCmd(command=command) seen_reads = 0 sam_lines = [] print BtLog.status_d['26'] % infile for sam_line in iterator: sam_lines.append(sam_line) print BtLog.status_d['22'] % infile reads_total = len(sam_lines) for i in xrange(0, len(sam_lines), 2): read1 = sam_lines[i].split() try: seen_reads += 2 read2 = sam_lines[i + 1].split() read_pair_type = "".join( sorted([ sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]] ])) BtLog.progress(seen_reads, progress_unit, reads_total) if read_pair_type in pair_seqs_by_type: #pair_seqs_by_type[read_pair_type] += get_read_pair_seqs(read1, read2) pair_seqs_by_type[read_pair_type].append( get_read_pair_seqs(read1, read2)) pair_count_by_type[read_pair_type] += 1 except IndexError: print BtLog.warn_d['11'] #print_bam(read_pair_out_fs, read_pair_type, read1, read2) # this prints SAM files for debugging if not seen_reads == reads_total: BtLog.progress(reads_total, progress_unit, reads_total) write_read_pair_seqs(pair_count_by_type, pair_seqs_by_type, out_fs_by_type) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in pair_count_by_type.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) if do_sort and not keep_sorted: os.remove(infile) return 1
def view(self, **kwargs): # arguments viewObjs = kwargs['viewObjs'] ranks = kwargs['ranks'] taxrule = kwargs['taxrule'] hits_flag = kwargs['hits_flag'] seqs = kwargs['seqs'] cov_libs = kwargs['cov_libs'] progress_bar = kwargs['progressbar'] # Default sequences if no subset if not (seqs): seqs = self.order_of_blobs # Default cov_libs if no subset cov_lib_names = cov_libs if not (cov_libs): cov_lib_names = [covLib for covLib in self.covLibs] tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)] lineages = self.lineages # setup for viewObj in viewObjs: if viewObj.name == 'table': viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names) if viewObj.name == 'concoct_cov': viewObj.header = self.getConcoctCovHeader(cov_lib_names) if viewObj.name == 'covlib': viewObj.header = self.getCovHeader(cov_lib_names) if viewObj.name == 'experimental': viewObj.covs = {cov_lib: [] for cov_lib in cov_lib_names} viewObj.covs["covsum"] = [] for taxrule in self.taxrules: viewObj.tax[taxrule] = {rank: [] for rank in BtTax.RANKS} # bodies for i, seq in enumerate(seqs): if (progress_bar): BtLog.progress(i, 1000, len(seqs)) blob = self.dict_of_blobs[seq] for viewObj in viewObjs: if viewObj.name == 'table': viewObj.body.append( self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages)) if viewObj.name == 'concoct_cov': viewObj.body.append( self.getConcoctCovLine(blob, cov_lib_names)) if viewObj.name == 'experimental': viewObj.names.append(blob['name']) viewObj.gc.append(blob['gc']) viewObj.length.append(blob['length']) cov_sum = 0.0 for cov_lib in blob['covs']: viewObj.covs[cov_lib].append(blob['covs'][cov_lib]) cov_sum += blob['covs'][cov_lib] viewObj.covs['covsum'].append(cov_sum) for taxrule in blob['taxonomy']: for rank in blob['taxonomy'][taxrule]: viewObj.tax[taxrule][rank].append( blob['taxonomy'][taxrule][rank]['tax']) if viewObj.name == 'concoct_tax': for rank in ranks: if not rank in viewObj.body: viewObj.body[rank] = [] viewObj.body[rank].append( self.getConcoctTaxLine(blob, rank, taxrule)) if viewObj.name == 'covlib': viewObj.body.append(self.getCovLine(blob, cov_lib_names)) if (progress_bar): BtLog.progress(len(seqs), 1000, len(seqs)) for viewObj in viewObjs: viewObj.output()