def parseJson(infile): '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/''' if not isfile(infile): BtLog.error('0', infile) import time start = time.time() json_parser = '' with open(infile, 'r') as fh: print BtLog.status_d['15'] json_string = fh.read() try: import ujson as json # fastest json_parser = 'ujson' print BtLog.status_d['16'] % json_parser except ImportError: try: import simplejson as json # fast json_parser = 'simplejson' except ImportError: import json # default json_parser = 'json' print BtLog.status_d['17'] % json_parser try: obj = json.loads(json_string.decode("ascii")) except ValueError: BtLog.error('37', infile, "BlobDB") data = byteify(obj) print BtLog.status_d['20'] % (time.time() - start) return data
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) if (fasta_type): # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error('1')
def set_format_scatterplot(axScatter, **kwargs): min_x, max_x = None, None min_y, max_y = None, None if kwargs['plot'] == 'blobplot': min_x, max_x = 0, 1 major_xticks = MultipleLocator(0.2) minor_xticks = AutoMinorLocator(20) min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 axScatter.set_yscale('log') axScatter.set_xscale('linear') axScatter.xaxis.set_major_locator(major_xticks) axScatter.xaxis.set_minor_locator(minor_xticks) elif kwargs['plot'] == 'covplot': min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000 axScatter.set_yscale('log') axScatter.set_xscale('log') else: BtLog.error('34' % kwargs['plot']) axScatter.set_xlim( (min_x, max_x) ) axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-') axScatter.set_axisbelow(True) axScatter.xaxis.labelpad = 20 axScatter.yaxis.labelpad = 20 axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) axScatter.tick_params(axis='both', which='both', direction='out') return axScatter
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs) / 100) cas_line_re = re.compile( r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group( 1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection): selected_cov_libs = [] cov_lib_selection_error = 0 if (cov_lib_selection): if cov_lib_selection == 'covsum': selected_cov_libs.append('covsum') elif "," in cov_lib_selection: selected_cov_libs = cov_lib_selection.split(",") if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())): cov_lib_selection_error = 1 else: selected_cov_libs.append(cov_lib_selection) if not cov_lib_selection in cov_lib_dict: cov_lib_selection_error = 1 else: selected_cov_libs = cov_lib_dict.keys() if cov_lib_selection_error: covlib_string = [] for covlib in cov_lib_dict: cov_lib_f = cov_lib_dict[covlib]['f'] if not cov_lib_f: cov_lib_f = "sum of coverages from all covlibs" covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f)) BtLog.error('33', "\n".join(covlib_string)) return selected_cov_libs
def main(): args = docopt(__doc__) bam_f = args['--bam'] include_f = args['--include'] exclude_f = args['--exclude'] out_prefix = args['--out'] gzip = args['--gzip'] do_sort = args['--sort'] keep_sorted = args['--keep'] sort_threads = int(args['--threads']) print BtLog.status_d['22'] % bam_f out_f = BtIO.getOutFile(bam_f, out_prefix, None) if include_f and exclude_f: print BtLog.error('43') elif include_f: sequence_list = BtIO.parseList(include_f) BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads) elif exclude_f: sequence_list = BtIO.parseList(exclude_f) BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads) else: BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d["1"] % ("FASTA", fasta_f) self.assembly_f = abspath(fasta_f) if fasta_type: # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if fasta_type: cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error("5", blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error("1")
def parseCovFromHeader(fasta_type, header): ''' Returns the coverage from the header of a FASTA sequence depending on the assembly type ''' ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus'] if not fasta_type in ASSEMBLY_TYPES: BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) if fasta_type == 'spades': spades_match_re = re.compile(r"_cov_(\d+\.*\d*)") cov = re.findall(r"_cov_(\d+\.*\d*)", header) return float(spades_match_re.findall(header)[0]) elif fasta_type == 'velvet': return float(header.split("_")[-1]) #elif fasta_type == 'abyss' or fasta_type == 'soap': # temp = header.split(" ") # return float(temp[2]/(temp[1]+1-75)) elif fasta_type == 'platanus': temp = header.rstrip("\n").split("_") if len(temp) >= 3: return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig else: return float(temp[1].replace("cov", "")) # gapClosed else: pass
def parseCas(infile, order_of_blobs): if not isfile(infile): BtLog.error('0', infile) seqs_total, reads_total, reads_mapped = checkCas(infile) progress_unit = int(len(order_of_blobs)/100) cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") command = "clc_mapping_info -n " + infile cov_dict = {} read_cov_dict = {} seqs_parsed = 0 if (runCmd(command=command)): for line in runCmd(command=command): cas_line_match = cas_line_re.search(line) if cas_line_match: idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero try: name = order_of_blobs[idx] reads = int(cas_line_match.group(3)) cov = float(cas_line_match.group(6)) cov_dict[name] = cov read_cov_dict[name] = reads seqs_parsed += 1 except: pass BtLog.progress(seqs_parsed, progress_unit, seqs_total) return cov_dict, reads_total, reads_mapped, read_cov_dict
def parseSet(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = set() for l in fh: items.add(l.rstrip("\n").lstrip(">")) return items
def parseList(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = [] for l in fh: items.append(l.rstrip("\n")) return items
def parseColours(infile): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: temp = l.rstrip("\n").split(",") items[temp[0]] = temp[1] return items
def writeNodesDB(nodesDB, nodesDB_f): print BtLog.status_d['5'] % nodesDB_f nodes_count = nodesDB['nodes_count'] i = 0 with open(nodesDB_f, 'w') as fh: fh.write("# nodes_count = %s\n" % nodes_count) for node in nodesDB: if not node == "nodes_count": i += 1 BtLog.progress(i, 1000, nodes_count) fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int( match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def readFasta(infile): if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: header, seqs = '', [] for l in fh: if l[0] == '>': if header: yield header, ''.join(seqs) header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace else: seqs.append(l[:-1]) yield header, ''.join(seqs)
def parseDict(infile, key, value): items = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: items = {} k_idx = int(key) v_idx = int(value) for l in fh: temp = l.rstrip("\n").split() items[temp[k_idx]] = temp[v_idx] return items
def parseCatColour(infile): catcolour_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: seq_name, category = l.rstrip("\n").split(",") catcolour_dict[seq_name] = category except: BtLog.error('23', infile) return catcolour_dict
def parseCov(infile, set_of_blobs): if not isfile(infile): BtLog.error('0', infile) old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)") base_cov_dict = {} cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") reads_total = 0 reads_mapped = 0 reads_unmapped = 0 read_cov_dict = {} seqs_parsed = 0 progress_unit = 1 old_format = 1 with open(infile) as fh: for line in fh: if line.startswith("#"): old_format = 0 if old_format == 0: if line.startswith('#'): if line.startswith("## Total Reads"): reads_total = int(line.split(" = ")[1]) elif line.startswith("## Mapped Reads"): reads_mapped = int(line.split(" = ")[1]) elif line.startswith("## Unmapped Reads"): reads_unmapped = int(line.split(" = ")[1]) else: pass else: match = cov_line_re.search(line) if match: seqs_parsed += 1 name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name, infile) else: read_cov_dict[name] = read_cov base_cov_dict[name] = base_cov else: match = old_cov_line_re.search(line) if match: seqs_parsed += 1 name, base_cov = match.group(1), float(match.group(2)) if name not in set_of_blobs: print BtLog.warn_d['2'] % (name) else: base_cov_dict[name] = base_cov BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs)) #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
def parseReferenceCov(infile): refcov_dict = {} if infile: if not isfile(infile): BtLog.error('0', infile) with open(infile) as fh: for l in fh: try: cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref), 'reads_mapped' : int(reads_mapped_ref)} except: BtLog.error('21', infile) return refcov_dict
def plotBar(self, cov_lib, out_f): fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov') ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] } ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] } reads_total = self.cov_libs_total_reads_dict[cov_lib] reads_mapped = self.stats['all']['reads_mapped'][cov_lib] reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib] ax_main_data['labels'].append('Unmapped (assembly)') ax_main_data['values'].append(reads_unmapped/reads_total) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (assembly)') ax_main_data['values'].append(reads_mapped/reads_total) ax_main_data['colours'].append(DGREY) if (self.refcov_dict): if cov_lib in self.refcov_dict: reads_total_ref = self.refcov_dict[cov_lib]['reads_total'] reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped'] reads_unmapped_ref = reads_total_ref - reads_mapped_ref ax_main_data['labels'].append('Unmapped (ref)') ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) ax_main_data['labels'].append('Mapped (ref)') ax_main_data['values'].append(reads_mapped_ref/reads_total_ref) ax_main_data['colours'].append(DGREY) else: BtLog.error('40', cov_lib) # mapped plotted groups for group in self.plot_order: ax_group_data['labels'].append(group) ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib]) ax_group_data['colours'].append(self.colours[group]) rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours']) for rect_g in rect_group: height_g = float(rect_g.get_height()) ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours']) for rect_m in rect_main: height_m = float(rect_m.get_height()) ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) #figsuptitle = fig.suptitle(out_f, verticalalignment='top') out_f = "%s.read_cov.%s" % (out_f, cov_lib) print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format) fig.tight_layout() #fig.savefig("%s.%s" % (out_f, self.format), format=self.format, bbox_extra_artists=(figsuptitle,)) fig.savefig("%s.%s" % (out_f, self.format), format=self.format) plt.close(fig)
def parseCmdLabels(labels): label_d = {} name, groups = '', '' if labels: try: for label in labels: name, groups = str(label).split("=") if "," in groups: for group in groups.split(","): label_d[group] = name else: label_d[groups] = name except: BtLog.error('17', labels) return label_d
def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random): print BtLog.status_d['6'] % ",".join(taxrules) tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if (blObj.hits): blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random) else: blObj.taxonomy[taxrule] = BtTax.noHit() self.set_of_taxIds = set()
def readNodesDB(nodesDB_f): nodesDB = {} nodesDB_count = 0 nodes_count = 0 with open(nodesDB_f) as fh: for line in fh: if line.startswith("#"): nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) else: nodes_count += 1 node, rank, name, parent = line.rstrip("\n").split("\t") nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent} if (nodesDB_count): BtLog.progress(nodes_count, 1000, nodesDB_count) nodesDB['nodes_count'] = nodes_count return nodesDB
def parseSam(infile, set_of_blobs, no_base_cov_flag): if not isfile(infile): BtLog.error('0', infile) base_cov_dict = {blob: [] for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s reads_total = 0 reads_mapped = 0 if not (no_base_cov_flag): with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) else: with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random): print BtLog.status_d["6"] % ",".join(taxrules) tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB) self.lineages = BtTax.getLineages(tree_lists, nodesDB) self.taxrules = taxrules i = 0 for blObj in self.dict_of_blobs.values(): i += 1 BtLog.progress(i, 100, self.seqs) for taxrule in taxrules: if blObj.hits: blObj.taxonomy[taxrule] = BtTax.taxRule( taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random ) else: blObj.taxonomy[taxrule] = BtTax.noHit() self.set_of_taxIds = set()
def main(): args = docopt(__doc__) fasta_f = args['--infile'] bam_fs = args['--bam'] cas_fs = args['--cas'] sam_fs = args['--sam'] prefix = args['--output'] no_base_cov_flag = args['--no_base_cov'] # Make covLibs cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] if not (cov_libs): BtLog.error('31') blobDb = BtCore.BlobDb('cov') blobDb.version = blobtools.__version__ blobDb.parseFasta(fasta_f, None) blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag)
def checkCas(infile): print BtLog.status_d['12'] if not isfile(infile): BtLog.error('0', infile) if not (which('clc_mapping_info')): BtLog.error('20') seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") reads_total_re = re.compile(r"\s+Reads\s+(\d+)") reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0 output = '' command = "clc_mapping_info -s " + infile for line in runCmd(command=command): output += line seqs_total = int(seqs_total_re.search(output).group(1)) reads_mapped = int(reads_mapping_re.search(output).group(1)) reads_total = int(reads_total_re.search(output).group(1)) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return seqs_total, reads_total, reads_mapped
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' if not isfile(infile): BtLog.error('0', infile) hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)" ) # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name': match.group(1), 'taxId': match.group( 2 ), # string because if int, conversion is a nightmare ... 'score': float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def checkBam(infile): print BtLog.status_d['10'] if not isfile(infile): BtLog.error('0', infile) if not which('samtools'): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary") reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command=command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_secondary = int(reads_secondary_re.search(output).group(1)) reads_supplementary = int(reads_supplementary_re.search(output).group(1)) reads_mapped = reads_mapped - reads_secondary - reads_secondary reads_total = int(reads_total_re.search(output).group(1)) - reads_secondary - reads_supplementary # check whether there are reads in BAM if not reads_total or not reads_mapped: BtLog.error('29' % infile) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \ '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def checkBam(infile): print BtLog.status_d['10'] if not isfile(infile): BtLog.error('0', infile) if not which('samtools'): BtLog.error('7') reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped") reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary") reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary") reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total") reads_total, reads_mapped = 0, 0 output = '' command = "samtools flagstat " + infile for line in runCmd(command=command): output += line reads_mapped = int(reads_mapped_re.search(output).group(1)) reads_secondary = int(reads_secondary_re.search(output).group(1)) reads_supplementary = int(reads_supplementary_re.search(output).group(1)) reads_mapped = reads_mapped - reads_secondary - reads_secondary reads_total = int(reads_total_re.search(output).group( 1)) - reads_secondary - reads_supplementary # check whether there are reads in BAM if not reads_total or not reads_mapped: BtLog.error('29' % infile) print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \ '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total)) return reads_total, reads_mapped
def parseSam(infile, set_of_blobs, no_base_cov_flag): if not isfile(infile): BtLog.error('0', infile) base_cov_dict = {blob : [] for blob in set_of_blobs} read_cov_dict = {blob : 0 for blob in set_of_blobs} cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s reads_total = 0 reads_mapped = 0 if not (no_base_cov_flag): with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])])) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) else: with open(infile) as fh: for line in fh: if line.startswith("@"): pass else: reads_total += 1 match = line.split() if not match[2] == '*': reads_mapped += 1 try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()} return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def main(): args = docopt(__doc__) fasta_f = args['--infile'] list_f = args['--list'] invert = args['--invert'] prefix = args['--out'] output = [] out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna") print BtLog.status_d['1'] % ("list", list_f) items = BtIO.parseSet(list_f) items_count = len(items) print BtLog.status_d['22'] % fasta_f items_parsed = [] sequences = 0 for header, sequence in BtIO.readFasta(fasta_f): sequences += 1 if header in items: if not (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) else: if (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) BtLog.progress(len(output), 10, items_count, no_limit=True) BtLog.progress(items_count, 10, items_count) items_parsed_count = len(items_parsed) print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences)) items_parsed_count_unique = len(set(items_parsed)) if not items_parsed_count == items_parsed_count_unique: print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1]))) with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("".join(output))
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) base_cov_dict = {blob: [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob: 0 for blob in set_of_blobs} cigar_match_re = re.compile( r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append( sum([ int(matching) for matching in cigar_match_re.findall(match[5]) ])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = { seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items() } #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def parseBam(infile, set_of_blobs, no_base_cov_flag): ''' checkBam returns reads_total and reads_mapped base_cov_dict is list of coverages for each contigs, since list appending should be faster ''' if not isfile(infile): BtLog.error('0', infile) reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) base_cov_dict = {blob : [] for blob in set_of_blobs} #base_cov_dict = {blob : 0 for blob in set_of_blobs} read_cov_dict = {blob : 0 for blob in set_of_blobs} cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment) command = "samtools view -F 1024 -F 4 -F 256 " + infile seen_reads = 0 #import time #start = time.time() if not (no_base_cov_flag): for line in runCmd(command=command): seen_reads += 1 match = line.split() try: base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])])) #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])]) read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) else: for line in runCmd(command=command): seen_reads += 1 match = line.split() try: read_cov_dict[match[2]] += 1 except: print BtLog.warn_d['2'] % (match[2]) BtLog.progress(seen_reads, progress_unit, reads_mapped) if not int(reads_mapped) == int(seen_reads): print BtLog.warn_d['3'] % (reads_mapped, seen_reads) base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()} #end = time.time() #print (end-start) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
def readTax(infile, set_of_blobs): ''' If more fields need to be parsed: - change hit_line_re - catch matches in variables - add as key-value pairs to hitDict ''' if not isfile(infile): BtLog.error('0', infile) hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards with open(infile) as fh: for line in fh: match = hit_line_re.search(line) if match: hitDict = { 'name' : match.group(1), 'taxId' : match.group(2), # string because if int, conversion is a nightmare ... 'score' : float(match.group(3)) } if hitDict['name'] not in set_of_blobs: BtLog.error('19', hitDict['name'], infile) if hitDict['taxId'] == 'N/A': BtLog.error('22', infile) yield hitDict
def parseNodesDB(**kwargs): ''' Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that gets JSON'ed into blobtools/data/nodes_db.json if this file does not exist. Nodes_db.json is used if neither "--names" and "--nodes" nor "--db" is specified. ''' nodesDB = {} names_f = kwargs['names'] nodes_f = kwargs['nodes'] nodesDB_f = kwargs['nodesDB'] nodesDB_default = kwargs['nodesDBdefault'] if (nodes_f and names_f): if not isfile(names_f): BtLog.error('0', names_f) if not isfile(nodes_f): BtLog.error('0', nodes_f) print BtLog.status_d['3'] % (nodes_f, names_f) try: nodesDB = readNamesNodes(names_f, nodes_f) except: BtLog.error('3', nodes_f, names_f) elif (nodesDB_f): if not isfile(nodesDB_f): BtLog.error('0', nodesDB_f) print BtLog.status_d['4'] % (nodesDB_f) try: nodesDB = readNodesDB(nodesDB_f) except: BtLog.error('27', nodesDB_f) elif (nodesDB_default): if not isfile(nodesDB_default): BtLog.error('28') print BtLog.status_d['4'] % (nodesDB_default) try: nodesDB = readNodesDB(nodesDB_default) except: BtLog.error('27', nodesDB_default) nodesDB_f = nodesDB_default # Write nodesDB if not available if not isfile(nodesDB_default): writeNodesDB(nodesDB, nodesDB_default) return nodesDB, nodesDB_f
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % ( infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped / 1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs( outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join( sorted([ sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]] ])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = { read_pair_type: tuple() for read_pair_type in read_pair_count } except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type ) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [ BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs) ] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads): ''' checkBam returns reads_total and reads_mapped parse BAM to extract readpairs ''' if not isfile(infile): BtLog.error('0', infile) if do_sort: command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile) runCmd(command=command, wait=True) infile = "%s.readsorted.bam" % infile reads_total, reads_mapped = checkBam(infile) progress_unit = int(reads_mapped/1000) command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile seen_reads = 0 read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude) read_pair_out_fhs = [] used_fhs = {} iterator = runCmd(command=command) read_pair_type = None if include: sequence_to_type_dict = defaultdict(lambda: 'Ex') for incl in include: sequence_to_type_dict[incl] = 'In' sequence_to_type_dict['*'] = 'Un' elif exclude: sequence_to_type_dict = defaultdict(lambda: 'In') for excl in exclude: sequence_to_type_dict[excl] = 'Ex' sequence_to_type_dict['*'] = 'Un' else: sequence_to_type_dict = defaultdict(lambda: 'In') sequence_to_type_dict['*'] = 'Un' for l in iterator: read1 = l.split() try: seen_reads += 2 read2 = next(iterator).split() read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]])) print_bam(read_pair_out_fs, read_pair_type, read1, read2) read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2) read_pair_count[read_pair_type] += 1 BtLog.progress(seen_reads, progress_unit, reads_total) if seen_reads % progress_unit == 0: used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count} except StopIteration: print BtLog.warn_d['11'] used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs) close_fhs(used_fhs) # info log info_string = [] info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00))) for read_pair_type, count in read_pair_count.items(): info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2)))) info_out_f = getOutFile(outfile, None, "info.txt") with open(info_out_f, 'w') as info_fh: print BtLog.status_d['24'] % info_out_f info_fh.write(get_table(info_string)) # gzip if gzip: if not which('gzip'): BtLog.error('43') for out_f in used_fhs: print BtLog.status_d['25'] % out_f runCmd(command="gzip -f " + out_f, wait=True) if not int(reads_total) == int(seen_reads): print BtLog.warn_d['3'] % (reads_total, seen_reads) if do_sort and not keep_sorted: os.remove(infile) return 1