def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d['1'] % ('FASTA', fasta_f) self.assembly_f = abspath(fasta_f) if (fasta_type): # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if (fasta_type): cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error('5', blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error('1')
def parseFasta(self, fasta_f, fasta_type): print BtLog.status_d["1"] % ("FASTA", fasta_f) self.assembly_f = abspath(fasta_f) if fasta_type: # Set up CovLibObj for coverage in assembly header self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f) for name, seq in BtIO.readFasta(fasta_f): blObj = BlObj(name, seq) if not blObj.name in self.dict_of_blobs: self.seqs += 1 self.length += blObj.length self.n_count += blObj.n_count if fasta_type: cov = BtIO.parseCovFromHeader(fasta_type, blObj.name) self.covLibs[fasta_type].cov_sum += cov blObj.addCov(fasta_type, cov) self.order_of_blobs.append(blObj.name) self.dict_of_blobs[blObj.name] = blObj else: BtLog.error("5", blObj.name) if self.seqs == 0 or self.length == 0: BtLog.error("1")
def parseCoverage(self, **kwargs): # arguments covLibObjs = kwargs['covLibObjs'] no_base_cov = kwargs['no_base_cov'] for covLib in covLibObjs: self.addCovLib(covLib) print BtLog.status_d['1'] % (covLib.name, covLib.f) if covLib.fmt == 'bam' or covLib.fmt == 'sam': base_cov_dict = {} if covLib.fmt == 'bam': base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseBam(covLib.f, set(self.dict_of_blobs), no_base_cov) else: base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseSam(covLib.f, set(self.dict_of_blobs), no_base_cov) if covLib.reads_total == 0: print BtLog.warn_d['4'] % covLib.f for name, base_cov in base_cov_dict.items(): cov = base_cov / self.dict_of_blobs[name].agct_count covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) # Create COV file for future use out_f = BtIO.getOutFile(covLib.f, kwargs.get('prefix', None), None) covView = ViewObj(name="covlib", out_f=out_f, suffix="cov", header="", body=[]) self.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[covLib.name], progressbar=False) elif covLib.fmt == 'cas': cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseCas(covLib.f, self.order_of_blobs) if covLib.reads_total == 0: print BtLog.warn_d['4'] % covLib.f for name, cov in cov_dict.items(): covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) out_f = BtIO.getOutFile(covLib.f, kwargs.get('prefix', None), None) covView = ViewObj(name="covlib", out_f=out_f, suffix="cov", header="", body=[]) self.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[covLib.name], progressbar=False) elif covLib.fmt == 'cov': base_cov_dict, covLib.reads_total, covLib.reads_mapped, covLib.reads_unmapped, read_cov_dict = BtIO.parseCov(covLib.f, set(self.dict_of_blobs)) #cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs)) if not len(base_cov_dict) == self.seqs: print BtLog.warn_d['4'] % covLib.f for name, cov in base_cov_dict.items(): covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) if name in read_cov_dict: self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) else: pass covLib.mean_cov = covLib.cov_sum/self.seqs if covLib.cov_sum == 0.0: print BtLog.warn_d['6'] % (covLib.name) self.covLibs[covLib.name] = covLib
def __init__(self, name='experimental', view_dir=''): self.name = name self.view_dir = view_dir self.length = [] self.gc = [] self.names = [] self.tax = {} self.covs = {} self.meta = {} BtIO.create_dir(self.view_dir)
def __init__(self, name="experimental", view_dir=""): self.name = name self.view_dir = view_dir self.length = [] self.gc = [] self.names = [] self.tax = {} self.covs = {} self.meta = {} BtIO.create_dir(self.view_dir)
def main(): args = docopt(__doc__) bam_f = args['--bam'] include_f = args['--include'] exclude_f = args['--exclude'] out_prefix = args['--out'] gzip = args['--gzip'] do_sort = args['--sort'] keep_sorted = args['--keep'] sort_threads = int(args['--threads']) print BtLog.status_d['22'] % bam_f out_f = BtIO.getOutFile(bam_f, out_prefix, None) if include_f and exclude_f: print BtLog.error('43') elif include_f: sequence_list = BtIO.parseList(include_f) BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads) elif exclude_f: sequence_list = BtIO.parseList(exclude_f) BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads) else: BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
def parse_data(self, key, exp_count): data = BtIO.read_json_list(self.files[key]) if not len(data) == exp_count: # error pass else: setattr(self, key, data)
def _write_output(self): primary = [ 'meta', 'blob_id', 'length', 'gc', 'n_count', 'agct_count', 'tax_id' ] secondary = ['cov_base', 'cov_read', 'tax', 'tax_hit'] directory = BtIO.create_dir(self.path) out_fs = [] if (directory): out_fs = [] strings = [] for key in self.files: if key in primary: out_f.append(self.files[key]) data.append(getattr(self, key)) elif key in secondary: for key2 in self.files[key]: out_f.append(self.files[key][key2]) data.append(getattr(self, key)[key2]) else: pass with tarfile.open(out_f, "a:gz") as tar: for out_f, string in zip(out_fs, strings): with open(out_f, 'w') as fh: json.dump(string, fh, indent=1, separators=(',', ' : ')) tar.add(out_f)
def parse_meta(self, meta_f): meta = BtIO.read_meta(meta_f) for key, value in meta.items(): setattr(key, value) self.name = meta["name"] self.covlib = meta["covlib"] self.taxrule = meta["taxrule"] self.taxlib = meta["taxlib"] self.files = meta["files"] self.blobs_count = meta["count"] self.ranks = meta["ranks"]
def parse_meta(self, meta_f): meta = BtIO.read_meta(meta_f) for key, value in meta.items(): setattr(key, value) self.name = meta['name'] self.covlib = meta['covlib'] self.taxrule = meta['taxrule'] self.taxlib = meta['taxlib'] self.files = meta['files'] self.blobs_count = meta['count'] self.ranks = meta['ranks']
def parseHits(self, hitLibs): for hitLib in hitLibs: self.hitLibs[hitLib.name] = hitLib print BtLog.status_d['1'] % (hitLib.name, hitLib.f) # only accepts format 'seqID\ttaxID\tscore' for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)): if ";" in hitDict['taxId']: hitDict['taxId'] = hitDict['taxId'].split(";")[0] print BtLog.warn_d['5'] % (hitDict['name'], hitLib) self.set_of_taxIds.add(hitDict['taxId']) self.dict_of_blobs[hitDict['name']].addHits(hitLib.name, hitDict)
def parseHits(self, hitLibs): for hitLib in hitLibs: self.hitLibs[hitLib.name] = hitLib print BtLog.status_d["1"] % (hitLib.name, hitLib.f) # only accepts format 'seqID\ttaxID\tscore' for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)): if ";" in hitDict["taxId"]: hitDict["taxId"] = hitDict["taxId"].split(";")[0] print BtLog.warn_d["5"] % (hitDict["name"], hitLib) self.set_of_taxIds.add(hitDict["taxId"]) self.dict_of_blobs[hitDict["name"]].addHits(hitLib.name, hitDict)
def main(): args = docopt(__doc__) fasta_f = args['--infile'] list_f = args['--list'] invert = args['--invert'] prefix = args['--out'] output = [] out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna") print BtLog.status_d['1'] % ("list", list_f) items = BtIO.parseSet(list_f) items_count = len(items) print BtLog.status_d['22'] % fasta_f items_parsed = [] sequences = 0 for header, sequence in BtIO.readFasta(fasta_f): sequences += 1 if header in items: if not (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) else: if (invert): items_parsed.append(header) output.append(">%s\n%s\n" % (header, sequence)) BtLog.progress(len(output), 10, items_count, no_limit=True) BtLog.progress(items_count, 10, items_count) items_parsed_count = len(items_parsed) print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences)) items_parsed_count_unique = len(set(items_parsed)) if not items_parsed_count == items_parsed_count_unique: print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1]))) with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("".join(output))
def _write_output(self): primary = ["meta", "blob_id", "length", "gc", "n_count", "agct_count", "tax_id"] secondary = ["cov_base", "cov_read", "tax", "tax_hit"] directory = BtIO.create_dir(self.path) out_fs = [] if directory: out_fs = [] strings = [] for key in self.files: if key in primary: out_f.append(self.files[key]) data.append(getattr(self, key)) elif key in secondary: for key2 in self.files[key]: out_f.append(self.files[key][key2]) data.append(getattr(self, key)[key2]) else: pass with tarfile.open(out_f, "a:gz") as tar: for out_f, string in zip(out_fs, strings): with open(out_f, "w") as fh: json.dump(string, fh, indent=1, separators=(",", " : ")) tar.add(out_f)
def main(): #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] prefix = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list_f = args['--list'] concoct = args['--concoct'] cov = args['--cov'] notable = args['--notable'] experimental = args['--experimental'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) out_f = BtIO.getOutFile(blobdb_f, prefix, None) # Are ranks sane ? if 'all' in ranks: temp_ranks = RANKS[0:-1] ranks = temp_ranks[::-1] else: for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) # Does seq_list file exist? seqs = [] if (seq_list_f): if isfile(seq_list_f): seqs = BtIO.parseList(seq_list_f) else: BtLog.error('0', seq_list_f) # Load BlobDb blobDb = BtCore.BlobDb('new') print BtLog.status_d['9'] % (blobdb_f) blobDb.load(blobdb_f) blobDb.version = blobtools.__version__ # Is taxrule sane and was it computed? if (blobDb.hitLibs) and taxrule not in blobDb.taxrules: BtLog.error('11', taxrule, blobDb.taxrules) # view(s) viewObjs = [] print BtLog.status_d['14'] if not (notable): tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[]) viewObjs.append(tableView) if (experimental): experimentalView = BtCore.ExperimentalViewObj(name="experimental", view_dir=out_f) viewObjs.append(experimentalView) if (concoct): concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict()) viewObjs.append(concoctTaxView) concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[]) viewObjs.append(concoctCovView) if (cov): for cov_lib_name, covLibDict in blobDb.covLibs.items(): out_f = BtIO.getOutFile(covLibDict['f'], prefix, None) covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[]) blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True) if (viewObjs): blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True) print BtLog.status_d['19']
def main(): args = docopt(__doc__) blast_f = args['--blast'] diamond_f = args['--diamond'] uniref_f = args['--uniref'] rnacentral_f = args['--rnacentral'] swissprot_f = args['--swissprot'] taxid = args['--taxid'] force = args['--force'] prefix = args['--out'] out_f, hit_f, map_f, taxid_d = None, None, None, {} # Check if blast_f OR diamond_f is speciefied if not (bool(blast_f) + bool(diamond_f) == 1): BtLog.error('26') elif blast_f: hit_f = blast_f elif diamond_f: hit_f = diamond_f else: pass # Check if taxID or Mapping file is supplied if (taxid): try: taxid = int(taxid) except TypeError: BtLog.error('26') out_f = BtIO.getOutFile(hit_f, prefix, "tax_%s.out" % taxid) taxid_d = defaultdict(lambda: taxid) elif (bool(uniref_f) + bool(rnacentral_f) + bool(swissprot_f) == 1): if uniref_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", uniref_f) taxid_d = BtIO.parseDict(uniref_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "uniref.out") map_f = uniref_f elif rnacentral_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", rnacentral_f) taxid_d = BtIO.parseDict(rnacentral_f, 0, 3) out_f = BtIO.getOutFile(hit_f, prefix, "rnacentral.out") map_f = rnacentral_f elif swissprot_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", swissprot_f) taxid_d = BtIO.parseDict(swissprot_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "swissprot.out") map_f = swissprot_f else: pass else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("hits file", hit_f) with open(hit_f) as fh: for idx, l in enumerate(fh): query_id, bitscore, tax_id, subject_id, rest = None, None, None, None, None line = l.rstrip("\n").split() query_id = line[0] if blast_f: bitscore = line[2] tax_id = line[1] subject_id = line[4] rest = "\t".join(line[2:]) elif diamond_f: bitscore = line[11] subject_id = line[1] rest = "\t".join(line[1:]) if swissprot_f: subject_id = subject_id.split("|")[1] if blast_f and not tax_id == "N/A" and not force: # so that it does not overwrite existing taxIDs print BtLog.warn_d['10'] % (idx + 1, line[0], line[1]) output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) else: try: tax_id = taxid_d[subject_id] except KeyError: BtLog.warn_d['12'] % (subject_id, map_f) tax_id = "N/A" output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output))
def main(): #print data_dir args = docopt(__doc__) blobdb_f = args['--input'] prefix = args['--out'] ranks = args['--rank'] taxrule = args['--taxrule'] hits_flag = args['--hits'] seq_list_f = args['--list'] concoct = args['--concoct'] cov = args['--cov'] notable = args['--notable'] experimental = args['--experimental'] # Does blobdb_f exist ? if not isfile(blobdb_f): BtLog.error('0', blobdb_f) out_f = BtIO.getOutFile(blobdb_f, prefix, None) # Are ranks sane ? if 'all' in ranks: temp_ranks = RANKS[0:-1] ranks = temp_ranks[::-1] else: for rank in ranks: if rank not in RANKS: BtLog.error('9', rank) # Does seq_list file exist? seqs = [] if (seq_list_f): if isfile(seq_list_f): seqs = BtIO.parseList(seq_list_f) else: BtLog.error('0', seq_list_f) # Load BlobDb blobDb = BtCore.BlobDb('new') print BtLog.status_d['9'] % (blobdb_f) blobDb.load(blobdb_f) blobDb.version = blobtools.__version__ # Is taxrule sane and was it computed? if (blobDb.hitLibs) and taxrule not in blobDb.taxrules: BtLog.error('11', taxrule, blobDb.taxrules) # view(s) viewObjs = [] print BtLog.status_d['14'] if not (notable): tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[]) viewObjs.append(tableView) if (experimental): experimentalView = BtCore.ExperimentalViewObj(name = "experimental", view_dir=out_f) viewObjs.append(experimentalView) if (concoct): concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict()) viewObjs.append(concoctTaxView) concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[]) viewObjs.append(concoctCovView) if (cov): for cov_lib_name, covLibDict in blobDb.covLibs.items(): out_f = BtIO.getOutFile(covLibDict['f'], prefix, None) covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[]) blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True) if (viewObjs): blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True) print BtLog.status_d['19']
def main(): args = docopt(__doc__) args = BtPlot.check_input(args) blobdb_f = args['--infile'] cov_f = args['--cov'] rank = args['--rank'] min_length = int(args['--length']) max_group_plot = int(args['--plotgroups']) hide_nohits = args['--nohit'] taxrule = args['--taxrule'] c_index = args['--cindex'] exclude_groups = args['--exclude'] labels = args['--label'] colour_f = args['--colours'] refcov_f = args['--refcov'] catcolour_f = args['--catcolour'] multiplot = args['--multiplot'] out_prefix = args['--out'] sort_order = args['--sort'] hist_type = args['--hist'] no_title = args['--notitle'] ignore_contig_length = args['--noscale'] format = args['--format'] no_plot_blobs = args['--noblobs'] no_plot_reads = args['--noreads'] legend_flag = args['--legend'] cumulative_flag = args['--cumulative'] cov_lib_selection = args['--lib'] xlabel = args['--xlabel'] ylabel = args['--ylabel'] axis_max = float(args['--max']) exclude_groups = BtIO.parseCmdlist(exclude_groups) refcov_dict = BtIO.parseReferenceCov(refcov_f) user_labels = BtIO.parseCmdLabels(labels) catcolour_dict = BtIO.parseCatColour(catcolour_f) colour_dict = BtIO.parseColours(colour_f) # Load BlobDb print BtLog.status_d['9'] % blobdb_f blobDb = Bt.BlobDb('blobplot') blobDb.version = blobtools.__version__ blobDb.load(blobdb_f) # Generate plot data print BtLog.status_d['18'] data_dict, min_cov, max_cov, cov_lib_dict = blobDb.getPlotData( rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict) plotObj = BtPlot.PlotObj(data_dict, cov_lib_dict, cov_lib_selection, 'covplot') plotObj.cov_y_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict = BtIO.parseCov( cov_f, set(blobDb.dict_of_blobs)) plotObj.exclude_groups = exclude_groups plotObj.version = blobDb.version plotObj.format = format plotObj.max_cov = axis_max plotObj.no_title = no_title plotObj.multiplot = multiplot plotObj.hist_type = hist_type plotObj.ignore_contig_length = ignore_contig_length plotObj.max_group_plot = max_group_plot plotObj.legend_flag = legend_flag plotObj.cumulative_flag = cumulative_flag # order by which to plot (should know about user label) plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order) # labels for each level of stats plotObj.labels.update(plotObj.group_order) # plotObj.group_labels is dict that contains labels for each group : all/other/user_label if (user_labels): for group, label in user_labels.items(): plotObj.labels.add(label) plotObj.group_labels = {group: set() for group in plotObj.group_order} plotObj.relabel_and_colour(colour_dict, user_labels) plotObj.compute_stats() plotObj.refcov_dict = refcov_dict # Plotting info_flag = 1 out_f = '' for cov_lib in plotObj.cov_libs: plotObj.xlabel = basename(cov_lib_dict[cov_lib]['f']) plotObj.ylabel = cov_f if (ylabel): plotObj.ylabel = ylabel if (xlabel): plotObj.xlabel = xlabel out_f = "%s.%s.%s.p%s.%s.%s" % (blobDb.title, taxrule, rank, max_group_plot, hist_type, min_length) if catcolour_dict: out_f = "%s.%s" % (out_f, "catcolour") if ignore_contig_length: out_f = "%s.%s" % (out_f, "noscale") if c_index: out_f = "%s.%s" % (out_f, "c_index") if exclude_groups: out_f = "%s.%s" % (out_f, "exclude_" + "_".join(exclude_groups)) if labels: out_f = "%s.%s" % (out_f, "userlabel_" + "_".join( set([name for name in user_labels.values()]))) out_f = "%s.%s" % (out_f, "covplot") if (plotObj.cumulative_flag): out_f = "%s.%s" % (out_f, "cumulative") if (plotObj.multiplot): out_f = "%s.%s" % (out_f, "multiplot") out_f = BtIO.getOutFile(out_f, out_prefix, None) if not (no_plot_blobs): plotObj.plotScatter(cov_lib, info_flag, out_f) info_flag = 0 plotObj.write_stats(out_f)
def main(): #main_dir = dirname(__file__) args = docopt(__doc__) fasta_f = args['--infile'] fasta_type = args['--type'] sam_fs = args['--sam'] bam_fs = args['--bam'] cov_fs = args['--cov'] cas_fs = args['--cas'] hit_fs = args['--hitsfile'] prefix = args['--out'] nodesDB_f = args['--db'] names_f = args['--names'] nodes_f = args['--nodes'] taxrules = args['--taxrule'] min_bitscore_diff = float(args['--min_diff']) tax_collision_random = args['--tax_collision_random'] title = args['--title'] # outfile out_f = BtIO.getOutFile("blobDB", prefix, "json") if not (title): title = out_f # coverage if not (fasta_type ) and not bam_fs and not sam_fs and not cov_fs and not cas_fs: BtLog.error('1') cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \ [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] # taxonomy hit_libs = [ BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs) ] # Create BlobDB object blobDb = BtCore.BlobDb(title) blobDb.version = blobtools.__version__ # Parse FASTA blobDb.parseFasta(fasta_f, fasta_type) # Parse nodesDB OR names.dmp, nodes.dmp nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt") nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) blobDb.nodesDB_f = nodesDB_f # Parse similarity hits if (hit_libs): blobDb.parseHits(hit_libs) blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random) else: print BtLog.warn_d['0'] # Parse coverage blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None) # Generating BlobDB and writing to file print BtLog.status_d['7'] % out_f BtIO.writeJson(blobDb.dump(), out_f)
def main(): args = docopt(__doc__) args = BtPlot.check_input(args) blobdb_f = args['--infile'] rank = args['--rank'] min_length = int(args['--length']) max_group_plot = int(args['--plotgroups']) hide_nohits = args['--nohit'] taxrule = args['--taxrule'] c_index = args['--cindex'] exclude_groups = args['--exclude'] labels = args['--label'] colour_f = args['--colours'] refcov_f = args['--refcov'] catcolour_f = args['--catcolour'] multiplot = args['--multiplot'] out_prefix = args['--out'] sort_order = args['--sort'] hist_type = args['--hist'] no_title = args['--notitle'] ignore_contig_length = args['--noscale'] format = args['--format'] no_plot_blobs = args['--noblobs'] no_plot_reads = args['--noreads'] legend_flag = args['--legend'] cumulative_flag = args['--cumulative'] cov_lib_selection = args['--lib'] filelabel = args['--filelabel'] exclude_groups = BtIO.parseCmdlist(exclude_groups) refcov_dict = BtIO.parseReferenceCov(refcov_f) user_labels = BtIO.parseCmdLabels(labels) catcolour_dict = BtIO.parseCatColour(catcolour_f) colour_dict = BtIO.parseColours(colour_f) # Load BlobDb print BtLog.status_d['9'] % blobdb_f blobDb = BtCore.BlobDb('blobplot') blobDb.version = blobtools.__version__ blobDb.load(blobdb_f) # Generate plot data print BtLog.status_d['18'] data_dict, min_cov, max_cov, cov_lib_dict = blobDb.getPlotData(rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict) plotObj = BtPlot.PlotObj(data_dict, cov_lib_dict, cov_lib_selection, 'blobplot') plotObj.exclude_groups = exclude_groups plotObj.version = blobDb.version plotObj.format = format plotObj.max_cov = max_cov plotObj.min_cov = min_cov plotObj.no_title = no_title plotObj.multiplot = multiplot plotObj.hist_type = hist_type plotObj.ignore_contig_length = ignore_contig_length plotObj.max_group_plot = max_group_plot plotObj.legend_flag = legend_flag plotObj.cumulative_flag = cumulative_flag # order by which to plot (should know about user label) plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order) # labels for each level of stats plotObj.labels.update(plotObj.group_order) # plotObj.group_labels is dict that contains labels for each group : all/other/user_label if (user_labels): for group, label in user_labels.items(): plotObj.labels.add(label) plotObj.group_labels = {group : set() for group in plotObj.group_order} plotObj.relabel_and_colour(colour_dict, user_labels) plotObj.compute_stats() plotObj.refcov_dict = refcov_dict # Plotting info_flag = 1 out_f = '' for cov_lib in plotObj.cov_libs: plotObj.ylabel = "Coverage" plotObj.xlabel = "GC proportion" if (filelabel): plotObj.ylabel = basename(cov_lib_dict[cov_lib]['f']) out_f = "%s.%s.%s.p%s.%s.%s" % (blobDb.title, taxrule, rank, max_group_plot, hist_type, min_length) if catcolour_dict: out_f = "%s.%s" % (out_f, "catcolour") if ignore_contig_length: out_f = "%s.%s" % (out_f, "noscale") if c_index: out_f = "%s.%s" % (out_f, "c_index") if exclude_groups: out_f = "%s.%s" % (out_f, "exclude_" + "_".join(exclude_groups)) if labels: out_f = "%s.%s" % (out_f, "userlabel_" + "_".join(set([name for name in user_labels.values()]))) out_f = "%s.%s" % (out_f, "blobplot") if (plotObj.cumulative_flag): out_f = "%s.%s" % (out_f, "cumulative") if (plotObj.multiplot): out_f = "%s.%s" % (out_f, "multiplot") out_f = BtIO.getOutFile(out_f, out_prefix, None) if not (no_plot_blobs): plotObj.plotScatter(cov_lib, info_flag, out_f) info_flag = 0 if not (no_plot_reads) and (plotObj.cov_libs_total_reads_dict[cov_lib]): # prevent plotting if --noreads or total_reads == 0 plotObj.plotBar(cov_lib, out_f) plotObj.write_stats(out_f)
def output(self): # meta meta = self.get_meta() meta_f = join(self.view_dir, "meta.json") BtIO.writeJson(meta, meta_f, indent=2) # gc gc_f = join(self.view_dir, "gc.json") print BtLog.status_d["13"] % (gc_f) BtIO.writeJson(self.gc, gc_f, indent=1) # length length_f = join(self.view_dir, "length.json") print BtLog.status_d["13"] % (length_f) BtIO.writeJson(self.length, length_f, indent=1) # names names_f = join(self.view_dir, "names.json") print BtLog.status_d["13"] % (names_f) BtIO.writeJson(self.names, names_f, indent=1) # cov cov_d = join(self.view_dir, "covs") BtIO.create_dir(directory=cov_d) for cov_lib, cov in self.covs.items(): cov_f = join(cov_d, "%s.json" % cov_lib) print BtLog.status_d["13"] % (cov_f) BtIO.writeJson(cov, cov_f, indent=1) # tax taxrule_d = join(self.view_dir, "taxrule") BtIO.create_dir(directory=taxrule_d) for taxrule in self.tax: tax_d = join(taxrule_d, taxrule) BtIO.create_dir(directory=tax_d) for rank in self.tax[taxrule]: tax = self.tax[taxrule][rank] rank_f = join(tax_d, "%s.json" % rank) BtIO.writeJson(tax, rank_f, indent=1)
def load(self, BlobDb_f): blobDict = BtIO.parseJson(BlobDb_f) for k, v in blobDict.items(): setattr(self, k, v) self.set_of_taxIds = blobDict['lineages'].keys()
def output(self): # meta meta = self.get_meta() meta_f = join(self.view_dir, "meta.json") BtIO.writeJson(meta, meta_f, indent=2) # gc gc_f = join(self.view_dir, "gc.json") print BtLog.status_d['13'] % (gc_f) BtIO.writeJson(self.gc, gc_f, indent=1) # length length_f = join(self.view_dir, "length.json") print BtLog.status_d['13'] % (length_f) BtIO.writeJson(self.length, length_f, indent=1) # names names_f = join(self.view_dir, "names.json") print BtLog.status_d['13'] % (names_f) BtIO.writeJson(self.names, names_f, indent=1) # cov cov_d = join(self.view_dir, "covs") BtIO.create_dir(directory=cov_d) for cov_lib, cov in self.covs.items(): cov_f = join(cov_d, "%s.json" % cov_lib) print BtLog.status_d['13'] % (cov_f) BtIO.writeJson(cov, cov_f, indent=1) # tax taxrule_d = join(self.view_dir, "taxrule") BtIO.create_dir(directory=taxrule_d) for taxrule in self.tax: tax_d = join(taxrule_d, taxrule) BtIO.create_dir(directory=tax_d) for rank in self.tax[taxrule]: tax = self.tax[taxrule][rank] rank_f = join(tax_d, "%s.json" % rank) BtIO.writeJson(tax, rank_f, indent=1)
def load(self, BlobDb_f): blobDict = BtIO.parseJson(BlobDb_f) for k, v in blobDict.items(): setattr(self, k, v) self.set_of_taxIds = blobDict["lineages"].keys()
def main(): args = docopt(__doc__) blast_f = args['--blast'] diamond_f = args['--diamond'] uniref_f = args['--uniref'] rnacentral_f = args['--rnacentral'] swissprot_f = args['--swissprot'] taxid = args['--taxid'] force = args['--force'] prefix = args['--out'] out_f, hit_f, map_f, taxid_d = None, None, None, {} # Check if blast_f OR diamond_f is speciefied if not (bool(blast_f) + bool(diamond_f) == 1): BtLog.error('26') elif blast_f: hit_f = blast_f elif diamond_f: hit_f = diamond_f else: pass # Check if taxID or Mapping file is supplied if (taxid): try: taxid = int(taxid) except TypeError: BtLog.error('26') out_f = BtIO.getOutFile(hit_f, prefix, "tax_%s.out" % taxid) taxid_d = defaultdict(lambda: taxid) elif (bool(uniref_f) + bool(rnacentral_f) + bool(swissprot_f) == 1): if uniref_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", uniref_f) taxid_d = BtIO.parseDict(uniref_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "uniref.out") map_f = uniref_f elif rnacentral_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", rnacentral_f) taxid_d = BtIO.parseDict(rnacentral_f, 0, 3) out_f = BtIO.getOutFile(hit_f, prefix, "rnacentral.out") map_f = rnacentral_f elif swissprot_f: print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", swissprot_f) taxid_d = BtIO.parseDict(swissprot_f, 0, 1) out_f = BtIO.getOutFile(hit_f, prefix, "swissprot.out") map_f = swissprot_f else: pass else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("hits file", hit_f) with open(hit_f) as fh: for idx, l in enumerate(fh): query_id, bitscore, tax_id, subject_id, rest = None, None, None, None, None line = l.rstrip("\n").split() query_id = line[0] if blast_f: bitscore = line[2] tax_id = line[1] subject_id = line[3] rest = "\t".join(line[2:]) elif diamond_f: bitscore = line[11] subject_id = line[1] rest = "\t".join(line[1:]) if swissprot_f: subject_id = subject_id.split("|")[1] if blast_f and not tax_id == "N/A" and not force: # so that it does not overwrite existing taxIDs print BtLog.warn_d['10'] % (idx+1, line[0], line[1]) output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) else: try: tax_id = taxid_d[subject_id] except KeyError: BtLog.error('42', subject_id, map_f) tax_id = "N/A" output.append("%s\t%s\t%s\t%s" % (query_id, tax_id, bitscore, rest)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output))
def parseCoverage(self, **kwargs): # arguments covLibObjs = kwargs["covLibObjs"] no_base_cov = kwargs["no_base_cov"] for covLib in covLibObjs: self.addCovLib(covLib) print BtLog.status_d["1"] % (covLib.name, covLib.f) if covLib.fmt == "bam" or covLib.fmt == "sam": base_cov_dict = {} if covLib.fmt == "bam": base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseBam( covLib.f, set(self.dict_of_blobs), no_base_cov ) else: base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseSam( covLib.f, set(self.dict_of_blobs), no_base_cov ) if covLib.reads_total == 0: print BtLog.warn_d["4"] % covLib.f for name, base_cov in base_cov_dict.items(): cov = base_cov / self.dict_of_blobs[name].agct_count covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) # Create COV file for future use out_f = BtIO.getOutFile(covLib.f, None, None) covView = ViewObj(name="covlib", out_f=out_f, suffix="cov", header="", body=[]) self.view( viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[covLib.name], progressbar=False, ) elif covLib.fmt == "cas": cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.parseCas( covLib.f, self.order_of_blobs ) if covLib.reads_total == 0: print BtLog.warn_d["4"] % covLib.f for name, cov in cov_dict.items(): covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) out_f = BtIO.getOutFile(covLib.f, None, None) covView = ViewObj(name="covlib", out_f=out_f, suffix="cov", header="", body=[]) self.view( viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[covLib.name], progressbar=False, ) elif covLib.fmt == "cov": base_cov_dict, covLib.reads_total, covLib.reads_mapped, covLib.reads_unmapped, read_cov_dict = BtIO.parseCov( covLib.f, set(self.dict_of_blobs) ) # cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs)) if not len(base_cov_dict) == self.seqs: print BtLog.warn_d["4"] % covLib.f for name, cov in base_cov_dict.items(): covLib.cov_sum += cov self.dict_of_blobs[name].addCov(covLib.name, cov) if name in read_cov_dict: self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name]) else: pass covLib.mean_cov = covLib.cov_sum / self.seqs if covLib.cov_sum == 0.0: print BtLog.warn_d["6"] % (covLib.name) self.covLibs[covLib.name] = covLib