def test_factory_function(self): bam = data.getBam() aln = data.getXml(8) ref = data.getXml(9) sub = data.getXml(10) inTypes = [bam, aln, ref, sub] expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet] for infn, exp in zip(inTypes, expTypes): # TODO enable this for all when simulated subread files can be # pbi'd if exp in [DataSet, ReferenceSet, AlignmentSet]: ds = openDataSet(infn, strict=True) else: ds = openDataSet(infn) self.assertEqual(type(ds), exp)
def test_datset_name(self): ssfn = data.getXml(7) ofn = tempfile.NamedTemporaryFile(suffix=".xml").name run_filter_dataset(ssfn, ofn, "0", "None") ds = openDataSet(ofn) assert ds.name.endswith("(filtered)") assert "filtered" in ds.tags
def __init__(self, alignment_file): self.alignment_file = alignment_file self.dataset_uuids = [] if alignment_file.endswith('.xml'): log.debug('Importing alignments from dataset XML') alignment_set = openDataSet(alignment_file) if not isinstance(alignment_set, (AlignmentSet, ConsensusAlignmentSet)): raise TypeError("Dataset type %s not allowed here" % type(alignment_set).__name__) self.alignment_file_list = alignment_set.toExternalFiles() self.dataset_uuids.append(alignment_set.uuid) movies = [] for x in self.alignment_file_list: if not os.path.exists(x): raise IOError( "Unable to find DataSet external resource {x}".format(x=x)) movies.extend(_movienames_from_bam(x)) self.movies = sorted(list(set(movies))) elif _is_sam_or_bam_file(alignment_file): self.alignment_file_list = [alignment_file] self.movies = _movienames_from_bam(alignment_file) else: raise ValueError("Unsupported alignment file type '${x}'".format( x=alignment_file))
def import_local_dataset(sal, path): """:type sal: ServiceAccessLayer""" # XXX basic validation of external resources try: from pbcore.io import openDataSet, ReadSet, HdfSubreadSet except ImportError: log.warn("Can't import pbcore, skipping dataset sanity check") else: ds = openDataSet(path, strict=True) if isinstance(ds, ReadSet) and not isinstance(ds, HdfSubreadSet): if len(ds) > 0: log.info("checking BAM file integrity") for rr in ds.resourceReaders(): try: _ = rr[-1] except Exception as e: log.exception("Import failed because the underlying "+ "data appear to be corrupted. Run "+ "'pbvalidate' on the dataset for more "+ "thorough checking.") return 1 else: log.warn("Empty dataset - will import anyway") # this will raise if the import wasn't successful _ = sal.run_import_local_dataset(path) log.info("Successfully import dataset from {f}".format(f=path)) return 0
def test_dataset_io(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name rc = bamSieve.filter_reads(input_bam=DS2, output_bam=ofn, whitelist="8") self.assertEqual(rc, 0) with openDataSet(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8])) # make sure paths are absolute tmpdir = tempfile.mkdtemp() ofn2 = op.join(tmpdir, op.basename(ofn)) shutil.copyfile(ofn, ofn2) with openDataSet(ofn2, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8]))
def consolidateXml(args): """Combine BAMs and apply the filters described in the XML file, producing one consolidated XML""" dset = openDataSet(args.infile) dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not args.noTmp)) dset.write(args.xmlfile)
def loadStatsXml(args): dset = openDataSet(args.infile, strict=args.strict) dset.loadStats(args.statsfile) if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False)
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, task_id=Constants.TOOL_ID): datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) != 1: new_resource_file = op.splitext(output_file)[0] + ".bam" ds_in.consolidate(new_resource_file, numFiles=n_files) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, task_id + "-out-2", ext_res.metaType, ext_res.bam) datastore_files.append(ds_file) for index in ext_res.indices: if index.metaType in Constants.BAI_FILE_TYPES: ds_file = DataStoreFile(index.uniqueId, task_id + "-out-3", index.metaType, index.resourceId) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def run_after(self, rtc, output_dir): n_actual = n_expected = 0 with openDataSet(self.INPUT_FILES[0]) as ds: n_expected = len([ rec for rec in ds ]) with self.READER_CLASS(rtc.task.output_files[0]) as f: n_actual = len([ rec for rec in f ]) self.assertEqual(n_actual, n_expected)
def summarizeXml(args): dset = openDataSet(args.infile, strict=args.strict) # check to see if there was an error updating the dataset length: numFlag = "" if dset.numRecords == 0: dset.updateCounts() if not dset._countsUpdated: numFlag = " Unable to update counts!" print("DataSet Type : {f}".format(f=dset.datasetType)) print("Name : {f}".format(f=dset.name)) print("Id : {f}".format(f=dset.uuid)) print("Number of records : {r}{f}".format(r=dset.numRecords, f=numFlag)) print("Total number of bases : {r}{f}".format(r=dset.totalLength, f=numFlag)) print("# of Resources : {r}".format(r=len(dset.toExternalFiles()))) print("Filters : {r}".format( r=str(dset.filters) if dset.filters else "None")) show_sample_names_if_defined(dset) if args.show_chemistry: print("Sequencing Chemistry : {c}".format( c=", ".join(dset.sequencingChemistry))) for fname in dset.toExternalFiles(): print(fname) return 0
def _labels_reads_iterator(reads, barcodes, subreads=True): with openDataSet(reads) as ds: for er in ds.externalResources: if er.barcodes != barcodes: raise ValueError( "Mismatch between external resource " + "barcodes and input BarcodeSet: " + "{a} != {b}".format(a=er.barcodes, b=barcodes) ) assert ds.isIndexed zmws_by_barcode = defaultdict(set) reads_by_zmw = defaultdict(list) for rr in ds.resourceReaders(): for i, (b, z, q) in enumerate(zip(rr.pbi.bcForward, rr.pbi.holeNumber, rr.pbi.qId)): movie = rr.readGroupInfo(q).MovieName zmws_by_barcode[b].add((movie, z)) reads_by_zmw[(movie, z)].append((rr, i)) with BarcodeSet(barcodes) as bc: for i_bc, barcode in enumerate(bc): zmws = sorted(list(zmws_by_barcode[i_bc])) for (movie, zmw) in zmws: for rr, i_read in reads_by_zmw[(movie, zmw)]: # FIXME(nechols)(2016-03-15) this will not work on CCS qlen = rr.pbi.qEnd[i_read] - rr.pbi.qStart[i_read] barcode_id = "{f}--{r}".format(f=rr.pbi.bcForward[i_read], r=rr.pbi.bcReverse[i_read]) yield barcode_id, barcode, ["n"] * qlen
def import_local_dataset(sal, path): """:type sal: ServiceAccessLayer""" # XXX basic validation of external resources try: from pbcore.io import openDataSet, ReadSet, HdfSubreadSet except ImportError: log.warn("Can't import pbcore, skipping dataset sanity check") else: ds = openDataSet(path, strict=True) if isinstance(ds, ReadSet) and not isinstance(ds, HdfSubreadSet): if len(ds) > 0: log.info("checking BAM file integrity") for rr in ds.resourceReaders(): try: _ = rr[-1] except Exception as e: log.exception("Import failed because the underlying " + "data appear to be corrupted. Run " + "'pbvalidate' on the dataset for more " + "thorough checking.") return 1 else: log.warn("Empty dataset - will import anyway") # this will raise if the import wasn't successful _ = sal.run_import_local_dataset(path) log.info("Successfully import dataset from {f}".format(f=path)) return 0
def _example_main( input_file, output_file, **kwargs ): """ This func should be imported from your python package. This should have *no* dependency on the pbcommand IO, such as the RTC/TC models. """ # This is just for test purposes log.info("Running example main with {i} {o} kw:{k}".format(i=input_file, o=output_file, k=kwargs)) # Try to open SubreadSet with pbcore log.info( "Attempting to open SubreadSet input with pbcore.io.BamIO" ) dset = openDataSet( input_file ) nreads = len( dset ) # write mock output files, otherwise the End-to-End test will fail when # run within testkit log.info( "Attempting to write simple information from the SubreadSet to CSV output" ) with open( output_file, 'wb' ) as csvfile: writer = csv.writer( csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL ) writer.writerow( ['foo', 'bar'] ) writer.writerow( ['baz', 'waz'] ) writer.writerow( ['nreads', str(nreads) ] ) #f.write( "MOCK TEST DATA" ) return 0
def _example_main(input_file, output_file, **kwargs): """ This func should be imported from your python package. This should have *no* dependency on the pbcommand IO, such as the RTC/TC models. """ # This is just for test purposes log.info("Running example main with {i} {o} kw:{k}".format(i=input_file, o=output_file, k=kwargs)) # Open input CSV. Store absolute path of each alignment set. dset_paths = _get_dset_paths(input_file) dsets_kpis = {} for f in dset_paths: dset = openDataSet(f) subsampled_dset = _subsample_alignments(dset) dsets_kpis[f] = _getKPIs(dset, subsampled_dset) pickle.dump(dsets_kpis, open(output_file, 'wb')) # save a simple plot traces = [] titles = [] max_rl = 0 for key in dsets_kpis.keys(): rl = dsets_kpis[key]['readlength'] acc = dsets_kpis[key]['accuracy'] if max(rl) > max_rl: max_rl = max(rl) trace = Scatter(x=rl, y=acc, mode='markers') traces.append(trace) titles.append(str(key)) rows = len(traces) fig = plotly.tools.make_subplots(rows=rows, cols=1, subplot_titles=tuple(titles)) fig['layout']['font']['size'] = 8 fig['layout'].update(showlegend=False) for row, trace in enumerate(traces): fig.append_trace(trace, row + 1, 1) # convert from zero-based to one-based indexing fig['layout']['xaxis' + str(row + 1)]['tickfont'].update(size=20) fig['layout']['yaxis' + str(row + 1)]['tickfont'].update(size=20) fig['layout']['xaxis' + str(row + 1)].update(range=[0, max_rl]) fig['layout']['yaxis' + str(rows / 2 + 1)].update(title='accuracy') fig['layout']['yaxis' + str(rows / 2 + 1)]['titlefont'].update(size=20) fig['layout']['xaxis' + str(rows)].update(title='readlength (bases)') fig['layout']['xaxis' + str(rows)]['titlefont'].update(size=20) plot(fig, filename='test-plot.html', show_link=False, auto_open=False) phantomjs_driver.set_window_size(1200, 800) phantomjs_driver.get('test-plot.html') phantomjs_driver.save_screenshot('test-plot.png') return 0
def _example_main(input_file, output_file, **kwargs): """ This func should be imported from your python package. This should have *no* dependency on the pbcommand IO, such as the RTC/TC models. """ # This is just for test purposes log.info("Running example main with {i} {o} kw:{k}".format(i=input_file, o=output_file, k=kwargs)) # Open dset CSV. Store absolute path of each alignment set. dset_paths = _get_dset_paths(input_file[0]) # Open plots CSV. Store names of plots to produce. plots_to_generate = _get_plots_to_generate(input_file[1]) dsets_kpis = {} for f in dset_paths: dset = openDataSet(dset_paths[f]['aset']) subsampled_dset = _subsample_alignments(dset) dsets_kpis[f] = _getKPIs(dset, subsampled_dset) figures = [] # figure tuple has form (plot_group_id, plot_id, figure) if 'accuracy_vs_readlength' in plots_to_generate: figures.append(('accuracy', 'accuracy_vs_readlength', accuracy_plots._plot_accuracy_vs_readlength(dsets_kpis))) if 'accuracy' in plots_to_generate: figures.append(('accuracy', 'accuracy', accuracy_plots._plot_accuracy_distribution(dsets_kpis))) if 'accuracy_boxplot' in plots_to_generate: figures.append(('accuracy', 'accuracy_boxplot', accuracy_plots._plot_accuracy_boxplots(dsets_kpis))) all_plots = {} # dictionary of plots. keys are groups for plot_group, plot_id, fig in figures: if plot_group not in all_plots.keys(): all_plots[plot_group] = [] plot(fig, filename='{i}.html'.format(i=plot_id), show_link=False, auto_open=False) phantomjs_driver.set_window_size(1920, 1080) phantomjs_driver.get('{i}.html'.format(i=plot_id)) phantomjs_driver.save_screenshot('{i}.png'.format(i=plot_id)) phantomjs_driver.get('{i}.html'.format(i=plot_id)) phantomjs_driver.save_screenshot('{i}_thumb.png'.format(i=plot_id)) os.remove('{i}.html'.format(i=plot_id)) plot_path = '{i}.png'.format(i=plot_id) thumb_path = '{i}_thumb.png'.format(i=plot_id) all_plots[plot_group].append(Plot(plot_id, plot_path, thumbnail=thumb_path)) plot_groups = [] for plot_group_title in all_plots.keys(): plot_group = PlotGroup( plot_group_title, plots=all_plots[plot_group_title]) plot_groups.append(plot_group) report = Report('mh_toy', tables=(), plotgroups=plot_groups, attributes=()) report.write_json( output_file ) phantomjs_driver.quit() return 0
def copyToXml(args): dss = openDataSet(args.infile, strict=args.strict) outfn = args.outdir if os.path.isdir(args.outdir): outfn = _swapPath(args.outdir, args.infile) dss.copyTo(os.path.split(outfn)[0]) dss.write(outfn, relPaths=args.relative) return 0
def _run_reheader_dataset_bams(self, ds_file): with openDataSet(ds_file) as ds: ds_out = reheader_dataset_bams(ds, os.getcwd(), self.BIOSAMPLE_NAME, self.LIBRARY_NAME) self._validate_dataset(ds_out) self._validate_records(ds, ds_out) return ds_out
def loadMetadataXml(args): dset = openDataSet(args.infile, strict=args.strict) dset.loadMetadata(args.metadata) if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False) return 0
def _verify(): with openDataSet(ofn, strict=False) as ds_out: ext_res = ds_out.externalResources[0] for bam_file in [ext_res.bam, ext_res.scraps]: with BamReader(bam_file) as bam: zmws = set([rec.HoleNumber for rec in bam]) self.assertEqual(len(zmws), 1) self.assertTrue(74056024 in zmws)
def test_dataset_io(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name rc = bamSieve.filter_reads( input_bam=DS2, output_bam=ofn, whitelist="8") self.assertEqual(rc, 0) with openDataSet(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8])) # make sure paths are absolute tmpdir = tempfile.mkdtemp() ofn2 = op.join(tmpdir, op.basename(ofn)) shutil.copyfile(ofn, ofn2) with openDataSet(ofn2, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8]))
def WriteWhitelistedDataSets(dsFile, mappings): prefix = GetPrefix(dsFile) for seqId, subreads in mappings.iteritems(): sset = openDataSet(dsFile) sset.filters.addRequirement(qname=[('=', subreadId) for subreadId in subreads]) #sset.filters.addRequirement(qname=[('=', sorted(subreads))]) sset.write(prefix + "." + seqId + ".subreadset.xml")
def summarize_coverage(aln_set, aln_summ_gff, ref_set=None, num_regions=Constants.NUM_REGIONS, region_size=Constants.REGION_SIZE, force_num_regions=Constants.FORCE_NUM_REGIONS, max_region_size=Constants.MAX_REGION_SIZE): """ Main point of entry """ if ref_set: untruncator = get_name_untruncator(ref_set) else: # this dict is always used with get(x, x), so when it's empty it will # just preserve the original name untruncator = {} #readers = enumerate_readers(args.alignment_file) readers = openDataSet(aln_set).resourceReaders() gff_writer = GffIO.GffWriter(aln_summ_gff) # First write the metadata. Names of references, command line used, things # like that metadata_lines = get_metadata_lines(readers, untruncator) for metadata_line in metadata_lines: gff_writer.writeHeader(metadata_line) log.debug("Wrote {n} header lines to {f}" .format(n=len(metadata_lines), f=aln_summ_gff)) # Build lists of intervals for each reference interval_lists = build_interval_lists(readers) log.debug("Finished creating interval lists for {n} references" .format(n=len(interval_lists))) # Create a function that gets region size from the reference length by # freezing the constant parameters get_region_size_frozen = functools.partial( get_region_size, num_refs=len(interval_lists), region_size=region_size, num_regions=num_regions, force_num_regions=force_num_regions, max_region_size=max_region_size) # Create Gff records and write them for ref_group_id in sorted(interval_lists): log.debug("Generating coverage GFF records for refGroupID {r}" .format(r=ref_group_id)) gff_generator = generate_gff_records( interval_lists[ref_group_id], readers, ref_group_id, get_region_size_frozen, untruncator) try: for gff_record in gff_generator: gff_writer.writeRecord(gff_record) except ValueError as e: log.warn(e)
def run_and_validate(args, ds_sizes): outdir = tempfile.mkdtemp(suffix="dataset-unittest") final_args = base_args + args + ["--outdir", outdir, tmp_ds] self._check_cmd(" ".join(final_args)) dss = [ openDataSet(op.join(outdir, fn)) for fn in sorted(os.listdir(outdir)) ] assert [len(ds) for ds in dss] == ds_sizes
def resolved_tool_contract_runner(rtc): log.info("Starting {f} version {v} report generation".format(f=__file__, v=__version__)) dataset_uuids = [openDataSet(rtc.task.input_files[0]).uuid, BarcodeSet(rtc.task.input_files[1]).uuid] report = run_to_report( reads=rtc.task.input_files[0], barcodes=rtc.task.input_files[1], subreads=True, dataset_uuids=dataset_uuids ) log.info(pformat(report.to_dict())) report.write_json(rtc.task.output_files[0]) return 0
def test_filter_more(self): ssfn = data.getXml(7) ofn = tempfile.NamedTemporaryFile(suffix=".xml").name # zm=[3,4,5] condition run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5] AND length >= 1000") ds = openDataSet(ofn) assert str(ds.filters) == "( zm = [3,4,5] AND length >= 1000 )" # zm=[3,4,5] condition run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5]; length >= 1000") ds = openDataSet(ofn) assert str(ds.filters) == "( zm = [3,4,5] AND length >= 1000 )" # zm=[3,4,5] condition by itself run_filter_dataset(ssfn, ofn, 0, "zm=[3,4,5]") ds = openDataSet(ofn) assert str(ds.filters) == '( zm = [3,4,5] )'
def run(in_file, out_file, filterstr): dataSet = openDataSet(in_file) filters = dict(parse_filter_list(filterstr.split(','))) log.info("Adding {} filters to {}: {}".format(len(filters), in_file, out_file, repr(filters))) dataSet.filters.addFilter(**filters) log.info("Added. Writing new dataset {}".format(repr(out_file))) #dataSet.updateCounts() # just in case # no, not needed dataSet.write(out_file, validate=False) # to avoid warnings
def run_consolidate(dataset_file, output_file, consolidate, n_files): with openDataSet(dataset_file) as ds_in: # XXX shouldn't the file count check be done elsewhere? if consolidate and len(ds_in.toExternalFiles()) != 1: new_resource_file = op.splitext(output_file)[0] + ".bam" # .fasta? ds_in.consolidate(new_resource_file, numFiles=n_files) ds_in.newUuid() ds_in.write(output_file) return 0
def run_after(self, output_file, n_expected, expected_filter_str): n_actual = self._get_counts(output_file) assert self._get_filters(output_file) == expected_filter_str assert n_actual == n_expected ds = openDataSet(output_file) assert len(ds.metadata.provenance) == 0 assert ds.name.endswith("(filtered)") assert "filtered" in ds.tags return ds
def run(in_file, out_file, filterstr): dataSet = openDataSet(in_file) filters = dict(parse_filter_list(filterstr.split(','))) log.info("Adding {} filters to {}: {}".format( len(filters), in_file, out_file, repr(filters))) dataSet.filters.addFilter(**filters) log.info("Added. Writing new dataset {}".format(repr(out_file))) #dataSet.updateCounts() # just in case # no, not needed dataSet.write(out_file, validate=False) # to avoid warnings
def test_dataset_io(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name rc = bamSieve.filter_reads( input_bam=DS2, output_bam=ofn, whitelist="8") self.assertEqual(rc, 0) with openDataSet(ofn, strict=False) as bam_out: have_zmws = set([rec.HoleNumber for rec in bam_out]) self.assertEqual(have_zmws, set([8]))
def mergeXml(args): dss = [openDataSet(infn, strict=args.strict) for infn in args.infiles] allds = reduce(lambda ds1, ds2: ds1 + ds2, dss) if not allds is None: allds.updateCounts() allds.write(args.outfile) else: raise InvalidDataSetIOError("Merge failed, likely due to " "conflicting Filters") return 0
def filterXml(args): if args.infile.endswith('xml'): dataSet = openDataSet(args.infile, strict=args.strict) filters = parse_filter_list(args.filters) dataSet.filters.addRequirement(**filters) dataSet.updateCounts() log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible")
def loadStatsXml(args): dset = openDataSet(args.infile, strict=args.strict) if len(dset.externalResources) > 1: log.info("More than one ExternalResource found, adding the " "sts.xml nested external resource to the first one") dset.externalResources[0].sts = args.statsfile if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False)
def absolutizeXml(args): dss = openDataSet(args.infile, strict=args.strict) outfn = args.infile if args.outdir: if os.path.isdir(args.outdir): outfn = _swapPath(args.outdir, args.infile) else: outfn = args.outdir dss.write(outfn, relPaths=False) return 0
def ReadAdaptersFromScraps(bam, windows): handles = [] if bam.lower().endswith(".scraps.bam"): handles.append(IndexedBamReader(bam)) else: # Iterate through each external resource, looking for scraps files to read ds = openDataSet(bam) for er in ds.externalResources: try: handle = IndexedBamReader(er.scraps) except: continue handles.append(handle) adps = defaultdict(list) for handle in handles: for record in handle: if record.scrapType != "A": continue hn = record.holeNumber # Skip records without alignments that passed QC try: qS, qE, _, _, _, _, _ = windows[hn] except: continue # Skip records for ZMWs other than the one selected for it's alignment if record.qStart not in [qS, qE] and record.qEnd not in [qS, qE]: continue # If we made it this far, record the position and type of adapter seq = record.peer.seq tFrac = sum(1 for b in seq if b == "T") / float(len(seq)) if tFrac < MIN_T: adps[hn].append((record.qStart, "TC6")) else: adps[hn].append((record.qStart, "POLYA")) # Convert our counts into a T/F depending on whether there are polyAs results = {} for hn, adpData in adps.iteritems(): if len(adpData) != 2: print "ERROR! ERROR! {0} adps for hn #{1}".format( len(adpTypes), hn) # Using the strand, sort the adps left-to-right (by alignment) _, _, _, _, _, strand, _ = windows[hn] if strand == 0: adpData = sorted(adpData) else: adpData = sorted(adpData, reverse=True) # Now ordered we can record both ADP types and locations leftTc6 = "T" if adpData[0][1] == "TC6" else "F" rightTc6 = "T" if adpData[1][1] == "TC6" else "F" leftPolyA = "T" if adpData[0][1] == "POLYA" else "F" rightPolyA = "T" if adpData[1][1] == "POLYA" else "F" results[hn] = (leftTc6, rightTc6, leftPolyA, rightPolyA) return results
def summarizeXml(args): dset = openDataSet(args.infile, strict=args.strict) print "DataSet Type : {f}".format(f=dset.datasetType) print "Name : {f}".format(f=dset.name) print "Id : {f}".format(f=dset.uuid) print "Number of records : {r}".format(r=dset.numRecords) print "Total number of bases : {r}".format(r=dset.totalLength) print "# of Resources : {r}".format(r=len(dset.toExternalFiles())) for fname in dset.toExternalFiles(): print fname return 0
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, tmp_dir=None): assert isinstance(program_name, basestring) barcode_mode = False if output_file_name.endswith(".gz"): with openDataSet(input_file_name) as ds_in: barcode_mode = ds_in.isBarcoded tmp_out_prefix = tempfile.NamedTemporaryFile(dir=tmp_dir).name args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] if barcode_mode: args.insert(1, "--split-barcodes") log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code else: base_ext = re.sub("bam2", "", program_name) if not barcode_mode: tmp_out = "{p}.{b}.gz".format(p=tmp_out_prefix, b=base_ext) assert os.path.isfile(tmp_out), tmp_out if output_file_name.endswith(".gz"): log.info("cp {t} {f}".format(t=tmp_out, f=output_file_name)) shutil.copyfile(tmp_out, output_file_name) else: _unzip_fastx(tmp_out, output_file_name) os.remove(tmp_out) else: suffix = "{f}.gz".format(f=base_ext) tmp_out_dir = op.dirname(tmp_out_prefix) tc_out_dir = op.dirname(output_file_name) barcoded_file_names = [] # find the barcoded FASTX files and unzip them to the same # output directory and file prefix as the ultimate output for fn in os.listdir(tmp_out_dir): fn = op.join(tmp_out_dir, fn) if fn.startswith(tmp_out_prefix) and fn.endswith(suffix): bc_fwd_rev = fn.split(".")[-3].split("_") suffix2 = ".{f}_{r}.{t}".format( f=bc_fwd_rev[0], r=bc_fwd_rev[1], t=base_ext) assert fn == tmp_out_prefix + suffix2 + ".gz" fn_out = re.sub(".gz$", suffix2, output_file_name) fastx_out = op.join(tc_out_dir, fn_out) _unzip_fastx(fn, fastx_out) barcoded_file_names.append(fn_out) os.remove(fn) assert len(barcoded_file_names) > 0 return archive_files(barcoded_file_names, output_file_name) return 0
def run_after(self, rtc, output_dir): with openDataSet(rtc.task.output_files[0]) as f: f.assertIndexed() self.assertEqual(len(f.toExternalFiles()), 1) # test for bug 33778 qnames = set() for rec in f: qnames.add(rec.qName) self.assertEqual(len(qnames), len(f)) ds = DataStore.load_from_json(rtc.task.output_files[1]) self.assertEqual(len(ds.files), 2)
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, consolidate_f=lambda ds: ds.consolidate): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) <= 0: raise ValueError( "DataSet {} must contain one or more files!".format( dataset_file)) new_resource_file = bam_of_dataset(output_file) consolidate_f(ds_in)(new_resource_file, numFiles=n_files, useTmp=False) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work reads_name = get_reads_name(ds_in) for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, Constants.TOOL_ID + "-out-2", ext_res.metaType, ext_res.bam, name=reads_name, description=reads_name) datastore_files.append(ds_file) # Prevent duplicated index files being added to datastore, since consolidated # dataset may contain multiple indices pointing to the same physical file added_resources = set() for index in ext_res.indices: if (index.metaType in Constants.BAI_FILE_TYPES and index.resourceId not in added_resources): added_resources.add(index.resourceId) ds_file = DataStoreFile( index.uniqueId, Constants.TOOL_ID + "-out-3", index.metaType, index.resourceId, name="Index of {}".format(reads_name.lower()), description="Index of {}".format( reads_name.lower())) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def loadStatsXml(args): dset = openDataSet(args.infile, strict=args.strict) if len(dset.externalResources) > 1: log.info("More than one ExternalResource found, adding the " "sts.xml nested external resource to the first one") dset.externalResources[0].sts = args.statsfile if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False) return 0
def filterXml(args): if args.infile.endswith('xml'): dataSet = openDataSet(args.infile, strict=args.strict) filters = parse_filter_list(args.filters) dataSet.filters.addRequirement(**filters) dataSet.updateCounts() log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible") return 0
def run_filter_dataset(in_file, out_file, read_length, other_filters, downsample_factor=0, min_rq=None): rlen = sanitize_read_length(read_length) filters = {} if other_filters and other_filters != "None": filters = parse_filter_list([str(other_filters)]) log.info("{i} other filters will be added".format(i=len(filters))) tags = set() if rlen or min_rq is not None or len(filters) > 0 or not downsample_factor in [0, 1]: dataSet = openDataSet(in_file) orig_uuid = dataSet.uuid dataSet.updateCounts() # just in case combine_filters(dataSet, filters) tags.update({t.strip() for t in dataSet.tags.strip().split(",")}) if rlen: combine_filters(dataSet, {'length': [('>=', rlen)]}) if min_rq is not None and min_rq > 0: combine_filters(dataSet, {'rq': [('>=', min_rq)]}) if not downsample_factor in [0, 1]: combine_filters(dataSet, {'zm': [("==", "0", downsample_factor)]}) tags.add("downsampled") dataSet.updateCounts() # XXX note we do *not* set a new UUID in case we want to keep a parent- # child relationship to the input dataset. since the filtered dataset # will not be imported back into SMRT Link it is ok to keep the # original UUID dataSet.uuid = orig_uuid else: # if we're not actually changing anything, don't load indices dataSet = openDataSet(in_file, skipCounts=True) tags.add("filtered") dataSet.tags = ",".join(list(tags)) if not "(filtered)" in dataSet.name: dataSet.name = dataSet.name + " (filtered)" if len(dataSet.metadata.provenance) > 0: log.warning("Removing existing provenance record: %s", dataSet.metadata.provenance) dataSet.metadata.provenance = None dataSet.write(out_file) return 0
def _get_rtc_dataset_uuids(report_file, use_outputs=False): rtc_path = op.join(op.dirname(report_file), "resolved-tool-contract.json") rtc = load_resolved_tool_contract_from(rtc_path) all_files = rtc.task.input_files if use_outputs: all_files = rtc.task.output_files ds_uuids = set() for file_name in all_files: if file_name.endswith(".xml"): with openDataSet(file_name) as ds: ds_uuids.add(ds.uuid) return ds_uuids
def _check_outputs(self, dataset_file): assert op.isfile(self.output_bam) assert op.isfile(self.output_bam + ".bai") assert op.isfile(self.output_bam + ".pbi") with openDataSet(dataset_file) as f: f.assertIndexed() assert len(f.toExternalFiles()) == 1 # test for bug 33778 qnames = set() for rec in f: qnames.add(rec.qName) assert len(qnames) == len(f)
def test_split(self): ds1 = openDataSet(data.getXml(12)) self.assertTrue(ds1.numExternalResources > 1) dss = ds1.split() self.assertTrue(len(dss) == ds1.numExternalResources) self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords) self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength) self.assertEqual(sum(len(ds) for ds in dss), len(ds1)) dss = ds1.split(chunks=1) self.assertTrue(len(dss) == 1) self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords) self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength) self.assertEqual(sum(len(ds) for ds in dss), len(ds1)) dss = ds1.split(chunks=2) self.assertTrue(len(dss) == 2) self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords) self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength) self.assertEqual(sum(len(ds) for ds in dss), len(ds1)) dss = ds1.split(chunks=2, ignoreSubDatasets=True) self.assertTrue(len(dss) == 2) self.assertEqual(sum(ds.numRecords for ds in dss), ds1.numRecords) self.assertEqual(sum(ds.totalLength for ds in dss), ds1.totalLength) self.assertEqual(sum(len(ds) for ds in dss), len(ds1)) self.assertFalse(dss[0].uuid == dss[1].uuid) self.assertTrue(dss[0].name == dss[1].name) # Lets try merging and splitting on subdatasets ds1 = openDataSet(data.getXml(8)) self.assertEquals(ds1.totalLength, 123588) ds1tl = ds1.totalLength ds2 = openDataSet(data.getXml(11)) self.assertEquals(ds2.totalLength, 117086) ds2tl = ds2.totalLength dss = ds1 + ds2 self.assertTrue(dss.totalLength == (ds1tl + ds2tl)) ds1, ds2 = sorted(dss.split(2, ignoreSubDatasets=False), key=lambda x: x.totalLength, reverse=True) self.assertTrue(ds1.totalLength == ds1tl) self.assertTrue(ds2.totalLength == ds2tl)
def __init__(self, file_name): self.file_name = file_name self._is_fasta = False self.ext = op.splitext(file_name)[1].upper() if self.ext in [".FA", ".FASTA"]: self._dataset = FastaReader(file_name) self._is_fasta = True elif self.ext == ".BAM": self._dataset = openDataFile(file_name) else: # either contigset.xml or consensusreadset.xml assert self.ext == ".XML" self._dataset = openDataSet(file_name) if isinstance(self._dataset, ContigSet): self._is_fasta = True
def run_filter_dataset(in_file, out_file, read_length, other_filters): dataSet = openDataSet(in_file) if other_filters and other_filters != "None": filters = parse_filter_list(str(other_filters).split(',')) dataSet.filters.addFilter(**filters) log.info("{i} other filters added".format(i=len(filters))) rlen = sanitize_read_length(read_length) if rlen: dataSet.filters.addRequirement( length=[('>=', rlen)]) if rlen or other_filters: dataSet.updateCounts() dataSet.write(out_file) return 0
def splitXml(args): log.debug("Starting split") dataSet = openDataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks if isinstance(dataSet, ContigSet): dss = dataSet.split(chunks) else: dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs, targetSize=args.targetSize, zmws=args.zmws, barcodes=args.barcodes, byRecords=(not args.byRefLength), updateCounts=(not args.noCounts)) log.debug("Splitting into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' nSuf = -2 if re.search(r".+\.\w+set\.xml", args.infile) else -1 if not args.outfiles: if not args.outdir: args.outfiles = ['.'.join(args.infile.split('.')[:nSuf] + [infix.format(i=chNum)] + args.infile.split('.')[nSuf:]) for chNum in range(len(dss))] else: args.outfiles = ['.'.join(args.infile.split('.')[:nSuf] + [infix.format(i=chNum)] + args.infile.split('.')[nSuf:]) for chNum in range(len(dss))] args.outfiles = [os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format( f=', '.join(args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files") return 0
def filterXml(args): if args.infile.endswith("xml"): dataSet = openDataSet(args.infile, strict=args.strict) filters = defaultdict(list) separators = ["<=", ">=", "!=", "==", ">", "<", "="] for filt in args.filters: for sep in separators: if sep in filt: param, condition = filt.split(sep) condition = (sep, condition) filters[param].append(condition) break dataSet.filters.addRequirement(**filters) dataSet.updateCounts() log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible")