Esempio n. 1
0
 def test_discard_bio_samples(self):
     ds = SubreadSet(self.SUBREADS)
     discard_bio_samples(ds, "lbc1--lbc1")
     coll = ds.metadata.collections[0]
     bioSamples = ds.metadata.collections[0].wellSample.bioSamples
     assert len(bioSamples) == 1
     assert bioSamples[0].name == "Alice"
     # No matching BioSample records
     ds = SubreadSet(self.SUBREADS)
     ds.metadata.collections[0].wellSample.bioSamples.pop(1)
     ds.metadata.collections[0].wellSample.bioSamples.pop(1)
     bioSample = ds.metadata.collections[0].wellSample.bioSamples[0]
     while len(bioSample.DNABarcodes) > 0:
         bioSample.DNABarcodes.pop(0)
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     discard_bio_samples(ds, "lbc1--lbc1")
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     assert ds.metadata.collections[0].wellSample.bioSamples[
         0].name == "lbc1--lbc1"
     assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[
         0].name == "lbc1--lbc1"
     # no BioSample records
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"))
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 0
     discard_bio_samples(ds, "lbc1--lbc1")
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     assert ds.metadata.collections[0].wellSample.bioSamples[
         0].name == "lbc1--lbc1"
     assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[
         0].name == "lbc1--lbc1"
Esempio n. 2
0
    def test_barcode_split_cornercases(self):
        fn = ('/pbi/dept/secondary/siv/testdata/'
              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
              '.tiny.subreadset.xml')
        sset = SubreadSet(fn, skipMissing=True)
        ssets = list(sset.split(chunks=3, barcodes=True))
        assert [str(ss.filters) for ss in ssets
                ] == ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"]
        sset = SubreadSet(fn, skipMissing=True)
        assert len(sset) == 15133
        sset.filters = None
        assert str(sset.filters) == ""
        sset.updateCounts()
        assert len(sset) == 2667562

        sset.filters.addRequirement(bc=[('=', '[2, 2]')])
        assert str(sset.filters) == "( bc = [2, 2] )"
        sset.updateCounts()
        assert len(sset) == 4710

        sset.filters = None
        assert str(sset.filters) == ""
        sset.updateCounts()
        assert len(sset) == 2667562

        sset.filters.addRequirement(bc=[('=', '[2,2]')])
        assert str(sset.filters) == "( bc = [2,2] )"
        sset.updateCounts()
        assert len(sset) == 4710
Esempio n. 3
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a list of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True, skipCounts=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    split_fns = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn, skipCounts=True)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        split_fns.append(out_fn)

    return split_fns
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Esempio n. 5
0
 def test_dataset_create_set_sample_names(self):
     sample_args = "--well-sample-name WELLSAMPLE --bio-sample-name BIOSAMPLE".split(
     )
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("subreads-bam")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         assert ds.metadata.collections[0].wellSample.bioSamples[
             0].name == "BIOSAMPLE"
         assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     # now with existing samples
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("barcoded-subreadset")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         biosamples = {
             s.name
             for s in ds.metadata.collections[0].wellSample.bioSamples
         }
         assert biosamples == {"BIOSAMPLE"}
Esempio n. 6
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
Esempio n. 7
0
 def setup_class(cls):
     bam_files = []
     with SubreadSet(pbtestdata.get_file("barcoded-subreadset")) as ds_in:
         for er in ds_in.externalResources:
             bam_files.append(er.bam)
     with SubreadSet(*bam_files, strict=True) as ds_out:
         ds_out.write(cls.INPUT_FILE)
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(7))
        assert not aln.metadata.collections
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        assert aln.metadata.collections
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        assert sset.metadata == orig_metadata

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        with pytest.raises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Esempio n. 9
0
 def test_split_zmws_around_read_groups(self):
     ds1 = pbtestdata.get_file("subreads-xml")
     ds2 = pbtestdata.get_file("subreads-sequel")
     ds = SubreadSet(ds1, ds2)
     assert len(ds) == 137
     # this is still the default behavior
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=True))
     assert len(chunks[0]) == 72
     assert len(chunks[1]) == 65
     # don't break up movies
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=False))
     assert len(chunks[0]) == 20
     assert len(chunks[1]) == 117
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -1197849594)
     chunks = list(
         ds.split(chunks=4, targetSize=1, zmws=True, breakReadGroups=False))
     assert [len(c) for c in chunks] == [8, 12, 54, 63]
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -2081539485)
     assert np.all(chunks[2].index.qId == -1197849594)
     assert np.all(chunks[3].index.qId == -1197849594)
     # control: single-movie dataset
     ds = SubreadSet(ds1)
     chunks1 = list(ds.split(chunks=4, zmws=True, breakReadGroups=False))
     chunks2 = list(ds.split(chunks=4, zmws=True, breakReadGroups=True))
     assert [len(x) for x in chunks1] == [len(y) for y in chunks2]
    def test_reports_with_fixed_bins(self):
        # TODO readQualDists are currently unpopulated, turn back on when
        # they're repopulated
        # for dist_name, nbins in zip(['medianInsertDists', 'readLenDists',
        #                             'readQualDists'], [200, 200, 50]):
        for dist_name, nbins in zip(['medianInsertDists', 'readLenDists'],
                                    [200, 200]):
            ss = SubreadSet()
            ss.loadStats(get_fixed_bin_sts())

            ss2 = SubreadSet()
            ss2.loadStats(get_fixed_bin_sts())

            # shift ss2
            mdist = getattr(ss2.metadata.summaryStats, dist_name)[0].bins
            mdist = [0, 0, 0] + mdist[:-3]
            getattr(ss2.metadata.summaryStats, dist_name)[0].bins = mdist

            ss3 = ss + ss2

            ss4 = SubreadSet()
            ss4.loadStats(get_fixed_bin_sts())

            # shift ss4
            mdist = getattr(ss4.metadata.summaryStats, dist_name)[0].bins
            mdist = [0 for _ in mdist]
            getattr(ss4.metadata.summaryStats, dist_name)[0].bins = mdist

            dists = getattr(ss4.metadata.summaryStats, dist_name)
            self.assertEqual(len(dists), 1)
            for n in [0, 1, 2, 10, 40, 41, 49, 50, 51, 200, 500]:
                ds = continuous_dist_shaper(dists, nbins=n)
                fixed_dists = [ds(dist) for dist in dists]
                self.assertEqual(len(dists[0].bins), nbins)
                self.assertEqual(len(fixed_dists[0].bins), nbins)
                self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))

            sss = [ss, ss2, ss3]

            for sset in sss:
                dists = getattr(sset.metadata.summaryStats, dist_name)
                self.assertEqual(len(dists), 1)
                # 0, requested nbins > numBins fails back to no-op
                ops = [1, 2, 3, 4, 7, 10, 40, 41, 49, 50, 51, 200, 500]
                no_ops = [0]
                for n in no_ops:
                    ds = continuous_dist_shaper(dists, nbins=n)
                    fixed_dists = [ds(dist) for dist in dists]
                    self.assertEqual(len(dists[0].bins), nbins)
                    self.assertEqual(len(fixed_dists[0].bins), nbins)
                    self.assertEqual(sum(dists[0].bins),
                                     sum(fixed_dists[0].bins))

                for n in ops:
                    ds = continuous_dist_shaper(dists, nbins=n)
                    fixed_dists = [ds(dist) for dist in dists]
                    self.assertEqual(len(dists[0].bins), nbins)
                    self.assertEqual(len(fixed_dists[0].bins), n)
                    self.assertEqual(sum(dists[0].bins),
                                     sum(fixed_dists[0].bins))
Esempio n. 11
0
    def test_isBarcoded(self):
        empty = upstreamdata.getEmptyBam()
        nonempty = ('/pbi/dept/secondary/siv/testdata/'
                    'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
                    '.tiny.subreadset.xml')

        # One empty one non empty
        sset = SubreadSet(nonempty, empty, skipMissing=True)
        self.assertTrue(sset.isBarcoded)

        # Just nonempty
        sset = SubreadSet(nonempty, skipMissing=True)
        self.assertEqual(len(sset), 15133)
        self.assertTrue(sset.isBarcoded)

        # Just empty
        #   This is crazy, the pbi must be out of date:
        sset = SubreadSet(empty)
        self.assertEqual(len(sset), 0)
        self.assertTrue(sset.isBarcoded)
        #   To confirm current behavior, I will regenerate the pbi with a
        #   current pbindex:
        efn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info("Copying to {}".format(efn))
        sset.copyTo(efn)
        sset.induceIndices(force=True)
        self.assertFalse(sset.isBarcoded)
Esempio n. 12
0
    def test_barcode_split_maxChunks(self):
        fn = ('/pbi/dept/secondary/siv/testdata/'
              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
              '.tiny.subreadset.xml')
        sset = SubreadSet(fn, skipMissing=True)
        ssets = sset.split(maxChunks=2, barcodes=True)
        self.assertEqual(
            [str(ss.filters) for ss in ssets],
            ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"])
        sset = SubreadSet(fn, skipMissing=True)
        self.assertEqual(len(sset), 15133)
        sset.filters = None
        self.assertEqual(str(sset.filters), "")
        sset.updateCounts()
        self.assertEqual(len(sset), 2667562)

        sset.filters = ssets[0].filters
        self.assertEqual(str(sset.filters), "( bc = [0, 0] )")
        sset.updateCounts()
        self.assertEqual(len(sset), 5370)

        sset.filters = None
        self.assertEqual(str(sset.filters), "")
        sset.updateCounts()
        self.assertEqual(len(sset), 2667562)

        sset.filters = ssets[1].filters
        self.assertEqual(str(sset.filters),
                         "( bc = [1, 1] ) OR ( bc = [2, 2] )")
        sset.updateCounts()
        self.assertEqual(len(sset), 9763)
Esempio n. 13
0
    def test_barcode_split_cornercases(self):
        fn = ('/pbi/dept/secondary/siv/testdata/'
              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
              '.tiny.subreadset.xml')
        sset = SubreadSet(fn, skipMissing=True)
        ssets = sset.split(chunks=3, barcodes=True)
        self.assertEqual(
            [str(ss.filters) for ss in ssets],
            ["( bc = [0, 0] )", "( bc = [1, 1] )", "( bc = [2, 2] )"])
        sset = SubreadSet(fn, skipMissing=True)
        self.assertEqual(len(sset), 15133)
        sset.filters = None
        self.assertEqual(str(sset.filters), "")
        sset.updateCounts()
        self.assertEqual(len(sset), 2667562)

        sset.filters.addRequirement(bc=[('=', '[2, 2]')])
        self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
        sset.updateCounts()
        self.assertEqual(len(sset), 4710)

        sset.filters = None
        self.assertEqual(str(sset.filters), "")
        sset.updateCounts()
        self.assertEqual(len(sset), 2667562)

        sset.filters.addRequirement(bc=[('=', '[2,2]')])
        self.assertEqual(str(sset.filters), "( bc = [2,2] )")
        sset.updateCounts()
        self.assertEqual(len(sset), 4710)
Esempio n. 14
0
 def test_copy(self):
     ds1 = DataSet(data.getXml())
     ds2 = ds1.copy()
     self.assertFalse(ds1 == ds2)
     self.assertFalse(ds1.uuid == ds2.uuid)
     self.assertFalse(ds1 is ds2)
     self.assertTrue(ds1.name == ds2.name)
     self.assertTrue(ds1.externalResources == ds2.externalResources)
     # The name and UniqueId are different:
     self.assertFalse(ds1.objMetadata == ds2.objMetadata)
     self.assertTrue(ds1.filters == ds2.filters)
     self.assertTrue(ds1.subdatasets == ds2.subdatasets)
     self.assertTrue(len(ds1.subdatasets) == 2)
     self.assertTrue(len(ds2.subdatasets) == 2)
     assert not reduce(lambda x, y: x or y, [
         ds1d is ds2d for ds1d in ds1.subdatasets
         for ds2d in ds2.subdatasets
     ])
     # TODO: once simulated files are indexable, turn on strict:
     ds1 = SubreadSet(data.getXml(no=10), strict=False)
     self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata')
     ds2 = ds1.copy()
     self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata')
     # Lets try casting
     ds1 = DataSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'DataSet')
     ds1 = ds1.copy(asType='SubreadSet')
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
     # Lets do some illicit casting
     with self.assertRaises(TypeError):
         ds1 = ds1.copy(asType='ReferenceSet')
     # Lets try not having to cast
     ds1 = SubreadSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
Esempio n. 15
0
 def test_get_barcode_sample_mappings(self):
     with SubreadSet(self._subreads) as ds:
         # just double-checking that the XML defines more samples than are
         # actually present in the BAM
         assert len(ds.metadata.collections[0].wellSample.bioSamples) == 3
     samples = get_barcode_sample_mappings(SubreadSet(self._subreads))
     assert samples == {'lbc3--lbc3': 'Charles', 'lbc1--lbc1': 'Alice'}
Esempio n. 16
0
    def test_barcode_split_maxChunks(self):
        fn = ('/pbi/dept/secondary/siv/testdata/'
              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
              '.tiny.subreadset.xml')
        sset = SubreadSet(fn, skipMissing=True)
        ssets = list(sset.split(maxChunks=2, barcodes=True))
        assert [str(ss.filters) for ss in ssets
                ] == ["( bc = [0, 0] )", "( bc = [1, 1] ) OR ( bc = [2, 2] )"]
        sset = SubreadSet(fn, skipMissing=True)
        assert len(sset) == 15133
        sset.filters = None
        assert str(sset.filters) == ""
        sset.updateCounts()
        assert len(sset) == 2667562

        sset.filters = ssets[0].filters
        assert str(sset.filters) == "( bc = [0, 0] )"
        sset.updateCounts()
        assert len(sset) == 5370

        sset.filters = None
        assert str(sset.filters) == ""
        sset.updateCounts()
        assert len(sset) == 2667562

        sset.filters = ssets[1].filters
        assert str(sset.filters) == "( bc = [1, 1] ) OR ( bc = [2, 2] )"
        sset.updateCounts()
        assert len(sset) == 9763
Esempio n. 17
0
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(9))
        log.debug(data.getXml(9))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        log.debug(fn.name)
        sset.write(fn.name)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()

        # with touching the element:
        sset = SubreadSet(data.getXml(9))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        sset.write(fn.name, validate=False)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()
Esempio n. 18
0
 def test_output_subreadset_name(self):
     """
     Verify that the output SubreadSet name is identical to the input name
     plus ' (barcoded)'.
     """
     with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in:
         with SubreadSet(self._get_subreadset_out()) as ds_out:
             self.assertEqual(ds_out.name, ds_in.name + " (barcoded)")
Esempio n. 19
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
Esempio n. 20
0
    def test_file_arg(self):
        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname_file=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 4
        hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]]
        with open(fn, 'w') as ofh:
            for h in hn:
                ofh.write(str(h))
                ofh.write('\n')
        good_hn = [('=', fn)]
        sset.filters.addRequirement(zm=good_hn)
        assert size == len(set(sset.index.holeNumber))
        og = set(hn)
        for r in sset:
            og.discard(r.holeNumber)
        assert len(og) == 0
Esempio n. 21
0
 def test_subreads_parent_dataset(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True)
     assert ds2.metadata.provenance.parentDataSet.uniqueId is None
     ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4",
                                   "PacBio.DataSet.SubreadSet",
                                   "timestamped_name")
     assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds2.write(ds_out, validate=False)
Esempio n. 22
0
 def test_adapters_resource(self):
     ifn = ("/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/"
            "m54075_161031_164015.subreadset.xml")
     s = SubreadSet(ifn)
     assert s.externalResources[0].adapters.endswith(
         'm54075_161031_164015_adapter.fasta')
     ifn = ("/pbi/dept/secondary/siv/testdata/SA3-Sequel/ecoli/315/"
            "3150319/r54011_20160727_213451/1_A01/"
            "m54011_160727_213918.subreads.bam")
     s = SubreadSet(ifn)
     assert s.externalResources[0].adapters.endswith(
         'm54011_160727_213918.adapters.fasta')
Esempio n. 23
0
 def test_output_subreadset_has_metadata(self):
     """
     Verify that metadata from the instrument are propagated to the barcoded
     SubreadSet.
     """
     with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in:
         with SubreadSet(self._get_subreadset_out()) as ds_out:
             md_in = ds_in.metadata
             md_out = ds_out.metadata
             self.assertTrue(len(md_out.collections.submetadata) > 0)
             self.assertEqual(
                 md_in.collections.submetadata[0].attrib['InstrumentName'],
                 md_out.collections.submetadata[0].attrib['InstrumentName'])
Esempio n. 24
0
 def test_merge(self):
     sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                'SA3-Sequel/lambda/roche_SAT/'
                'm54013_151205_032353.subreadset.xml')
     sset = SubreadSet(sset_fn)
     orig_metadata = copy.deepcopy(sset.metadata)
     assert len(sset.metadata.collections) == 1
     sset.metadata.collections.merge(orig_metadata.collections)
     assert len(sset.metadata.collections) == 2
     sset = SubreadSet(sset_fn)
     sset.metadata.collections.merge(orig_metadata.collections,
                                     forceUnique=True)
     assert len(sset.metadata.collections) == 1
Esempio n. 25
0
 def test_build(self):
     # Progs like pbalign provide a .bam file:
     # e.g. d = DataSet("aligned.bam")
     # Something like the test files we have:
     inBam = data.getBam()
     self.assertTrue(inBam.endswith('.bam'))
     d = DataSet(inBam)
     # A UniqueId is generated, despite being a BAM input
     self.assertTrue(d.uuid != '')
     dOldUuid = d.uuid
     # They can write this BAM to an XML:
     # e.g. d.write("alignmentset.xml")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outXml = os.path.join(outdir, 'tempfile.xml')
     d.write(outXml)
     # And then recover the same XML (or a different one):
     # e.g. d = DataSet("alignmentset.xml")
     d = DataSet(outXml)
     # The UniqueId will be the same
     self.assertTrue(d.uuid == dOldUuid)
     # Inputs can be many and varied
     ds1 = DataSet(data.getXml(11), data.getBam())
     self.assertEquals(ds1.numExternalResources, 2)
     ds1 = DataSet(data.getFofn())
     self.assertEquals(ds1.numExternalResources, 2)
     # New! Use the correct constructor:
     self.assertEquals(
         type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet')
     # Even with untyped inputs
     self.assertTrue(
         str(SubreadSet(data.getBam())).startswith('<SubreadSet'))
     self.assertEquals(
         type(SubreadSet(data.getBam())).__name__, 'SubreadSet')
     self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet')
     # You can also cast up and down, but casting between siblings
     # is limited (abuse at your own risk)
     self.assertEquals(
         type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__,
         'SubreadSet')
     self.assertEquals(
         type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__,
         'DataSet')
     # Add external Resources:
     ds = DataSet()
     ds.externalResources.addResources(["IdontExist.bam"])
     self.assertTrue(
         ds.externalResources[-1].resourceId == "IdontExist.bam")
     # Add an index file
     ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"])
     self.assertTrue(ds.externalResources[-1].indices[0].resourceId ==
                     "IdontExist.bam.pbi")
Esempio n. 26
0
    def test_qname_filter_scaling(self):
        # unaligned bam
        bam0 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590956/0003/"
                "Analysis_Results/m140913_222218_42240_c10069"
                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/"
                "Analysis_Results/m140913_005018_42139_c10071"
                "3652400000001823152404301534_s1_p0.all.subreadset.xml")

        # separate '==' takes 120 seconds to addReq for 10k qnames:
        """
        sset = SubreadSet(bam0, bam1)
        self.assertEqual(len(sset), 178570)
        size = 100
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', name) for name in qn]
        sset.filters.addRequirement(qname=good_qn)
        #self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))

        sset = SubreadSet(data.getXml(10))
        self.assertEqual(len(sset), 92)
        size = 10
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', name) for name in qn]
        sset.filters.addRequirement(qname=good_qn)
        self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))
        """

        # "in" takes 1.2 seconds to addReq for 10k qnames:

        sset = SubreadSet(bam0, bam1)
        self.assertEqual(len(sset), 178570)
        size = 100
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', qn)]
        sset.filters.addRequirement(qname=good_qn)
        #self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))

        sset = SubreadSet(data.getXml(10))
        self.assertEqual(len(sset), 92)
        size = 10
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', qn)]
        sset.filters.addRequirement(qname=good_qn)
        self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))
Esempio n. 27
0
 def test_provenance_record_ordering(self):
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(),
                                  ds.datasetType,
                                  createdBy="AnalysisJob",
                                  timeStampedName="")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(tmp_out)
     ds = SubreadSet(tmp_out, strict=True)
     tags = [r['tag'] for r in ds.metadata.record['children']]
     self.assertEqual(tags, [
         'TotalLength', 'NumRecords', 'Provenance', 'Collections',
         'SummaryStats'
     ])
Esempio n. 28
0
    def test_nested_external_resources(self):
        log.debug("Testing nested externalResources in AlignmentSets")
        aln = AlignmentSet(data.getXml(0), skipMissing=True)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
        self.assertEqual(aln.externalResources[0].scraps, None)

        log.debug("Testing nested externalResources in SubreadSets")
        subs = SubreadSet(data.getXml(5), skipMissing=True)
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        self.assertEqual(subs.externalResources[0].reference, None)

        log.debug("Testing added nested externalResoruces to SubreadSet")
        subs = SubreadSet(data.getXml(10))
        self.assertFalse(subs.externalResources[0].scraps)
        subs.externalResources[0].scraps = 'fake.fasta'
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        subs.externalResources[0].barcodes = 'bc.fasta'
        self.assertTrue(subs.externalResources[0].barcodes)
        self.assertEqual(
            subs.externalResources[0].externalResources[1].metaType,
            "PacBio.DataSet.BarcodeSet")

        subs.externalResources[0].adapters = 'foo.adapters.fasta'
        self.assertEqual(subs.externalResources[0].adapters,
                         'foo.adapters.fasta')
        self.assertEqual(
            subs.externalResources[0].externalResources[2].metaType,
            "PacBio.SubreadFile.AdapterFastaFile")

        log.debug("Testing adding nested externalResources to AlignmetnSet "
                  "manually")
        aln = AlignmentSet(data.getXml(8))
        self.assertTrue(aln.externalResources[0].bai)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertFalse(aln.externalResources[0].reference)
        aln.externalResources[0].reference = 'fake.fasta'
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
Esempio n. 29
0
 def test_subreadset_consolidate(self):
     log.debug("Test through API")
     aln = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     aln.consolidate(outfn)
     self.assertTrue(os.path.exists(outfn))
     self.assertEqual(len(aln.toExternalFiles()), 1)
     nonCons = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(nonCons.toExternalFiles()), 2)
     for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
         self.assertEqual(read1, read2)
     self.assertEqual(len(aln), len(nonCons))
Esempio n. 30
0
 def test_subread_build(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
     assert type(ds1).__name__ == 'SubreadSet'
     assert ds1._metadata.__class__.__name__ == 'SubreadSetMetadata'
     assert type(ds1._metadata).__name__ == 'SubreadSetMetadata'
     assert type(ds1.metadata).__name__ == 'SubreadSetMetadata'
     assert len(ds1.metadata.collections) == 1
     assert len(ds2.metadata.collections) == 1
     ds3 = ds1 + ds2
     assert len(ds3.metadata.collections) == 2
     ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
     assert type(ds4).__name__ == 'SubreadSet'
     assert type(ds4._metadata).__name__ == 'SubreadSetMetadata'
     assert len(ds4.metadata.collections) == 1