def test_update_barcoded_sample_metadata(self):
     datastore_tmp = tempfile.NamedTemporaryFile(
         suffix=".datastore.json").name
     barcodes = pbtestdata.get_file("barcodeset")
     ds = split_barcoded_dataset(self.SUBREADS)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self, self.SUBREADS, datastore)
     # now with use_barcode_uuids=False
     datastore = update_barcoded_sample_metadata(base_dir,
                                                 datastore_tmp,
                                                 self.SUBREADS,
                                                 barcodes,
                                                 use_barcode_uuids=False)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       use_barcode_uuids=False)
     # test that it doesn't break with no collection metadata
     ss = SubreadSet(self.SUBREADS)
     ss.metadata.collections = None
     ss_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ss.write(ss_tmp)
     ds = split_barcoded_dataset(ss_tmp)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       have_collection_metadata=False,
                                       number_of_expected_collections=0)
def main(parser):
    args = parser.parse_args()

    filt = Filters()
    dset = SubreadSet(args.inXml)
    names = nameGen(args.inFile, fileType='list' if args.list else 'fasta')
    if args.subreads:
        if args.inverted:
            for name in names:
                filt.addRequirement(QNAME=[('!=', name)])
        else:
            filt.addRequirement(QNAME=[('=', name) for name in names])
    else:
        assert len(
            dset.movieIds
        ) == 1, 'This method only works for single-movie subreadsets.  use --subreads option for multi-movie subreadsets'
        uniqHn = set(map(getZmw, names))
        if args.inverted:
            for hn in uniqHn:
                filt.addRequirement(zm=[('!=', hn)])
        else:
            filt.addRequirement(zm=[('=', hn) for hn in uniqHn])
    dset.addFilters(filt)
    if args.newUuid:
        dset.newUuid()
    if args.name:
        dset.name = args.name
    dset.write(args.outXml)
Exemple #3
0
def _generateSubreadSet(output_bam_file):
    sset = SubreadSet(output_bam_file, generateIndices=True)

    sset_output_name = output_bam_file[:-12] + 'subreadset.xml'
    sset.name = sset_output_name.split('.')[0]
    sset.write(sset_output_name)
    return sset_output_name
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(10))
        log.debug(data.getXml(10))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.debug(fn)
        sset.write(fn)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)

        # with touching the element:
        sset = SubreadSet(data.getXml(10))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn, validate=False)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)
Exemple #5
0
 def test_subreadset_from_bam(self):
     # DONE control experiment for bug 28698
     bam = upstreamData.getUnalignedBam()
     ds1 = SubreadSet(bam, strict=False)
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     ds1.write(fn)
Exemple #6
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a FOFN of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    fofn = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        fofn.append(out_fn)

    return fofn
Exemple #7
0
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(9))
        log.debug(data.getXml(9))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        log.debug(fn.name)
        sset.write(fn.name)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()

        # with touching the element:
        sset = SubreadSet(data.getXml(9))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        sset.write(fn.name, validate=False)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()
Exemple #8
0
def run_bax_to_bam(input_file_name, output_file_name):
    with HdfSubreadSet(input_file_name) as ds_in:
        movies = set()
        for rr in ds_in.resourceReaders():
            movies.add(rr.movieName)
        if len(movies) > 1:
            out_dir = os.path.dirname(output_file_name)
            ds_out_files = []
            for bax_file in ds_in.toExternalFiles():
                output_file_name_tmp = os.path.join(
                    out_dir,
                    ".".join(os.path.basename(bax_file).split(".")[:-2]) +
                    ".hdfsubreadset.xml")
                rc = _run_bax_to_bam(bax_file, output_file_name_tmp)
                if rc != 0:
                    log.error("bax2bam failed")
                    return rc
                ds_out_files.append(output_file_name_tmp)
            ds = SubreadSet(*ds_out_files)
            ds.name = ds_in.name
            if 'Description' in ds_in.objMetadata:
                ds.objMetadata['Description'] = ds_in.objMetadata[
                    'Description']
                ds.metadata.merge(ds_in.metadata)
            ds.write(output_file_name)
        else:
            return _run_bax_to_bam(input_file_name, output_file_name)
    return 0
 def setUpClass(cls):
     super(TestToolContract, cls).setUpClass()
     ds = SubreadSet(BAM_FILE, strict=True)
     ds.write(cls.INPUT_FILES[0])
     with FastaWriter(cls.INPUT_FILES[1]) as fa_out:
         for i in range(1010):
             fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
Exemple #10
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a list of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True, skipCounts=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    split_fns = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn, skipCounts=True)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        split_fns.append(out_fn)

    return split_fns
def run(subreadset, fofn):
    dir_name = os.getcwd()
    maxChunks = 0
    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()
    import pprint
    log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns)))
    nrecs = len(dset)
    # HG with 70x coverage => 200G bases total
    ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human
    ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human
    # and we expect about 7-10min per chunk.
    chunks = nrecs // ts
    log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts))
    log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format(
        chunks, maxChunks))
    dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks,
            updateCounts=False,
            #targetSize=1, breakContigs=True
    )

    chunk_fns = []
    for i, dset in enumerate(dset_chunks):
        chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02
        chunk_fn = os.path.join(dir_name, chunk_name)
        dset.updateCounts()
        dset.write(chunk_fn, validate=False) # , relPaths=True
        chunk_fns.append(chunk_fn)
    with open(fofn, 'w') as ofs:
        for fn in chunk_fns:
            ofs.write('{}\n'.format(fn))
    log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
Exemple #12
0
def run_bax_to_bam(input_file_name, output_file_name):
    with HdfSubreadSet(input_file_name) as ds_in:
        movies = set()
        for rr in ds_in.resourceReaders():
            movies.add(rr.movieName)
        if len(movies) > 1:
            out_dir = os.path.dirname(output_file_name)
            ds_out_files = []
            for bax_file in ds_in.toExternalFiles():
                output_file_name_tmp = os.path.join(out_dir, ".".join(
                    os.path.basename(bax_file).split(".")[:-2]) +
                    ".hdfsubreadset.xml")
                rc = _run_bax_to_bam(bax_file, output_file_name_tmp)
                if rc != 0:
                    log.error("bax2bam failed")
                    return rc
                ds_out_files.append(output_file_name_tmp)
            ds = SubreadSet(*ds_out_files)
            ds.name = ds_in.name
            if 'Description' in ds_in.objMetadata:
                ds.objMetadata['Description'] = ds_in.objMetadata['Description']
                ds.metadata.merge(ds_in.metadata)
            ds.write(output_file_name)
        else:
            return _run_bax_to_bam(input_file_name, output_file_name)
    return 0
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(7))
        assert not aln.metadata.collections
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        assert aln.metadata.collections
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        assert sset.metadata == orig_metadata

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        with pytest.raises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)
        ss.write(ofn, validate=False)
 def test_subreadset_from_bam(self):
     # DONE control experiment for bug 28698
     bam = upstreamData.getUnalignedBam()
     ds1 = SubreadSet(bam, strict=False)
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     ds1.write(fn)
Exemple #17
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1):
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                subreads_bam, scraps_bam
            ]
            print args
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            # FIXME we need a more general method for this
            ext_res_new = ExternalResource()
            ext_res_new.resourceId = subreads_bam
            ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile'
            ext_res_new.addIndices([subreads_bam + ".pbi"])
            ext_res_inner = ExternalResources()
            ext_res_scraps = ExternalResource()
            ext_res_scraps.resourceId = scraps_bam
            ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile'
            ext_res_scraps.addIndices([scraps_bam + ".pbi"])
            ext_res_inner.append(ext_res_scraps)
            ext_res_new.append(ext_res_inner)
            ds_new.externalResources.append(ext_res_new)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.updateCounts()
        ds_new.write(output_file_name)
    return 0
Exemple #18
0
 def test_bam2fastx_filtered(self):
     input_file = pbtestdata.get_file("subreads-xml")
     ds = SubreadSet(input_file, strict=True)
     ds.filters.addRequirement(length=[('>=', 1000)])
     input_tmp = get_temp_file(suffix=".subreadset.xml")
     ds.write(input_tmp)
     nrecords_expected = 13
     self.run_and_check_fastx(input_tmp, nrecords_expected)
Exemple #19
0
def run_bam_to_bam(subread_set_file,
                   barcode_set_file,
                   output_file_name,
                   nproc=1,
                   score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError(
            "Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(
                op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded",
                       op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                                       subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam", "-j",
                str(nproc), "-b",
                str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta,
                "--scoreMode", score_mode, subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(
                f=subreads_bam)
            add_subread_resources(ds_new,
                                  subreads=subreads_bam,
                                  scraps=scraps_bam,
                                  barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
 def setUpClass(cls):
     tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
     shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam)
     shutil.copyfile(pbcore.data.getUnalignedBam()+".pbi", tmp_bam+".pbi")
     ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True,
                               ds_class=BarcodeSet)
     super(TestScatterSubreadBAMs, cls).setUpClass()
def _make_dataset(file_name=None, barcodes=None):
    if file_name is None:
        file_name = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
    ds = SubreadSet(BAM_FILE, strict=True)
    if barcodes is not None:
        for er in ds.externalResources:
            er.barcodes = barcodes
    ds.write(file_name)
    return file_name
Exemple #22
0
 def _set_up_basic(self):
     input_file = get_temp_file(suffix=".subreadset.xml")
     ds = SubreadSet(data.getXml(9), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(),
                                  ds.datasetType,
                                  createdBy="AnalysisJob",
                                  timeStampedName="")
     ds.write(input_file)
     return input_file, len(ds)
 def test_provenance_record_ordering(self):
     import pbtestdata
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(tmp_out)
     ds = SubreadSet(tmp_out, strict=True)
     tags = [r['tag'] for r in ds.metadata.record['children']]
     self.assertEqual(tags, ['TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats'])
 def test_get_dataset_uuid(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     uuid = getDataSetUuid(ds_file)
     self.assertEqual(uuid, ds.uuid)
     with open(ds_file, "w") as out:
         out.write("hello world!")
     uuid = getDataSetUuid(ds_file)
     self.assertEqual(uuid, None)
Exemple #25
0
 def test_get_dataset_uuid(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     uuid = getDataSetUuid(ds_file)
     assert uuid == ds.uuid
     with open(ds_file, "w") as out:
         out.write("hello world!")
     uuid = getDataSetUuid(ds_file)
     assert uuid is None
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)

        # There are no existing biosamples:
        self.assertFalse(
            'BioSamples' in ss.metadata.collections[0].wellSample.tags)
        # Therefore the metadata is falsy
        self.assertFalse(ss.metadata.collections[0].wellSample.bioSamples)

        ss.metadata.collections[0].wellSample.bioSamples.addSample('Clown')
        self.assertEqual(
            'Clown', ss.metadata.collections[0].wellSample.bioSamples[0].name)

        ss.metadata.collections[0].wellSample.bioSamples[
            0].DNABarcodes.addBarcode('Dentist')
        self.assertEqual(
            'Dentist', ss.metadata.collections[0].wellSample.bioSamples[0].
            DNABarcodes[0].name)

        # check that we are adding one additional biosamples element:
        self.assertEqual(
            Counter(ss.metadata.collections[0].wellSample.tags)['BioSamples'],
            1)
        # Therefore the metadata is truthy
        self.assertTrue(ss.metadata.collections[0].wellSample.bioSamples)
        ss.write(ofn, validate=False)
Exemple #27
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                "--scoreMode", score_mode,
                subreads_bam, scraps_bam
            ]
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            add_subread_resources(ds_new,
                subreads=subreads_bam,
                scraps=scraps_bam,
                barcodes=barcode_set_file)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.metadata = ds.metadata
        ds_new.name = ds.name + " (barcoded)"
        ds_new.updateCounts()
        ds_new.newUuid()
        ds_new.write(output_file_name)
    return 0
Exemple #28
0
 def test_subreads_parent_dataset(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True)
     assert ds2.metadata.provenance.parentDataSet.uniqueId is None
     ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4",
                                   "PacBio.DataSet.SubreadSet",
                                   "timestamped_name")
     assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds2.write(ds_out, validate=False)
Exemple #29
0
 def setUpClass(cls):
     tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
     shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam)
     shutil.copyfile(pbcore.data.getUnalignedBam() + ".pbi",
                     tmp_bam + ".pbi")
     ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     _write_fasta_or_contigset(cls.INPUT_FILES[1],
                               make_faidx=True,
                               ds_class=BarcodeSet)
     super(TestScatterSubreadBAMs, cls).setUpClass()
Exemple #30
0
 def test_filter_dataset_bq(self):
     ds_in = get_temp_file(suffix=".subreadset.xml")
     ds = SubreadSet(pbtestdata.get_file("barcoded-subreadset"),
                     strict=True)
     ds.filters.addRequirement(bq=[('>=', 31)])
     assert len(ds) == 1
     ds.write(ds_in)
     ds_out = get_temp_file(suffix=".subreadset.xml")
     args = self.BASE_ARGS + [ds_in, ds_out, "length >= 10 AND bq >= 10"]
     self._check_call(args)
     n_expected = 2
     expected_filter_str = "( bq >= 10 AND length >= 10 )"
     self.run_after(ds_out, n_expected, expected_filter_str)
 def test_subreads_parent_dataset(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     self.assertEqual(ds1.metadata.provenance.parentDataSet.uniqueId,
                      "f81cf391-b3da-41f8-84cb-a0de71f460f4")
     ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True)
     self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, None)
     ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4",
                                   "PacBio.DataSet.SubreadSet",
                                   "timestamped_name")
     self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId,
                      "f81cf391-b3da-41f8-84cb-a0de71f460f4")
     ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds2.write(ds_out, validate=False)
Exemple #32
0
 def test_provenance_record_ordering(self):
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(),
                                  ds.datasetType,
                                  createdBy="AnalysisJob",
                                  timeStampedName="")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(tmp_out)
     ds = SubreadSet(tmp_out, strict=True)
     tags = [r['tag'] for r in ds.metadata.record['children']]
     self.assertEqual(tags, [
         'TotalLength', 'NumRecords', 'Provenance', 'Collections',
         'SummaryStats'
     ])
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)

        # There are no existing biosamples:
        self.assertFalse(
            'BioSamples' in ss.metadata.tags)
        # Therefore the metadata is falsy
        self.assertFalse(ss.metadata.bioSamples)

        ss.metadata.bioSamples.addSample('Clown')
        self.assertEqual('Clown', ss.metadata.bioSamples[0].name)

        ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist')
        self.assertEqual('Dentist',
                         ss.metadata.bioSamples[0].DNABarcodes[0].name)

        # check that we are adding one additional biosamples element:
        self.assertEqual(Counter(ss.metadata.tags)['BioSamples'], 1)
        # Therefore the metadata is truthy
        self.assertTrue(ss.metadata.bioSamples)
        ss.write(ofn, validate=False)
Exemple #34
0
def to_zmw_chunked_subreadset_files(subreadset_path, max_total_nchunks,
                                    chunk_key, dir_name, base_name, ext):
    """Identical to to_chunked_subreadset_files, but chunks subreads by
    ZMW ranges for input to pbccs."""
    dset = SubreadSet(subreadset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, zmws=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        c = PipelineChunk(chunk_id, **d)
        yield c
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(9))
        col = CollectionMetadata()
        assert not ss.metadata.collections

        ss.metadata.collections.append(col)
        assert ss.metadata.collections

        col.cellIndex = 1
        assert ss.metadata.collections[0].cellIndex == '1'

        col.instrumentName = "foo"
        assert ss.metadata.collections[0].instrumentName == "foo"

        col.context = 'bar'
        assert ss.metadata.collections[0].context == "bar"

        ss.metadata.collections[0].runDetails.name = 'foo'
        assert 'foo' == ss.metadata.collections[0].runDetails.name

        ss.metadata.collections[0].wellSample.name = 'bar'
        assert 'bar' == ss.metadata.collections[0].wellSample.name

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        assert 'baz' == ss.metadata.collections[0].wellSample.wellName

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        assert 'baz' == ss.metadata.collections[0].wellSample.concentration

        # There are no existing biosamples:
        assert not 'BioSamples' in ss.metadata.tags
        # Therefore the metadata is falsy
        assert not ss.metadata.bioSamples

        ss.metadata.bioSamples.addSample('Clown')
        assert 'Clown' == ss.metadata.bioSamples[0].name

        ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist')
        assert 'Dentist' == ss.metadata.bioSamples[0].DNABarcodes[0].name

        # check that we are adding one additional biosamples element:
        assert Counter(ss.metadata.tags)['BioSamples'] == 1
        # Therefore the metadata is truthy
        assert ss.metadata.bioSamples
        ss.write(ofn, validate=False)
    def test_mock_update_barcoded_sample_metadata(self):
        tmp_dir = tempfile.mkdtemp()
        datastore_tmp = op.join(tmp_dir, "lima.datastore.json")
        barcodeset = pbtestdata.get_file("barcodeset")
        barcodes = ["lbc1--lbc1", "lbc3--lbc3"]
        files = [
            op.join(tmp_dir, "lima.lbc1--lbc1.subreadset.xml"),
            op.join(tmp_dir, "lima.lbc3--lbc3.subreadset.xml")
        ]
        uuids = [uuid.uuid4() for fn in files]
        # XXX these are hardcoded to match the actual barcoded test input
        bc_uuids = [
            "dffb30e8-9243-4743-9980-468a20952167",
            "eef1a8ea-c6a7-4233-982a-d426e1e7d8c9"
        ]
        ds = SubreadSet(pbtestdata.get_file("subreads-sequel"))

        def _add_barcoded_sample(sn, bn, id_):
            ds.metadata.collections[0].wellSample.bioSamples.addSample(sn)
            ds.metadata.collections[0].wellSample.bioSamples[
                -1].DNABarcodes.addBarcode(bn)
            ds.metadata.collections[0].wellSample.bioSamples[-1].DNABarcodes[
                -1].uniqueId = id_

        _add_barcoded_sample("Alice", "lbc1--lbc1", bc_uuids[0])
        _add_barcoded_sample("Charles", "lbc3--lbc3", bc_uuids[1])
        tmp_ds = op.join(tmp_dir, "input.subreadset.xml")
        ds.write(tmp_ds)
        for fn, bc, dsid in zip(files, barcodes, uuids):
            ds = SubreadSet(tmp_ds)
            ds.uuid = str(dsid)
            ds.name = ds.name + " ({b})".format(b=bc)
            ds.write(fn)
        ds_files = [
            DataStoreFile(dsid, "barcoding.tasks.lima-0",
                          FileTypes.DS_SUBREADS.file_type_id, fn)
            for (dsid, fn) in zip(uuids, files)
        ]
        ds = DataStore(ds_files)
        ds.write_json(datastore_tmp)
        base_dir = tempfile.mkdtemp()
        datastore = mock_update_barcoded_sample_metadata(
            base_dir, datastore_tmp, tmp_ds, barcodeset)
        validate_barcoded_datastore_files(self,
                                          tmp_ds,
                                          datastore,
                                          number_of_expected_filters=0)
 def test_merge_biosamples(self):
     import pbtestdata
     ds1 = pbtestdata.get_file("subreads-biosample-1")
     ds2 = pbtestdata.get_file("subreads-biosample-2")
     # Case 1: two biosamples
     ds = SubreadSet(ds1, ds2)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     assert samples == ["Alice", "Bob"]
     # Case 2: same biosample in both files
     ds = SubreadSet(ds1, ds1)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     assert samples == ["Alice"]
     assert len(ds.metadata.bioSamples[0].DNABarcodes) == 1
     # Case 3: same biosample, different barcodes
     dsTmp = SubreadSet(ds1)
     dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7"
     tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     dsTmp.write(tmpFile)
     ds = SubreadSet(ds1, tmpFile)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     assert samples == ["Alice"]
     bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes]
     assert bcs == ["F1--R1", "F7--R7"]
 def test_merge_biosamples(self):
     import pbtestdata
     ds1 = pbtestdata.get_file("subreads-biosample-1")
     ds2 = pbtestdata.get_file("subreads-biosample-2")
     # Case 1: two biosamples
     ds = SubreadSet(ds1, ds2)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice", "Bob"])
     # Case 2: same biosample in both files
     ds = SubreadSet(ds1, ds1)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     self.assertEqual(len(ds.metadata.bioSamples[0].DNABarcodes), 1)
     # Case 3: same biosample, different barcodes
     dsTmp = SubreadSet(ds1)
     dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7"
     tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     dsTmp.write(tmpFile)
     ds = SubreadSet(ds1, tmpFile)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes]
     self.assertEqual(bcs, ["F1--R1", "F7--R7"])
Exemple #39
0
def run(subreadset, fofn):
    dir_name = os.getcwd()
    maxChunks = 0
    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()
    import pprint
    log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns)))
    nrecs = len(dset)
    # HG with 70x coverage => 200G bases total
    ts = 50000  # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human
    ts = 500000  # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human
    # and we expect about 7-10min per chunk.
    chunks = nrecs // ts
    log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts))
    log.info(
        'Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'
        .format(chunks, maxChunks))
    dset_chunks = dset.split(
        zmws=False,
        chunks=chunks,
        ignoreSubDatasets=True,
        maxChunks=maxChunks,
        updateCounts=False,
        #targetSize=1, breakContigs=True
    )

    chunk_fns = []
    for i, dset in enumerate(dset_chunks):
        chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i)  # TODO: 02
        chunk_fn = os.path.join(dir_name, chunk_name)
        dset.updateCounts()
        dset.write(chunk_fn, validate=False)  # , relPaths=True
        chunk_fns.append(chunk_fn)
    with open(fofn, 'w') as ofs:
        for fn in chunk_fns:
            ofs.write('{}\n'.format(fn))
    log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                'SA3-Sequel/lambda/roche_SAT/'
                'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        stack = zip(sset.metadata, orig_metadata)
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)


        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                'SA3-Sequel/lambda/roche_SAT/'
                'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
 def setUpClass(cls):
     ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2Fasta, cls).setUpClass()
 def test_get_dataset_metatype(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     meta_type = getDataSetMetaType(ds_file)
     self.assertEqual(meta_type, "PacBio.DataSet.SubreadSet")
 def setUpClass(cls):
     ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True)
     ds.filters.addRequirement(length=[('>=', 1000)])
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2FastaFiltered, cls).setUpClass()
Exemple #44
0
 def setUpClass(cls):
     ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2Fasta, cls).setUpClass()
class SmrtCell(object):
    
    '''
    Initializes a new SmrtCell object from a smrtcell xml file
    @param xml_file: the path to a subreadset.xml file of a smrtcell
    '''
    def __init__(self,xml_file):
        self.__logger = logging.getLogger('support.smrtcell')
        self.__is_valid = False
        
        self.__xml_file = check_file(xml_file)
        if not self.__xml_file:
            self.show_log('error', 'XML file '+self.__xml_file+' does not exist or is not a file!')
            return
        
#TODO: read xml content from encrypted file        
        self.__subreadset = None
        try:
            self.__subreadset = SubreadSet(self.__xml_file)
        except IOError as err:
            self.show_log('error', 'Parsing of XML file '+self.__xml_file+' was not successful: '+err+'!')
            return
        
        self.__is_valid = True
        

    '''
    Tests if the SmrtCell object is valid.
    @return: return true if the SmrtCell object is valid otherwise false
    @rtype: bool
    '''    
    def is_valid(self):
        return self.__is_valid

    '''
    Returns the name of the SmrtCell object.
    @return: the name
    @rtype: str
    '''    
    def get_name(self):
        return self.__subreadset.name if self.__is_valid else None

    '''
    Returns the total number of reads in the SmrtCell object.
    @return: the number of reads
    @rtype: integer
    '''    
    def get_total_number_of_reads(self):
        return int(self.__subreadset.metadata.numRecords) if self.__is_valid else None

    '''
    Returns the total number of bp in the SmrtCell object.
    @return: the number of bp
    @rtype: integer
    '''    
    def get_total_sum_of_bp(self):
        return int(self.__subreadset.metadata.totalLength) if self.__is_valid else None

    '''
    Returns the number of collections ('sequencing runs') in the SmrtCell object.
    Should be 1 in almost all cases. If not, all other functions have an optional argument 
    to specify the collection.
    @return: the number of sequencing runs
    @rtype: integer
    '''    
    def get_number_of_collections(self):
        return len(self.__subreadset.metadata.collections) if self.__is_valid else None
 
    '''
    Checks if a provided collection index is valid, i.e. can access a collection.
    Do not confuse with collection index.
    @param collection_index: the index of the collection
    @return: true if collection index is valid otherwise false
    @rtype: bool
    '''    
    def check_collection_index(self,collection_index):
        return self.__is_valid and collection_index >= 0 and collection_index<len(self.__subreadset.metadata.collections)  

    '''
    Returns the names of the samples that were loaded onto this SmrtCell.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: a list with sample names
    @rtype: list of str
    '''    
    def get_biosample_names(self,collection_index=0):
        biosample_names = []
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            num_biosamples = len(self.__subreadset.metadata.collections[collection_index].wellSample.bioSamples)
            for i in range(0,num_biosamples):
                biosample_names.append(self.__subreadset.metadata.collections[collection_index].wellSample.bioSamples[i].name)
        return biosample_names

    '''
    Returns the cell index.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the cell index
    @rtype: integer
    '''    
    def get_cell_index(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return int(self.__subreadset.metadata.collections[collection_index].cellIndex)
        else:
            return None

    '''
    Returns the collection number.
    Do not confuse with collection index.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the collection number
    @rtype: integer
    '''    
    def get_collection_number(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return int(self.__subreadset.metadata.collections[collection_index].collectionNumber)
        else:
            return None
        
    '''
    Returns the raw data path.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the raw data path
    @rtype: str
    '''    
    def get_raw_data_path(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return get_absolute_path(self.__subreadset.metadata.collections[collection_index].primary.outputOptions.collectionPathUri)
        else:
            return None
    
    '''
    Returns the run id.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the run id
    @rtype: str
    '''    
    def get_run_id(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].runDetails.timeStampedName
        else:
            return None

    '''
    Returns the run name.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the run name
    @rtype: str
    '''    
    def get_run_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].runDetails.name
        else:
            return None

    '''
    Returns the cellpac barcode.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the cellpac barcode
    @rtype: str
    '''    
    def get_cellpac_barcode(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].cellPac.barcode
        else:
            return None

    '''
    Returns the cellpac lot number.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the cellpac lot number
    @rtype: str
    '''    
    def get_cellpac_lot_number(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].cellPac.lotNumber
        else:
            return None
 
    '''
    Returns the instrument code.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the instrument code
    @rtype: str
    '''    
    def get_instrument_code(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].instrumentName
        else:
            return None
 
    '''
    Returns the sequencing date.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the sequencing date as string
    @rtype: str
    '''    
    def get_sequencing_date(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            val = self.__subreadset.metadata.collections[collection_index].createdAt
            reduced_date_string, suffix = val.split(".")
            suffix = suffix
            return datetime.strptime(reduced_date_string, "%Y-%m-%dT%H:%M:%S").strftime('%Y-%m-%d')
        else:
            return None
 
    '''
    Returns the well name.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the well name
    @rtype: str
    '''    
    def get_well_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].wellSample.name
        else:
            return None

    '''
    Returns the concentration.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the concentration
    @rtype: float
    '''    
    def get_concentration(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return float(self.__subreadset.metadata.collections[collection_index].wellSample.concentration)
        else:
            return None
    
    '''
    Returns the UseCount property.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the UseCount property
    @rtype: str
    '''    
    def get_usecount(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].wellSample.useCount
        else:
            return None
    
    '''
    Returns the instrument control software version.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the instrument control software version
    @rtype: str
    '''    
    def get_instrument_control_software_version(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].instCtrlVer
        else:
            return None

    '''
    Returns the signal processing software version.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the signal processing software version
    @rtype: str
    '''    
    def get_signal_processing_software_version(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].sigProcVer
        else:
            return None

    '''
    Returns the notes (i.e. additional free text description).
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the notes
    @rtype: str
    '''    
    def get_notes(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].wellSample.description
        else:
            return None
        
    '''
    Returns the automation parameters.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: a hash with key - value pairs
    @rtype: dict of str
    '''     
    def get_automation_parameters(self,collection_index=0):
        automation_parameters = {}
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            for i in range(0,len(self.__subreadset.metadata.collections[collection_index].automation.automationParameters)):
                hashed_data = self.__subreadset.metadata.collections[collection_index].automation.automationParameters[i].metadata
                automation_parameters[hashed_data['Name']] =  hashed_data['SimpleValue']
        return automation_parameters
    
    '''
    Returns the movie length in minutes.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the movie length in minutes
    @rtype: integer
    '''    
    def get_movie_length(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            automation_parameters = self.get_automation_parameters(collection_index)
            return int(automation_parameters['MovieLength'])
        else:
            return None
    
    '''
    Returns the immobilization time in minutes.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the immobilization time in minutes
    @rtype: integer
    '''    
    def get_immobilisation_time(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            automation_parameters = self.get_automation_parameters(collection_index)
            return int(automation_parameters['ImmobilizationTime'])
        else:
            return None
    '''
    Returns the insert size.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the insert size
    @rtype: integer
    '''    
    def get_insert_size(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            automation_parameters = self.get_automation_parameters(collection_index)
            return int(automation_parameters['InsertSize'])
        else:
            return None

    '''
    Returns true if hot start was enabled otherwise false.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: true if hot start was enabled false otherwise
    @rtype: bool
    '''    
    def get_stage_hotstart_enabled(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return bool(self.__subreadset.metadata.collections[collection_index].wellSample.stageHotstartEnabled)
        else:
            return None
    
    '''
    Returns the primary protocol name
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the primary protocol name
    @rtype: str
    '''    
    def get_primary_protocol_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].primary.automationName
        else:
            return None

    '''
    Returns the primary protocol config
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the primary protocol config
    @rtype: str
    '''    
    def get_primary_protocol_config(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].primary.configFileName
        else:
            return None

    '''
    Returns the adapter sequences used in template prep
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: a dictionary with left adapter sequence and right adapter sequence
    @rtype: dict of str
    '''    
    def get_adapter_sequences(self,collection_index=0):
        adapter_sequences = {}
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            adapter_sequences['LeftAdapterSequence'] = self.__subreadset.metadata.collections[collection_index].templatePrepKit.leftAdaptorSequence
            adapter_sequences['RightAdapterSequence'] = self.__subreadset.metadata.collections[collection_index].templatePrepKit.rightAdaptorSequence
        return adapter_sequences

    '''
    Returns the sample name. Deprecated, use get_biosample_names instead.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the sample name
    @rtype: str
    '''    
    def get_sample_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            return self.__subreadset.metadata.collections[collection_index].wellSample.name
        else:
            return None   
        
    '''
    Returns the name of the Sequel binding kit.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the binding kit name
    @rtype: str
    '''    
    def get_binding_kit_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            name = self.__subreadset.metadata.collections[collection_index].bindingKit.name
            return normalize('NFKD',name).encode('ascii','ignore')
            return 
        else:
            return None
                
    '''
    Returns the name of the Sequel template prep kit.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the template prep kit name
    @rtype: str
    '''    
    def get_template_prep_kit_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            name = self.__subreadset.metadata.collections[collection_index].templatePrepKit.name
            return normalize('NFKD',name).encode('ascii','ignore')
            return 
        else:
            return None

    '''
    Returns the name of the Sequel sequencing plate kit.
    @param collection_index: the index of the collection (optional, zero-based, default: 0)
    @return: the sequencing plate kit name
    @rtype: str
    '''    
    def get_sequencing_plate_kit_name(self,collection_index=0):
        if self.__is_valid:
            assert self.check_collection_index(collection_index),'Specified collection index is invalid!'
            name = self.__subreadset.metadata.collections[collection_index].sequencingKitPlate.name
            return normalize('NFKD',name).encode('ascii','ignore')
            return 
        else:
            return None
                    
    '''
    Make all paths encoded in the SmrtCell object relative.
    @param outdir: a directory from which the paths should originate (optional, default: ".")
    '''    
    def make_paths_relative(self,outdir="."):
        if self.__is_valid:
            self.__subreadset.makePathsRelative(outdir)

    '''
    Make all paths encoded in the SmrtCell object absolute.
    '''    
    def make_paths_absolute(self):
        if self.__is_valid:
            self.__subreadset.makePathsAbsolute()
    
    '''
    Write new (subreadset) xml file for SmrtCell object.
    @param filename: a file name
    '''
    def write(self,filename):
        if self.__is_valid:
            self.__subreadset.write(filename)
        
    
    '''
    Helper function for log messages
    @param level: the log level (debug, info, warning, error, critical)
    @param message: the log message
    '''    
    def show_log(self, level, message):
        if level == 'debug':
            self.__logger.debug(message)
        elif level == 'info':
            self.__logger.info(message)
        elif level == 'warning':
            self.__logger.warning(message)
        elif level == 'error':
            self.__logger.error(message)
        elif level == 'critical':
            self.__logger.critical(message)
Exemple #46
0
 def setUpClass(cls):
     ds = SubreadSet(data.getXml(10), strict=True)
     ds.write(cls.INPUT_FILES[0])
 def setUpClass(cls):
     ds = SubreadSet(cls.SRC_FILE, strict=True)
     ds.filters.addRequirement(length=[('>=', 1000)])
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2FastqFiltered, cls).setUpClass()
 def setUpClass(cls):
     ds = SubreadSet(cls.SRC_FILE, strict=True)
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2FastaArchive, cls).setUpClass()
 def setUpClass(cls):
     ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
Exemple #50
0
 def setUpClass(cls):
     ds = SubreadSet(pbcore.data.getUnalignedBam(), strict=True)
     ds.filters.addRequirement(length=[('>=', 1000)])
     ds.write(cls.INPUT_FILES[0])
     super(TestBam2FastaFiltered, cls).setUpClass()
Exemple #51
0
 def test_get_dataset_metatype(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     meta_type = getDataSetMetaType(ds_file)
     assert meta_type == "PacBio.DataSet.SubreadSet"
 def setUpClass(cls):
     ds = SubreadSet(data.getXml(10), strict=True)
     ds.write(cls.INPUT_FILES[0])