Esempio n. 1
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            # Regarding RG ID: BLASR currently outputs a hex digest of
            # 10 nibbles, instead of the 8 which would fit into a
            # 32-bit word.  So we truncate here for the purposes of
            # cross-referencing within this API and the PacBioBamIndex
            # API.  We do check for a collision below.
            rgID = int(rg["ID"][:8], 16)
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds["SOFTWAREVERSION"]
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.uint32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O")])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        self._pulseFeaturesAvailable = pulseFeaturesInAll_
Esempio n. 2
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []

        # RGID -> ("abstract feature name" -> actual feature name)
        self._baseFeatureNameMappings = {}
        self._pulseFeatureNameMappings = {}

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            rgReadType = ds["READTYPE"]
            rgChem = "unknown"
            rgFrameRate = 0.0
            if rgReadType != "TRANSCRIPT":
                rgFrameRate = ds["FRAMERATEHZ"]
                basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
                triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
                rgChem = decodeTriple(*triple)

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            baseFeatureNameMapping  = { key.split(":")[0] : key
                                        for key in ds.keys()
                                        if key in BASE_FEATURE_TAGS }
            pulseFeatureNameMapping = { key.split(":")[0] : key
                                        for key in ds.keys()
                                        if key in PULSE_FEATURE_TAGS }
            self._baseFeatureNameMappings[rgID]  = baseFeatureNameMapping
            self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping

            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate,
                                    frozenset(baseFeatureNameMapping.iterkeys())))

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float),
                   ("BaseFeatures",        "O")])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        # The base/pulse features "available" to clients of this file are the intersection
        # of features available from each read group.
        self._baseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()])
        self._pulseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
Esempio n. 3
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            # TODO(dalexander): need FRAMERATEHZ in RG::DS!
            #rgFrameRate = ds["FRAMERATEHZ"]
            rgFrameRate = 75.0
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float)])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        self._pulseFeaturesAvailable = pulseFeaturesInAll_
Esempio n. 4
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []

        # RGID -> ("abstract feature name" -> actual feature name)
        self._baseFeatureNameMappings = {}
        self._pulseFeatureNameMappings = {}

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            rgFrameRate = ds["FRAMERATEHZ"]

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            baseFeatureNameMapping  = { key.split(":")[0] : key
                                        for key in ds.keys()
                                        if key in BASE_FEATURE_TAGS }
            pulseFeatureNameMapping = { key.split(":")[0] : key
                                        for key in ds.keys()
                                        if key in PULSE_FEATURE_TAGS }
            self._baseFeatureNameMappings[rgID]  = baseFeatureNameMapping
            self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping

            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate,
                                    frozenset(baseFeatureNameMapping.iterkeys())))

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float),
                   ("BaseFeatures",        "O")])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        # The base/pulse features "available" to clients of this file are the intersection
        # of features available from each read group.
        self._baseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()])
        self._pulseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
Esempio n. 5
0
 def sequencingChemistry(self):
     """
     Find the name of the chemistry by consulting, in order of preference:
       1) Barcode triple in file
       2) "SequencingChemistry" attr in file (chemistry override)
       3) metadata.xml companion file
     """
     if self._sequencingChemistry is None:
         triple = self._chemistryBarcodeTripleInFile
         if triple is not None:
             self._sequencingChemistry = decodeTriple(*triple)
         elif "SequencingChemistry" in self.file["/ScanData/RunInfo"].attrs:
             self._sequencingChemistry = self.file["/ScanData/RunInfo"].attrs["SequencingChemistry"]
         else:
             tripleFromXML = self._chemistryBarcodeTripleFromMetadataXML
             if tripleFromXML is not None:
                 self._sequencingChemistry = decodeTriple(*tripleFromXML)
             else:
                 raise ChemistryLookupError("Chemistry information could not be found for this file")
     return self._sequencingChemistry
Esempio n. 6
0
 def sequencingChemistry(self):
     """
     Find the name of the chemistry by consulting, in order of preference:
       1) Barcode triple in file
       2) "SequencingChemistry" attr in file (chemistry override)
       3) metadata.xml companion file
     """
     if self._sequencingChemistry is None:
         triple = self._chemistryBarcodeTripleInFile
         if triple is not None:
             self._sequencingChemistry = decodeTriple(*triple)
         elif "SequencingChemistry" in self.file["/ScanData/RunInfo"].attrs:
             self._sequencingChemistry = self.file["/ScanData/RunInfo"].attrs["SequencingChemistry"]
         else:
             tripleFromXML = self._chemistryBarcodeTripleFromMetadataXML
             if tripleFromXML is not None:
                 self._sequencingChemistry = decodeTriple(*tripleFromXML)
             else:
                 raise ChemistryLookupError, "Chemistry information could not be found for this file"
     return self._sequencingChemistry
Esempio n. 7
0
 def sequencingChemistry(self):
     if self._sequencingChemistry is None:
         mi = self.file["/MovieInfo"]
         if (("BindingKit" in mi) and ("SequencingKit" in mi)
                 and ("SoftwareVersion" in mi)):
             # New way
             self._sequencingChemistry = \
                 [ decodeTriple(bk, sk, sv)
                   for (bk, sk, sv) in zip(
                           mi["BindingKit"],
                           mi["SequencingKit"],
                           mi["SoftwareVersion"]) ]
         elif "SequencingChemistry" in mi:
             # Old way
             self._sequencingChemistry = mi["SequencingChemistry"].value
         else:
             raise ChemistryLookupError, "Chemistry information could not be found in cmp.h5!"
     return self._sequencingChemistry
Esempio n. 8
0
 def sequencingChemistry(self):
     if self._sequencingChemistry is None:
         mi = dict(self.file["/MovieInfo"])
         if (("BindingKit" in mi) and
             ("SequencingKit" in mi) and
             ("SoftwareVersion" in mi)):
             # New way
             self._sequencingChemistry = \
                 [ decodeTriple(bk, sk, sv)
                   for (bk, sk, sv) in zip(
                           mi["BindingKit"],
                           mi["SequencingKit"],
                           mi["SoftwareVersion"]) ]
         elif "SequencingChemistry" in mi:
             # Old way
             self._sequencingChemistry = mi["SequencingChemistry"].value
         else:
             raise ChemistryLookupError, "Chemistry information could not be found in cmp.h5!"
     return self._sequencingChemistry
Esempio n. 9
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([
                pair.split("=") for pair in rg["DS"].split(";") if pair != ""
            ])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(
                ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            # TODO(dalexander): need FRAMERATEHZ in RG::DS!
            #rgFrameRate = ds["FRAMERATEHZ"]
            rgFrameRate = 75.0
            readGroupTable_.append(
                (rgID, rgName, rgReadType, rgChem, rgFrameRate))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(readGroupTable_,
                                                  dtype=[
                                                      ("ID", np.int32),
                                                      ("MovieName", "O"),
                                                      ("ReadType", "O"),
                                                      ("SequencingChemistry",
                                                       "O"),
                                                      ("FrameRate", float)
                                                  ])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable}

        self._pulseFeaturesAvailable = pulseFeaturesInAll_
Esempio n. 10
0
    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            # Regarding RG ID: BLASR currently outputs a hex digest of
            # 10 nibbles, instead of the 8 which would fit into a
            # 32-bit word.  So we truncate here for the purposes of
            # cross-referencing within this API and the PacBioBamIndex
            # API.  We do check for a collision below.
            rgID = int(rg["ID"][:8], 16)
            rgName = rg["PU"]
            ds = dict([
                pair.split("=") for pair in rg["DS"].split(";") if pair != ""
            ])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds[
                "SOFTWAREVERSION"]
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(readGroupTable_,
                                                  dtype=[
                                                      ("ID", np.uint32),
                                                      ("MovieName", "O"),
                                                      ("ReadType", "O"),
                                                      ("SequencingChemistry",
                                                       "O")
                                                  ])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable}

        self._pulseFeaturesAvailable = pulseFeaturesInAll_
Esempio n. 11
0
 def sequencingChemistry(self):
     return decodeTriple(*self.chemistryBarcodeTriple)
Esempio n. 12
0
 def sequencingChemistry(self):
     return decodeTriple(*self.chemistryBarcodeTriple)