def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: # Regarding RG ID: BLASR currently outputs a hex digest of # 10 nibbles, instead of the 8 which would fit into a # 32-bit word. So we truncate here for the purposes of # cross-referencing within this API and the PacBioBamIndex # API. We do check for a collision below. rgID = int(rg["ID"][:8], 16) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds["SOFTWAREVERSION"] rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.uint32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O")]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_
def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] # RGID -> ("abstract feature name" -> actual feature name) self._baseFeatureNameMappings = {} self._pulseFeatureNameMappings = {} for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup rgReadType = ds["READTYPE"] rgChem = "unknown" rgFrameRate = 0.0 if rgReadType != "TRANSCRIPT": rgFrameRate = ds["FRAMERATEHZ"] basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) baseFeatureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in BASE_FEATURE_TAGS } pulseFeatureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate, frozenset(baseFeatureNameMapping.iterkeys()))) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float), ("BaseFeatures", "O")]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } # The base/pulse features "available" to clients of this file are the intersection # of features available from each read group. self._baseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()]) self._pulseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] # TODO(dalexander): need FRAMERATEHZ in RG::DS! #rgFrameRate = ds["FRAMERATEHZ"] rgFrameRate = 75.0 readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float)]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_
def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] # RGID -> ("abstract feature name" -> actual feature name) self._baseFeatureNameMappings = {} self._pulseFeatureNameMappings = {} for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] rgFrameRate = ds["FRAMERATEHZ"] # Look for the features manifest entries within the DS tag, # and build an "indirection layer", i.e. to get from # "Ipd" to "Ipd:Frames" # (This is a bit messy. Can we separate the manifest from # the rest of the DS content?) baseFeatureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in BASE_FEATURE_TAGS } pulseFeatureNameMapping = { key.split(":")[0] : key for key in ds.keys() if key in PULSE_FEATURE_TAGS } self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate, frozenset(baseFeatureNameMapping.iterkeys()))) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float), ("BaseFeatures", "O")]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } # The base/pulse features "available" to clients of this file are the intersection # of features available from each read group. self._baseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()]) self._pulseFeaturesAvailable = set.intersection( *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
def sequencingChemistry(self): """ Find the name of the chemistry by consulting, in order of preference: 1) Barcode triple in file 2) "SequencingChemistry" attr in file (chemistry override) 3) metadata.xml companion file """ if self._sequencingChemistry is None: triple = self._chemistryBarcodeTripleInFile if triple is not None: self._sequencingChemistry = decodeTriple(*triple) elif "SequencingChemistry" in self.file["/ScanData/RunInfo"].attrs: self._sequencingChemistry = self.file["/ScanData/RunInfo"].attrs["SequencingChemistry"] else: tripleFromXML = self._chemistryBarcodeTripleFromMetadataXML if tripleFromXML is not None: self._sequencingChemistry = decodeTriple(*tripleFromXML) else: raise ChemistryLookupError("Chemistry information could not be found for this file") return self._sequencingChemistry
def sequencingChemistry(self): """ Find the name of the chemistry by consulting, in order of preference: 1) Barcode triple in file 2) "SequencingChemistry" attr in file (chemistry override) 3) metadata.xml companion file """ if self._sequencingChemistry is None: triple = self._chemistryBarcodeTripleInFile if triple is not None: self._sequencingChemistry = decodeTriple(*triple) elif "SequencingChemistry" in self.file["/ScanData/RunInfo"].attrs: self._sequencingChemistry = self.file["/ScanData/RunInfo"].attrs["SequencingChemistry"] else: tripleFromXML = self._chemistryBarcodeTripleFromMetadataXML if tripleFromXML is not None: self._sequencingChemistry = decodeTriple(*tripleFromXML) else: raise ChemistryLookupError, "Chemistry information could not be found for this file" return self._sequencingChemistry
def sequencingChemistry(self): if self._sequencingChemistry is None: mi = self.file["/MovieInfo"] if (("BindingKit" in mi) and ("SequencingKit" in mi) and ("SoftwareVersion" in mi)): # New way self._sequencingChemistry = \ [ decodeTriple(bk, sk, sv) for (bk, sk, sv) in zip( mi["BindingKit"], mi["SequencingKit"], mi["SoftwareVersion"]) ] elif "SequencingChemistry" in mi: # Old way self._sequencingChemistry = mi["SequencingChemistry"].value else: raise ChemistryLookupError, "Chemistry information could not be found in cmp.h5!" return self._sequencingChemistry
def sequencingChemistry(self): if self._sequencingChemistry is None: mi = dict(self.file["/MovieInfo"]) if (("BindingKit" in mi) and ("SequencingKit" in mi) and ("SoftwareVersion" in mi)): # New way self._sequencingChemistry = \ [ decodeTriple(bk, sk, sv) for (bk, sk, sv) in zip( mi["BindingKit"], mi["SequencingKit"], mi["SoftwareVersion"]) ] elif "SequencingChemistry" in mi: # Old way self._sequencingChemistry = mi["SequencingChemistry"].value else: raise ChemistryLookupError, "Chemistry information could not be found in cmp.h5!" return self._sequencingChemistry
def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join( ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] # TODO(dalexander): need FRAMERATEHZ in RG::DS! #rgFrameRate = ds["FRAMERATEHZ"] rgFrameRate = 75.0 readGroupTable_.append( (rgID, rgName, rgReadType, rgChem, rgFrameRate)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.int32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O"), ("FrameRate", float) ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} self._pulseFeaturesAvailable = pulseFeaturesInAll_
def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: # Regarding RG ID: BLASR currently outputs a hex digest of # 10 nibbles, instead of the 8 which would fit into a # 32-bit word. So we truncate here for the purposes of # cross-referencing within this API and the PacBioBamIndex # API. We do check for a collision below. rgID = int(rg["ID"][:8], 16) rgName = rg["PU"] ds = dict([ pair.split("=") for pair in rg["DS"].split(";") if pair != "" ]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds[ "SOFTWAREVERSION"] rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords(readGroupTable_, dtype=[ ("ID", np.uint32), ("MovieName", "O"), ("ReadType", "O"), ("SequencingChemistry", "O") ]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable} self._pulseFeaturesAvailable = pulseFeaturesInAll_
def sequencingChemistry(self): return decodeTriple(*self.chemistryBarcodeTriple)