Exemple #1
0
    def _loadIpdTable(self, nullModelGroup):
        """
        Read the null kinetic model into a shared numpy array dataset
        """
        nullModelDataset = nullModelGroup["KineticValues"]

        # assert that the dataset is a uint8
        assert(nullModelDataset.dtype == uint8)

        # Construct a 'shared array' (a numpy wrapper around some shared memory
        # Read the LUT into this table
        self.sharedArray = SharedArray('B', nullModelDataset.shape[0])
        lutArray = self.sharedArray.getNumpyWrapper()
        nullModelDataset.read_direct(lutArray)

        # Load the second-level LUT
        self.floatLut = nullModelGroup["Lut"][:]
Exemple #2
0
    def __init__(self, fastaRecords, modelFile=None, modelIterations=-1):
        """
        Load the reference sequences and the ipd lut into shared arrays that can be
        used as numpy arrays in worker processes.
        fastaRecords is a list of FastaRecords, in the cmp.h5 file order
        """

        self.pre = 10
        self.post = 4

        self.pad = 30
        self.base4 = 4 ** np.array(range(self.pre + self.post + 1))

        self.refDict = {}
        self.refLengthDict = {}

        for contig in fastaRecords:
            if contig.id is None:
                # This contig has no mapped reads -- skip it
                continue

            rawSeq = contig.sequence
            refSeq = np.fromstring(rawSeq, dtype=byte)

            # Store the reference length
            self.refLengthDict[contig.id] = len(rawSeq)

            # Make a shared array
            sa = SharedArray(dtype='B', shape=len(rawSeq) + self.pad * 2)
            saWrap = sa.getNumpyWrapper()

            # Lut Codes convert Ns to As so that we don't put Ns into the Gbm Model
            # Seq Codes leaves Ns as Ns for getting reference snippets out
            innerLutCodes = lutCodeMap[refSeq]
            innerSeqCodes = seqCodeMap[refSeq]
            innerCodes = np.bitwise_or(innerLutCodes, np.left_shift(innerSeqCodes, 4))

            saWrap[self.pad:(len(rawSeq) + self.pad)] = innerCodes

            # Padding codes -- the lut array is padded with 0s the sequence array is padded with N's (4)
            outerCodes = np.left_shift(np.ones(self.pad, dtype=uint8) * 4, 4)
            saWrap[0:self.pad] = outerCodes
            saWrap[(len(rawSeq) + self.pad):(len(rawSeq) + 2 * self.pad)] = outerCodes

            self.refDict[contig.id] = sa

        # No correction factor for IPDs everything is normalized to 1
        self.meanIpd = 1

        # Find and open the ipd model file

        if modelFile:
            self.lutPath = modelFile
        else:
            self.lutPath = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + "kineticLut.h5"

        if os.path.exists(self.lutPath):
            h5File = h5py.File(self.lutPath, mode='r')

            gbmModelGroup = h5File["/AllMods_GbmModel"]
            self.gbmModel = GbmContextModel(gbmModelGroup, modelIterations)

            # We always use the model -- no more LUTS
            self.predictIpdFunc = self.predictIpdFuncModel
            self.predictManyIpdFunc = self.predictManyIpdFuncModel
        else:
            logging.info("Couldn't find model file: %s" % self.lutPath)
Exemple #3
0
    def __init__(self, fastaRecords, modelFile, modelIterations=-1):
        """
        Load the reference sequences and the ipd lut into shared arrays that can be
        used as numpy arrays in worker processes.
        fastaRecords is a list of FastaRecords, in the alignments file order
        """

        self.pre = 10
        self.post = 4

        self.pad = 30
        self.base4 = 4**np.array(range(self.pre + self.post + 1))

        self.refDict = {}
        self.refLengthDict = {}

        for contig in fastaRecords:
            if contig.alignmentID is None:
                # This contig has no mapped reads -- skip it
                continue

            rawSeq = contig.sequence[:]
            refSeq = np.frombuffer(rawSeq.encode("utf-8"), dtype=byte)

            # Store the reference length
            self.refLengthDict[contig.alignmentID] = len(rawSeq)

            # Make a shared array
            sa = SharedArray(dtype='B', shape=len(rawSeq) + self.pad * 2)
            saWrap = sa.getNumpyWrapper()

            # Lut Codes convert Ns to As so that we don't put Ns into the Gbm Model
            # Seq Codes leaves Ns as Ns for getting reference snippets out
            innerLutCodes = lutCodeMap[refSeq]
            innerSeqCodes = seqCodeMap[refSeq]
            innerCodes = np.bitwise_or(innerLutCodes,
                                       np.left_shift(innerSeqCodes, 4))

            saWrap[self.pad:(len(rawSeq) + self.pad)] = innerCodes

            # Padding codes -- the lut array is padded with 0s the sequence
            # array is padded with N's (4)
            outerCodes = np.left_shift(np.ones(self.pad, dtype=uint8) * 4, 4)
            saWrap[0:self.pad] = outerCodes
            saWrap[(len(rawSeq) + self.pad):(len(rawSeq) +
                                             2 * self.pad)] = outerCodes

            self.refDict[contig.alignmentID] = sa

        # No correction factor for IPDs everything is normalized to 1
        self.meanIpd = 1

        # Find and open the ipd model file
        self.lutPath = modelFile
        if os.path.exists(self.lutPath):
            with gzip.open(self.lutPath, "rb") as npz_in:
                gbmModelData = np.load(npz_in, allow_pickle=True)
                self.gbmModel = GbmContextModel(gbmModelData, modelIterations)

            # We always use the model -- no more LUTS
            self.predictIpdFunc = self.predictIpdFuncModel
            self.predictManyIpdFunc = self.predictManyIpdFuncModel
        else:
            logging.info("Couldn't find model file: %s" % self.lutPath)