Beispiel #1
0
    def merge(genome, trackName, allowOverlaps):
        path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps)

        collector = PreProcMetaDataCollector(genome, trackName)
        chrList = collector.getPreProcessedChrs(allowOverlaps)
        if not collector.getTrackFormat().reprIsDense():
            chrList = sorted(chrList)

        existingChrList = [
            chr
            for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)
        ]
        if len(existingChrList) == 0:
            raise EmptyGESourceError(
                'No data lines has been read from source file (probably because it is empty).'
            )

        firstChrTrackData = TrackSource().getTrackData(trackName,
                                                       genome,
                                                       existingChrList[0],
                                                       allowOverlaps,
                                                       forceChrFolders=True)
        arrayList = firstChrTrackData.keys()
        for arrayName in arrayList:
            mergedArray = firstChrTrackData[arrayName][:]
            elementDim, dtypeDim = parseMemmapFileFn(
                firstChrTrackData[arrayName].filename)[1:3]
            del firstChrTrackData[arrayName]

            for chr in existingChrList[1:]:
                chrTrackData = TrackSource().getTrackData(trackName,
                                                          genome,
                                                          chr,
                                                          allowOverlaps,
                                                          forceChrFolders=True)

                mergedArray = ChrMemmapFolderMerger.mergeArrays(
                    mergedArray, np.array(chrTrackData[arrayName][:]))
                elementDimNew, dtypeDimNew = parseMemmapFileFn(
                    chrTrackData[arrayName].filename)[1:3]
                elementDim = max(elementDim, elementDimNew)
                dtypeDim = max(dtypeDim, dtypeDimNew)

                del chrTrackData[arrayName]

            mergedFn = createMemmapFileFn(path, arrayName, elementDim,
                                          dtypeDim, str(mergedArray.dtype))

            f = np.memmap(mergedFn,
                          dtype=mergedArray.dtype,
                          mode='w+',
                          shape=mergedArray.shape)
            f[:] = mergedArray
            f.flush()
            del f
            del mergedArray
    def merge(genome, trackName, allowOverlaps):
        path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps)

        collector = PreProcMetaDataCollector(genome, trackName)
        chrList = collector.getPreProcessedChrs(allowOverlaps)
        if not collector.getTrackFormat().reprIsDense():
            chrList = sorted(chrList)
        
        existingChrList = [chr for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)]
        if len(existingChrList) == 0:
            raise EmptyGESourceError('No data lines has been read from source file (probably because it is empty).')
            
        firstChrTrackData = TrackSource().getTrackData(trackName, genome, existingChrList[0], allowOverlaps, forceChrFolders=True)
        arrayList = firstChrTrackData.keys()
        for arrayName in arrayList:
            mergedArray = firstChrTrackData[arrayName][:]
            elementDim, dtypeDim = parseMemmapFileFn(firstChrTrackData[arrayName].filename)[1:3]
            del firstChrTrackData[arrayName]
            
            for chr in existingChrList[1:]:
                chrTrackData = TrackSource().getTrackData(trackName, genome, chr, allowOverlaps, forceChrFolders=True)
            
                mergedArray = ChrMemmapFolderMerger.mergeArrays(mergedArray, np.array(chrTrackData[arrayName][:]))
                elementDimNew, dtypeDimNew = parseMemmapFileFn(chrTrackData[arrayName].filename)[1:3]
                elementDim = max(elementDim, elementDimNew)
                dtypeDim = max(dtypeDim, dtypeDimNew)
                
                del chrTrackData[arrayName]
            
            mergedFn = createMemmapFileFn(path, arrayName, elementDim, dtypeDim, str(mergedArray.dtype))
            
            f = np.memmap(mergedFn, dtype=mergedArray.dtype, mode='w+', shape=mergedArray.shape)
            f[:] = mergedArray
            f.flush()
            del f
            del mergedArray
Beispiel #3
0
    def __init__(self, path, prefix, size, valDataType='float64', valDim=1, weightDataType='float64', weightDim=1, maxNumEdges=0, maxStrLens={}, allowAppend=True):
        assert valDim >= 1 and weightDim >= 1

        if valDataType == 'S':
            valDataType = 'S' + str(max(2, maxStrLens['val']))
        if weightDataType == 'S':
            weightDataType = 'S' + str(max(2, maxStrLens['weights']))
            
        self._setup(prefix, 'start', getStart, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'end', getEnd, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'strand', getStrand, writeNoSlice, None, 'int8', 1, False)
        self._setup(prefix, 'val', getVal, writeNoSlice, None, valDataType, valDim, True)
        self._setup(prefix, 'id', getId, writeNoSlice, None, 'S' + str(maxStrLens.get('id')), 1, False)
        self._setup(prefix, 'edges', getEdges, writeSliceFromFront, maxNumEdges, 'S' + str(maxStrLens.get('edges')), 1, False)
        self._setup(prefix, 'weights', getWeights, writeSliceFromFront, maxNumEdges, weightDataType, weightDim, True)
        self._setup(prefix, 'leftIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'rightIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        
        if not hasattr(self, '_parseFunc'):
            self._geParseClass = GetExtra(prefix)
            self._setup(prefix, prefix, self._geParseClass.parse, writeNoSlice, None, 'S' + str(maxStrLens.get(prefix)), 1, False)
        
        # If there is one number in the path, it is the data type dimension.
        # Only one value is allowed per element, no extra dimensions are added
        # to the array and the element dimension is None.
        #
        # Example: val.4.float64 contains, per element, a vector of 4 numbers.
        #          The shape is (n,4) for n elements.
        #
        # If there are two numbers in the path, the first is the maximal element
        # dimension and the second is the data type dimension.
        #
        # Example: weights.3.4.float64 contains, per element, at most 3 vectors
        #          of 4 numbers each. The shape is (n,3,4) for n elements.
        
        self._fn = createMemmapFileFn(path, prefix, self._elementDim, self._dataTypeDim, self._dataType)
        self._index = 0
        
        shape = [size] + \
                 ([max(1, self._elementDim)] if self._elementDim is not None else []) + \
                 ([self._dataTypeDim] if self._dataTypeDim > 1 else [])
        
        append = os.path.exists(self._fn)
        if append:
            if not allowAppend:
                raise InvalidFormatError('Error: different genome element sources (e.g. different input files) tries to write to index file for the same chromosome (%s). This is probably caused by different files in the same folder containing elements from the same chromosome.' % self._fn)
            
            try:
                f = np.memmap( self._fn, dtype=self._dataType, mode='r+' )
                self._index = len(f) / product(shape[1:])
                del f

                existingShape = calcShapeFromMemmapFileFn(self._fn)
                self._contents = np.array( np.memmap(self._fn, dtype=self._dataType, mode='r+', shape=tuple(existingShape)) )
                self._contents = np.r_[self._contents, np.zeros( dtype=self._dataType, shape=tuple(shape) )]
            except Exception:
                print 'Error when opening file: ', self._fn
                raise
        else:
            self._contents = np.zeros( dtype=self._dataType, shape=tuple(shape) )
            
        if not append and self._setEmptyVal:
            self._contents[:] = findEmptyVal(self._dataType)
Beispiel #4
0
    def __init__(self, path, prefix, size, valDataType='float64', valDim=1, weightDataType='float64', weightDim=1, maxNumEdges=0, maxStrLens={}, allowAppend=True):
        assert valDim >= 1 and weightDim >= 1
        
        if valDataType == 'S':
            valDataType = 'S' + str(max(2, maxStrLens['val']))
        if weightDataType == 'S':
            weightDataType = 'S' + str(max(2, maxStrLens['weights']))
            
        self._setup(prefix, 'start', getStart, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'end', getEnd, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'strand', getStrand, writeNoSlice, None, 'int8', 1, False)
        self._setup(prefix, 'val', getVal, writeNoSlice, None, valDataType, valDim, True)
        self._setup(prefix, 'id', getId, writeNoSlice, None, 'S' + str(maxStrLens.get('id')), 1, False)
        self._setup(prefix, 'edges', getEdges, writeSliceFromFront, maxNumEdges, 'S' + str(maxStrLens.get('edges')), 1, False)
        self._setup(prefix, 'weights', getWeights, writeSliceFromFront, maxNumEdges, weightDataType, weightDim, True)
        self._setup(prefix, 'leftIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'rightIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        
        if not hasattr(self, '_parseFunc'):
            self._geParseClass = GetExtra(prefix)
            self._setup(prefix, prefix, self._geParseClass.parse, writeNoSlice, None, 'S' + str(maxStrLens.get(prefix)), 1, False)
        
        # If there is one number in the path, it is the data type dimension.
        # Only one value is allowed per element, no extra dimensions are added
        # to the array and the element dimension is None.
        #
        # Example: val.4.float64 contains, per element, a vector of 4 numbers.
        #          The shape is (n,4) for n elements.
        #
        # If there are two numbers in the path, the first is the maximal element
        # dimension and the second is the data type dimension.
        #
        # Example: weights.3.4.float64 contains, per element, at most 3 vectors
        #          of 4 numbers each. The shape is (n,3,4) for n elements.
        
        self._fn = createMemmapFileFn(path, prefix, self._elementDim, self._dataTypeDim, self._dataType)
        self._index = 0
        
        shape = [size] + \
                 ([max(1, self._elementDim)] if self._elementDim is not None else []) + \
                 ([self._dataTypeDim] if self._dataTypeDim > 1 else [])
        
        append = os.path.exists(self._fn)
        if append:
            if not allowAppend:
                raise InvalidFormatError('Error: different genome element sources (e.g. different input files) tries to write to index file for the same chromosome (%s). This is probably caused by different files in the same folder containing elements from the same chromosome.' % self._fn)
            
            try:
                f = np.memmap( self._fn, dtype=self._dataType, mode='r+' )
                self._index = len(f) / product(shape[1:])
                del f

                existingShape = calcShapeFromMemmapFileFn(self._fn)
                self._contents = np.array( np.memmap(self._fn, dtype=self._dataType, mode='r+', shape=tuple(existingShape)) )
                self._contents = np.r_[self._contents, np.zeros( dtype=self._dataType, shape=tuple(shape) )]
            except Exception:
                print 'Error when opening file: ', self._fn
                raise
        else:
            self._contents = np.zeros( dtype=self._dataType, shape=tuple(shape) )
            
        if not append and self._setEmptyVal:
            self._contents[:] = findEmptyVal(self._dataType)