def __init__(self, fn, *args, **kwArgs):
        GenomeElementSource.__init__(self, fn, *args, **kwArgs)

        f = open(fn)
        trackDef = f.readline().replace('\'', '"')
        if not trackDef.startswith('track type="array"'):
            raise InvalidFormatError(
                'Track definition line must start with: track type="array". Line: '
                + trackDef)

        header = self._parseHeader(trackDef)
        if not all(key in header
                   for key in ['expScale', 'expStep', 'expNames']):
            raise InvalidFormatError(
                'Track definition line must define values for expScale, expStep and expNames: '
                + trackDef)

        expNames = header['expNames']
        if not all(expNames[i] == '"' for i in [0, -1]):
            raise InvalidFormatError(
                'expNames does not start and end in quote marks: ' + trackDef)

        self._globExpCount = len(
            [x for x in expNames[1:-2].split(',') if x != ''])
        if self._globExpCount < 3:
            raise InvalidFormatError(
                'Microarray data must have at least 3 experiments. Length of expNames: '
                + str(self._globExpCount))
    def _next(self, line):
        if line.startswith('#'):
            return

        ge = GenomeElement(self._genome)
        cols = line.split('\t')

        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.')
        else:
            self._numCols = len(cols)

        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))

        self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName( ge, cols )
        self._parseVal( ge, cols )

        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])

        for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i+7:
                setattr(ge, extraCol, cols[i+6])

        return ge
Beispiel #3
0
    def _adjustColumnsAccordingToHeaderLines(self, hbColumns, columns):
        if self._headerDict['fixed length'] != 1:
            if not 'end' in columns:
                raise InvalidFormatError('Error: header "fixed length" does not have the default value ' \
                                         '(%s != 1), but the end prefix is not defined' % self._headerDict['fixed length'])

        if self._headerDict['fixed gap size'] != 0:
            if not 'start' in columns:
                raise InvalidFormatError('Error: header "fixed gap size" does not have the default value ' \
                                         '(%s != 0), but the start prefix is not defined' % self._headerDict['fixed gap size'])
            if not self._hasAttrInBoundingRegion('start'):
                raise InvalidFormatError('Error: header "fixed gap size" does not have the default value ' \
                                         '(%s != 0), but bounding regions of type B are not defined' % self._headerDict['fixed gap size'])

        toDelete = []
        if self._headerDict['fixed length'] != 1:
            toDelete.append('end')

        if self._headerDict['fixed gap size'] != 0:
            toDelete.append('start')

        if len(columns) > len(toDelete):
            for col in toDelete:
                del columns[columns.index(col)]
                del hbColumns[hbColumns.index(col)]
        else:
            self._headerDict['fixed length'] = 1
            self._headerDict['fixed gap size'] = 0

        return hbColumns, columns
Beispiel #4
0
 def _handleStep(self, step):
     step = int(step) if step is not None else 1
     if step < 1:
         raise InvalidFormatError(
             'The step value must be positive: %s < 1.' % step)
     if self._step is not None and step != self._step:
         raise InvalidFormatError(
             'The step value is not allowed to change within the same WIG file: %s != %s.'
             % (self._step, step))
     return step
Beispiel #5
0
 def _handleSpan(self, span):
     span = int(span) if span is not None else 1
     if span < 1:
         raise InvalidFormatError(
             'The span value must be positive: %s < 1.' % span)
     if self._fixedStep and self._span is not None and span != self._span:
         raise InvalidFormatError(
             'The span value is not allowed to change within the same WIG fixedStep file: %s != %s.'
             % (self._span, span))
     return span
Beispiel #6
0
    def _checkValidStart(self, chr, start):
        if start < 0:
            raise InvalidFormatError('Error: start position is negative: %s' %
                                     start)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                start > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, start, GenomeInfo.getChrLen(self.genome, chr)))
        return start
Beispiel #7
0
 def _checkDataLineCols(self, cols):
     if self._fixedStep is None:
         raise InvalidFormatError(
             'All WIG data lines must be preceded by a declaration line.')
     elif self._fixedStep:
         if len(cols) != 1:
             raise InvalidFormatError(
                 'WIG fixedStep requires data lines with one column.')
     else:
         if len(cols) != 2:
             raise InvalidFormatError(
                 'WIG variableStep requires data lines with two columns.')
    def next(self):
        try:
            return self._geIter.next()
        except StopIteration:
            self._storeOtherDependentAttrs()

            if self._valDim is None:
                raise InvalidFormatError('Error: unable to determine value dimension.')
            if self._edgeWeightDim is None:
                raise InvalidFormatError('Error: unable to determine edge weight dimension.')

            self._boundingRegionTuples = self._geIter.getBoundingRegionTuples()
            raise
Beispiel #9
0
 def _commonGetGtrackValType(self, valDataType, valOrEdgeWeights):
     valDataType = valDataType.replace('|', '')
     for gtrackValType, valType in Gtrack.VAL_TYPE_DICT.iteritems():
         if valType.fromNumpyTypeFunc(valDataType):
             return gtrackValType
     raise InvalidFormatError('Error: did not understand %s type: %s' %
                              (valOrEdgeWeights, valDataType))
Beispiel #10
0
    def _checkValidEnd(self, chr, end, start=None):
        if end < 0:
            raise InvalidFormatError('Error: end position is negative: %s' %
                                     end)

        if self.genome and \
            GenomeInfo.isValidChr(self.genome, chr) and \
                end-1 > GenomeInfo.getChrLen(self.genome, chr):
            raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \
                                     (chr, end-1, GenomeInfo.getChrLen(self.genome, chr)))
        if start is not None and end <= start:
            if not start == end == 1:
                raise InvalidFormatError(
                    'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d'
                    % (end, start))

        return end
Beispiel #11
0
 def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br):
     GenomeElementSource._checkBoundingRegionSortedPair(
         self, lastBoundingRegion, br)
     if br.start is not None and br.end is not None:
         if lastBoundingRegion.end == br.start:
             raise InvalidFormatError(
                 "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)."
                 % (lastBoundingRegion, br))
Beispiel #12
0
 def _parseVal(self, ge, valStr):
     if self._handleNan(valStr) == 'nan':
         ge.val = BINARY_MISSING_VAL
     elif valStr == '0':
         ge.val = False
     elif valStr == '1':
         ge.val = True
     else:
         raise InvalidFormatError('Could not parse value: ' + valStr +
                                  ' as target/control.')
Beispiel #13
0
 def _handleTrackDefinitionLineIfPresent(self, firstLine):
     if firstLine.startswith('track'):
         if firstLine.startswith('track type=wiggle_0'):
             self._numHeaderLines = 1
         else:
             raise InvalidFormatError(
                 'The wiggle track definition line must (if present) start with: track type=wiggle_0'
             )
     else:
         self._numHeaderLines = 0
    def _parseVal(self, ge, cols):
        if self._numCols >= 5:
            if cols[4] in ['-', '.']:
                val = 0
            else:
                val = int(cols[4])

            if val < 0 or val > 1000:
                raise InvalidFormatError("Error: BED score column must be an integer between 0 and 1000: %s. Perhaps you instead " + \
                                         "should use the file formats 'valued.bed' or 'gtrack'?")
            ge.val = val
Beispiel #15
0
    def _checkFixedStep(self, line, start, step):
        fixedStep = self._isFixedStepLine(line)

        if self._fixedStep is not None and self._fixedStep != fixedStep:
            raise InvalidFormatError(
                'WIG fixedStep and variableStep declaration lines are not allowed mix within the same file.'
            )

        if fixedStep:
            if start is None:
                raise InvalidFormatError(
                    'WIG fixedStep requires start values in the declaration line.'
                )
        else:
            if start is not None or step is not None:
                raise InvalidFormatError(
                    'WIG variableStep may not have start and step values in the declaration line.'
                )

        return fixedStep
Beispiel #16
0
 def _getStrandFromString(cls, val):
     if val == '+':
         return True
     elif val == '-':
         return False
     elif val == '.':
         return BINARY_MISSING_VAL
     #val == ''?
     else:
         raise InvalidFormatError(
             "Error: strand must be either '+', '-' or '.'. Value: %s" %
             val)
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError(
                    'FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
    def _next(self, line):
        cols = line.split()
        if len(cols) != 15:
            raise InvalidFormatError(
                'File must contain exactly 15 columns, contains ' +
                str(len(cols)))

        self._genomeElement.chr = self._checkValidChr(cols[0])
        self._genomeElement.start = self._checkValidStart(
            self._genomeElement.chr, int(cols[1]))
        self._genomeElement.end = self._checkValidEnd(
            self._genomeElement.chr,
            int(cols[2]),
            start=self._genomeElement.start)
        self._genomeElement.strand = self._getStrandFromString(cols[5])

        self._genomeElement.val = [numpy.nan] * self._globExpCount
        expCount = int(cols[12])
        expIds = [int(x) for x in cols[13].split(',') if x != '']
        expScores = [numpy.float(x) for x in cols[14].split(',') if x != '']

        if len(expIds) != expCount:
            raise InvalidFormatError('expId length (' + str(len(expIds)) +
                                     ') is not equal to expCount (' +
                                     str(expCount) + ')')
        if len(expScores) != expCount:
            raise InvalidFormatError('expScores length (' + str(len(expIds)) +
                                     ') is not equal to expCount (' +
                                     str(expScores) + ')')

        for i in range(expCount):
            if expIds[i] >= self._globExpCount:
                raise InvalidFormatError('expId ' + str(expIds[i]) + ' too large. expNames in header line defines ' + str(self._globExpCount) + ' experiments. '+\
                                         'Thsi could be because of counting from 1 instead of from 0.')
            self._genomeElement.val[expIds[i]] = expScores[i]

        return self._genomeElement
Beispiel #19
0
    def __iter__(self):
        try:
            while not self._finished:
                yield self._curEl
                self._curEl = self._geIter.next()
                if self._curEl.chr != self._chrList[-1]:
                    if self._curEl.chr in self._chrList:
                        raise InvalidFormatError(
                            'Error: chromosome %s has been previously encountered. Dense datasets must not skip back and forth between chromosomes.'
                            % self._curEl.chr)
                    self._chrList.append(self._curEl.chr)
                    break

        except StopIteration:
            self._finished = True
            raise
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line) > 0 and line[0] == '#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError(
                "Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge
Beispiel #21
0
    def _next(self, brt, ge, i):
        if ge.genome is not None:
            if self._genome is None:
                self._genome = ge.genome
            elif self._genome != ge.genome:
                raise InvalidFormatError(
                    'GtrackStandardizer does not support GTrack files with more than one genome'
                )
            ge.genome = None

        if ge.start is None:
            if i == 0:
                if brt is not None:
                    ge.start = brt.region.start
                else:
                    raise ShouldNotOccurError
            else:
                ge.start = self._prevElement.end

        if ge.end is None:
            ge.end = ge.start + 1

        if ge.val is None:
            ge.val = numpy.nan

        if ge.strand is None:
            ge.strand = BINARY_MISSING_VAL

        if ge.id is None:
            ge.id = str(self._id)
            self._id += 1

        if ge.edges is None:
            ge.edges = []

        self._prevElement = ge
        return ge
Beispiel #22
0
 def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br):
     if br.start is not None and br.end is not None:
         if lastBoundingRegion.overlaps(br):
             raise InvalidFormatError(
                 "Error: bounding regions '%s' and '%s' overlap." %
                 (lastBoundingRegion, br))
Beispiel #23
0
    def __init__(self, path, prefix, size, valDataType='float64', valDim=1, weightDataType='float64', weightDim=1, maxNumEdges=0, maxStrLens={}, allowAppend=True):
        assert valDim >= 1 and weightDim >= 1

        if valDataType == 'S':
            valDataType = 'S' + str(max(2, maxStrLens['val']))
        if weightDataType == 'S':
            weightDataType = 'S' + str(max(2, maxStrLens['weights']))
            
        self._setup(prefix, 'start', getStart, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'end', getEnd, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'strand', getStrand, writeNoSlice, None, 'int8', 1, False)
        self._setup(prefix, 'val', getVal, writeNoSlice, None, valDataType, valDim, True)
        self._setup(prefix, 'id', getId, writeNoSlice, None, 'S' + str(maxStrLens.get('id')), 1, False)
        self._setup(prefix, 'edges', getEdges, writeSliceFromFront, maxNumEdges, 'S' + str(maxStrLens.get('edges')), 1, False)
        self._setup(prefix, 'weights', getWeights, writeSliceFromFront, maxNumEdges, weightDataType, weightDim, True)
        self._setup(prefix, 'leftIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        self._setup(prefix, 'rightIndex', getNone, writeNoSlice, None, 'int32', 1, False)
        
        if not hasattr(self, '_parseFunc'):
            self._geParseClass = GetExtra(prefix)
            self._setup(prefix, prefix, self._geParseClass.parse, writeNoSlice, None, 'S' + str(maxStrLens.get(prefix)), 1, False)
        
        # If there is one number in the path, it is the data type dimension.
        # Only one value is allowed per element, no extra dimensions are added
        # to the array and the element dimension is None.
        #
        # Example: val.4.float64 contains, per element, a vector of 4 numbers.
        #          The shape is (n,4) for n elements.
        #
        # If there are two numbers in the path, the first is the maximal element
        # dimension and the second is the data type dimension.
        #
        # Example: weights.3.4.float64 contains, per element, at most 3 vectors
        #          of 4 numbers each. The shape is (n,3,4) for n elements.
        
        self._fn = createMemmapFileFn(path, prefix, self._elementDim, self._dataTypeDim, self._dataType)
        self._index = 0
        
        shape = [size] + \
                 ([max(1, self._elementDim)] if self._elementDim is not None else []) + \
                 ([self._dataTypeDim] if self._dataTypeDim > 1 else [])
        
        append = os.path.exists(self._fn)
        if append:
            if not allowAppend:
                raise InvalidFormatError('Error: different genome element sources (e.g. different input files) tries to write to index file for the same chromosome (%s). This is probably caused by different files in the same folder containing elements from the same chromosome.' % self._fn)
            
            try:
                f = np.memmap( self._fn, dtype=self._dataType, mode='r+' )
                self._index = len(f) / product(shape[1:])
                del f

                existingShape = calcShapeFromMemmapFileFn(self._fn)
                self._contents = np.array( np.memmap(self._fn, dtype=self._dataType, mode='r+', shape=tuple(existingShape)) )
                self._contents = np.r_[self._contents, np.zeros( dtype=self._dataType, shape=tuple(shape) )]
            except Exception:
                print 'Error when opening file: ', self._fn
                raise
        else:
            self._contents = np.zeros( dtype=self._dataType, shape=tuple(shape) )
            
        if not append and self._setEmptyVal:
            self._contents[:] = findEmptyVal(self._dataType)
 def _parseEnd(self, ge, end):
     if end != ge.start + 1:
         raise InvalidFormatError('Error: point BED files can only have segments of length 1')
Beispiel #25
0
 def _handleChr(self, chr):
     if chr == None:
         raise InvalidFormatError(
             'WIG declaration line requires the specification of a chromosome.'
         )
     return chr
Beispiel #26
0
 def _handleGetItem(self, key, item):
     if item is not None and len(item) > 1:
         raise InvalidFormatError(
             'Error: duplicate match on the same key, "%s"' % str(key))
     return item[0]
    def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList,
                             sparse):
        assert sparse in [False, True]

        tempContents = OrderedDict()

        genomeElementChrs = set(genomeElementChrList)
        lastRegion = None
        chrStartIdxs = OrderedDict()
        chrEndIdxs = OrderedDict()
        totElCount = 0
        totBinCount = 0

        for br in boundingRegionTuples:
            if lastRegion is None or br.region.chr != lastRegion.chr:
                if br.region.chr in tempContents:
                    raise InvalidFormatError(
                        "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)."
                        % br.region)

                lastRegion = None
                tempContents[br.region.chr] = OrderedDict()
                if sparse:
                    chrStartIdxs[br.region.chr] = totElCount
            else:
                if br.region < lastRegion:
                    raise InvalidFormatError(
                        "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s."
                        % (lastRegion, br.region))
                if lastRegion.overlaps(br.region):
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' overlap." %
                        (lastRegion, br.region))
                if lastRegion.end == br.region.start:
                    raise InvalidFormatError(
                        "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)."
                        % (lastRegion, br.region))

            if len(br.region) < 1:
                raise InvalidFormatError(
                    "Error: bounding region '%s' does not have positive length."
                    % br.region)

            if not sparse and len(br.region) != br.elCount:
                raise InvalidFormatError(
                    "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s"
                    % (br.region, len(br.region), br.elCount))

            startIdx, endIdx = (totElCount, totElCount +
                                br.elCount) if not sparse else (None, None)
            totElCount += br.elCount
            if sparse:
                chrEndIdxs[br.region.chr] = totElCount

            tempContents[br.region.chr][br.region.start] = BoundingRegionInfo(
                br.region.start, br.region.end, startIdx, endIdx, 0, 0)

            lastRegion = br.region

        if sparse:
            totBinCount = 0
            for chr in tempContents:
                chrLen = GenomeInfo.getChrLen(self._genome, chr)
                numBinsInChr = CompBinManager.getNumOfBins(
                    GenomeRegion(start=0, end=chrLen))
                for key in tempContents[chr].keys():
                    startBinIdx = totBinCount
                    endBinIdx = totBinCount + numBinsInChr
                    brInfo = tempContents[chr][key]

                    if chr in genomeElementChrs:
                        tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \
                                                                    chrStartIdxs[chr], chrEndIdxs[chr], \
                                                                    startBinIdx, endBinIdx)
                    else:
                        if chrEndIdxs[chr] - chrStartIdxs[chr] > 0:
                            raise InvalidFormatError(
                                "Error: bounding region '%s' has incorrect element count: %s > 0"
                                % (GenomeRegion(chr=chr,
                                                start=brInfo.start,
                                                end=brInfo.end),
                                   chrEndIdxs[chr] - chrStartIdxs[chr]))
                        tempContents[chr][key] = BoundingRegionInfo(
                            brInfo.start, brInfo.end, 0, 0, 0, 0)

                if chr in genomeElementChrs:
                    totBinCount += numBinsInChr

        if len(genomeElementChrs - set(tempContents.keys())) > 0:
            raise InvalidFormatError(
                'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s'
                % ', '.join(genomeElementChrs - set(tempContents.keys())))

        ensurePathExists(self._fn)

        for chr in tempContents:
            brInfoDict = tempContents[chr]
            tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()),
                                             tuple(brInfoDict.values()))

        brShelve = safeshelve.open(self._fn)
        brShelve.update(tempContents)
        brShelve.close()

        while not self.fileExists():
            from gtrackcore.application.LogSetup import logMessage
            logMessage(
                "Bounding region shelve file '%s' has yet to be created" %
                self._fn)
            import time
            time.sleep(0.2)