def _setLevelAndCheckOrder(oldLevel, newLevel):
    if newLevel < oldLevel:
        if oldLevel == 5:
            raise InvalidFormatError(
                'Header line after data line is not allowed.')
        else:
            raise InvalidFormatError(
                'Header type "%s" after type "%s" is not allowed.' %
                ('#' * newLevel, '#' * oldLevel))

    if newLevel == oldLevel == 4:
        raise InvalidFormatError('Double genome lines are not allowed.')

    if newLevel == oldLevel == 3:
        raise InvalidFormatError(
            'Double column specification lines are not allowed.')

    return newLevel
Example #2
0
    def genericVisit(self, gSuiteTrack, galaxyFn, colHierarchyList):
        gSuiteReq = GSuiteRequirements(allowedLocations=[REMOTE])
        gSuiteReq.check(gSuiteTrack)

        allowedCols = OPTIONAL_STD_COL_NAMES + gSuiteTrack.attributes.keys()
        for col in colHierarchyList:
            if col not in allowedCols:
                raise InvalidFormatError('Column "%s" not found: %s' %
                                         (col, allowedCols))
    def _init(self, **kwArgs):
        if self.netloc is None:
            raise InvalidFormatError(
                'Track protocol "%s" requires the specification ' %
                self.SCHEME +
                'of a host server, e.g. "%s://server.org/path/to/file".' %
                self.SCHEME)

        super(RemoteGSuiteTrack, self)._init(**kwArgs)
    def _init(self, **kwArgs):
        if self.fileFormat in (PRIMARY,
                               UNKNOWN) and self._parsedUri.query != '':
            raise InvalidFormatError(
                'Queries in URI ("?%s") ' % self._parsedUri.query +
                'is not allowed for non-binary tracks with "%s" as protocol.' %
                self.SCHEME)

        super(NoQueryForTextGSuiteTrack, self)._init(**kwArgs)
Example #5
0
 def _parseVal(self, ge, valStr):
     if self._handleNan(valStr) == 'nan':
         ge.val = BINARY_MISSING_VAL
     elif valStr == '0':
         ge.val = False
     elif valStr == '1':
         ge.val = True
     else:
         raise InvalidFormatError('Could not parse value: ' + valStr +
                                  ' as target/control.')
Example #6
0
 def _handleTrackDefinitionLineIfPresent(self, firstLine):
     if firstLine.startswith('track'):
         if firstLine.startswith('track type=wiggle_0'):
             self._numHeaderLines = 1
         else:
             raise InvalidFormatError(
                 'The wiggle track definition line must (if present) start with: track type=wiggle_0'
             )
     else:
         self._numHeaderLines = 0
Example #7
0
    def _parseVal(self, ge, cols):
        if self._numCols >= 5:
            if cols[4] in ['-', '.']:
                val = 0
            else:
                val = int(cols[4])

            if val < 0 or val > 1000:
                raise InvalidFormatError("Error: BED score column must be an integer between 0 and 1000: %s. Perhaps you instead " + \
                                         "should use the file formats 'valued.bed' or 'gtrack'?")
            ge.val = val
Example #8
0
    def _init(self, fileFormat=None, **kwArgs):
        self.fileFormat = fileFormat  #To handle deprecated 'binary' value

        if self.fileFormat is not None and self.fileFormat != PREPROCESSED:
            raise InvalidFormatError('Track protocol "%s" requires the file format ' % self.SCHEME +\
                                     'to be "%s", not "%s".' % (PREPROCESSED, fileFormat))

        self.fileFormat = PREPROCESSED

        kwArgs['fileFormat'] = fileFormat
        super(PreprocessedGSuiteTrack, self)._init(**kwArgs)
Example #9
0
    def _checkFixedStep(self, line, start, step):
        fixedStep = self._isFixedStepLine(line)

        if self._fixedStep is not None and self._fixedStep != fixedStep:
            raise InvalidFormatError(
                'WIG fixedStep and variableStep declaration lines are not allowed mix within the same file.'
            )

        if fixedStep:
            if start is None:
                raise InvalidFormatError(
                    'WIG fixedStep requires start values in the declaration line.'
                )
        else:
            if start is not None or step is not None:
                raise InvalidFormatError(
                    'WIG variableStep may not have start and step values in the declaration line.'
                )

        return fixedStep
 def _getStrandFromString(cls, val):
     if val == '+':
         return True
     elif val == '-':
         return False
     elif val == '.':
         return BINARY_MISSING_VAL
     #val == ''?
     else:
         raise InvalidFormatError(
             "Error: strand must be either '+', '-' or '.'. Value: %s" %
             val)
Example #11
0
    def getUserBinSource(self, regSpec, binSpec):
        ubSourceInfo = self._getUserBinSourceInfo(regSpec)

        try:
            ubSource = ubSourceInfo.generateUserBinSource(regSpec, binSpec)
            ubSource.description = ubSourceInfo.describeUserBinSource(
                regSpec, binSpec)
            return ubSource
        except Exception, e:
            raise InvalidFormatError(
                'Unable to parse region specification. Error message: "%s"' %
                e)
Example #12
0
    def _getUserBinSourceInfo(self, regSpec):
        ubSourceInfo = self._ubSourceInfoDict.get(regSpec)
        if ubSourceInfo is None:
            if regSpec in self._ALL_UB_SOURCE_INFO_CLS_DICT.keys():
                name = self._ALL_UB_SOURCE_INFO_CLS_DICT[regSpec].NAME
                raise InvalidFormatError('Cannot create user bins of type: "%s", as it is not ' % name +\
                                         'available for the selected genome and tracks (if any).')
            else:
                ubSourceInfo = self._ubSourceInfoDict[
                    self._DEFAULT_KEY_WHEN_NO_MATCH]

        return ubSourceInfo
def _parseHeaderLine(line):
    headerLine = line[2:]
    splitLine = headerLine.split(':')

    if len(splitLine) != 2:
        raise InvalidFormatError('Header line not understood: ' +
                                 repr(headerLine))

    key, val = splitLine
    key = key.lower()
    val = val.strip()

    if key == GENOME_HEADER:
        val = urlDecodePhrase(val)
    elif key not in HEADER_VAR_DICT:
        if key.endswith(' '):
            raise InvalidFormatError(
                'Header variable "%s" must not end with space.' % key)

        # raise InvalidFormatError('Header variable "%s" is not part of the GSuite format.' % key)
        if urlDecodePhrase(key) != key:
            raise InvalidFormatError(
                'Custom header variable names in GSuite do not support URL '
                'escaping. Offending header variable: "{}"'.format(key))
    else:
        val = val.lower()

        if val not in HEADER_VAR_DICT[key].allowed:
            raise InvalidFormatError(
                'Value "%s" is not allowed for header "%s". Allowed values: %s'
                % (val, key, ', '.join(HEADER_VAR_DICT[key].allowed)))

        if key == FILE_TYPE_HEADER:
            if val == TEXT:
                val = PRIMARY
            elif val == BINARY:
                val = PREPROCESSED

    return key, val
def _parseColumnSpecLine(line):
    colNames = line[3:].lower().split('\t')

    # if any(' ' in colName for colName in colNames):
    #    raise InvalidFormatError('Error in column specification line: %s ' % repr(line) +
    #                             'Please separate columns by tab, not space.')

    colNames = [(col if col not in ALL_STD_COL_NAMES else col)
                for col in colNames]

    for colName in colNames:
        if colNames.count(colName) > 1:
            raise InvalidFormatError(
                'Column "%s" appears multiple times in the ' % colName +
                'column specification line.')

    if colNames[0] == '':
        raise InvalidFormatError(
            'Column specification line requires at least one'
            'column (the "uri" column), but none is specified.')

    if colNames[0] != URI_COL:
        raise InvalidFormatError('The first column must be "%s", not "%s".' %
                                 (URI_COL, colNames[0]))

    if any(colName.strip() == '' for colName in colNames):
        raise InvalidFormatError('Empty column names are not allowed.')

    curOptStdColIdx = -1
    nonStdColsFound = []
    for colName in colNames[1:]:
        if colName in OPTIONAL_STD_COL_NAMES:
            nextOptStdColIdx = OPTIONAL_STD_COL_NAMES.index(colName)

            if nonStdColsFound:
                raise InvalidFormatError(
                    'Non-standard columns "%s" ' % ', '.join(nonStdColsFound) +
                    'encountered before standard column "%s".' % colName)
            elif nextOptStdColIdx <= curOptStdColIdx:
                raise InvalidFormatError(
                    'Standard columns are not in the correct order: '
                    '%s.' % ', '.join('"%s"' % col
                                      for col in OPTIONAL_STD_COL_NAMES))

            curOptStdColIdx = nextOptStdColIdx
        else:
            if urlDecodePhrase(colName) != colName:
                raise InvalidFormatError(
                    'Column names in GSuite do not support URL escaping. '
                    'Offending column name: "{}"'.format(colName))
            nonStdColsFound.append(colName)

    return colNames
Example #15
0
    def attributes(self, attributes):
        self._attributes = OrderedDict()

        for key, val in attributes.iteritems():
            if val is not None:
                if val == '':
                    raise InvalidFormatError(
                        'Empty attribute contents not allowed. '
                        'Please use ".", the period character, to '
                        'indicate missing values')

                if self._doUnquote:
                    val = unquote(val)
                self._attributes[key] = val
Example #16
0
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError(
                    'FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
Example #17
0
    def _next(self, line):
        cols = line.split()
        if len(cols) != 15:
            raise InvalidFormatError(
                'File must contain exactly 15 columns, contains ' +
                str(len(cols)))

        self._genomeElement.chr = self._checkValidChr(cols[0])
        self._genomeElement.start = self._checkValidStart(
            self._genomeElement.chr, int(cols[1]))
        self._genomeElement.end = self._checkValidEnd(
            self._genomeElement.chr,
            int(cols[2]),
            start=self._genomeElement.start)
        self._genomeElement.strand = self._getStrandFromString(cols[5])

        self._genomeElement.val = [numpy.nan] * self._globExpCount
        expCount = int(cols[12])
        expIds = [int(x) for x in cols[13].split(',') if x != '']
        expScores = [numpy.float(x) for x in cols[14].split(',') if x != '']

        if len(expIds) != expCount:
            raise InvalidFormatError('expId length (' + str(len(expIds)) +
                                     ') is not equal to expCount (' +
                                     str(expCount) + ')')
        if len(expScores) != expCount:
            raise InvalidFormatError('expScores length (' + str(len(expIds)) +
                                     ') is not equal to expCount (' +
                                     str(expScores) + ')')

        for i in range(expCount):
            if expIds[i] >= self._globExpCount:
                raise InvalidFormatError('expId ' + str(expIds[i]) + ' too large. expNames in header line defines ' + str(self._globExpCount) + ' experiments. '+\
                                         'Thsi could be because of counting from 1 instead of from 0.')
            self._genomeElement.val[expIds[i]] = expScores[i]

        return self._genomeElement
def _parseTrackLine(trackLine, colNames, headerVars):
    colVals = trackLine.split('\t')

    if len(colVals) != len(colNames):
        raise InvalidFormatError(
            'The number of columns in track line: %s ' % (repr(trackLine)) +
            'is not equal to the number of columns in the '
            'column specification line (%s != %s)' %
            (len(colVals), len(colNames)))

    from copy import copy
    remainingColNames = copy(colNames)

    assert colNames[0] == URI_COL
    kwArgs = {}
    for colSpec in ALL_STD_COL_SPECS:
        val = _popValueFromColValsAndNamesIfPresent(colVals, remainingColNames,
                                                    colSpec.colName)
        if val is not None:
            kwArgs[colSpec.memberName] = val
        elif colSpec.headerName in headerVars:
            if headerVars[colSpec.headerName] != MULTIPLE:
                kwArgs[colSpec.memberName] = headerVars[colSpec.headerName]

    attributes = OrderedDict(zip(remainingColNames, colVals))
    for key, val in attributes.iteritems():
        if val == '.':
            del attributes[key]
    kwArgs['attributes'] = attributes

    try:
        track = GSuiteTrack(**kwArgs)
    except InvalidFormatError as e:
        errorMsg = 'Error in track line %s:\n' % repr(trackLine) + e.message
        raise InvalidFormatError(errorMsg)

    return track
Example #19
0
def _parseHeaderLine(line):
    headerLine = line[2:]
    splitLine = headerLine.split(':')

    if len(splitLine) != 2:
        raise InvalidFormatError('Header line not understood: ' +
                                 repr(headerLine))

    key, val = splitLine
    key = key.lower()
    val = val.strip()

    if key == GENOME_HEADER:
        val = unquote(val)
    else:
        val = val.lower()

    if key not in HEADER_VAR_DICT:
        if key.endswith(' '):
            raise InvalidFormatError(
                'Header variable "%s" must not end with space.' % key)

        raise InvalidFormatError(
            'Header variable "%s" is not part of the GSuite format.' % key)

    if val not in HEADER_VAR_DICT[key].allowed:
        raise InvalidFormatError('Value "%s" is not allowed for header "%s". Allowed values: %s' \
                                 % (val, key, ', '.join(HEADER_VAR_DICT[key].allowed)))

    if key == FILE_TYPE_HEADER:
        if val == TEXT:
            val = PRIMARY
        elif val == BINARY:
            val = PREPROCESSED

    return key, val
    def __init__(self,
                 uri,
                 title=None,
                 fileFormat=None,
                 trackType=None,
                 genome=None,
                 attributes=OrderedDict(),
                 comment=None,
                 doUnquote=True):
        self._doUnquote = doUnquote

        self._parsedUri = urlparse.urlparse(uri)
        if self._parsedUri.query:
            self._queryDict = urlparse.parse_qs(self._parsedUri.query,
                                                keep_blank_values=False,
                                                strict_parsing=True)

        if doUnquote:
            self._parsedUri = unquoteParseResults(self._parsedUri)
            if self._parsedUri.query:
                self._queryDict = unquoteQueryDict(self._queryDict)

        assert self._parsedUri.scheme == self.SCHEME, [
            self._parsedUri.scheme, self.SCHEME
        ]
        if self._parsedUri.fragment != '':
            raise InvalidFormatError(
                'Fragment part of URI is not allowed: "#%s"' %
                self._parsedUri.fragment)

        self.title = title
        self.fileFormat = fileFormat if fileFormat is not None else \
            HEADER_VAR_DICT[FILE_FORMAT_HEADER].default
        self.trackType = trackType if trackType is not None else \
            HEADER_VAR_DICT[TRACK_TYPE_HEADER].default
        self.genome = urlDecodePhrase(genome) if genome is not None else \
            HEADER_VAR_DICT[GENOME_HEADER].default
        self.attributes = attributes
        self.comment = comment

        self._init(uri=uri,
                   title=title,
                   fileFormat=fileFormat,
                   trackType=trackType,
                   genome=genome,
                   attributes=attributes,
                   comment=comment,
                   doUnquote=doUnquote)
Example #21
0
    def getStdTrackNameFromGalaxyTN(cls,
                                    galaxyTN,
                                    allowUnsupportedSuffixes=False):
        if isinstance(galaxyTN, basestring):
            galaxyTN = galaxyTN.split(':')

        assert galaxyTN[0].lower() == 'galaxy', str(galaxyTN)
        if not allowUnsupportedSuffixes and not galaxyTN[1].lower(
        ) in getSupportedFileSuffixes():
            raise InvalidFormatError('File type "%s" is not supported.' %
                                     galaxyTN[1].lower())

        fn = cls.extractFnFromGalaxyTN(galaxyTN)
        id = cls.extractIdFromGalaxyFn(fn)
        name = galaxyTN[-1]
        return ExternalTrackManager.createStdTrackName(id, name)
    def __iter__(self):
        try:
            while not self._finished:
                yield self._curEl
                self._curEl = self._geIter.next()
                if self._curEl.chr != self._chrList[-1]:
                    if self._curEl.chr in self._chrList:
                        raise InvalidFormatError(
                            'Error: chromosome %s has been previously encountered. Dense datasets must not skip back and forth between chromosomes.'
                            % self._curEl.chr)
                    self._chrList.append(self._curEl.chr)
                    break

        except StopIteration:
            self._finished = True
            raise
    def addTrack(self, track, allowDuplicateTitles=True):
        if track.title in self._titleToTrackDict:
            if allowDuplicateTitles:
                for i in range(self.numTracks()):
                    candTitle = track.title + ' (%s)' % (i + 2)
                    if candTitle not in self._titleToTrackDict:
                        track.title = candTitle
                        break
            else:
                raise InvalidFormatError(
                    'Multiple tracks with the same title is not allowed: ' +
                    track.title)

        self._updatedHeaders = False
        self._titleToTrackDict[track.title] = track
        self._trackList.append(track)
    def _combineTrackTypeVals(self, curVal, nextVal):
        try:
            return self._combineEqualVals(curVal, nextVal)
        except InvalidFormatError:
            from gold.track.TrackFormat import TrackFormatReq
            curReq = TrackFormatReq(name=curVal)
            nextReq = TrackFormatReq(name=nextVal)

            maxCommonCoreType = TrackFormatReq.maxCommonCoreFormat(
                curReq, nextReq)
            if maxCommonCoreType is not None:
                return maxCommonCoreType.getFormatName().lower()

            raise InvalidFormatError(
                'Track types "%s" and "%s" are not possible to combine. ' %
                (curVal, nextVal))
    def customHeaders(self, customHeaders):
        self._customHeaders = OrderedDict()

        for key, val in customHeaders.iteritems():
            if val is not None:
                if val == '':
                    raise InvalidFormatError(
                        'Empty header values not allowed. '
                        'Please use ".", the period character, to '
                        'indicate missing values')

                if key.lower() in self._customHeaders:
                    raise ArgumentValueError(
                        'Custom header "{}" appears multiple times in the '
                        'header list. Note that custom headers are case '
                        'insensitive (e.g., "ABC" and "abc" is the same '
                        'header).'.format(key))
                self.setCustomHeader(key, val)
Example #26
0
    def __init__(self, genome, trackNameList, **kwArgs):
        self._ubSourceInfoDict = OrderedDict()
        self._ubSourceInfoDictFromName = OrderedDict()

        assert genome is not None

        from quick.util.GenomeInfo import GenomeInfo
        if not GenomeInfo(genome).isInstalled():
            raise InvalidFormatError(
                'The specified genome "%s" is not installed.' % genome)

        for key in self._ALL_UB_SOURCE_INFO_CLS_DICT:
            ubSourceInfo = self._ALL_UB_SOURCE_INFO_CLS_DICT[key](
                genome, trackNameList, **kwArgs)
            if ubSourceInfo.isAvailable():
                self._ubSourceInfoDictFromName[
                    ubSourceInfo.NAME] = ubSourceInfo
                self._ubSourceInfoDict[key] = ubSourceInfo
    def _compute(self):

        binSize = self._children[0].getResult()
        tv = self._children[1].getResult()
        starts = list(tv.startsAsNumpyArray())
        ends = starts[:]
        vals = strandType = strandList = None
        if len(starts) > 0:
            if starts[0] > 0:
                starts.insert(0, 0)
            else:
                del ends[0]

            if len(ends) == 0 or ends[-1] < binSize - 1:
                ends.append(binSize - 1)
            else:
                del starts[-1]

            strands = tv.strandsAsNumpyArray()

            if strands != None:
                strands = set(strands)
                if len(strands) > 1:
                    raise InvalidFormatError(
                        'All strands within a bin must be of same sort: error at %s'
                        % (tv.genomeAnchor))
                strandType = strands.pop()
                strandList = [strandType] * len(starts)

            vals = range(len(starts) -
                         1, -1, -1) if strandType == 0 else range(len(starts))

            starts = np.array(starts) + tv.genomeAnchor.start
            ends = np.array(ends) + tv.genomeAnchor.start

        strTemplate = self._region.chr + '\t%s\t%s\t%s\t' + getStringFromStrand(
            strandType)
        return '\n'.join([
            strTemplate % (str(starts[i]), str(ends[i]), str(vals[i]))
            for i in xrange(len(starts))
        ])

        return TrackView(genomeAnchor=tv.genomeAnchor, startList=starts, endList=ends, valList=vals, \
                         strandList=strandList, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
Example #28
0
 def checkIfEdgeIdsExist(genome, trackName, allowOverlaps):
     collector = PreProcMetaDataCollector(genome, trackName)
     if not collector.getTrackFormat().isLinked():
         return
     
     uniqueIds = numpy.array([], dtype='S')
     uniqueEdgeIds = numpy.array([], dtype='S')
     
     for chr in collector.getPreProcessedChrs(allowOverlaps):
         trackSource = TrackSource()
         trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps)
         uniqueIds = numpy.unique(numpy.concatenate((uniqueIds, trackData['id'][:])))
         uniqueEdgeIds = numpy.unique(numpy.concatenate((uniqueEdgeIds, trackData['edges'][:].flatten())))
     
     uniqueIds = uniqueIds[uniqueIds != '']
     uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != '']
     
     unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds)
     if len(unmatchedIds) > 0:
         raise InvalidFormatError("Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
    def attributes(self, attributes):
        self._attributes = OrderedDict()

        for key, val in attributes.iteritems():
            if val is not None:
                if val == '':
                    raise InvalidFormatError(
                        'Empty attribute contents not allowed. '
                        'Please use ".", the period character, to '
                        'indicate missing values')

                if self._doUnquote:
                    val = urlDecodePhrase(val)
                if key.lower() in self._attributes:
                    raise ArgumentValueError(
                        'Attribute "{}" appears multiple times in the '
                        'attribute list. Note that attributes are case '
                        'insensitive (e.g., "ABC" and "abc" is the same '
                        'attribute).'.format(key))
                self.setAttribute(key, val)
Example #30
0
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line) > 0 and line[0] == '#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError(
                "Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge