def _determineHeaderLines(self, hbColumns, columns): self._setHeaderDict('track type', Gtrack.getTrackTypeFromColumnSpec(columns)) self._setHeaderDict('value type', self._getGtrackValueType()) self._setHeaderDict('value dimension', Gtrack.getGtrackValueDimension(self._geSource.getValDim())) self._setHeaderDict('undirected edges', self._geSource.hasUndirectedEdges()) self._setHeaderDict('edge weights', ('weights' in hbColumns)) self._setHeaderDict('edge weight type', self._getGtrackEdgeWeightType()) self._setHeaderDict('edge weight dimension', Gtrack.getGtrackValueDimension(self._geSource.getEdgeWeightDim())) self._setHeaderDict('uninterrupted data lines', not self._hasMoreThanOneBoundingRegion()) self._setHeaderDict('sorted elements', self._geSource.isSorted()) self._setHeaderDict('no overlapping elements', self._geSource.hasNoOverlappingElements()) self._setHeaderDict('circular elements', self._geSource.hasCircularElements()) compliesToSubtype = False if self._USE_EXTENDED_GTRACK: self._setHeaderDict('fixed length', self._geSource.getFixedLength()) self._setHeaderDict('fixed gap size', self._geSource.getFixedGapSize()) self._setHeaderDict('fixed-size data lines', self._determineIfFixedSizeDataLines(columns)) if self._headerDict['fixed-size data lines']: self._setHeaderDict('data line size', self._geSource.getValDim()) hbColumns, columns = self._adjustColumnsAccordingToHeaderLines(hbColumns, columns) hbColumns, columns, compliesToSubtype = self._determineIfFileCompliesToSubtypes(hbColumns, columns) if not compliesToSubtype: self._setHeaderDict('1-indexed', self._geSource.inputIsOneIndexed()) self._setHeaderDict('end inclusive', self._geSource.inputIsEndInclusive()) for header, val in self._forcedHeaderDict.iteritems(): if header not in self._headerDict: self._headerDict[header] = val return hbColumns, columns
def getOptionsBox5(prevChoices): '''Returns a list of options to be displayed in the second options box, which will be displayed after a selection is made in the first box. prevChoices is a list of selections made by the web-user in the previous input boxes (that is, list containing only one element for this case) ''' if prevChoices[2] and prevChoices[3]: fnSource = ExternalTrackManager.extractFnFromGalaxyTN( prevChoices[2].split(':')) fnDB = ExternalTrackManager.extractFnFromGalaxyTN( prevChoices[3].split(':')) gtrackDBColumnSpec = GtrackGenomeElementSource( fnDB).getColumnSpec().keys() gtrackSourceColumnSpec = GtrackGenomeElementSource( fnSource).getColumnSpec().keys() resultlist = ['Element id'] if 'id' in gtrackDBColumnSpec and 'id' in gtrackSourceColumnSpec else [] commonColumns = list( set(gtrackDBColumnSpec) & set(gtrackSourceColumnSpec)) tupleKey = True if any(x in commonColumns for x in ['start', 'end']) else False resultlist += ['Positional information'] if tupleKey else [] return resultlist return None
def _getHeaders(prevChoices): numCols = TabularToGtrackTool._getFileContentsInfo(prevChoices).numCols if prevChoices.columnSelection == 'Select individual columns': header = [] for i in xrange(numCols): if hasattr(prevChoices, 'column%s' % i): colHeader = getattr(prevChoices, 'column%s' % i) if colHeader is None or colHeader == '-- ignore --': header.append('') elif colHeader == '-- custom --': header.append(getattr(prevChoices, 'customColumn%s' % i).strip()) else: header.append(colHeader) else: header.append('') return header else: genome = prevChoices.genome if prevChoices.selectGenome == 'Yes' else None inFn = ExternalTrackManager.extractFnFromGalaxyTN(prevChoices.colSpecFile.split(':')) try: geSource = GtrackGenomeElementSource(inFn, genome=genome) geSource.parseFirstDataLine() return geSource.getColumns()[:numCols] except Exception, e: return []
def _createColumnSpec(self, cols, addAnyExtraFixedCols=True): GtrackGenomeElementSource._createColumnSpec(self, cols, addAnyExtraFixedCols) self._headerDict[ 'track type'] = GtrackGenomeElementSource.getTrackTypeFromColumnSpec( self._columnSpec)
def _getHeaders(prevChoices): numCols = TabularToGtrackTool._getFileContentsInfo(prevChoices).numCols if prevChoices.columnSelection != 'Base columns on existing GTrack file': header = [] for i in xrange(numCols): if hasattr(prevChoices, 'column%s' % i): colHeader = getattr(prevChoices, 'column%s' % i) if colHeader is None or colHeader == '-- ignore --': header.append('') elif colHeader == '-- custom --': header.append(getattr(prevChoices, 'customColumn%s' % i).strip()) else: header.append(colHeader) else: header.append('') return header else: genome = prevChoices.genome if prevChoices.selectGenome == 'Yes' else None try: inFn = ExternalTrackManager.extractFnFromGalaxyTN(prevChoices.colSpecFile.split(':')) geSource = GtrackGenomeElementSource(inFn, genome=genome) geSource.parseFirstDataLine() return geSource.getColumns()[:numCols] except Exception, e: return []
def _commonComplementGtrackFile(origFn, dbFn, intersectingFactor, gtrackColsToAdd, genome): origGESource = GtrackGenomeElementSource(origFn, genome) dbGESource = GtrackGenomeElementSource(dbFn, genome) dbPrefixes = dbGESource.getPrefixList() if intersectingFactor == 'id': fullDbDict = IdFullInfoDict(dbGESource, dbPrefixes) elif intersectingFactor == 'position': fullDbDict = TupleFullInfoDict(dbGESource, dbPrefixes) else: ShouldNotOccurError forcedHeaderDict = {} dbHeaderDict = dbGESource.getHeaderDict() if 'value' in gtrackColsToAdd: forcedHeaderDict['value type'] = dbHeaderDict['value type'] forcedHeaderDict['value dimension'] = dbHeaderDict['value dimension'] if 'edges' in gtrackColsToAdd: forcedHeaderDict['edge weight type'] = dbHeaderDict['edge weight type'] forcedHeaderDict['edge weight dimension'] = dbHeaderDict['edge weight dimension'] composerCls = ExtendedGtrackComposer if origGESource.isExtendedGtrackFile() else StdGtrackComposer composedFile = composerCls( ElementComplementer(origGESource, fullDbDict, gtrackColsToAdd), \ forcedHeaderDict=forcedHeaderDict).returnComposed() return expandHeadersOfGtrackFileAndReturnComposer('', genome, strToUseInsteadOfFn=composedFile)
def getOptionsBox6(prevChoices): if prevChoices[3]: extraDbColumnsDict = OrderedDict() fnSource = ExternalTrackManager.extractFnFromGalaxyTN(prevChoices[2].split(':')) fnDB = ExternalTrackManager.extractFnFromGalaxyTN(prevChoices[3].split(':')) gtrackDB = GtrackGenomeElementSource(fnDB) gtrackSource = GtrackGenomeElementSource(fnSource) extraDbColumns = [v for v in gtrackDB.getColumns() if not v in gtrackSource.getColumns()] #list(set(gtrackDBColumnSpec) - set(gtrackSourceColumnSpec)) for column in extraDbColumns: extraDbColumnsDict[column] = False return extraDbColumnsDict
def _determineHeaderLines(self, hbColumns, columns): self._setHeaderDict('track type', Gtrack.getTrackTypeFromColumnSpec(columns)) self._setHeaderDict('value type', self._getGtrackValueType()) self._setHeaderDict( 'value dimension', Gtrack.getGtrackValueDimension(self._geSource.getValDim())) self._setHeaderDict('undirected edges', self._geSource.hasUndirectedEdges()) self._setHeaderDict('edge weights', ('weights' in hbColumns)) self._setHeaderDict('edge weight type', self._getGtrackEdgeWeightType()) self._setHeaderDict( 'edge weight dimension', Gtrack.getGtrackValueDimension(self._geSource.getEdgeWeightDim())) self._setHeaderDict('uninterrupted data lines', not self._hasMoreThanOneBoundingRegion()) self._setHeaderDict('sorted elements', self._geSource.isSorted()) self._setHeaderDict('no overlapping elements', self._geSource.hasNoOverlappingElements()) self._setHeaderDict('circular elements', self._geSource.hasCircularElements()) compliesToSubtype = False if self._USE_EXTENDED_GTRACK: self._setHeaderDict('fixed length', self._geSource.getFixedLength()) self._setHeaderDict('fixed gap size', self._geSource.getFixedGapSize()) self._setHeaderDict('fixed-size data lines', self._determineIfFixedSizeDataLines(columns)) if self._headerDict['fixed-size data lines']: self._setHeaderDict('data line size', self._geSource.getValDim()) hbColumns, columns = self._adjustColumnsAccordingToHeaderLines( hbColumns, columns) hbColumns, columns, compliesToSubtype = self._determineIfFileCompliesToSubtypes( hbColumns, columns) if not compliesToSubtype: self._setHeaderDict('1-indexed', self._geSource.inputIsOneIndexed()) self._setHeaderDict('end inclusive', self._geSource.inputIsEndInclusive()) for header, val in self._forcedHeaderDict.iteritems(): if header not in self._headerDict: self._headerDict[header] = val return hbColumns, columns
def _checkValidEnd(self, chr, end, start=None): if start is not None and end <= start: if not self._headerDict['circular elements']: self._headerDict['circular elements'] = True start = None return GtrackGenomeElementSource._checkValidEnd(self, chr, end, start)
def getGeSourceList(cls, genome, tracks): from quick.application.ExternalTrackManager import ExternalTrackManager from gold.origdata.BedGenomeElementSource import BedGenomeElementSource, BedCategoryGenomeElementSource from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.TrackGenomeElementSource import FullTrackGenomeElementSource geSourceList = [] trackNamesWithoutPath = [] for track in tracks: try: fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN( track) fn = ExternalTrackManager.extractFnFromGalaxyTN(track) if fileType == 'category.bed': geSourceList.append(BedCategoryGenomeElementSource(fn)) elif fileType == 'gtrack': geSourceList.append(GtrackGenomeElementSource(fn)) else: geSourceList.append(BedGenomeElementSource(fn)) trackNamesWithoutPath.append( ExternalTrackManager.extractNameFromHistoryTN(track)) except: # it is not a history, must be in HB track repository geSourceList.append( FullTrackGenomeElementSource(genome, track, allowOverlaps=True)) trackNamesWithoutPath.append(':'.join(track)) return geSourceList, trackNamesWithoutPath
def _getValInCorrectType(self, val, valueOrEdgeWeight='value', isEmptyElement=False): headerDictInFile = self.getHeaderDictInFile() valTypeList = ['binary', 'number', 'category', 'character'] for i, valueType in enumerate(valTypeList): if valueOrEdgeWeight in self._valTypeIndexDict and self._valTypeIndexDict[ valueOrEdgeWeight] > i: continue valTypeInfo = GtrackGenomeElementSource.VAL_TYPE_DICT[valueType] if self._isValOfParticularType(val, valTypeInfo): self._noteIfAllValuesAreMissing(valueOrEdgeWeight, val, valTypeInfo) self._valTypeIndexDict[valueOrEdgeWeight] = i valueDim = self._getGtrackValueDim(val, valTypeInfo, valueOrEdgeWeight) if not '%s type' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s type' % valueOrEdgeWeight] = valTypeList[i] if not '%s dimension' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s dimension' % valueOrEdgeWeight] = valueDim return GtrackGenomeElementSource._getValInCorrectType( self, val, valueOrEdgeWeight, isEmptyElement) raise ShouldNotOccurError()
def _checkValidEnd(self, chr, end, start=None): if start is not None and end <= start: if not self._headerDict['circular elements']: self._headerDict['circular elements'] = True start = None return GtrackGenomeElementSource._checkValidEnd(self, chr, end, start)
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager genome = choices[0] preProcTN1 = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN( genome, choices[2].split( ':')) if choices[1] == 'history' else choices[2].split(':') chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)]) trackType = choices[3].split(':')[1] fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[3].split(':')) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter( BedGenomeElementSource(fnSource, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter( GtrackGenomeElementSource(fnSource, genome=genome)).__iter__() #headLinesStr = geSource.getHeaderLines().replace('##','\n##') else: raise InvalidFormatError( 'The Binning must be of the following formats: gtrack, valued.bed, category.bed ,bed ...' ) cls.PrintResultToHistItem(galaxyFn, geSource, preProcTN1, genome, username)
def _commonComplementGtrackFile(origFn, dbFn, intersectingFactor, gtrackColsToAdd, genome): origGESource = GtrackGenomeElementSource(origFn, genome) dbGESource = GtrackGenomeElementSource(dbFn, genome) dbPrefixes = dbGESource.getPrefixList() if intersectingFactor == 'id': fullDbDict = IdFullInfoDict(dbGESource, dbPrefixes) elif intersectingFactor == 'position': fullDbDict = TupleFullInfoDict(dbGESource, dbPrefixes) else: ShouldNotOccurError forcedHeaderDict = {} dbHeaderDict = dbGESource.getHeaderDict() if 'value' in gtrackColsToAdd: forcedHeaderDict['value type'] = dbHeaderDict['value type'] forcedHeaderDict['value dimension'] = dbHeaderDict['value dimension'] if 'edges' in gtrackColsToAdd: forcedHeaderDict['edge weight type'] = dbHeaderDict['edge weight type'] forcedHeaderDict['edge weight dimension'] = dbHeaderDict['edge weight dimension'] composerCls = ExtendedGtrackComposer if origGESource.isExtendedGtrackFile() else StdGtrackComposer composedFile = composerCls( ElementComplementer(origGESource, fullDbDict, gtrackColsToAdd), \ forcedHeaderDict=forcedHeaderDict).returnComposed() return expandHeadersOfGtrackFileAndReturnComposer('', genome, strToUseInsteadOfFn=composedFile)
def __init__(self, geSource, fullDbDict, gtrackColsToAdd): self._prefixesToAdd = [GtrackGenomeElementSource.convertNameFromGtrack(col) for col in gtrackColsToAdd] if 'edges' in self._prefixesToAdd: self._prefixesToAdd.append('weights') ElementModifierGESourceWrapper.__init__(self, geSource) self._fullDbDict = fullDbDict self._prefixList = geSource.getPrefixList() + self._prefixesToAdd
def _parseEdges(self, edgeStr): if edgeStr != '.': for edgeSpec in edgeStr.split(';'): if '=' in edgeStr: if not self._headerDict['edge weights']: self._headerDict['edge weights'] = True self._getValInCorrectType(edgeSpec.split('=')[1], 'edge weight') return GtrackGenomeElementSource._parseEdges(self, edgeStr)
def __init__(self, geSource, fullDbDict, gtrackColsToAdd): self._prefixesToAdd = [GtrackGenomeElementSource.convertNameFromGtrack(col) for col in gtrackColsToAdd] if 'edges' in self._prefixesToAdd: self._prefixesToAdd.append('weights') ElementModifierGESourceWrapper.__init__(self, geSource) self._fullDbDict = fullDbDict self._prefixList = geSource.getPrefixList() + self._prefixesToAdd
def _parseEdges(self, edgeStr): if edgeStr != '.': for edgeSpec in edgeStr.split(';'): if '=' in edgeSpec: if not self._headerDict['edge weights']: self._headerDict['edge weights'] = True self._getValInCorrectType( edgeSpec.split('=')[1], 'edge weight') return GtrackGenomeElementSource._parseEdges(self, edgeStr)
def _iter(self): self._valTypeIndexDict = {} self._valLenDict = {} self._allMissingDict = {} #self._headerDict['no overlapping elements'] = True self._headerDict['sorted elements'] = True if self._headerDict['track type'].startswith('linked'): self._headerDict['undirected edges'] = True return GtrackGenomeElementSource._iter(self)
def _iter(self): self._valTypeIndexDict = {} self._valLenDict = {} self._allMissingDict = {} #self._headerDict['no overlapping elements'] = True self._headerDict['sorted elements'] = True if self._headerDict['track type'].startswith('linked'): self._headerDict['undirected edges'] = True return GtrackGenomeElementSource._iter(self)
def getOptionsBoxTrackType(prevChoices): if prevChoices.columnSelection == 'Base columns on existing GTrack file' and not prevChoices.colSpecFile: return if prevChoices.history or prevChoices.input: trackType = GtrackGenomeElementSource.getTrackTypeFromColumnSpec(TabularToGtrackTool._getHeaders(prevChoices)) if trackType is not None: words = [x.capitalize() for x in trackType.split()] abbrv = ''.join([x[0] for x in words]) fullTrackType = ' '.join(words) + ' (%s)' % abbrv return (fullTrackType, 1, True)
def _getGtrackValueDim(self, val, valTypeInfo, valueOrEdgeWeight): valLen = len(val.split(valTypeInfo.delim) if valTypeInfo.delim != '' else val) if valueOrEdgeWeight in self._valLenDict: if self._valLenDict[valueOrEdgeWeight] != valLen: self._valLenDict[valueOrEdgeWeight] = 0 else: self._valLenDict[valueOrEdgeWeight] = valLen valDim = GtrackGenomeElementSource.getGtrackValueDimension(self._valLenDict[valueOrEdgeWeight]) return valDim
def _composeBoundingRegionLine(self, boundingRegionTuple): region = copy(boundingRegionTuple.region) if self._headerDict['1-indexed']: region.start = region.start+1 if region.start is not None else None region.end = region.end+1 if region.end is not None else None if self._headerDict['end inclusive']: region.end = region.end-1 if region.end is not None else None brLinePartList = [(Gtrack.convertNameToGtrack(attr), getattr(region, attr)) for attr in ['genome', 'chr', 'start', 'end']] return '####' + '; '.join(k + '=' + self._formatPhraseWithCorrectChrUsage(str(v), useUrlEncoding=True, notAllowedChars='=;#\t') \ for k,v in brLinePartList if v is not None) + os.linesep
def _handleEndOfFile(self): GtrackGenomeElementSource._handleEndOfFile(self) #To fix an issue where value dimension is "list" if the value type was wrongly #guessed for early elements. newIter = self.__iter__() newIter._valTypeIndexDict = self._valTypeIndexDict newIter._handleEndOfFile = newIter._basicHandleEndOfFile try: while True: newIter.next() except StopIteration: pass self._valLenDict = newIter._valLenDict if len(self._uniqueEdgeIds) == 0: self._headerDict['undirected edges'] = False for valueOrEdgeWeight in ['value', 'edge weight']: if valueOrEdgeWeight in newIter._allMissingDict and newIter._allMissingDict[valueOrEdgeWeight] == True: self._headerDict['%s type' % valueOrEdgeWeight] = 'number'
def _getGtrackValueDim(self, val, valTypeInfo, valueOrEdgeWeight): valLen = len( val.split(valTypeInfo.delim) if valTypeInfo.delim != '' else val) if valueOrEdgeWeight in self._valLenDict: if self._valLenDict[valueOrEdgeWeight] != valLen: self._valLenDict[valueOrEdgeWeight] = 0 else: self._valLenDict[valueOrEdgeWeight] = valLen valDim = GtrackGenomeElementSource.getGtrackValueDimension( self._valLenDict[valueOrEdgeWeight]) return valDim
def _composeBoundingRegionLine(self, boundingRegionTuple): region = boundingRegionTuple.region.getCopy() if self._headerDict['1-indexed']: region.start = region.start + 1 if region.start is not None else None region.end = region.end + 1 if region.end is not None else None if self._headerDict['end inclusive']: region.end = region.end - 1 if region.end is not None else None brLinePartList = [(Gtrack.convertNameToGtrack(attr), getattr(region, attr)) for attr in ['genome', 'chr', 'start', 'end']] return '####' + '; '.join(k + '=' + self._formatPhraseWithCorrectChrUsage(str(v), useUrlEncoding=True, notAllowedChars='=;#\t') \ for k,v in brLinePartList if v is not None) + os.linesep
def _handleEndOfFile(self): GtrackGenomeElementSource._handleEndOfFile(self) #To fix an issue where value dimension is "list" if the value type was wrongly #guessed for early elements. newIter = self.__iter__() newIter._valTypeIndexDict = self._valTypeIndexDict newIter._handleEndOfFile = newIter._basicHandleEndOfFile try: while True: newIter.next() except StopIteration: pass self._valLenDict = newIter._valLenDict if len(self._uniqueEdgeIds) == 0: self._headerDict['undirected edges'] = False for valueOrEdgeWeight in ['value', 'edge weight']: if valueOrEdgeWeight in newIter._allMissingDict and newIter._allMissingDict[ valueOrEdgeWeight] == True: self._headerDict['%s type' % valueOrEdgeWeight] = 'number'
def getOptionsBoxTrackType(prevChoices): if prevChoices.columnSelection == 'Base columns on existing GTrack file' and not prevChoices.colSpecFile: return if prevChoices.history or prevChoices.input: headers = set(TabularToGtrackTool._getHeaders(prevChoices)) if prevChoices.createDense == 'Yes' and 'start' in headers: headers.remove('start') if not 'edges' in headers and TabularToGtrackTool._create3dData(prevChoices): headers.add('edges') trackType = GtrackGenomeElementSource.getTrackTypeFromColumnSpec(headers) if trackType is not None: words = [x.capitalize() for x in trackType.split()] abbrv = ''.join([x[0] for x in words]) fullTrackType = ' '.join(words) + ' (%s)' % abbrv return (fullTrackType, 1, True)
def getGeSource(track, genome=None): from quick.application.ExternalTrackManager import ExternalTrackManager from gold.origdata.BedGenomeElementSource import BedGenomeElementSource, BedCategoryGenomeElementSource from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.TrackGenomeElementSource import FullTrackGenomeElementSource if isinstance(track, basestring): track = track.split(':') try: fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN(track) fn = ExternalTrackManager.extractFnFromGalaxyTN(track) if fileType == 'category.bed': return BedCategoryGenomeElementSource(fn) elif fileType == 'gtrack': return GtrackGenomeElementSource(fn) else: return BedGenomeElementSource(fn) except: return FullTrackGenomeElementSource(genome, track, allowOverlaps=False)
def _getValInCorrectType(self, val, valueOrEdgeWeight='value', isEmptyElement=False): valTypeList = ['binary', 'number', 'category', 'character'] for i,valueType in enumerate(valTypeList): if valueOrEdgeWeight in self._valTypeIndexDict and self._valTypeIndexDict[valueOrEdgeWeight] > i: continue valTypeInfo = GtrackGenomeElementSource.VAL_TYPE_DICT[valueType] if self._isValOfParticularType(val, valTypeInfo): self._noteIfAllValuesAreMissing(valueOrEdgeWeight, val, valTypeInfo) self._valTypeIndexDict[valueOrEdgeWeight] = i valueDim = self._getGtrackValueDim(val, valTypeInfo, valueOrEdgeWeight) if not '%s type' % valueOrEdgeWeight in self.getHeaderDictInFile(): self._headerDict['%s type' % valueOrEdgeWeight] = valTypeList[i] if not '%s dimension' % valueOrEdgeWeight in self.getHeaderDictInFile(): self._headerDict['%s dimension' % valueOrEdgeWeight] = valueDim return GtrackGenomeElementSource._getValInCorrectType(self, val, valueOrEdgeWeight, isEmptyElement) raise ShouldNotOccurError()
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.gtr If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' fnSource = ExternalTrackManager.extractFnFromGalaxyTN( choices[2].split(':')) core = HtmlCore() core.begin() valid = False try: core.header('Validating GTrack headers') core.styleInfoBegin(styleClass='debug') print str(core) core = HtmlCore() gtrackSource = GtrackGenomeElementSource( fnSource, choices[1] if choices[0] == 'Yes' else None, printWarnings=True) core.append('Done') core.styleInfoEnd() core.header('Validating complete GTrack file') core.styleInfoBegin(styleClass='debug') print str(core) core = HtmlCore() try: for ge in gtrackSource: pass except Exception, e: raise else:
def execute(cls, choices, galaxyFn=None, username=''): outputFile = open(galaxyFn, 'w') genome = choices[0] histItem = choices[2] trackItem = choices[3] chromRegsPath = GenomeInfo.getChrRegsFn(genome) chrSizeDict = dict([ ( chr, GenomeInfo.getChrLen(genome, chr)) for chr in GenomeInfo.getChrList(genome)]) geSource = headLinesStr = None if choices[1] == 'history': trackType = choices[2].split(':')[1] from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile tempFn = GalaxyRunSpecificFile(['fromHistory.'+trackType],galaxyFn).getDiskPath(True) fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':')) open(tempFn,'w').write(open(fnSource,'r').read()) if trackType in ['valued.bed', 'category.bed', 'bed']: geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__() elif trackType == 'gtrack': geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__() headLinesStr = geSource.getHeaderLines().replace('##','\n##') cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True) os.remove(tempFn) else: writeHeaderFlag = True for chr in GenomeInfo.getChrList(genome): gRegion = GenomeRegion(genome, chr, 0, chrSizeDict[chr]) plTrack = PlainTrack(trackItem.split(':')) geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__() cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag) writeHeaderFlag = False outputFile.close()
def getOptionsBox6(prevChoices): if prevChoices[3]: extraDbColumnsDict = OrderedDict() fnSource = ExternalTrackManager.extractFnFromGalaxyTN( prevChoices[2].split(':')) fnDB = ExternalTrackManager.extractFnFromGalaxyTN( prevChoices[3].split(':')) gtrackDB = GtrackGenomeElementSource(fnDB) gtrackSource = GtrackGenomeElementSource(fnSource) extraDbColumns = [ v for v in gtrackDB.getColumns() if not v in gtrackSource.getColumns() ] #list(set(gtrackDBColumnSpec) - set(gtrackSourceColumnSpec)) for column in extraDbColumns: extraDbColumnsDict[column] = False return extraDbColumnsDict
def _determineIfFileCompliesToSubtypes(self, hbColumns, columns): if 'subtype url' in self._forcedHeaderDict: subtypeUrlList = [self._forcedHeaderDict['subtype url']] \ if self._forcedHeaderDict['subtype url'] != '' else [] else: subtypeUrlList = self.GTRACK_PRIORITIZED_SUBTYPE_LIST for subtypeUrl in subtypeUrlList: subtypeGESource = Gtrack.getSubtypeGESource(subtypeUrl) subtypeColumns = subtypeGESource.getColumns(orig=False) subtypeHeaders = subtypeGESource.getHeaderDict() numRepeats = 2 if subtypeHeaders[ 'subtype adherence'] == 'redefinable' else 1 for repeat in range(numRepeats): self._setHeaderDict('1-indexed', subtypeHeaders['1-indexed']) self._setHeaderDict('end inclusive', subtypeHeaders['end inclusive']) if subtypeHeaders['subtype adherence'] in [ 'reorderable', 'free' ]: rearrangedColumns = columns rearrangedHbColumns = hbColumns else: colSet = set(columns) subtypeColSet = set(subtypeColumns) if subtypeHeaders['subtype adherence'] == 'redefinable': colsRemoved = list(subtypeColSet - colSet) colsAdded = list(colSet - subtypeColSet) if len(colsRemoved) != len(colsAdded) or len( colsRemoved) > 2: continue colsRedefinedTo = [ 'value', 'edges' ] if repeat == 1 else ['edges', 'value'] rearrangedColumns = [] i, j = (0, 0) for col in subtypeColumns: if col in colsRemoved: rearrangedColumns.append(colsRedefinedTo[i]) i += 1 elif col in colsRedefinedTo: rearrangedColumns.append(colsAdded[j]) j += 1 else: rearrangedColumns.append(col) for col in columns: if col in colsAdded[j:]: rearrangedColumns.append(col) else: rearrangedColumns = [x for x in subtypeColumns if x in colSet] + \ [x for x in columns if x not in subtypeColSet] rearrangedHbColumns = self._getHbColumnsFromGtrackColumns( rearrangedColumns) try: tempFile = StringIO() self._composeContents(tempFile, rearrangedHbColumns, rearrangedColumns, \ deepcopy(self._geSource), onlyNonDefault=True, singleDataLine=True) gtrackGESource = Gtrack('subtypeTest.' + self.getDefaultFileNameSuffix(), printWarnings=False, \ strToUseInsteadOfFn=tempFile.getvalue()) tempFile.close() if gtrackGESource.compliesWithSubtype(subtypeUrl): gtrackGESource._headerDict['subtype url'] = subtypeUrl gtrackGESource._updateHeadersAccordingToSubtype() updatedHeaders = OrderedDict([(key, val) for key,val in gtrackGESource.getHeaderDict().iteritems() \ if val != Gtrack.DEFAULT_HEADER_DICT.get(key)]) for header in updatedHeaders: self._setHeaderDict(header, updatedHeaders[header]) return rearrangedHbColumns, rearrangedColumns, True except Exception, e: continue
def _getHbColumnsFromGtrackColumns(self, columns): return [Gtrack.convertNameFromGtrack(col) for col in columns]
def _determineIfFileCompliesToSubtypes(self, hbColumns, columns): if 'subtype url' in self._forcedHeaderDict: subtypeUrlList = [self._forcedHeaderDict['subtype url']] \ if self._forcedHeaderDict['subtype url'] != '' else [] else: subtypeUrlList = self.GTRACK_PRIORITIZED_SUBTYPE_LIST for subtypeUrl in subtypeUrlList: subtypeGESource = Gtrack.getSubtypeGESource(subtypeUrl) subtypeColumns = subtypeGESource.getColumns(orig=False) subtypeHeaders = subtypeGESource.getHeaderDict() numRepeats = 2 if subtypeHeaders['subtype adherence'] == 'redefinable' else 1 for repeat in range(numRepeats): self._setHeaderDict('1-indexed', subtypeHeaders['1-indexed']) self._setHeaderDict('end inclusive', subtypeHeaders['end inclusive']) if subtypeHeaders['subtype adherence'] in ['reorderable', 'free']: rearrangedColumns = columns rearrangedHbColumns = hbColumns else: colSet = set(columns) subtypeColSet = set(subtypeColumns) if subtypeHeaders['subtype adherence'] == 'redefinable': colsRemoved = list(subtypeColSet - colSet) colsAdded = list(colSet - subtypeColSet) if len(colsRemoved) != len(colsAdded) or len(colsRemoved) > 2: continue colsRedefinedTo = ['value', 'edges'] if repeat == 1 else ['edges', 'value'] rearrangedColumns = [] i,j = (0,0) for col in subtypeColumns: if col in colsRemoved: rearrangedColumns.append(colsRedefinedTo[i]) i += 1 elif col in colsRedefinedTo: rearrangedColumns.append(colsAdded[j]) j += 1 else: rearrangedColumns.append(col) for col in columns: if col in colsAdded[j:]: rearrangedColumns.append(col) else: rearrangedColumns = [x for x in subtypeColumns if x in colSet] + \ [x for x in columns if x not in subtypeColSet] rearrangedHbColumns = self._getHbColumnsFromGtrackColumns(rearrangedColumns) try: tempFile = StringIO() self._composeContents(tempFile, rearrangedHbColumns, rearrangedColumns, \ deepcopy(self._geSource), onlyNonDefault=True, singleDataLine=True) gtrackGESource = Gtrack('subtypeTest.' + self.getDefaultFileNameSuffix(), printWarnings=False, \ strToUseInsteadOfFn=tempFile.getvalue()) tempFile.close() if gtrackGESource.compliesWithSubtype(subtypeUrl): gtrackGESource._headerDict['subtype url'] = subtypeUrl gtrackGESource._updateHeadersAccordingToSubtype() updatedHeaders = OrderedDict([(key, val) for key,val in gtrackGESource.getHeaderDict().iteritems() \ if val != Gtrack.DEFAULT_HEADER_DICT.get(key)]) for header in updatedHeaders: self._setHeaderDict(header, updatedHeaders[header]) return rearrangedHbColumns, rearrangedColumns, True except Exception, e: continue
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.gtr If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' try: if choices.history: inputFile = open(ExternalTrackManager.extractFnFromGalaxyTN(choices.history.split(':')), 'r') else: inputFile = StringIO(choices.input) headers = cls._getHeaders(choices) headerIdxs = {} for i, header in enumerate(headers): headerIdxs[header] = i createDense = choices.createDense == 'Yes' if createDense: firstRegInBlock = None curReg = None assert headerIdxs['seqid'] is not None assert headerIdxs['start'] is not None headers[headerIdxs['start']] = '' create3dData = cls._create3dData(choices) if create3dData: if any(x in headers for x in ['id', 'edges']): print >> sys.stderr, "Error: when using the special 3D input columns 'linked_seqid' and " + \ "'linked_start', the columns 'id' and 'edges' must not " + \ "be specified in addition." return for header in ['linked_seqid', 'linked_start', 'linked_end', 'link_weight']: if header in headerIdxs: headers[headerIdxs[header]] = '' for header in ['id', 'edges']: headerIdxs[header] = len(headers) headers += [header] regs = [] regIdx = 0 prevRegIdx = 0 idDict = {} idCount = 0 curCols = None prevLine = '' firstRegInBlock = None curReg = None prev3dReg = None nextReg = None if createDense or create3dData: newInputFile = NamedTemporaryFile() sortedInputFile = NamedTemporaryFile() colIndexes = [i for i, header in enumerate(headers) if header != ''] numSkipLines = cls._getNumSkipLines(choices) tempContents = NamedTemporaryFile() tempDataLines = NamedTemporaryFile() if choices.indexing == '1-indexed, end inclusive': tempContents.write('##1-indexed: true' + os.linesep) tempContents.write('##end inclusive: true' + os.linesep) tempContents.write('###' + '\t'.join([headers[i] for i in colIndexes]) + os.linesep) for passType in ['pre','final'] if createDense or create3dData else ['final']: for i in xrange(numSkipLines): inputFile.readline() splitChar = cls._getSplitChar(choices) numCols = cls._getFileContentsInfo(choices).numCols regionsDecreased = False autoCorrectSeqId = choices.handleSeqId == 'Yes, auto-correct to the best match in the genome build' cropCrossingSegments = choices.cropCrossingSegments == 'Yes' genome = choices.genome for i, line in enumerate(inputFile): if line == '' or len(line) > 0 and line[0] == '#': pass cols = [x.strip() for x in line.strip().split(splitChar)] if create3dData: cols += ['', ''] for j in colIndexes: if len(cols) <= j: print >> sys.stderr, "Error in line #%s: %s" % (i+1, line) print >> sys.stderr, "The line does not include the column #%s, which is defined with " \ "the name '%s' (the number of columns is %s). Please fix the input " \ "file or redefine the column names of this column." \ % (j+1, headers[j], len(cols)) return if autoCorrectSeqId: from quick.util.GenomeInfo import GenomeInfo cols[headerIdxs['seqid']] = GenomeInfo.findBestMatchingChr(genome, cols[headerIdxs['seqid']]) for j, col in enumerate(cols): if col == '': cols[j] = '.' else: cols[j] = formatPhraseWithCorrectChrUsage(col, notAllowedChars='#\t') if cropCrossingSegments: from quick.util.GenomeInfo import GenomeInfo for seqidHdr, startHdr, endHdr in [('seqid','start','end')] \ + ([('linked_seqid','linked_start','linked_end')] if create3dData else []): if endHdr in headerIdxs: seqid = cols[headerIdxs[seqidHdr]] start = cols[headerIdxs[startHdr]] end = cols[headerIdxs[endHdr]] if not any(x == '.' for x in [seqid, start, end]): start, end = int(start), int(end) if choices.indexing == '1-indexed, end inclusive': start -= 1 chrLen = GenomeInfo().getChrLen(genome, seqid) if start < chrLen and end > chrLen: cols[headerIdxs[endHdr]] = str(chrLen) if createDense or create3dData: prevReg = curReg curReg = cls._getGenomeRegion(cols[headerIdxs['seqid']], cols[headerIdxs['start']], \ cols[headerIdxs['end']] if headerIdxs.get('end') else None) if passType == 'pre': newInputFile.write(line.strip() + os.linesep) if create3dData: id = curReg.strShort() if id not in idDict: regs.append(curReg) idDict[id] = '' linkedReg = cls._getGenomeRegion(cols[headerIdxs['linked_seqid']], cols[headerIdxs['linked_start']], \ cols[headerIdxs['linked_end']] if 'end' in headerIdxs else None) if choices.undirected == 'Yes' and linkedReg and linkedReg != curReg: id = linkedReg.strShort() if id not in idDict: regs.append(linkedReg) idDict[id] = '' cols[headerIdxs['seqid']], cols[headerIdxs['linked_seqid']] = cols[headerIdxs['linked_seqid']], cols[headerIdxs['seqid']] cols[headerIdxs['start']], cols[headerIdxs['linked_start']] = cols[headerIdxs['linked_start']], cols[headerIdxs['start']] if 'end' in headerIdxs: cols[headerIdxs['end']], cols[headerIdxs['linked_end']] = cols[headerIdxs['linked_end']], cols[headerIdxs['end']] newInputFile.write(splitChar.join(cols[:-2]) + os.linesep) else: #passType == 'final': if firstRegInBlock is None: firstRegInBlock = curReg if create3dData: if curReg != prevReg: prevCols = curCols prevRegIdx = regIdx regIdx = 0 id = curReg.strShort() curCols = copy(cols) curCols[headerIdxs['id']] = idDict[id] if choices.idGeneration == 'Counting' else id curCols[headerIdxs['edges']] = '' linkedReg = cls._getGenomeRegion(cols[headerIdxs['linked_seqid']], cols[headerIdxs['linked_start']], \ cols[headerIdxs['linked_end']] if 'end' in headerIdxs else None) if linkedReg: edges = curCols[headerIdxs['edges']] if edges != '': edges += ';' id = linkedReg.strShort() if id not in idDict: raise InvalidFormatError("Error: linked region '%s' is not present in tabular file. Line: %s" % (linkedReg, line)) if choices.complete == 'Yes': while regIdx < len(regs) and regs[regIdx] != linkedReg: missingId = regs[regIdx].strShort() edges += '%s=.;' % (idDict[missingId] if choices.idGeneration == 'Counting' else missingId) regIdx += 1 edges += idDict[id] if choices.idGeneration == 'Counting' else id if 'link_weight' in headerIdxs: edges += '=' + formatPhraseWithCorrectChrUsage( cols[headerIdxs['link_weight']], notAllowedChars='#\t') regIdx += 1 curCols[headerIdxs['edges']] = edges if curReg != prevReg and prevCols: if choices.complete == 'Yes': for i in xrange(prevRegIdx, len(regs)): missingId = regs[i].strShort() if i != 0: prevCols[headerIdxs['edges']] += ';' prevCols[headerIdxs['edges']] += '%s=.' % (idDict[missingId] if choices.idGeneration == 'Counting' else missingId) if prevCols[headerIdxs['edges']] == '': prevCols[headerIdxs['edges']] = '.' cls._checkOverlap(prev3dReg, prevReg, prevLine) if createDense: firstRegInBlock, tempDataLines = cls._writeBlockLines \ (firstRegInBlock, prev3dReg, prevReg, tempContents, tempDataLines) cls._writeDataLines(prevCols, colIndexes, tempDataLines) prev3dReg = prevReg prevLine = line else: #createDense cls._checkOverlap(prevReg, curReg, line) firstRegInBlock, tempDataLines = cls._writeBlockLines \ (firstRegInBlock, prevReg, curReg, tempContents, tempDataLines) cls._writeDataLines(cols, colIndexes, tempDataLines) else: cls._writeDataLines(cols, colIndexes, tempDataLines) if passType == 'pre': newInputFile.flush() inputFile.close() sortCmd = ["sort", newInputFile.name, "-t$'%s'" % splitChar, "-s"] +\ ["-k%s,%s%s" % (headerIdxs[x]+1, headerIdxs[x]+1, s) if x in headerIdxs else "" \ for x,s in [('seqid',''), ('start','n'), ('end','n'), \ ('linked_seqid',''), ('linked_start','n'), ('linked_end','n')]] +\ ["-o", sortedInputFile.name] subprocess.call(' '.join(sortCmd), stderr=sys.stderr, stdout = sys.stdout, shell=True) #print >> sys.stderr, ' '.join(sortCmd) #os._exit(0) newInputFile.close() if create3dData: regs = sorted(regs) for i,reg in enumerate(regs): idDict[reg.strShort()] = str(i) inputFile = sortedInputFile inputFile.seek(0) numSkipLines = 0 curReg = None else: #passType == 'final': if create3dData: if choices.complete == 'Yes': for i in xrange(regIdx, len(regs)): missingId = regs[i].strShort() if i != 0: curCols[headerIdxs['edges']] += ';' curCols[headerIdxs['edges']] += '%s=.' % (idDict[missingId] if choices.idGeneration == 'Counting' else missingId) if curCols[headerIdxs['edges']] == '': curCols[headerIdxs['edges']] = '.' cls._checkOverlap(prev3dReg, curReg, prevLine) if createDense: firstRegInBlock, tempDataLines = cls._writeBlockLines \ (firstRegInBlock, prev3dReg, curReg, tempContents, tempDataLines) cls._writeDataLines(curCols, colIndexes, tempDataLines) if createDense: firstRegInBlock, tempDataLines = cls._writeBlockLines \ (firstRegInBlock, curReg, None, tempContents, tempDataLines) tempDataLines.flush() tempDataLines.seek(0) tempContents.write(tempDataLines.read()) tempContents.flush() tempContents.seek(0) #print tempContents.read() #tempContents.seek(0) expandHeadersOfGtrackFileAndReturnComposer(tempContents.name).composeToFile(galaxyFn) geSource = GtrackGenomeElementSource(galaxyFn, genome=genome, printWarnings=False) for ge in geSource: pass except Exception, e: print >> sys.stderr, e raise
def _checkUndirectedEdges(self): if self._headerDict['track type'].startswith('linked'): try: GtrackGenomeElementSource._checkUndirectedEdges(self) except InvalidFormatError: self._headerDict['undirected edges'] = False
def _basicHandleEndOfFile(self): GtrackGenomeElementSource._handleEndOfFile(self)
def __init__(self, *args, **kwArgs): GtrackGenomeElementSource.__init__(self, *args, **kwArgs) self._noOverlappingElements = None
def printGSuite(cls, choices, cols, rows, colListString, outFile): #print cols from quick.extra.ProgressViewer import ProgressViewer from gold.gsuite.GSuite import GSuite from gold.gsuite.GSuiteTrack import GSuiteTrack, GalaxyGSuiteTrack import gold.gsuite.GSuiteComposer as GSuiteComposer from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.GtrackComposer import ExtendedGtrackComposer from gold.origdata.GESourceWrapper import ListGESourceWrapper from gold.origdata.GenomeElement import GenomeElement from collections import defaultdict from copy import copy from urllib import quote from unidecode import unidecode from pyliftover import LiftOver gSuite = GSuite() diseaseColIndex = cols.index(cls.DISEASE_COLUMN_NAME) chrColIndex = cols.index(cls.CHR_COLUMN_NAME) startColIndex = cols.index(cls.START_COLUMN_NAME) valColIndex = cols.index(cls.VAL_COLUMN_NAME) orderedExtraKeys = copy(cols) extraIndexes = range(len(cols)) for colName in [cls.DISEASE_COLUMN_NAME, cls.CHR_COLUMN_NAME, cls.START_COLUMN_NAME, cls.VAL_COLUMN_NAME]: extraIndexes.remove(cols.index(colName)) orderedExtraKeys.remove(colName) orderedExtraKeys = [cls._fixColNameForGTrack(key) for key in orderedExtraKeys] diseaseToRowsDict = defaultdict(list) for row in rows: disease = row[diseaseColIndex] if isinstance(disease, unicode): disease = unidecode(disease).replace('\x00', '') diseaseToRowsDict[disease].append(row) progressViewer = ProgressViewer([('Create GWAS tracks for diseases/traits', len(diseaseToRowsDict))], cls.extraGalaxyFn[cls.HISTORY_PROGRESS_TITLE] ) for disease in sorted(diseaseToRowsDict.keys()): uri = GalaxyGSuiteTrack.generateURI(galaxyFn=cls.extraGalaxyFn[cls.HISTORY_HIDDEN_TRACK_STORAGE], extraFileName=disease.replace('/', '_') + '.gtrack') gSuiteTrack = GSuiteTrack(uri, title=disease, genome=cls.OUTPUT_GENOME) gSuite.addTrack(gSuiteTrack) shouldLiftOver = cls.DATABASE_GENOME != cls.OUTPUT_GENOME if shouldLiftOver: liftOver = LiftOver(cls.DATABASE_GENOME, cls.OUTPUT_GENOME) geList = [] for row in diseaseToRowsDict[disease]: extra = {} for col, index in zip(orderedExtraKeys, extraIndexes): cell = row[index].strip() if isinstance(cell, unicode): cell = unidecode(cell) extra[col] = cell if cell != '' else '.' chrom = 'chr' + row[chrColIndex] if chrom == 'chr23': chrom = 'chrX' if chrom == 'chr24': chrom = 'chrY' if chrom == 'chrMT': chrom = 'chrM' start = int(row[startColIndex]) if shouldLiftOver: newPosList = liftOver.convert_coordinate(chrom, start) if newPosList is None or len(newPosList) != 1: print 'SNP with position %s on chromosome %s ' % (chrom, start) +\ 'could not be lifted over from reference genome ' +\ '%s to %s (for disease/trait "%s")' % \ (cls.DATABASE_GENOME, cls.OUTPUT_GENOME, disease) else: chrom, start = newPosList[0][0:2] #print extra geList.append(GenomeElement(chr=chrom, start=start, val=row[valColIndex], orderedExtraKeys=orderedExtraKeys, extra=extra)) geSource = GtrackGenomeElementSource(cls.GTRACK_BLUEPRINT_PATH) wrappedGeSource = ListGESourceWrapper(geSource, geList) composer = ExtendedGtrackComposer(wrappedGeSource) composer.composeToFile(gSuiteTrack.path) progressViewer.update() GSuiteComposer.composeToFile(gSuite, outFile)
def _isExpandableHeader(self, line, onlyGuaranteed): return self._isHeaderLine(line) and \ ( (Gtrack.getHeaderKeyValue(line)[0] in EXPANDABLE_HEADERS) or \ (not onlyGuaranteed and Gtrack.getHeaderKeyValue(line)[0] in NOT_GUARANTEED_EXPANDABLE_HEADERS) )
def _getHbColumnsFromGtrackColumns(self, columns): return [Gtrack.convertNameFromGtrack(col) for col in columns]
def _getGtrackColumnsFromHbColumns(self, hbColumns): return [ Gtrack.convertNameToGtrack(col) for col in hbColumns if col != 'weights' ]
def _getGtrackColumnsFromHbColumns(self, hbColumns): return [Gtrack.convertNameToGtrack(col) for col in hbColumns if col != 'weights']
def testHeaderExpansion(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith('gtrack'): continue if 'no_expand' in caseName: print 'Test case skipped: ' + caseName continue onlyGuaranteed = 'no_types_expanded' in caseName print caseName print '===========' case = geSourceTest.cases[caseName] headerLines = [line if not self._isHeaderLine(line) else '##' + ': '.join([str(x).lower() for x in Gtrack.getHeaderKeyValue(line.strip())]) for line in case.headerLines] fullContents = os.linesep.join(headerLines + case.lines) print 'Original:\n\n' + fullContents case.headerLines = [line for line in headerLines if not self._isExpandableHeader(line, onlyGuaranteed)] print '-----' print 'With headers removed:\n\n' + os.linesep.join(case.headerLines + case.lines) testFn = self._writeTestFile(case) expandedContents = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=False) print '-----' print 'With expanded headers:\n\n' + expandedContents expandedContentsOnlyNonDefaults = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=True) print '-----' print 'With expanded headers (only non-default headers):\n\n' + expandedContentsOnlyNonDefaults origExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in headerLines \ if self._isExpandableHeader(line, onlyGuaranteed=False)]) notExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in case.headerLines \ if self._isHeaderLine(line) and not self._isValueNotKeptHeader(line)]) expandedHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in expandedContents.split(os.linesep) \ if self._isHeaderLine(line)]) if 'no_check_expand' in caseName: print 'No checks for case: ' + caseName else: for header in origExpandableHeaders: self.assertEquals(origExpandableHeaders[header], expandedHeaders[header]) for header in notExpandableHeaders: self.assertEquals(notExpandableHeaders[header], expandedHeaders[header]) for contents in [expandedContents, expandedContentsOnlyNonDefaults]: sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False stdGeSource = GEDependentAttributesHolder(sourceClass('expanded.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=contents)) self.assertEquals(case.assertElementList, [ge for ge in stdGeSource]) self.assertEquals(case.boundingRegionsAssertList, [br for br in stdGeSource.getBoundingRegionTuples()])
def _isValueNotKeptHeader(self, line): return self._isHeaderLine(line) and \ Gtrack.getHeaderKeyValue(line)[0] in VALUE_NOT_KEPT_HEADERS
def execute(cls, choices, galaxyFn=None, username=''): from quick.application.ExternalTrackManager import ExternalTrackManager from collections import defaultdict from gold.origdata.BedGenomeElementSource import BedGenomeElementSource, BedCategoryGenomeElementSource from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.TrackGenomeElementSource import FullTrackGenomeElementSource from urllib import unquote print choices genome = choices[0] geSourceList, labelNames = [], [] selectedHists = [ unquote(val).split(':') for id, val in choices[1].iteritems() if val ] inorout = [int(x) for x in choices[2].split(',')] selectedHists += [ v.split(':') for v in choices[3:] if v not in ['----- Select -----', 'no', 'yes', None, ''] ] for track in selectedHists: try: fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN( track) fn = ExternalTrackManager.extractFnFromGalaxyTN(track) if fileType == 'category.bed': geSourceList.append(BedCategoryGenomeElementSource(fn)) elif fileType == 'gtrack': geSourceList.append(GtrackGenomeElementSource(fn)) else: geSourceList.append(BedGenomeElementSource(fn)) labelNames.append( ExternalTrackManager.extractNameFromHistoryTN(track)) except: geSourceList.append( FullTrackGenomeElementSource(genome, track, allowOverlaps=False)) #labelNames.append(track[-1]) labelNames.append(':'.join(track)) primeList = [ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 ] resultCounter = defaultdict(int) posDict = defaultdict(list) catDict = defaultdict(list) debugstring = 'debug out:' for index, geSource in enumerate(geSourceList): primeNum = primeList[index] prevEnd = -1 prevChr = '' for ge in geSource: posDict[ge.chr] += [ge.start, ge.end] catDict[ge.chr] += [primeNum, -primeNum] prevEnd = ge.end prevChr = ge.chr debugstring += 'posDict elements/2: ' + str( sum(len(v) for v in posDict.itervalues()) / 2) + '\n' debugstring += 'catDict elements/2: ' + str( sum(len(v) for v in catDict.itervalues()) / 2) + '\n' #maxState = reduce( lambda x, y: x*y, primeList[:len(geSourceList)] ) #assuming all tracks are in. selectedState = 1 for n in range(len(geSourceList)): if inorout[n]: selectedState = selectedState * primeList[n] utfil = open(galaxyFn, 'w') for chrom in posDict.keys(): indxSortedList = sorted(range(len(posDict[chrom])), key=posDict[chrom].__getitem__) posList = posDict[chrom] catList = catDict[chrom] catCoverageDepth = defaultdict(int) currentState = 1 currentPos = 0 for indx in indxSortedList: pos = posList[indx] primeVal = catList[indx] #print 'pos, primeVal: ', pos, primeVal #print 'resultCounter: ', resultCounter if currentPos != pos: if abs(currentState) == selectedState: print >> utfil, '%s\t%i\t%i' % (chrom, currentPos, pos) resultCounter[abs(currentState)] += pos - currentPos #debugstring +='resultCounter='+str(resultCounter)+ ' currentPos='+ str(currentPos) + ' pos='+str(pos)+ ' chrom='+str(chrom)+ ' primeVal='+str(primeVal)+ ' catCoverageDepth='+str(catCoverageDepth) +'<br/>' #print 'resultCounter,currentState, pos and currentPos',abs(currentState),':', pos, currentPos currentPos = pos if primeVal < 0: catCoverageDepth[abs(primeVal)] -= 1 if catCoverageDepth[abs(primeVal)] == 0: currentState /= primeVal else: catCoverageDepth[primeVal] += 1 if catCoverageDepth[primeVal] == 1: currentState *= primeVal utfil.close()
def execute(cls, choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from quick.application.ExternalTrackManager import ExternalTrackManager from gold.origdata.BedGenomeElementSource import BedCategoryGenomeElementSource from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource from gold.origdata.TrackGenomeElementSource import TrackGenomeElementSource from gold.track.GenomeRegion import GenomeRegion from quick.util.GenomeInfo import GenomeInfo from collections import defaultdict genome = choices[0] track = choices[2].split(':') allowOverlaps = True if choices[3] == 'Yes' else False regionList = [] for chrom in GenomeInfo.getChrList(genome): start = 0 chromSize = GenomeInfo.getChrLen(genome, chrom) regionList.append(GenomeRegion(genome, chrom, start, chromSize)) if choices[1] == 'From Hyperbrowser repository': geSource = TrackGenomeElementSource(genome, track, regionList) else: fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN( track) fn = ExternalTrackManager.extractFnFromGalaxyTN(track) geSource = BedCategoryGenomeElementSource( fn ) if fileType == 'category.bed' else GtrackGenomeElementSource(fn) resultMinDict = defaultdict(dict) resultMaxDict = defaultdict(dict) for ge in geSource: if resultMaxDict[ge.chr].has_key(ge.val): if ge.end: if resultMaxDict[ge.chr][ge.val] < ge.end: resultMaxDict[ge.chr][ge.val] = ge.end elif resultMaxDict[ge.chr][ge.val] < ge.start: resultMaxDict[ge.chr][ge.val] = ge.start if resultMinDict[ge.chr][ge.val] > ge.start: resultMinDict[ge.chr][ge.val] = ge.start else: resultMaxDict[ge.chr][ge.val] = ge.end if ge.end else ge.start resultMinDict[ge.chr][ge.val] = ge.start utfil = open(galaxyFn, 'w') quitFlag = False errorMsg = 'Error, overlapping regions ' catsConflicting = [] for chrom in sorted(resultMinDict.keys()): for category in resultMinDict[chrom].keys(): lower, upper = resultMinDict[chrom][category], resultMaxDict[ chrom][category] if not allowOverlaps: for cat in resultMinDict[chrom]: if cat != category: l, u = resultMinDict[chrom][cat], resultMaxDict[ chrom][cat] if l >= upper or u <= lower: continue if l > lower or u < upper: quitFlag = True catsConflicting.append( '(Category: %s, Region: %i - %i) vs. (Category: %s, Region: %i - %i)' % (category, lower, upper, cat, l, u)) #break #if quitFlag: break print >> utfil, '\t'.join( [chrom, str(lower), str(upper + 1), category]) #if quitFlag: break utfil.close() if quitFlag: open(galaxyFn, 'w').write( 'Error: overlapping resulting regions are not allowed with selected preferences:\n' + '\n'.join(catsConflicting))
def _isExpandableHeader(self, line, onlyGuaranteed): return self._isHeaderLine(line) and \ ( (Gtrack.getHeaderKeyValue(line)[0] in EXPANDABLE_HEADERS) or \ (not onlyGuaranteed and Gtrack.getHeaderKeyValue(line)[0] in NOT_GUARANTEED_EXPANDABLE_HEADERS) )
def _isValueNotKeptHeader(self, line): return self._isHeaderLine(line) and \ Gtrack.getHeaderKeyValue(line)[0] in VALUE_NOT_KEPT_HEADERS
def testHeaderExpansion(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith('gtrack'): continue if 'no_expand' in caseName: print 'Test case skipped: ' + caseName continue onlyGuaranteed = 'no_types_expanded' in caseName print caseName print '===========' case = geSourceTest.cases[caseName] headerLines = [line if not self._isHeaderLine(line) else '##' + ': '.join([str(x).lower() for x in Gtrack.getHeaderKeyValue(line.strip())]) for line in case.headerLines] fullContents = os.linesep.join(headerLines + case.lines) print 'Original:\n\n' + fullContents case.headerLines = [line for line in headerLines if not self._isExpandableHeader(line, onlyGuaranteed)] print '-----' print 'With headers removed:\n\n' + os.linesep.join(case.headerLines + case.lines) testFn = self._writeTestFile(case) expandedContents = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=False) print '-----' print 'With expanded headers:\n\n' + expandedContents expandedContentsOnlyNonDefaults = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=True) print '-----' print 'With expanded headers (only non-default headers):\n\n' + expandedContentsOnlyNonDefaults origExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in headerLines \ if self._isExpandableHeader(line, onlyGuaranteed=False)]) notExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in case.headerLines \ if self._isHeaderLine(line) and not self._isValueNotKeptHeader(line)]) expandedHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in expandedContents.split(os.linesep) \ if self._isHeaderLine(line)]) if 'no_check_expand' in caseName: print 'No checks for case: ' + caseName else: for header in origExpandableHeaders: self.assertEquals(origExpandableHeaders[header], expandedHeaders[header]) for header in notExpandableHeaders: self.assertEquals(notExpandableHeaders[header], expandedHeaders[header]) for contents in [expandedContents, expandedContentsOnlyNonDefaults]: sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False stdGeSource = GEDependentAttributesHolder(sourceClass('expanded.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=contents))
def _createColumnSpec(self, cols, addAnyExtraFixedCols=True): GtrackGenomeElementSource._createColumnSpec(self, cols, addAnyExtraFixedCols) self._headerDict['track type'] = GtrackGenomeElementSource.getTrackTypeFromColumnSpec(self._columnSpec)