def evaluatePvalueAndNullDistributionList(observedAndMcSamplesTuple, tail, rawStatisticMainClassName): resultsDict = OrderedDict() #TODO: What is received is not a list of tuples, it is a tuple of the real result which is a # TrackStructure whose result is a list of raw values and list of such track structures. # Need to find a way to handle it. observedResult = observedAndMcSamplesTuple[0] mcSamplesTsList = observedAndMcSamplesTuple[1] #TODO: What about categorial ts results? isPairedTsResult = all([val.isPairedTs() for val in observedResult.values()]) observedResultDict = OrderedDict() mcSamplesResultDict = OrderedDefaultDict(list) if isPairedTsResult: for pairedTs in observedResult.values(): trackTitle = pairedTs['reference'].metadata['title'] assert trackTitle not in observedResultDict, "%s already in observed results dict" % trackTitle observedResultDict[trackTitle] = pairedTs.result for mcSampleTs in mcSamplesTsList: for pairedTs in mcSampleTs.values(): trackTitle = pairedTs['reference'].metadata['title'] mcSamplesResultDict[trackTitle].append(pairedTs.result) else: #isFlat? raise Exception('not implemented yet!') for trackTitle, observation in observedResultDict.iteritems(): resultsDict[trackTitle] = evaluatePvalueAndNullDistribution((observation, mcSamplesResultDict[trackTitle]), tail, rawStatisticMainClassName) return resultsDict
def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False
class GESourceManager(object): def __init__(self, geSource): self._geSource = self._decorateGESource(geSource) self._boundingRegionsAndGEsCorrespond = None self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category' self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category' self._valCategories = set() self._edgeWeightCategories = set() self._numElements = OrderedDefaultDict(int) self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys())) self._maxNumEdges = OrderedDefaultDict(int) self._hasCalculatedStats = False # self._calcStatisticsInExtraPass() def _decorateGESource(self, geSource): return GEDependentAttributesHolder(geSource) def _getMaxStrLensKeys(self): prefixSet = set(self._geSource.getPrefixList()) return (['val'] if 'val' in prefixSet and self._geSource.getValDataType() == 'S' else []) + \ (['id'] if 'id' in prefixSet else []) + \ (['edges'] if 'edges' in prefixSet else []) + \ (['weights'] if 'weights' in prefixSet and self._geSource.getEdgeWeightDataType() == 'S' else []) + \ [x for x in prefixSet if x not in RESERVED_PREFIXES] @staticmethod def _initMaxStrLens(keys): return dict([(x,0) for x in keys]) def _calcStatisticsInExtraPass(self): if not self._hasCalculatedStats: prevPrintWarnings = self._geSource.getPrintWarnings() self._geSource.setPrintWarnings(False) if self._geSource.isSliceSource(): if len(self._getMaxStrLensKeys()): raise NotImplementedError('Dimension calculation not yet implemented for slice-based GenomeElementSources.') prefixList = self._geSource.getPrefixList() for el in self._geSource: chr = el.chr self._numElements[chr] += len(getattr(el, prefixList[0])) else: for el in self._geSource: chr = el.chr self._numElements[chr] += 1 if el.isBlankElement: continue if self._areValsCategorical: self._valCategories.add(el.val) if self._areEdgeWeightsCategorical: self._edgeWeightCategories |= set(el.weights) for prefix in self._maxStrLens[chr]: content = getattr(el, prefix, None) if content is not None: self._maxStrLens[chr][prefix] = \ max( self._maxStrLens[chr][prefix], \ max(1, len(content)) if isinstance(content, basestring) else \ max([1] + [len(x) for x in flatten(content)]) ) if prefix == 'edges': self._maxNumEdges[chr] = max(self._maxNumEdges[chr], len(el.edges)) self._geSource.setPrintWarnings(prevPrintWarnings) self._hasCalculatedStats = True def getGESource(self): return self._geSource def getBoundingRegionTuples(self): boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \ if x.region.chr is not None] if len(boundingRegionTuples) == 0: from gold.origdata.GenomeElementSource import BoundingRegionTuple from gold.track.GenomeRegion import GenomeRegion from quick.util.GenomeInfo import GenomeInfo geChrList = self.getAllChrs() boundingRegionTuples = [BoundingRegionTuple( \ GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \ self.getNumElementsForChr(chr) ) \ for chr in geChrList] self._boundingRegionsAndGEsCorrespond = False else: self._boundingRegionsAndGEsCorrespond = True return boundingRegionTuples def _getBoundingRegionTuples(self): return self._geSource.getBoundingRegionTuples() def boundingRegionsAndGEsCorrespond(self): assert self._boundingRegionsAndGEsCorrespond is not None return self._boundingRegionsAndGEsCorrespond def getPrefixList(self): return self._geSource.getPrefixList() def getValDataType(self): return self._geSource.getValDataType() def getValDim(self): return self._geSource.getValDim() def getEdgeWeightDataType(self): return self._geSource.getEdgeWeightDataType() def getEdgeWeightDim(self): return self._geSource.getEdgeWeightDim() def isSorted(self): return self._geSource.isSorted() def getAllChrs(self): self._calcStatisticsInExtraPass() return self._numElements.keys() def getNumElements(self): self._calcStatisticsInExtraPass() return sum(self._numElements.values()) def getNumElementsForChr(self, chr): self._calcStatisticsInExtraPass() return self._numElements[chr] def getValCategories(self): self._calcStatisticsInExtraPass() return self._valCategories def getEdgeWeightCategories(self): self._calcStatisticsInExtraPass() return self._edgeWeightCategories def getMaxNumEdges(self): self._calcStatisticsInExtraPass() return max(self._maxNumEdges.values()) def getMaxNumEdgesForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxNumEdges[chr] def getMaxStrLens(self): self._calcStatisticsInExtraPass() return reduce(lambda x,y:dict((key, max(x[key], y[key])) for key in x.keys()), \ self._maxStrLens.values()) def getMaxStrLensForChr(self, chr): self._calcStatisticsInExtraPass() return self._maxStrLens[chr] def getMaxChrStrLen(self): self._calcStatisticsInExtraPass() return max(len(chr) for chr in self._maxStrLens.keys())
def __init__(self, statistic=None): self._statClassList = [statistic] if statistic else [] self._analysisParts = [] self._analysisOptionsDict = OrderedDefaultDict(list)
class AnalysisSpec(object): #Only supports a single stat, at least for now #Takes a MagicStatFactory, as this will resolve into either an unsplittable or a splittable statistic according to what's suited #Note: maybe MagicStatFactory should have a synonomous class name that would appear less intrusive in a setting like this? #@takes(AnalysisSpec, MagicStatFactory) def __init__(self, statistic=None): self._statClassList = [statistic] if statistic else [] self._analysisParts = [] self._analysisOptionsDict = OrderedDefaultDict(list) def integrateParsedAnalysis(self, other): assert isinstance(other, AnalysisSpec) self._statClassList = copy(other._statClassList) self._analysisParts = copy(other._analysisParts) self._analysisOptionsDict = copy(other._analysisOptionsDict) #@takes(str, str) def addParameter(self, paramName, paramValue): self._appendAnalysisOption('[%s=%s]' % (paramName, paramValue)) def _appendAnalysisOption(self, optionLine): analysisOption = AnalysisOption(optionLine) self._analysisParts.append(analysisOption) self._analysisOptionsDict[analysisOption.getLabelKey()].append(analysisOption) def _removeAnalysisOption(self, optionLabelKey): for option in self._allAnalysisOptions([optionLabelKey], raiseIfEmpty=True): self._analysisParts.remove(option) self._analysisOptionsDict.remove(optionLabelKey) def _allAnalysisOptions(self, labelKeys=None, onlyWithLabelText=False, onlyActivated=False, raiseIfEmpty=False): empty = True if labelKeys is None: labelKeys = self._analysisOptionsDict.keys() for key in labelKeys: options = self._analysisOptionsDict.get(key) if options is not None: if onlyWithLabelText: options = [opt for opt in options if opt.getLabelText() != ''] if onlyActivated: options = [opt for opt in options if opt.isActivated(self)] if options: empty = False for opt in options: yield opt if empty and raiseIfEmpty: raise ShouldNotOccurError def getDefAfterChoices(self, filterByActivation=False): defAfterChoices = '' for part in self._analysisParts: if isinstance(part, AnalysisOption): if filterByActivation and not part.isActivated(self): continue defAfterChoices += part.getDefAfterChoice() else: defAfterChoices += str(part) defAfterChoices += ' -> ' + ','.join([x.__name__ for x in self._statClassList]) return defAfterChoices