def localGenerateInput(self, model, myInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ try: pt, weight = self.sparseGrid[self.counter - 1] except IndexError: raise utils.NoMoreSamplesNeeded for v, varName in enumerate(self.sparseGrid.varNames): # compute the SampledVarsPb for 1-D distribution if self.variables2distributionsMapping[varName]['totDim'] == 1: for key in varName.strip().split(','): self.values[key] = pt[v] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(pt[v]) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "-")] = self.inputInfo['SampledVarsPb'][varName] # compute the SampledVarsPb for N-D distribution # Assume only one N-D distribution is associated with sparse grid collocation method elif self.variables2distributionsMapping[varName][ 'totDim'] > 1 and self.variables2distributionsMapping[ varName]['reducedDim'] == 1: dist = self.variables2distributionsMapping[varName]['name'] ndCoordinates = np.zeros( len(self.distributions2variablesMapping[dist])) positionList = self.distributions2variablesIndexList[dist] for varDict in self.distributions2variablesMapping[dist]: var = utils.first(varDict.keys()) position = utils.first(varDict.values()) location = -1 for key in var.strip().split(','): if key in self.sparseGrid.varNames: location = self.sparseGrid.varNames.index(key) break if location > -1: ndCoordinates[positionList.index( position)] = pt[location] else: self.raiseAnError( IOError, 'The variables ' + var + ' listed in sparse grid collocation sampler, but not used in the ROM!' ) for key in var.strip().split(','): self.values[key] = pt[location] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ndCoordinates) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "!")] = self.inputInfo['SampledVarsPb'][varName] self.inputInfo['ProbabilityWeight'] = weight self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['SamplerType'] = 'Sparse Grid Collocation'
def pcaTransform(self, varsDict, dist): """ This method is used to map latent variables with respect to the model input variables both the latent variables and the model input variables will be stored in the dict: self.inputInfo['SampledVars'] @ In, varsDict, dict, dictionary contains latent and manifest variables {'latentVariables':[latentVar1,latentVar2,...], 'manifestVariables':[var1,var2,...]} @ In, dist, string, the distribution name associated with given variable set @ Out, None """ latentVariablesValues = [] listIndex = [] manifestVariablesValues = [None] * len(varsDict['manifestVariables']) for index, lvar in enumerate(varsDict['latentVariables']): for var, value in self.values.items(): if lvar == var: latentVariablesValues.append(value) listIndex.append(varsDict['latentVariablesIndex'][index]) varName = utils.first( utils.first(self.distributions2variablesMapping[dist]).keys()) varsValues = self.distDict[varName].pcaInverseTransform( latentVariablesValues, listIndex) for index1, index2 in enumerate(varsDict['manifestVariablesIndex']): manifestVariablesValues[index2] = varsValues[index1] manifestVariablesDict = dict( zip(varsDict['manifestVariables'], manifestVariablesValues)) self.values.update(manifestVariablesDict)
def _computeWeightedPercentile(self, arrayIn, pbWeight, percent=0.5): """ Method to compute the weighted percentile in a array of data @ In, arrayIn, list/numpy.array, the array of values from which the percentile needs to be estimated @ In, pbWeight, list/numpy.array, the reliability weights that correspond to the values in 'array' @ In, percent, float, the percentile that needs to be computed (between 0.01 and 1.0) @ Out, result, float, the percentile """ idxs = np.argsort(np.asarray(list(zip(pbWeight, arrayIn)))[:, 1]) # Inserting [0.0,arrayIn[idxs[0]]] is needed when few samples are generated and # a percentile that is < that the first pb weight is requested. Otherwise the median # is returned. sortedWeightsAndPoints = np.insert(np.asarray( list(zip(pbWeight[idxs], arrayIn[idxs]))), 0, [0.0, arrayIn[idxs[0]]], axis=0) weightsCDF = np.cumsum(sortedWeightsAndPoints[:, 0]) # This step returns the index of the array which is < than the percentile, because # the insertion create another entry, this index should shift to the bigger side indexL = utils.first(np.asarray(weightsCDF >= percent).nonzero())[0] # This step returns the indices (list of index) of the array which is > than the percentile indexH = utils.first(np.asarray(weightsCDF > percent).nonzero()) try: # if the indices exists that means the desired percentile lies between two data points # with index as indexL and indexH[0]. Calculate the midpoint of these two points result = 0.5 * (sortedWeightsAndPoints[indexL, 1] + sortedWeightsAndPoints[indexH[0], 1]) except IndexError: result = sortedWeightsAndPoints[indexL, 1] return result
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ self.inputInfo['ProbabilityWeight'] = 1.0 pt = self.neededPoints.pop() self.submittedNotCollected.append(pt) for v, varName in enumerate(self.sparseGrid.varNames): # compute the SampledVarsPb for 1-D distribution if self.variables2distributionsMapping[varName]['totDim'] == 1: for key in varName.strip().split(','): self.values[key] = pt[v] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(pt[v]) self.inputInfo[ 'ProbabilityWeight-' + varName] = self.inputInfo['SampledVarsPb'][varName] # compute the SampledVarsPb for N-D distribution elif self.variables2distributionsMapping[varName][ 'totDim'] > 1 and self.variables2distributionsMapping[ varName]['reducedDim'] == 1: dist = self.variables2distributionsMapping[varName]['name'] ndCoordinates = np.zeros( len(self.distributions2variablesMapping[dist])) positionList = self.distributions2variablesIndexList[dist] for varDict in self.distributions2variablesMapping[dist]: var = utils.first(varDict.keys()) position = utils.first(varDict.values()) location = -1 for key in var.strip().split(','): if key in self.sparseGrid.varNames: location = self.sparseGrid.varNames.index(key) break if location > -1: ndCoordinates[positionList.index( position)] = pt[location] else: self.raiseAnError( IOError, 'The variables ' + var + ' listed in sparse grid collocation sampler, but not used in the ROM!' ) for key in var.strip().split(','): self.values[key] = pt[location] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ndCoordinates) self.inputInfo['ProbabilityWeight-' + dist] = self.inputInfo['SampledVarsPb'][varName] self.inputInfo['ProbabilityWeight'] *= self.inputInfo[ 'ProbabilityWeight-' + dist] self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['SamplerType'] = self.type
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ try: pt = self.pointsToRun[self.counter - 1] except IndexError: self.raiseADebug('All sparse grids are complete! Moving on...') raise utils.NoMoreSamplesNeeded for v, varName in enumerate(self.features): # compute the SampledVarsPb for 1-D distribution if self.variables2distributionsMapping[varName]['totDim'] == 1: for key in varName.strip().split(','): self.values[key] = pt[v] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(pt[v]) self.inputInfo[ 'ProbabilityWeight-' + varName] = self.inputInfo['SampledVarsPb'][varName] # compute the SampledVarsPb for N-D distribution elif self.variables2distributionsMapping[varName][ 'totDim'] > 1 and self.variables2distributionsMapping[ varName]['reducedDim'] == 1: dist = self.variables2distributionsMapping[varName]['name'] ndCoordinates = np.zeros( len(self.distributions2variablesMapping[dist])) positionList = self.distributions2variablesIndexList[dist] for varDict in self.distributions2variablesMapping[dist]: var = utils.first(varDict.keys()) position = utils.first(varDict.values()) location = -1 for key in var.strip().split(','): if key in self.features: location = self.features.index(key) break if location > -1: ndCoordinates[positionList.index( position)] = pt[location] else: self.raiseAnError( IOError, 'The variables ' + var + ' listed in sobol sampler, but not used in the ROM!' ) for key in var.strip().split(','): self.values[key] = pt[location] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ndCoordinates) self.inputInfo['ProbabilityWeight-' + dist] = self.inputInfo['SampledVarsPb'][varName] self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['ProbabilityWeight'] = np.atleast_1d( 1.0) # weight has no meaning for sobol self.inputInfo['SamplerType'] = 'Sparse Grids for Sobol'
def localInitialize(self, solutionExport=None): """ Will perform all initialization specific to this Sampler. For instance, creating an empty container to hold the identified surface points, error checking the optionally provided solution export and other preset values, and initializing the limit surface Post-Processor used by this sampler. @ In, solutionExport, DataObjects, optional, a PointSet to hold the solution (a list of limit surface points) @ Out, None """ if self.detAdaptMode == 2: self.startAdaptive = True # we first initialize the LimitSurfaceSearch sampler LimitSurfaceSearch.localInitialize(self, solutionExport=solutionExport) if self.hybridDETstrategy is not None: # we are running an adaptive hybrid DET and not only an adaptive DET if self.hybridDETstrategy == 1: gridVector = self.limitSurfacePP.gridEntity.returnParameter( "gridVectors") # construct an hybrid DET through an XML node distDict, xmlNode = {}, ET.fromstring( '<InitNode> <HybridSampler type="Grid" name="none"/> </InitNode>' ) for varName, dist in self.distDict.items(): if varName.replace('<distribution>', '') in self.epistemicVariables.keys(): # found an epistemic varNode = ET.Element( 'Distribution' if varName.startswith('<distribution>') else 'variable', {'name': varName.replace('<distribution>', '')}) varNode.append( ET.fromstring("<distribution>" + dist.name.strip() + "</distribution>")) distDict[dist.name.strip()] = self.distDict[varName] varNode.append( ET.fromstring( '<grid construction="custom" type="value">' + ' '.join([ str(elm) for elm in utils.first(gridVector.values()) [varName.replace('<distribution>', '')] ]) + '</grid>')) xmlNode.find("HybridSampler").append(varNode) #TODO, need to pass real paramInput self._localInputAndChecksHybrid(xmlNode, paramInput=None) for hybridsampler in self.hybridStrategyToApply.values(): hybridsampler._generateDistributions(distDict, {}) DynamicEventTree.localInitialize(self) if self.hybridDETstrategy == 2: self.actualHybridTree = utils.first(self.TreeInfo.keys()) self._endJobRunnable = sys.maxsize
def _printState(self,which,toDoSub,poly): """ Debugging tool. Prints status of adaptive steps. Togglable in input by specifying logFile. @ In, which, string, the type of the next addition to make by the adaptive sampler: poly, or subset @ In, toDoSub, tuple(str), the next subset that will be resolved as part of the adaptive sampling @ In, poly, tuple(int), the polynomial within the next subset that will be added to resolve it @ Out, None """ #print status, including error; next step to make; and existing, training, and expected values self.stateCounter+=1 self.statesFile.writelines('==================== STEP %s ====================\n' %self.stateCounter) #write error, next adaptive move to make in this step self.statesFile.writelines('\n\nError: %1.9e\n' %self.error) self.statesFile.writelines('Next: %6s %8s %12s\n' %(which,','.join(toDoSub),str(poly))) #write a summary of the state of each subset sampler: existing points, training points, yet-to-try points, and their impacts on each target for sub in self.useSet.keys(): self.statesFile.writelines('-'*50) self.statesFile.writelines('\nsubset %8s with impacts' %','.join(sub)) for t in self.targets: self.statesFile.writelines( ' [ %4s:%1.6e ] ' %(t,self.subsetImpact[t][sub])) self.statesFile.writelines('\n') #existing polynomials self.statesFile.writelines('ESTABLISHED:\n') self.statesFile.writelines(' %12s' %'polynomial') for t in self.targets: self.statesFile.writelines(' %12s' %t) self.statesFile.writelines('\n') for coeff in utils.first(self.romShell[sub].supervisedEngine.supervisedContainer[0].polyCoeffDict.values()).keys(): self.statesFile.writelines(' %12s' %','.join(str(c) for c in coeff)) for t in self.targets: self.statesFile.writelines(' %1.6e' %self.romShell[sub].supervisedEngine.supervisedContainer[0].polyCoeffDict[t][coeff]) self.statesFile.writelines('\n') #polynomials in training if any(sub==item[1] for item in self.inTraining): self.statesFile.writelines('TRAINING:\n') for item in self.inTraining: if sub == item[1]: self.statesFile.writelines(' %12s %12s\n' %(sub,item[2])) #polynomials on the fringe that aren't being trained self.statesFile.writelines('EXPECTED:\n') for poly in utils.first(self.samplers[sub].expImpact.values()).keys(): self.statesFile.writelines(' %12s' %','.join(str(c) for c in poly)) self.statesFile.writelines(' %1.6e' %self.samplers[sub].expImpact[t][poly]) self.statesFile.writelines('\n') self.statesFile.writelines('-'*50+'\n') #other subsets that haven't been started yet self.statesFile.writelines('EXPECTED SUBSETS\n') for sub,val in self.subsetExpImpact.items(): self.statesFile.writelines(' %8s: %1.6e\n' %(','.join(sub),val)) self.statesFile.writelines('\n==================== END STEP ====================\n')
def isRomConverged(self, outputDict): """ This function will check the convergence of rom @ In, outputDict, dict, dictionary contains the metric information e.g. {targetName:{metricName:List of metric values}}, this dict is coming from results of cross validation @ Out, converged, bool, True if the rom is converged """ converged = True # very temporary solution exploredTargets = [] for cvKey, metricValues in outputDict.items(): #for targetName, metricInfo in outputDict.items(): # very temporary solution info = self.cvInstance.interface._returnCharacteristicsOfCvGivenOutputName( cvKey) if info['targetName'] in exploredTargets: self.raiseAnError( IOError, "Multiple metrics are used in cross validation '", self.cvInstance.name, "'. Currently, this can not be processed by the HybridModel '", self.name, "'!") exploredTargets.append(info['targetName']) name = utils.first(self.cvInstance.interface.metricsDict.keys()) converged = self.checkErrors(info['metricType'], metricValues) return converged
def localInitialize(self): """ Will perform all initialization specific to this Sampler. @ In, None @ Out, None """ # check the source if self.assemblerDict['Source'][0][0] == 'Files': self.readingFrom = 'File' csvFile = self.assemblerDict['Source'][0][3] csvFile.open(mode='r') headers = [x.replace("\n","").strip() for x in csvFile.readline().split(",")] data = np.loadtxt(self.assemblerDict['Source'][0][3], dtype=np.float, delimiter=',', skiprows=1, ndmin=2) lenRlz = len(data) csvFile.close() for var in self.toBeSampled.keys(): for subVar in var.split(','): subVar = subVar.strip() sourceName = self.nameInSource[subVar] if sourceName not in headers: self.raiseAnError(IOError, "variable "+ sourceName + " not found in the file " + csvFile.getFilename()) self.pointsToSample[subVar] = data[:,headers.index(sourceName)] subVarPb = 'ProbabilityWeight-' if subVarPb+sourceName in headers: self.infoFromCustom[subVarPb+subVar] = data[:, headers.index(subVarPb+sourceName)] else: self.infoFromCustom[subVarPb+subVar] = np.ones(lenRlz) if 'PointProbability' in headers: self.infoFromCustom['PointProbability'] = data[:,headers.index('PointProbability')] else: self.infoFromCustom['PointProbability'] = np.ones(lenRlz) if 'ProbabilityWeight' in headers: self.infoFromCustom['ProbabilityWeight'] = data[:,headers.index('ProbabilityWeight')] else: self.infoFromCustom['ProbabilityWeight'] = np.ones(lenRlz) self.limit = len(utils.first(self.pointsToSample.values())) else: self.readingFrom = 'DataObject' dataObj = self.assemblerDict['Source'][0][3] lenRlz = len(dataObj) dataSet = dataObj.asDataset() self.pointsToSample = dataObj.sliceByIndex(dataObj.sampleTag) for var in self.toBeSampled.keys(): for subVar in var.split(','): subVar = subVar.strip() sourceName = self.nameInSource[subVar] if sourceName not in dataObj.getVars() + dataObj.getVars('indexes'): self.raiseAnError(IOError,"the variable "+ sourceName + " not found in "+ dataObj.type + " " + dataObj.name) self.limit = len(self.pointsToSample) # if "index" provided, limit sampling to those points if self.indexes is not None: self.limit = len(self.indexes) maxIndex = max(self.indexes) if maxIndex > len(self.pointsToSample) -1: self.raiseAnError(IndexError,'Requested index "{}" from custom sampler, but highest index sample is "{}"!'.format(maxIndex,len(self.pointsToSample)-1)) #TODO: add restart capability here! if self.restartData: self.raiseAnError(IOError,"restart capability not implemented for CustomSampler yet!")
def readFromROM(self): """ Reads in required information from ROM and returns a sample supervisedLearning object. @ In, None @ Out, SVL, supervisedLearning object, SVL object """ self.ROM = self.assemblerDict['ROM'][0][3] SVLs = self.ROM.supervisedEngine.supervisedContainer SVL = utils.first(SVLs) self.features = SVL.features self.sparseGridType = SVL.sparseGridType.lower() return SVL
def localStillReady(self,ready): #, lastOutput= None """ first perform some check to understand what it needs to be done possibly perform an early return ready is returned @ In, ready, bool, a boolean representing whether the caller is prepared for another input. @ Out, ready, bool, a boolean representing whether the caller is prepared for another input. """ if self.counter == 0 : return True if len(self.RunQueue['queue']) != 0: detReady = True else : detReady = False # since the RunQueue is empty, let's check if there are still branches running => if not => start the adaptive search self._checkIfStartAdaptive() if self.startAdaptive: #if self._endJobRunnable != 1: self._endJobRunnable = 1 # retrieve the endHistory branches completedHistNames, finishedHistNames = [], [] hybridTrees = self.TreeInfo.values() if self.hybridDETstrategy in [1,None] else [self.TreeInfo[self.actualHybridTree]] for treer in hybridTrees: # this needs to be solved for ending in treer.iterProvidedFunction(self._checkCompleteHistory): completedHistNames.append(self.lastOutput.getParam(typeVar='inout',keyword='none',nodeId=ending.get('name'),serialize=False)) finishedHistNames.append(utils.first(completedHistNames[-1].keys())) # assemble a dictionary if len(completedHistNames) > self.completedHistCnt: # sort the list of histories self.sortedListOfHists.extend(list(set(finishedHistNames) - set(self.sortedListOfHists))) completedHistNames = [completedHistNames[finishedHistNames.index(elem)] for elem in self.sortedListOfHists] if len(completedHistNames[-1].values()) > 0: lastOutDict = {'inputs':{},'outputs':{}} for histd in completedHistNames: histdict = histd.values()[-1] for key in histdict['inputs' ].keys(): if key not in lastOutDict['inputs'].keys(): lastOutDict['inputs'][key] = np.atleast_1d(histdict['inputs'][key]) else : lastOutDict['inputs'][key] = np.concatenate((np.atleast_1d(lastOutDict['inputs'][key]),np.atleast_1d(histdict['inputs'][key]))) for key in histdict['outputs'].keys(): if key not in lastOutDict['outputs'].keys(): lastOutDict['outputs'][key] = np.atleast_1d(histdict['outputs'][key]) else : lastOutDict['outputs'][key] = np.concatenate((np.atleast_1d(lastOutDict['outputs'][key]),np.atleast_1d(histdict['outputs'][key]))) else: self.raiseAWarning('No Completed HistorySet! Not possible to start an adaptive search! Something went wrong!') if len(completedHistNames) > self.completedHistCnt: actualLastOutput = self.lastOutput self.lastOutput = copy.deepcopy(lastOutDict) ready = LimitSurfaceSearch.localStillReady(self,ready) self.lastOutput = actualLastOutput self.completedHistCnt = len(completedHistNames) self.raiseAMessage("Completed full histories are "+str(self.completedHistCnt)) else: ready = False self.adaptiveReady = ready if ready or detReady: return True else : return False return detReady
def _readdressEvaluateConstResponse(self, edict): """ Method to re-address the evaluate base class method in order to avoid wasting time in case the training set has an unique response (e.g. if 10 points in the training set, and the 10 outcomes are all == to 1, this method returns one without the need of an evaluation) @ In, edict, dict, prediction request. Not used in this method (kept the consistency with evaluate method) @ Out, returnDict, dict, dictionary with the evaluation (in this case, the constant number) """ returnDict = {} #get the number of inputs provided to this ROM to evaluate numInputs = len(utils.first(edict.values())) #fill the target values for index, target in enumerate(self.target): returnDict[target] = np.ones(numInputs) * self.myNumber[index] return returnDict
def _computeSortedWeightsAndPoints(self, arrayIn, pbWeight, percent): """ Method to compute the sorted weights and points @ In, arrayIn, list/numpy.array, the array of values from which the percentile needs to be estimated @ In, pbWeight, list/numpy.array, the reliability weights that correspond to the values in 'array' @ In, percent, float, the percentile that needs to be computed (between 0.01 and 1.0) @ Out, sortedWeightsAndPoints, list/numpy.array, with [:,0] as the value of the probability density function at the bin, normalized, and [:,1] is the coresonding edge of the probability density function. @ Out, indexL, index of the lower quantile """ idxs = np.argsort(np.asarray(list(zip(pbWeight, arrayIn)))[:, 1]) sortedWeightsAndPoints = np.asarray( list(zip(pbWeight[idxs], arrayIn[idxs]))) weightsCDF = np.cumsum(sortedWeightsAndPoints[:, 0]) indexL = utils.first(np.asarray(weightsCDF >= percent).nonzero())[0] return sortedWeightsAndPoints, indexL
def collectOutput(self,finishedJob, output): """ Function to place all of the computed data into the output object, i.e. Files @ In, finishedJob, object, JobHandler object that is in charge of running this postprocessor @ In, output, object, the object where we want to place our computed results @ Out, None """ evaluation = finishedJob.getEvaluation() if isinstance(evaluation, Runners.Error): self.raiseAnError(RuntimeError, ' No available output to collect') outputDict = evaluation[1] if self.cvScore is not None: output.addRealization(outputDict) else: cvIDs = {self.cvID: np.atleast_1d(range(len(utils.first(outputDict.values()))))} outputDict.update(cvIDs) output.load(outputDict, style='dict')
def run(self, inputIn): """ This method executes the postprocessor action. @ In, inputIn, list, list of DataObjects @ Out, outputDict, dict, dictionary of outputs """ inputDict = self.inputToInternal(inputIn) targetDict = inputDict['target'] classifierDict = inputDict['classifier'] outputDict = {} outputDict.update(inputDict['target']['data']) outputType = targetDict['type'] numRlz = utils.first(targetDict['input'].values()).size outputDict[self.label] = [] for i in range(numRlz): tempTargDict = {} for param, vals in targetDict['input'].items(): tempTargDict[param] = vals[i] for param, vals in targetDict['output'].items(): tempTargDict[param] = vals[i] tempClfList = [] labelIndex = None for key, values in classifierDict['input'].items(): calcVal = self.funcDict[key].evaluate("evaluate", tempTargDict) inds, = np.where(np.asarray(values) == calcVal) if labelIndex is None: labelIndex = set(inds) else: labelIndex = labelIndex & set(inds) if len(labelIndex) != 1: self.raiseAnError( IOError, "The parameters", ",".join(tempTargDict.keys()), "with values", ",".join([str(el) for el in tempTargDict.values()]), "could not be put in any class!") label = classifierDict['output'][self.label][list(labelIndex)[0]] if outputType == 'PointSet': outputDict[self.label].append(label) else: outputDict[self.label].append( np.asarray([label] * targetDict['historySizes'][i])) outputDict[self.label] = np.asarray(outputDict[self.label]) outputDict = {'data': outputDict, 'dims': inputDict['target']['dims']} return outputDict
def _findHighestImpactIndex(self, returnValue=False): """ Finds and returns the index with the highest average expected impact factor across all targets Can optionally return the value of the highest impact, as well. @ In, returnValue, bool, optional, returns the value of the index if True @ Out, point, tuple(int), polynomial index with greatest expected effect """ point = None avg = 0 for pt in utils.first(self.expImpact.values()).keys(): new = sum(self.expImpact[t][pt] for t in self.targets) / len(self.targets) if avg < new: avg = new point = pt self.raiseADebug('Highest impact point is', point, 'with expected average impact', avg) if returnValue: return point, avg else: return point
def _printToLog(self): """ Prints adaptive state of this sampler to the log file. @ In, None @ Out, None """ self.logCounter += 1 pl = 4 * len(self.features) + 1 f = open(self.logFile, 'a') f.writelines('===================== STEP %i =====================\n' % self.logCounter) f.writelines('\nNumber of Runs: %i\n' % len(self.pointsNeededToMakeROM)) f.writelines('Error: %1.9e\n' % self.error) f.writelines('Features: %s\n' % ','.join(self.features)) f.writelines('\nExisting indices:\n') f.writelines(' {:^{}}:'.format('poly', pl)) for t in self.targets: f.writelines(' {:<16}'.format(t)) f.writelines('\n') for idx in self.indexSet.points: f.writelines(' {:^{}}:'.format(idx, pl)) for t in self.targets: f.writelines(' {:<9}'.format(self.actImpact[t][idx])) f.writelines('\n') f.writelines('\nPredicted indices:\n') f.writelines(' {:^{}}:'.format('poly', pl)) for t in self.targets: f.writelines(' {:<16}'.format(t)) f.writelines('\n') for idx in utils.first(self.expImpact.values()).keys(): f.writelines(' {:^{}}:'.format(idx, pl)) for t in self.targets: f.writelines(' {:<9}'.format(self.expImpact[t][idx])) f.writelines('\n') f.writelines('===================== END STEP =====================\n') f.close()
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ self.inputInfo['distributionName'] = { } #Used to determine which distribution to change if needed. self.inputInfo['distributionType'] = { } #Used to determine which distribution type is used weight = 1.0 recastDict = {} for i in range(len(self.axisName)): varName = self.axisName[i] if self.gridInfo[varName] == 'CDF': if self.distDict[varName].getDimensionality() == 1: recastDict[varName] = [self.distDict[varName].ppf] else: recastDict[varName] = [ self.distDict[varName].inverseMarginalDistribution, [ self.variables2distributionsMapping[varName]['dim'] - 1 ] ] elif self.gridInfo[varName] != 'value': self.raiseAnError( IOError, self.gridInfo[varName] + ' is not know as value keyword for type. Sampler: ' + self.name) if self.externalgGridCoord: currentIndexes = self.gridEntity.returnIteratorIndexesFromIndex( self.gridCoordinate) coordinates = self.gridEntity.returnCoordinateFromIndex( self.gridCoordinate, True, recastDict) else: currentIndexes = self.gridEntity.returnIteratorIndexes() coordinates = self.gridEntity.returnPointAndAdvanceIterator( True, recastDict) if coordinates == None: self.raiseADebug( 'Grid finished with restart points! Moving on...') raise utils.NoMoreSamplesNeeded coordinatesPlusOne = self.gridEntity.returnShiftedCoordinate( currentIndexes, dict.fromkeys(self.axisName, 1)) coordinatesMinusOne = self.gridEntity.returnShiftedCoordinate( currentIndexes, dict.fromkeys(self.axisName, -1)) for i in range(len(self.axisName)): varName = self.axisName[i] # compute the SampledVarsPb for 1-D distribution if ("<distribution>" in varName) or ( self.variables2distributionsMapping[varName]['totDim'] == 1): for key in varName.strip().split(','): self.inputInfo['distributionName'][key] = self.toBeSampled[ varName] self.inputInfo['distributionType'][key] = self.distDict[ varName].type self.values[key] = coordinates[varName] self.inputInfo['SampledVarsPb'][key] = self.distDict[ varName].pdf(self.values[key]) # compute the SampledVarsPb for N-D distribution else: if self.variables2distributionsMapping[varName][ 'reducedDim'] == 1: # to avoid double count; distName = self.variables2distributionsMapping[varName][ 'name'] ndCoordinate = [0] * len( self.distributions2variablesMapping[distName]) positionList = self.distributions2variablesIndexList[ distName] for var in self.distributions2variablesMapping[distName]: variable = utils.first(var.keys()) position = utils.first(var.values()) ndCoordinate[positionList.index(position)] = float( coordinates[variable.strip()]) for key in variable.strip().split(','): self.inputInfo['distributionName'][ key] = self.toBeSampled[variable] self.inputInfo['distributionType'][ key] = self.distDict[variable].type self.values[key] = coordinates[variable] # Based on the discussion with Diego, we will use the following to compute SampledVarsPb. self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ndCoordinate) # Compute the ProbabilityWeight if ("<distribution>" in varName) or ( self.variables2distributionsMapping[varName]['totDim'] == 1): if self.distDict[varName].getDisttype() == 'Discrete': weight *= self.distDict[varName].pdf(coordinates[varName]) else: if self.gridInfo[varName] == 'CDF': if coordinatesPlusOne[ varName] != sys.maxsize and coordinatesMinusOne[ varName] != -sys.maxsize: midPlusCDF = (coordinatesPlusOne[varName] + self.distDict[varName].cdf( self.values[key])) / 2.0 midMinusCDF = (coordinatesMinusOne[varName] + self.distDict[varName].cdf( self.values[key])) / 2.0 self.inputInfo[ 'ProbabilityWeight-' + varName.replace( ",", "-")] = midPlusCDF - midMinusCDF weight *= midPlusCDF - midMinusCDF if coordinatesMinusOne[varName] == -sys.maxsize: midPlusCDF = (coordinatesPlusOne[varName] + self.distDict[varName].cdf( self.values[key])) / 2.0 midMinusCDF = 0.0 self.inputInfo[ 'ProbabilityWeight-' + varName.replace( ",", "-")] = midPlusCDF - midMinusCDF weight *= midPlusCDF - midMinusCDF if coordinatesPlusOne[varName] == sys.maxsize: midPlusCDF = 1.0 midMinusCDF = (coordinatesMinusOne[varName] + self.distDict[varName].cdf( self.values[key])) / 2.0 self.inputInfo[ 'ProbabilityWeight-' + varName.replace( ",", "-")] = midPlusCDF - midMinusCDF weight *= midPlusCDF - midMinusCDF else: # Value if coordinatesPlusOne[ varName] != sys.maxsize and coordinatesMinusOne[ varName] != -sys.maxsize: midPlusValue = (self.values[key] + coordinatesPlusOne[varName]) / 2.0 midMinusValue = ( self.values[key] + coordinatesMinusOne[varName]) / 2.0 weight *= self.distDict[varName].cdf( midPlusValue) - self.distDict[varName].cdf( midMinusValue) self.inputInfo[ 'ProbabilityWeight-' + varName.replace(",", "-")] = self.distDict[ varName].cdf(midPlusValue) - self.distDict[ varName].cdf(midMinusValue) if coordinatesMinusOne[varName] == -sys.maxsize: midPlusValue = (self.values[key] + coordinatesPlusOne[varName]) / 2.0 self.inputInfo[ 'ProbabilityWeight-' + varName.replace(",", "-")] = self.distDict[ varName].cdf(midPlusValue) - 0.0 weight *= self.distDict[varName].cdf( midPlusValue) - 0.0 if coordinatesPlusOne[varName] == sys.maxsize: midMinusValue = ( self.values[key] + coordinatesMinusOne[varName]) / 2.0 self.inputInfo[ 'ProbabilityWeight-' + varName.replace( ",", "-")] = 1.0 - self.distDict[ varName].cdf(midMinusValue) weight *= 1.0 - self.distDict[varName].cdf( midMinusValue) # ND variable else: if self.variables2distributionsMapping[varName][ 'reducedDim'] == 1: # to avoid double count of weight for ND distribution; I need to count only one variable instaed of N distName = self.variables2distributionsMapping[varName][ 'name'] ndCoordinate = np.zeros( len(self.distributions2variablesMapping[distName])) dxs = np.zeros( len(self.distributions2variablesMapping[distName])) positionList = self.distributions2variablesIndexList[ distName] for var in self.distributions2variablesMapping[distName]: variable = utils.first(var.keys()).strip() position = utils.first(var.values()) if self.gridInfo[variable] == 'CDF': if coordinatesPlusOne[ variable] != sys.maxsize and coordinatesMinusOne[ variable] != -sys.maxsize: up = self.distDict[ variable].inverseMarginalDistribution( coordinatesPlusOne[variable], self.variables2distributionsMapping[ variable]['dim'] - 1) down = self.distDict[ variable].inverseMarginalDistribution( coordinatesMinusOne[variable], self.variables2distributionsMapping[ variable]['dim'] - 1) dxs[positionList.index( position)] = (up - down) / 2.0 ndCoordinate[positionList.index( position )] = coordinates[variable] - ( coordinates[variable] - down) / 2.0 + dxs[ positionList.index(position)] / 2.0 if coordinatesMinusOne[variable] == -sys.maxsize: up = self.distDict[ variable].inverseMarginalDistribution( coordinatesPlusOne[variable], self.variables2distributionsMapping[ variable]['dim'] - 1) dxs[positionList.index(position)] = ( coordinates[variable.strip()] + up) / 2.0 - self.distDict[ varName].returnLowerBound( positionList.index(position)) ndCoordinate[positionList.index(position)] = ( (coordinates[variable.strip()] + up) / 2.0 + self.distDict[varName].returnLowerBound( positionList.index(position))) / 2.0 if coordinatesPlusOne[variable] == sys.maxsize: down = self.distDict[ variable].inverseMarginalDistribution( coordinatesMinusOne[variable], self.variables2distributionsMapping[ variable]['dim'] - 1) dxs[positionList.index( position )] = self.distDict[varName].returnUpperBound( positionList.index(position)) - ( coordinates[variable.strip()] + down) / 2.0 ndCoordinate[positionList.index(position)] = ( self.distDict[varName].returnUpperBound( positionList.index(position)) + (coordinates[variable.strip()] + down) / 2.0) / 2.0 else: if coordinatesPlusOne[ variable] != sys.maxsize and coordinatesMinusOne[ variable] != -sys.maxsize: dxs[positionList.index(position)] = ( coordinatesPlusOne[variable] - coordinatesMinusOne[variable]) / 2.0 ndCoordinate[positionList.index( position )] = coordinates[variable.strip()] - ( coordinates[variable.strip()] - coordinatesMinusOne[variable]) / 2.0 + dxs[ positionList.index(position)] / 2.0 if coordinatesMinusOne[variable] == -sys.maxsize: dxs[positionList.index(position)] = ( coordinates[variable.strip()] + coordinatesPlusOne[variable] ) / 2.0 - self.distDict[ varName].returnLowerBound( positionList.index(position)) ndCoordinate[positionList.index(position)] = ( (coordinates[variable.strip()] + coordinatesPlusOne[variable]) / 2.0 + self.distDict[varName].returnLowerBound( positionList.index(position))) / 2.0 if coordinatesPlusOne[variable] == sys.maxsize: dxs[positionList.index( position )] = self.distDict[varName].returnUpperBound( positionList.index(position)) - ( coordinates[variable.strip()] + coordinatesMinusOne[variable]) / 2.0 ndCoordinate[positionList.index(position)] = ( self.distDict[varName].returnUpperBound( positionList.index(position)) + (coordinates[variable.strip()] + coordinatesMinusOne[variable]) / 2.0) / 2.0 self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "!")] = self.distDict[varName].cellIntegral( ndCoordinate, dxs) weight *= self.distDict[varName].cellIntegral( ndCoordinate, dxs) self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['ProbabilityWeight'] = copy.deepcopy(weight) self.inputInfo['SamplerType'] = 'Grid'
def run(self, inputDic): """ @ In, inputDic, list, list of dictionaries which contains the data inside the input DataObjects @ Out, outputDic, dict, dictionary which contains the data to be collected by output DataObject """ if len(inputDic) > 1: self.raiseAnError( IOError, self.__class__.__name__ + ' Interfaced Post-Processor ' + str(self.name) + ' accepts only one dataObject') #get actual data inputDict = inputDic[0]['data'] #identify features self.features = inputDic[0]['outVars'] #don't keep the pivot parameter in the feature space if self.pivotParameter in self.features: self.features.remove(self.pivotParameter) #if output length (size of desired output history) not set, set it now if self.outputLen is None: self.outputLen = np.asarray(inputDict['output'][utils.first( inputDict['output'].keys())][self.pivotParameter])[-1] ## Check if data is synchronized referenceHistory = 0 referenceTimeAxis = inputDict[self.pivotParameter][referenceHistory] for hist in range(inputDic[0]['numberRealizations']): if str(inputDict[self.pivotParameter][hist]) != str( referenceTimeAxis): errorMessage = '{} Interfaced Post-Processor "{}": one or more histories in the historySet have different time scales (e.g., reference points: {} and {})'.format( self.__class__.__name__, self.name, referenceHistory, hist) self.raiseAnError(IOError, errorMessage) # task: reshape the data into histories with the size of the output I'm looking for #data dictionaries have form {historyNumber:{VarName:[data], VarName:[data]}} reshapedData = {} newHistoryCounter = 0 #new history tracking labels for historyNumber in range(inputDic[0]['numberRealizations']): #array of the pivot values provided in the history pivotValues = np.asarray( inputDict[self.pivotParameter][historyNumber]) #if the desired output pivot value length is (equal to or) longer than the provided history ... # -> (i.e. I have a year and I want output of a year) if self.outputLen >= pivotValues[-1]: #don't change the shape of this history; it's fine as is reshapedData[newHistoryCounter] = self.retrieveHistory( inputDict, historyNumber) newHistoryCounter += 1 #if the provided history is longer than the requested output period # -> (i.e., I have a year of data and I only want output of 1 year) else: #reshape the history into multiple histories to use startPivot = 0 endPivot = self.outputLen # until you find the last observed pivot point... while endPivot <= pivotValues[-1]: #create a storage place for each new usable history reshapedData[newHistoryCounter] = {} # acceptable is if the pivot value is greater than start and less than end extractCondition = np.logical_and( pivotValues >= startPivot, pivotValues <= endPivot) # extract out the acceptable parts from the pivotValues, and reset the base pivot point to 0 reshapedData[newHistoryCounter][ self.pivotParameter] = np.extract( extractCondition, pivotValues) - startPivot # for each feature... for feature in self.features: # extract applicable information from the feature set reshapedData[newHistoryCounter][feature] = np.extract( extractCondition, inputDict[feature][historyNumber]) #increment history counter newHistoryCounter += 1 #update new start/end points for grabbing the next history startPivot = endPivot endPivot += self.outputLen inputDict['output'] = reshapedData self.numHistory = len(inputDict['output'].keys( )) #should be same as newHistoryCounter - 1, if that's faster #update the set of pivot parameter values to match the first of the reshaped histories self.pivotValues = np.asarray(inputDict['output'][utils.first( inputDict['output'].keys())][self.pivotParameter]) # task: split the history into multiple subsequences so that the typical history can be constructed # -> i.e., split the year history into multiple months, so we get a typical January, February, ..., hence a typical year # start by identifying the subsequences within the histories self.subsequence = [ ] #list of start/stop pivot values for the subsequences startLocation = 0 #tracks the point in the history being evaluated n = 0 #counts the number of the subsequence # in this loop we collect the similar (in time) subsequences in each history while True: subsequenceLength = self.subseqLen[n % len(self.subseqLen)] # if the history is longer than the subsequence we need, take the whole subsequence if startLocation + subsequenceLength < self.pivotValues[-1]: self.subsequence.append( [startLocation, startLocation + subsequenceLength]) # otherwise, take only as much as the history has, and exit else: self.subsequence.append([startLocation, self.pivotValues[-1]]) break # TODO this could be made "while startLocation + subsequenceLength < self.pivotValues[-1] # iterate forward startLocation += subsequenceLength n += 1 numParallelSubsequences = len(self.subsequence) #now that the subsequences are identified, collect the data # for the record, defaultdict is a dict that auto-populates using the constructer given if an element isn't present subseqData = defaultdict( dict ) # eventually {'all':{feature:[[parallel output data]], feature:[[parallel output data]]}, # subseqIndex:{pivotParam:pivotValues[-1]}, # feature:[[parallel data]]} # 'all' means all the feature data is included, # while the subseqIndex dictionaries only contain the relevant subsequence data (i.e., the monthly data) # stack the similar histories in numpy arrays for full period (for example, by year) for feature in self.features: subseqData['all'][feature] = np.concatenate( list(inputDict['output'][h][feature] for h in inputDict['output'].keys())) # gather feature data by subsequence (for example, by month) for index in range(numParallelSubsequences): extractCondition = np.logical_and( self.pivotValues >= self.subsequence[index][0], self.pivotValues < self.subsequence[index][1]) subseqData[index][self.pivotParameter] = np.extract( extractCondition, self.pivotValues) #get the pivot parameter entries as well, but only do it once, at the end if self.pivotValues[-1] == self.subsequence[index][1]: subseqData[index][self.pivotParameter] = np.concatenate( (subseqData[index][self.pivotParameter], np.asarray([self.pivotValues[-1]]))) #get the subsequence data for each feature, for each history for feature in self.features: subseqData[index][feature] = np.zeros( shape=(self.numHistory, len(subseqData[index][self.pivotParameter]))) for h, historyNumber in enumerate(inputDict['output'].keys()): if self.pivotValues[-1] == self.subsequence[index][1]: #TODO this is doing the right action, but it's strange that we need to add one extra element. # Maybe this should be fixed where we set the self.subsequence[index][1] for the last index, instead of patched here subseqData[index][feature][h, 0:-1] = np.extract( extractCondition, inputDict['output'][historyNumber][feature]) subseqData[index][feature][h, -1] = inputDict[ 'output'][historyNumber][feature][-1] else: subseqData[index][feature][h, :] = np.extract( extractCondition, inputDict['output'][historyNumber][feature]) # task: compare CDFs to find the nearest match to the collective time's standard CDF (see the paper ref'd in the manual) # start by building the CDFs in the same structure as subseqData # for the record, defaultdict is a dict that auto-populates using the constructer given if an element isn't present cdfData = defaultdict( dict ) # eventually {'all':{feature:[monotonically increasing floats], feature:[monotonically increasing floats]}, # subseqIndex:{pivotParam:pivotValues[-1]}, # feature:[monotonically increasing floats]} # TODO there surely is a faster way to do this than triple-for-loops for feature in self.features: #construct reasonable bins for feature numBins, binEdges = mathUtils.numBinsDraconis( subseqData['all'][feature]) #get the empirical CDF by bin for entire history (e.g., full year or even multiple years) cdfData['all'][feature] = self.__computeECDF( subseqData['all'][feature], binEdges) #get the empirical CDF by bin for subsequence (e.g., for a month) for index in range(numParallelSubsequences): cdfData[index][feature] = np.zeros(shape=(self.numHistory, numBins)) for h in range(self.numHistory): cdfData[index][feature][h, :] = self.__computeECDF( subseqData[index][feature][h, :], binEdges) # now determine which subsequences are the most typical, using the CDF # find the smallestDeltaCDF and its index so the typical data can be set # first, find and store them by history typicalDataHistories = {} for index in range(numParallelSubsequences): typicalDataHistories[index] = {} typicalDataHistories[index][ self.pivotParameter] = subseqData[index][self.pivotParameter] smallestDeltaCDF = np.inf smallestDeltaIndex = numParallelSubsequences + 1 #initialized as bogus index to preserve errors for h in range( self.numHistory ): # for h, historyNumber in enumerate(inputDict['output'].keys()): delta = sum( self.__computeDist(cdfData['all'][feature], cdfData[index] [feature][h, :]) for feature in self.features) if delta < smallestDeltaCDF: smallestDeltaCDF = delta smallestDeltaIndex = h for feature in self.features: typicalDataHistories[index][feature] = subseqData[index][ feature][smallestDeltaIndex, :] # now collapse the data into the typical history typicalData = {} typicalData[self.pivotParameter] = np.concatenate( list(typicalDataHistories[index][self.pivotParameter] for index in range(numParallelSubsequences))) for feature in self.features: typicalData[feature] = np.concatenate( list(typicalDataHistories[index][feature] for index in range(numParallelSubsequences))) # sanity check, should probably be skipped for efficiency, as it looks like a debugging tool # preserved for now in case it was important for an undiscovered reason # for t in range(1,len(typicalData[self.pivotParameter])): # if typicalData[self.pivotParameter][t] < typicalData[self.pivotParameter][t-1]: # self.raiseAnError(RuntimeError,'Something went wrong with the TypicalHistorySet! Expected calculated data is missing.') # task: collect data as expected by RAVEN outputDict = {'data': {}} # typical history for var in typicalData.keys(): outputDict['data'][var] = np.zeros(1, dtype=object) outputDict['data'][var][0] = typicalData[var] # preserve input data for var in inputDic[0]['inpVars']: outputDict['data'][var] = np.zeros(1, dtype=object) outputDict['data'][var][0] = inputDict[var][0] outputDict['dims'] = {} for var in self.features: outputDict['dims'][var] = [self.pivotParameter] return outputDict
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ # create values dictionary weight = 1.0 for key in self.distDict: # check if the key is a comma separated list of strings # in this case, the user wants to sample the comma separated variables with the same sampled value => link the value to all comma separated variables dim = self.variables2distributionsMapping[key]['dim'] totDim = self.variables2distributionsMapping[key]['totDim'] dist = self.variables2distributionsMapping[key]['name'] reducedDim = self.variables2distributionsMapping[key]['reducedDim'] weight = 1.0 if totDim == 1: for var in self.distributions2variablesMapping[dist]: varID = utils.first(var.keys()) if self.samplingType == 'uniform': distData = self.distDict[key].getCrowDistDict() if ('xMin' not in distData.keys()) or ( 'xMax' not in distData.keys()): self.raiseAnError( IOError, "In the Monte-Carlo sampler a uniform sampling type has been chosen; however, one or more distributions have not specified either the lowerBound or the upperBound" ) lower = distData['xMin'] upper = distData['xMax'] rvsnum = lower + (upper - lower) * randomUtils.random() epsilon = (upper - lower) / self.limit midPlusCDF = self.distDict[key].cdf(rvsnum + epsilon) midMinusCDF = self.distDict[key].cdf(rvsnum - epsilon) weight *= midPlusCDF - midMinusCDF else: rvsnum = self.distDict[key].rvs() self.inputInfo['SampledVarsPb'][key] = self.distDict[ key].pdf(rvsnum) for kkey in varID.strip().split(','): self.values[kkey] = np.atleast_1d(rvsnum)[0] self.inputInfo['ProbabilityWeight-' + varID] = 1. elif totDim > 1: if reducedDim == 1: if self.samplingType is None: rvsnum = self.distDict[key].rvs() coordinate = np.atleast_1d(rvsnum).tolist() else: coordinate = np.zeros(totDim) for i in range(totDim): lower = self.distDict[key].returnLowerBound(i) upper = self.distDict[key].returnUpperBound(i) coordinate[i] = lower + ( upper - lower) * randomUtils.random() if reducedDim > len(coordinate): self.raiseAnError( IOError, "The dimension defined for variables drew from the multivariate normal distribution is exceeded by the dimension used in Distribution (MultivariateNormal) " ) probabilityValue = self.distDict[key].pdf(coordinate) self.inputInfo['SampledVarsPb'][key] = probabilityValue for var in self.distributions2variablesMapping[dist]: varID = utils.first(var.keys()) varDim = var[varID] for kkey in varID.strip().split(','): self.values[kkey] = np.atleast_1d(rvsnum)[varDim - 1] self.inputInfo['ProbabilityWeight-' + dist] = 1. else: self.raiseAnError( IOError, "Total dimension for given distribution should be >= 1") if len(self.inputInfo['SampledVarsPb'].keys()) > 0: self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) else: self.inputInfo['PointProbability'] = 1.0 if self.samplingType == 'uniform': self.inputInfo['ProbabilityWeight'] = weight else: self.inputInfo[ 'ProbabilityWeight'] = 1.0 #MC weight is 1/N => weight is one self.inputInfo['SamplerType'] = 'MonteCarlo'
def localGenerateInput(self, model, oldInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, oldInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ #note: pointsNeeded is the collection of points needed by sampler, # while neededPoints is just the reference point that needs running #if there's a point that THIS sampler needs, prioritize it if len(self.neededPoints) > 0: pt = self.neededPoints.pop() #otherwise, take from the highest-impact sampler's needed points else: #pointsNeeded is in order from least to most impactful, so list reverse of keys. subsets = self.pointsNeeded.keys() subsets.reverse() #now they're in order of impact. Look for the next point to run. found = False for sub in subsets: for p in self.pointsNeeded[sub]: pt = self._expandCutPoint(sub, p) if pt not in self.submittedNotCollected: self.submittedNotCollected.append(pt) found = True break if found: break if not found: #this should not occur, but is a good sign something went wrong in developing. self.raiseAnError( RuntimeError, 'No point was found to generate! This should not be possible...' ) #add the number of necessary distinct points to a set (so no duplicates). self.distinctPoints.add(pt) for v, varName in enumerate(self.features): # compute the SampledVarsPb for 1-D distribution if self.variables2distributionsMapping[varName]['totDim'] == 1: for key in varName.strip().split(','): self.values[key] = pt[v] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(pt[v]) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "-")] = self.inputInfo['SampledVarsPb'][varName] # compute the SampledVarsPb for N-D distribution elif self.variables2distributionsMapping[varName][ 'totDim'] > 1 and self.variables2distributionsMapping[ varName]['reducedDim'] == 1: dist = self.variables2distributionsMapping[varName]['name'] ndCoordinates = np.zeros( len(self.distributions2variablesMapping[dist])) positionList = self.distributions2variablesIndexList[dist] for varDict in self.distributions2variablesMapping[dist]: var = utils.first(varDict.keys()) position = utils.first(varDict.values()) location = -1 for key in var.strip().split(','): if key in self.features: location = self.features.index(key) break if location > -1: ndCoordinates[positionList.index( position)] = pt[location] else: self.raiseAnError( IOError, 'The variables ' + var + ' listed in adaptive sobol sampler, but not used in the ROM!' ) for key in var.strip().split(','): self.values[key] = pt[location] self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ndCoordinates) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "!")] = self.inputInfo['SampledVarsPb'][varName] self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['SamplerType'] = 'Adaptive Sparse Grids for Sobol'
def run(self, inputIn): """ This method executes the postprocessor action. In this case it performs the action defined in the external pp @ In, inputIn, dict, dictionary of data to process @ Out, outputDict, dict, Dictionary containing the post-processed results """ inputDict = self.inputToInternal(inputIn) outputDict = {} ## This will map the name to its appropriate interface and method ## in the case of a function being defined in two separate files, we ## qualify the output by appending the name of the interface from which it ## originates methodMap = {} ## First check all the requested methods are available and if there are ## duplicates then qualify their names for the user for method in self.methodsToRun: matchingInterfaces = [] for interface in self.externalInterfaces: if method in interface.availableMethods(): matchingInterfaces.append(interface) if len(matchingInterfaces) == 0: self.raiseAWarning(method + ' not found. I will skip it.') #elif len(matchingInterfaces) == 1: # methodMap[method] = (matchingInterfaces[0], method) else: for interface in matchingInterfaces: methodName = interface.name + '_' + method methodMap[methodName] = (interface, method) ## Evaluate the method and add it to the outputDict, also if the method ## adjusts the input data, then you should update it as well. warningMessages = [] for methodName, (interface, method) in methodMap.items(): # The deep copy is needed since the interface postprocesor will change the values of inputDict tempInputDict = copy.deepcopy(inputDict) outputDict[methodName] = np.atleast_1d(copy.copy(interface.evaluate(method, tempInputDict))) if outputDict[methodName] is None: self.raiseAnError(Exception,"the method "+methodName+" has not produced any result. It needs to return a result!") for target in tempInputDict.keys(): if hasattr(interface, target): #if target not in outputDict.keys(): if target not in methodMap.keys(): attributeInSelf = getattr(interface, target) if (np.atleast_1d(attributeInSelf)).shape != (np.atleast_1d(inputDict[target])).shape or (np.atleast_1d(attributeInSelf) - np.atleast_1d(inputDict[target])).all(): if target in outputDict.keys(): self.raiseAWarning("In Post-Processor "+ self.name +" the modified variable "+target+ " has the same name of a one already modified through another Function method." + " This method overwrites the input DataObject variable value") outputDict[target] = np.atleast_1d(attributeInSelf) else: warningMessages.append("In Post-Processor "+ self.name +" the method "+method+ " has the same name of a variable contained in the input DataObject." + " This method overwrites the input DataObject variable value") for msg in list(set(warningMessages)): self.raiseAWarning(msg) # TODO: We assume the structure of input to the external pp is the same as the struture of output to this external pp # An interface pp should be used if the user wants to merge two data objects, or change the structures of input data # objects. numRlz = len(utils.first(outputDict.values())) for val in outputDict.values(): if len(val) != numRlz: self.raiseAnError(IOError, "The return results from the external functions have different number of realizations!" + " This postpocessor ", self.name, " requests all the returned values should have the same number of realizations.") for target in inputDict.keys(): if target not in outputDict.keys(): if len(inputDict[target]) != numRlz: self.raiseAWarning("Parameter ", target, " is available in the provided input DataObjects," + " but it has different length from the returned values from the external functions." + " Thus this parameter will not be accessible by the output DataObjects!") else: outputDict[target] = np.atleast_1d(inputDict[target]) return outputDict
def _checkClosestBranch(self): """ Function that checks the closest branch already evaluated @ In, None @ Out, returnTuple, tuple, closest branch info: - if self.hybridDETstrategy and branch found -> returnTuple = (valBranch,cdfValues,treer) - if self.hybridDETstrategy and branch not found -> returnTuple = (None,cdfValues,treer) - if not self.hybridDETstrategy and branch found -> returnTuple = (valBranch,cdfValues) - if not self.hybridDETstrategy and branch not found -> returnTuple = (None,cdfValues) """ from sklearn import neighbors # compute cdf of sampled vars lowerCdfValues = {} cdfValues = {} self.raiseADebug("Check for closest branch:") self.raiseADebug("_" * 50) for key, value in self.values.items(): self.raiseADebug("Variable name : " + str(key)) self.raiseADebug("Distrbution name: " + str(self.toBeSampled[key])) if key not in self.epistemicVariables.keys(): cdfValues[key] = self.distDict[key].cdf(value) try: index = utils.first( np.atleast_1d( np.asarray(self.branchProbabilities[key]) <= cdfValues[key]).nonzero())[-1] val = self.branchProbabilities[key][index] except (ValueError, IndexError): val = None lowerCdfValues[key] = val self.raiseADebug("CDF value : " + str(cdfValues[key])) self.raiseADebug("Lower CDF found : " + str(lowerCdfValues[key])) self.raiseADebug("_" * 50) #if hybrid DET, we need to find the correct tree that matches the values of the epistemic if self.hybridDETstrategy is not None: self.foundEpistemicTree, treer, compareDict = False, None, dict.fromkeys( self.epistemicVariables.keys(), False) for tree in self.TreeInfo.values(): epistemicVars = tree.getrootnode().get( "hybridsamplerCoordinate")[0]['SampledVars'] for key in self.epistemicVariables.keys(): compareDict[key] = utils.compare(epistemicVars[key], self.values[key]) if all(compareDict.values()): # we found the right epistemic tree self.foundEpistemicTree, treer = True, tree break else: treer = utils.first(self.TreeInfo.values()) # check if in the adaptive points already explored (if not push into the grid) if not self.insertAdaptBPb: candidatesBranch = [] # check if adaptive point is better choice -> TODO: improve efficiency for invPoint in self.investigatedPoints: pbth = [ invPoint[self.toBeSampled[key]] for key in cdfValues.keys() ] if all(i <= pbth[cnt] for cnt, i in enumerate(cdfValues.values())): candidatesBranch.append(invPoint) if len(candidatesBranch) > 0: if None in lowerCdfValues.values(): lowerCdfValues = candidatesBranch[0] for invPoint in candidatesBranch: pbth = [ invPoint[self.toBeSampled[key]] for key in cdfValues.keys() ] if all(i >= pbth[cnt] for cnt, i in enumerate(lowerCdfValues.values())): lowerCdfValues = invPoint # Check if The adaptive point requested is outside the so far run grid; in case return None # In addition, if Adaptive Hybrid DET, if treer is None, we did not find any tree # in the epistemic space => we need to create another one if None in lowerCdfValues.values() or treer is None: if self.hybridDETstrategy is not None: returnTuple = None, cdfValues, treer else: returnTuple = None, cdfValues return returnTuple nntrain, mapping = None, {} for ending in treer.iterProvidedFunction(self._checkEnded): #already ended branches, create training set for nearest algorithm (take coordinates <= of cdfValues) -> TODO: improve efficiency pbth = [ ending.get('SampledVarsPb')[key] for key in lowerCdfValues.keys() ] if all(pbth[cnt] <= i for cnt, i in enumerate(lowerCdfValues.values())): if nntrain is None: nntrain = np.zeros((1, len(cdfValues.keys()))) nntrain[0, :] = np.array(copy.copy(pbth)) else: nntrain = np.concatenate( (nntrain, np.atleast_2d(np.array(copy.copy(pbth)))), axis=0) mapping[nntrain.shape[0]] = ending if nntrain is not None: neigh = neighbors.NearestNeighbors(n_neighbors=len(mapping.keys())) neigh.fit(nntrain) valBranch = self._checkValidityOfBranch( neigh.kneighbors([list(lowerCdfValues.values())]), mapping) if self.hybridDETstrategy is not None: returnTuple = valBranch, cdfValues, treer else: returnTuple = valBranch, cdfValues return returnTuple else: returnTuple = (None, cdfValues, treer) if self.hybridDETstrategy is not None else ( None, cdfValues) return returnTuple
def initialize(self, externalSeeding=None, solutionExport=None): """ This function should be called every time a clean sampler is needed. Called before takeAstep in <Step> @ In, externalSeeding, int, optional, external seed @ In, solutionExport, DataObject, optional, in goal oriented sampling (a.k.a. adaptive sampling this is where the space/point satisfying the constrains) @ Out, None """ if self.initSeed == None: self.initSeed = Distributions.randomIntegers(0, 2**31, self) self.counter = 0 if not externalSeeding: Distributions.randomSeed( self.initSeed) #use the sampler initialization seed self.auxcnt = self.initSeed elif externalSeeding == 'continue': pass #in this case the random sequence needs to be preserved else: Distributions.randomSeed( externalSeeding) #the external seeding is used self.auxcnt = externalSeeding #grab restart dataobject if it's available, then in localInitialize the sampler can deal with it. if 'Restart' in self.assemblerDict.keys(): self.raiseADebug('Restart object: ' + str(self.assemblerDict['Restart'])) self.restartData = self.assemblerDict['Restart'][0][3] self.raiseAMessage('Restarting from ' + self.restartData.name) #check consistency of data try: rdata = self.restartData.getAllMetadata()['crowDist'] sdata = self.inputInfo['crowDist'] self.raiseAMessage('sampler inputs:') for sk, sv in sdata.items(): self.raiseAMessage('| ' + str(sk) + ': ' + str(sv)) for i, r in enumerate(rdata): if type(r) != dict: continue if not r == sdata: self.raiseAMessage('restart inputs %i:' % i) for rk, rv in r.items(): self.raiseAMessage('| ' + str(rk) + ': ' + str(rv)) self.raiseAnError( IOError, 'Restart "%s" data[%i] does not have same inputs as sampler!' % (self.restartData.name, i)) except KeyError as e: self.raiseAWarning( "No CROW distribution available in restart -", e) else: self.raiseAMessage('No restart for ' + self.printTag) #load restart data into existing points if self.restartData is not None: if not self.restartData.isItEmpty(): inps = self.restartData.getInpParametersValues() outs = self.restartData.getOutParametersValues() #FIXME there is no guarantee ordering is accurate between restart data and sampler inputs = list(v for v in inps.values()) existingInps = zip(*inputs) outVals = zip(*list(v for v in outs.values())) self.existing = dict(zip(existingInps, outVals)) #specializing the self.localInitialize() to account for adaptive sampling if solutionExport != None: self.localInitialize(solutionExport=solutionExport) else: self.localInitialize() for distrib in self.NDSamplingParams: if distrib in self.distributions2variablesMapping: params = self.NDSamplingParams[distrib] temp = utils.first( self.distributions2variablesMapping[distrib][0].keys()) self.distDict[temp].updateRNGParam(params) else: self.raiseAnError( IOError, 'Distribution "%s" specified in distInit block of sampler "%s" does not exist!' % (distrib, self.name)) # Store the transformation matrix in the metadata if self.variablesTransformationDict: self.entitiesToRemove = [] for variable in self.variables2distributionsMapping.keys(): distName = self.variables2distributionsMapping[variable][ 'name'] dim = self.variables2distributionsMapping[variable]['dim'] totDim = self.variables2distributionsMapping[variable][ 'totDim'] if totDim > 1 and dim == 1: transformDict = {} transformDict['type'] = self.distDict[ variable.strip()].type transformDict['transformationMatrix'] = self.distDict[ variable.strip()].transformationMatrix() self.inputInfo['transformation-' + distName] = transformDict self.entitiesToRemove.append('transformation-' + distName)
def _readMoreXMLbase(self, xmlNode): """ Function to read the portion of the xml input that belongs to the base sampler only and initialize some stuff based on the inputs got The text is supposed to contain the info where and which variable to change. In case of a code the syntax is specified by the code interface itself @ In, xmlNode, xml.etree.ElementTree.Element, Xml element node1 @ Out, None """ for child in xmlNode: prefix = "" if child.tag == 'Distribution': for childChild in child: if childChild.tag == 'distribution': prefix = "<distribution>" tobesampled = childChild.text self.toBeSampled[prefix + child.attrib['name']] = tobesampled #if child.attrib['name'] != tobesampled:self.raiseAnError(IOError,"name of the <Distribution> node and <distribution> mismatches for node named "+ child.attrib['name']) elif child.tag == 'variable': foundDistOrFunc = False for childChild in child: if childChild.tag == 'distribution': if not foundDistOrFunc: foundDistOrFunc = True else: self.raiseAnError( IOError, 'A sampled variable cannot have both a distribution and a function!' ) tobesampled = childChild.text varData = {} varData['name'] = childChild.text if childChild.get('dim') == None: dim = 1 else: dim = childChild.attrib['dim'] varData['dim'] = int(dim) self.variables2distributionsMapping[ child.attrib['name']] = varData self.toBeSampled[prefix + child.attrib['name']] = tobesampled elif childChild.tag == 'function': if not foundDistOrFunc: foundDistOrFunc = True else: self.raiseAnError( IOError, 'A sampled variable cannot have both a distribution and a function!' ) tobesampled = childChild.text self.dependentSample[ prefix + child.attrib['name']] = tobesampled if not foundDistOrFunc: self.raiseAnError( IOError, 'Sampled variable', child.attrib['name'], 'has neither a <distribution> nor <function> node specified!' ) elif child.tag == "variablesTransformation": transformationDict = {} listIndex = None for childChild in child: if childChild.tag == "latentVariables": transformationDict[childChild.tag] = list( inp.strip() for inp in childChild.text.strip().split(',')) elif childChild.tag == "manifestVariables": transformationDict[childChild.tag] = list( inp.strip() for inp in childChild.text.strip().split(',')) elif childChild.tag == "manifestVariablesIndex": # the index provided by the input file starts from 1, but the index used by the code starts from 0. listIndex = list( int(inp.strip()) - 1 for inp in childChild.text.strip().split(',')) elif childChild.tag == "method": self.transformationMethod[ child.attrib['distribution']] = childChild.text if listIndex == None: self.raiseAWarning( 'Index is not provided for manifestVariables, default index will be used instead!' ) listIndex = range( len(transformationDict["manifestVariables"])) transformationDict["manifestVariablesIndex"] = listIndex self.variablesTransformationDict[ child.attrib['distribution']] = transformationDict elif child.tag == "constant": value = utils.partialEval(child.text) if value is None: self.raiseAnError( IOError, 'The body of "constant" XML block should be a number. Got: ' + child.text) try: self.constants[child.attrib['name']] = value except KeyError: self.raiseAnError( KeyError, child.tag + ' must have the attribute "name"!!!') elif child.tag == "restartTolerance": self.restartTolerance = float(child.text) if len(self.constants) > 0: # check if constant variables are also part of the sampled space. In case, error out if not set(self.toBeSampled.keys()).isdisjoint( self.constants.keys()): self.raiseAnError( IOError, "Some constant variables are also in the sampling space:" + ' '.join([ i if i in self.toBeSampled.keys() else "" for i in self.constants.keys() ])) if self.initSeed == None: self.initSeed = Distributions.randomIntegers(0, 2**31, self) # Creation of the self.distributions2variablesMapping dictionary: {'distName': ({'variable_name1': dim1}, {'variable_name2': dim2})} for variable in self.variables2distributionsMapping.keys(): distName = self.variables2distributionsMapping[variable]['name'] dim = self.variables2distributionsMapping[variable]['dim'] listElement = {} listElement[variable] = dim if (distName in self.distributions2variablesMapping.keys()): self.distributions2variablesMapping[distName].append( listElement) else: self.distributions2variablesMapping[distName] = [listElement] # creation of the self.distributions2variablesIndexList dictionary:{'distName':[dim1,dim2,...,dimN]} self.distributions2variablesIndexList = {} for distName in self.distributions2variablesMapping.keys(): positionList = [] for var in self.distributions2variablesMapping[distName]: position = utils.first(var.values()) positionList.append(position) positionList = list(set(positionList)) positionList.sort() self.distributions2variablesIndexList[distName] = positionList for key in self.variables2distributionsMapping.keys(): distName = self.variables2distributionsMapping[key]['name'] dim = self.variables2distributionsMapping[key]['dim'] reducedDim = self.distributions2variablesIndexList[distName].index( dim) + 1 self.variables2distributionsMapping[key][ 'reducedDim'] = reducedDim # the dimension of variable in the transformed space self.variables2distributionsMapping[key]['totDim'] = max( self.distributions2variablesIndexList[distName] ) # We will reset the value if the node <variablesTransformation> exist in the raven input file if not self.variablesTransformationDict and self.variables2distributionsMapping[ key]['totDim'] > 1: if self.variables2distributionsMapping[key]['totDim'] != len( self.distributions2variablesIndexList[distName]): self.raiseAnError( IOError, 'The "dim" assigned to the variables insider Sampler are not correct! the "dim" should start from 1, and end with the full dimension of given distribution' ) #Checking the variables transformation if self.variablesTransformationDict: for dist, varsDict in self.variablesTransformationDict.items(): maxDim = len(varsDict['manifestVariables']) listLatentElement = varsDict['latentVariables'] if len(set(listLatentElement)) != len(listLatentElement): dups = set(var for var in listLatentElement if listLatentElement.count(var) > 1) self.raiseAnError( IOError, 'The following are duplicated variables listed in the latentVariables: ' + str(dups)) if len(set(varsDict['manifestVariables'])) != len( varsDict['manifestVariables']): dups = set(var for var in varsDict['manifestVariables'] if varsDict['manifestVariables'].count(var) > 1) self.raiseAnError( IOError, 'The following are duplicated variables listed in the manifestVariables: ' + str(dups)) if len(set(varsDict['manifestVariablesIndex'])) != len( varsDict['manifestVariablesIndex']): dups = set( var + 1 for var in varsDict['manifestVariablesIndex'] if varsDict['manifestVariablesIndex'].count(var) > 1) self.raiseAnError( IOError, 'The following are duplicated variables indices listed in the manifestVariablesIndex: ' + str(dups)) listElement = self.distributions2variablesMapping[dist] for var in listElement: self.variables2distributionsMapping[var.keys()[0]][ 'totDim'] = maxDim #reset the totDim to reflect the totDim of original input space tempListElement = { k.strip(): v for x in listElement for ks, v in x.items() for k in list(ks.strip().split(',')) } listIndex = [] for var in listLatentElement: if var not in set(tempListElement.keys()): self.raiseAnError( IOError, 'The variable listed in latentVariables ' + var + ' is not listed in the given distribution: ' + dist) listIndex.append(tempListElement[var] - 1) if max(listIndex) > maxDim: self.raiseAnError( IOError, 'The maximum dim = ' + str(max(listIndex)) + ' defined for latent variables is exceeded the dimension of the problem ' + str(maxDim)) if len(set(listIndex)) != len(listIndex): dups = set(var + 1 for var in listIndex if listIndex.count(var) > 1) self.raiseAnError( IOError, 'Each of the following dimensions are assigned to multiple latent variables in Samplers: ' + str(dups)) # update the index for latentVariables according to the 'dim' assigned for given var defined in Sampler self.variablesTransformationDict[dist][ 'latentVariablesIndex'] = listIndex
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ varCount = 0 self.inputInfo['distributionName'] = { } #Used to determine which distribution to change if needed. self.inputInfo['distributionType'] = { } #Used to determine which distribution type is used weight = 1.0 for varName in self.axisName: # new implementation for ND LHS if not "<distribution>" in varName: if self.variables2distributionsMapping[varName][ 'totDim'] > 1 and self.variables2distributionsMapping[ varName]['reducedDim'] == 1: # to avoid double count of weight for ND distribution; I need to count only one variable instaed of N if self.variablesTransformationDict: distName = self.variables2distributionsMapping[ varName]['name'] for distVarName in self.distributions2variablesMapping[ distName]: for kkey in utils.first( distVarName.keys()).strip().split(','): self.inputInfo['distributionName'][ kkey] = self.toBeSampled[varName] self.inputInfo['distributionType'][ kkey] = self.distDict[varName].type ndCoordinate = np.zeros( len(self.distributions2variablesMapping[distName])) dxs = np.zeros( len(self.distributions2variablesMapping[distName])) centerCoordinate = np.zeros( len(self.distributions2variablesMapping[distName])) positionList = self.distributions2variablesIndexList[ distName] for var in self.distributions2variablesMapping[ distName]: # if the varName is a comma separated list of strings the user wants to sample the comma separated variables with the same sampled value => link the value to all comma separated variables variable = utils.first(var.keys()).strip() position = utils.first(var.values()) upper = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { variable: self.sampledCoordinate[self.counter - 1][varCount] + 1 })[variable] lower = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { variable: self.sampledCoordinate[self.counter - 1][varCount] })[variable] varCount += 1 if self.gridInfo[variable] == 'CDF': coordinate = lower + ( upper - lower) * Distributions.random() ndCoordinate[positionList.index( position)] = self.distDict[ variable].inverseMarginalDistribution( coordinate, variable) dxs[positionList.index( position )] = self.distDict[ variable].inverseMarginalDistribution( max(upper, lower), variable ) - self.distDict[ variable].inverseMarginalDistribution( min(upper, lower), variable) centerCoordinate[positionList.index( position)] = (self.distDict[variable]. inverseMarginalDistribution( upper, variable) + self.distDict[variable]. inverseMarginalDistribution( lower, variable)) / 2.0 for kkey in variable.strip().split(','): self.values[kkey] = ndCoordinate[ positionList.index(position)] self.inputInfo['upper'][kkey] = self.distDict[ variable].inverseMarginalDistribution( max(upper, lower), variable) self.inputInfo['lower'][kkey] = self.distDict[ variable].inverseMarginalDistribution( min(upper, lower), variable) elif self.gridInfo[variable] == 'value': dxs[positionList.index(position)] = max( upper, lower) - min(upper, lower) centerCoordinate[positionList.index( position)] = (upper + lower) / 2.0 coordinateCdf = self.distDict[ variable].marginalCdf(lower) + ( self.distDict[variable].marginalCdf( upper) - self.distDict[variable].marginalCdf( lower)) * Distributions.random() coordinate = self.distDict[ variable].inverseMarginalDistribution( coordinateCdf, variable) ndCoordinate[positionList.index( position)] = coordinate for kkey in variable.strip().split(','): self.values[kkey] = coordinate self.inputInfo['upper'][kkey] = max( upper, lower) self.inputInfo['lower'][kkey] = min( upper, lower) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "!")] = self.distDict[varName].cellIntegral( centerCoordinate, dxs) weight *= self.inputInfo['ProbabilityWeight-' + varName.replace(",", "!")] self.inputInfo['SampledVarsPb'][ varName] = self.distDict[varName].pdf(ndCoordinate) else: if self.gridInfo[varName] == 'CDF': upper = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { varName: self.sampledCoordinate[self.counter - 1][varCount] + 1 })[varName] lower = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { varName: self.sampledCoordinate[self.counter - 1][varCount] })[varName] varCount += 1 coordinate = lower + ( upper - lower) * Distributions.random() gridCoordinate, distName = self.distDict[ varName].ppf( coordinate ), self.variables2distributionsMapping[ varName]['name'] for distVarName in self.distributions2variablesMapping[ distName]: for kkey in utils.first( distVarName.keys()).strip().split(','): self.inputInfo['distributionName'][ kkey], self.inputInfo['distributionType'][ kkey], self.values[ kkey] = self.toBeSampled[ varName], self.distDict[ varName].type, np.atleast_1d( gridCoordinate )[distVarName.values() [0] - 1] # coordinate stores the cdf values, we need to compute the pdf for SampledVarsPb self.inputInfo['SampledVarsPb'][ varName] = self.distDict[varName].pdf( np.atleast_1d(gridCoordinate).tolist()) weight *= max(upper, lower) - min(upper, lower) self.inputInfo['ProbabilityWeight-' + varName.replace(",", "!")] = max( upper, lower) - min( upper, lower) else: self.raiseAnError( IOError, "Since the globalGrid is defined, the Stratified Sampler is only working when the sampling is performed on a grid on a CDF. However, the user specifies the grid on " + self.gridInfo[varName]) if ( "<distribution>" in varName ) or self.variables2distributionsMapping[varName]['totDim'] == 1: # 1D variable # if the varName is a comma separated list of strings the user wants to sample the comma separated variables with the same sampled value => link the value to all comma separated variables upper = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { varName: self.sampledCoordinate[self.counter - 1][varCount] + 1 })[varName] lower = self.gridEntity.returnShiftedCoordinate( self.gridEntity.returnIteratorIndexes(), { varName: self.sampledCoordinate[self.counter - 1][varCount] })[varName] varCount += 1 if self.gridInfo[varName] == 'CDF': coordinate = lower + (upper - lower) * Distributions.random() ppfValue = self.distDict[varName].ppf(coordinate) ppfLower = self.distDict[varName].ppf(min(upper, lower)) ppfUpper = self.distDict[varName].ppf(max(upper, lower)) weight *= self.distDict[varName].cdf( ppfUpper) - self.distDict[varName].cdf(ppfLower) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "-")] = self.distDict[varName].cdf( ppfUpper) - self.distDict[varName].cdf(ppfLower) self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(ppfValue) elif self.gridInfo[varName] == 'value': coordinateCdf = self.distDict[varName].cdf( min(upper, lower)) + (self.distDict[varName].cdf( max(upper, lower)) - self.distDict[varName].cdf( min(upper, lower))) * Distributions.random() if coordinateCdf == 0.0: self.raiseAWarning( IOError, "The grid lower bound and upper bound in value will generate ZERO cdf value!!!" ) coordinate = self.distDict[varName].ppf(coordinateCdf) weight *= self.distDict[varName].cdf(max( upper, lower)) - self.distDict[varName].cdf( min(upper, lower)) self.inputInfo['ProbabilityWeight-' + varName.replace( ",", "-")] = self.distDict[varName].cdf( max(upper, lower)) - self.distDict[varName].cdf( min(upper, lower)) self.inputInfo['SampledVarsPb'][varName] = self.distDict[ varName].pdf(coordinate) for kkey in varName.strip().split(','): self.inputInfo['distributionName'][ kkey] = self.toBeSampled[varName] self.inputInfo['distributionType'][kkey] = self.distDict[ varName].type if self.gridInfo[varName] == 'CDF': self.values[kkey] = ppfValue self.inputInfo['upper'][kkey] = ppfUpper self.inputInfo['lower'][kkey] = ppfLower elif self.gridInfo[varName] == 'value': self.values[kkey] = coordinate self.inputInfo['upper'][kkey] = max(upper, lower) self.inputInfo['lower'][kkey] = min(upper, lower) self.inputInfo['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) self.inputInfo['ProbabilityWeight'] = weight self.inputInfo['SamplerType'] = 'Stratified'
def collectOutputFromDataObject(self,exportDict,output): """ Method to collect the output from a DataObject (if it is not a dataObject, it just returns a list with one single exportDict) @ In, exportDict, dict, the export dictionary ({'inputSpaceParams':{var1:value1,var2:value2}, 'outputSpaceParams':{outstreamName1:DataObject1,outstreamName2:DataObject2}, 'metadata':{'metadataName1':value1,'metadataName2':value2}}) @ Out, returnList, list, list of export dictionaries """ returnList = [] if utils.first(exportDict['outputSpaceParams'].values()).__class__.__base__.__name__ != 'Data': returnList.append(exportDict) else: # get the DataObject that is compatible with this output compatibleDataObject = None for dataObj in exportDict['outputSpaceParams'].values(): if output.type == dataObj.type: compatibleDataObject = dataObj break if output.type == 'HDF5' and dataObj.type == 'HistorySet': compatibleDataObject = dataObj break if compatibleDataObject is None: # if none found (e.g. => we are filling an HistorySet with a PointSet), we take the first one compatibleDataObject = utils.first(exportDict['outputSpaceParams'].values()) # get the values inputs = compatibleDataObject.getParametersValues('inputs',nodeId = 'RecontructEnding') unstructuredInputs = compatibleDataObject.getParametersValues('unstructuredinputs',nodeId = 'RecontructEnding') outputs = compatibleDataObject.getParametersValues('outputs',nodeId = 'RecontructEnding') metadata = compatibleDataObject.getAllMetadata(nodeId = 'RecontructEnding') inputKeys = inputs.keys() if compatibleDataObject.type == 'PointSet' else utils.first(inputs.values()).keys() # expand inputspace of current RAVEN for i in range(len(compatibleDataObject)): appendDict = {'inputSpaceParams':{},'outputSpaceParams':{},'metadata':{}} appendDict['inputSpaceParams'].update(exportDict['inputSpaceParams']) appendDict['metadata'].update(exportDict['metadata']) if compatibleDataObject.type == 'PointSet': for inKey, value in inputs.items(): appendDict['inputSpaceParams'][inKey] = value[i] for inKey, value in unstructuredInputs.items(): appendDict['inputSpaceParams'][inKey] = value[i] for outKey, value in outputs.items(): appendDict['outputSpaceParams'][outKey] = value[i] else: for inKey, value in inputs.values()[i].items(): appendDict['inputSpaceParams'][inKey] = value if len(unstructuredInputs) > 0: for inKey, value in unstructuredInputs.values()[i].items(): appendDict['inputSpaceParams'][inKey] = value for outKey, value in outputs.values()[i].items(): appendDict['outputSpaceParams'][outKey] = value # add metadata for both dataobject types for metadataToExport in ['SampledVars','SampledVarsPb']: if metadataToExport in metadata: appendDict['metadata'][metadataToExport].update(metadata[metadataToExport][i]) weightForVars = ['ProbabilityWeight-'+var.strip() for var in inputKeys] for metadataToMerge in ['ProbabilityWeight', 'PointProbability']+weightForVars: if metadataToMerge in appendDict['metadata']: if metadataToMerge in metadata: appendDict['metadata'][metadataToMerge]*= metadata[metadataToMerge][i] else: if metadataToMerge in metadata: appendDict['metadata'][metadataToMerge] = metadata[metadataToMerge][i] returnList.append(appendDict) return returnList
def evaluateSample(self, myInput, samplerType, kwargs): """ This will evaluate an individual sample on this model. Note, parameters are needed by createNewInput and thus descriptions are copied from there. @ In, myInput, list, the inputs (list) to start from to generate the new one @ In, samplerType, string, is the type of sampler that is calling to generate a new input @ In, kwargs, dict, is a dictionary that contains the information coming from the sampler, a mandatory key is the sampledVars that contains a dictionary {'name variable':value} @ Out, returnValue, tuple, This will hold two pieces of information, the first item will be the input data used to generate this sample, the second item will be the output of this model given the specified inputs """ inputFiles = self.createNewInput(myInput, samplerType, **kwargs) self.currentInputFiles, metaData = (copy.deepcopy(inputFiles[0]),inputFiles[1]) if type(inputFiles).__name__ == 'tuple' else (inputFiles, None) returnedCommand = self.code.genCommand(self.currentInputFiles,self.executable, flags=self.clargs, fileArgs=self.fargs, preExec=self.preExec) ## Given that createNewInput can only return a tuple, I don't think these ## checks are necessary (keeping commented out until someone else can verify): # if type(returnedCommand).__name__ != 'tuple': # self.raiseAnError(IOError, "the generateCommand method in code interface must return a tuple") # if type(returnedCommand[0]).__name__ != 'list': # self.raiseAnError(IOError, "the first entry in tuple returned by generateCommand method needs to be a list of tuples!") executeCommand, self.outFileRoot = returnedCommand precommand = kwargs['precommand'] postcommand = kwargs['postcommand'] bufferSize = kwargs['bufferSize'] fileExtensionsToDelete = kwargs['deleteOutExtension'] deleteSuccessfulLogFiles = kwargs['delSucLogFiles'] codeLogFile = self.outFileRoot if codeLogFile is None: codeLogFile = os.path.join(metaData['subDirectory'],'generalOut') ## Before we were temporarily changing directories in order to copy the ## correct directory to the subprocess. Instead, we can just set the ## directory after we copy it over. -- DPM 5/5/2017 sampleDirectory = os.path.join(os.getcwd(),metaData['subDirectory']) localenv = dict(os.environ) localenv['PWD'] = str(sampleDirectory) outFileObject = open(os.path.join(sampleDirectory,codeLogFile), 'w', bufferSize) found = False for index, inputFile in enumerate(self.currentInputFiles): if inputFile.getExt() in self.code.getInputExtension(): found = True break if not found: self.raiseAnError(IOError,'None of the input files has one of the extensions requested by code ' + self.subType +': ' + ' '.join(self.getInputExtension())) commands=[] for runtype,cmd in executeCommand: newCommand='' if runtype.lower() == 'parallel': newCommand += precommand newCommand += cmd+' ' newCommand += postcommand commands.append(newCommand) elif runtype.lower() == 'serial': commands.append(cmd) else: self.raiseAnError(IOError,'For execution command <'+cmd+'> the run type was neither "serial" nor "parallel"! Instead received: ',runtype,'\nPlease check the code interface.') command = ' && '.join(commands)+' ' command = command.replace("%INDEX%",kwargs['INDEX']) command = command.replace("%INDEX1%",kwargs['INDEX1']) command = command.replace("%CURRENT_ID%",kwargs['CURRENT_ID']) command = command.replace("%CURRENT_ID1%",kwargs['CURRENT_ID1']) command = command.replace("%SCRIPT_DIR%",kwargs['SCRIPT_DIR']) command = command.replace("%FRAMEWORK_DIR%",kwargs['FRAMEWORK_DIR']) ## Note this is the working directory that the subprocess will use, it is ## not the directory I am currently working. This bit me as I moved the code ## from the old ExternalRunner because in that case this was filled in after ## the process was submitted by the process itself. -- DPM 5/4/17 command = command.replace("%WORKING_DIR%",sampleDirectory) command = command.replace("%BASE_WORKING_DIR%",kwargs['BASE_WORKING_DIR']) command = command.replace("%METHOD%",kwargs['METHOD']) command = command.replace("%NUM_CPUS%",kwargs['NUM_CPUS']) self.raiseAMessage('Execution command submitted:',command) if platform.system() == 'Windows': command = self._expandForWindows(command) self.raiseAMessage("modified command to", repr(command)) for key, value in localenv.items(): localenv[key]=str(value) elif not self.code.getRunOnShell(): command = self._expandCommand(command) print(f'DEBUGG command: |{command}|') ## reset python path localenv.pop('PYTHONPATH',None) ## This code should be evaluated by the job handler, so it is fine to wait ## until the execution of the external subprocess completes. process = utils.pickleSafeSubprocessPopen(command, shell=self.code.getRunOnShell(), stdout=outFileObject, stderr=outFileObject, cwd=localenv['PWD'], env=localenv) if self.maxWallTime is not None: timeout = time.time() + self.maxWallTime while True: time.sleep(0.5) process.poll() if time.time() > timeout and process.returncode is None: self.raiseAWarning('walltime exeeded in run in working dir: '+str(metaData['subDirectory'])+'. Killing the run...') process.kill() process.returncode = -1 if process.returncode is not None or time.time() > timeout: break else: process.wait() returnCode = process.returncode # procOutput = process.communicate()[0] ## If the returnCode is already non-zero, we should maintain our current ## value as it may have some meaning that can be parsed at some point, so ## only set the returnCode to -1 in here if we did not already catch the ## failure. if returnCode == 0 and 'checkForOutputFailure' in dir(self.code): codeFailed = self.code.checkForOutputFailure(codeLogFile, metaData['subDirectory']) if codeFailed: returnCode = -1 # close the log file outFileObject.close() ## We should try and use the output the code interface gives us first, but ## in lieu of that we should fall back on the standard output of the code ## (Which was deleted above in some cases, so I am not sure if this was ## an intentional design by the original developer or accidental and should ## be revised). ## My guess is that every code interface implements this given that the code ## below always adds .csv to the filename and the standard output file does ## not have an extension. - (DPM 4/6/2017) outputFile = codeLogFile if 'finalizeCodeOutput' in dir(self.code) and returnCode == 0: finalCodeOutputFile = self.code.finalizeCodeOutput(command, codeLogFile, metaData['subDirectory']) ## Special case for RAVEN interface --ALFOA 09/17/17 ravenCase = False if type(finalCodeOutputFile).__name__ == 'dict': ravenCase = True if ravenCase and self.code.__class__.__name__ != 'RAVEN': self.raiseAnError(RuntimeError, 'The return argument from "finalizeCodeOutput" must be a str containing the new output file root!') if finalCodeOutputFile and not ravenCase: outputFile = finalCodeOutputFile ## If the run was successful if returnCode == 0: ## This may be a tautology at this point --DPM 4/12/17 ## Special case for RAVEN interface. Added ravenCase flag --ALFOA 09/17/17 if outputFile is not None and not ravenCase: outFile = Files.CSV() ## Should we be adding the file extension here? outFile.initialize(outputFile+'.csv',self.messageHandler,path=metaData['subDirectory']) csvLoader = CsvLoader.CsvLoader(self.messageHandler) # does this CodeInterface have sufficiently intense (or limited) CSV files that # it needs to assume floats and use numpy, or can we use pandas? loadUtility = self.code.getCsvLoadUtil() csvData = csvLoader.loadCsvFile(outFile.getAbsFile(), nullOK=False, utility=loadUtility) returnDict = csvLoader.toRealization(csvData) if not ravenCase: self._replaceVariablesNamesWithAliasSystem(returnDict, 'inout', True) returnDict.update(kwargs) returnValue = (kwargs['SampledVars'],returnDict) exportDict = self.createExportDictionary(returnValue) else: # we have the DataObjects -> raven-runs-raven case only so far # we have two tasks to do: collect the input/output/meta/indexes from the INNER raven run, and ALSO the input from the OUTER raven run. # -> in addition, we have to fix the probability weights. ## get the number of realizations ### we already checked consistency in the CodeInterface, so just get the length of the first data object numRlz = len(utils.first(finalCodeOutputFile.values())) ## set up the return container exportDict = {'RAVEN_isBatch':True,'realizations':[]} ## set up each realization for n in range(numRlz): rlz = {} ## collect the results from INNER, both point set and history set for dataObj in finalCodeOutputFile.values(): # TODO FIXME check for overwriting data. For now just replace data if it's duplicate! new = dict((var,np.atleast_1d(val)) for var,val in dataObj.realization(index=n,unpackXArray=True).items()) rlz.update( new ) ## add OUTER input space # TODO FIXME check for overwriting data. For now just replace data if it's duplicate! new = dict((var,np.atleast_1d(val)) for var,val in kwargs['SampledVars'].items()) rlz.update( new ) ## combine ProbabilityWeights # TODO FIXME these are a rough attempt at getting it right! rlz['ProbabilityWeight'] = np.atleast_1d(rlz.get('ProbabilityWeight',1.0) * kwargs.get('ProbabilityWeight',1.0)) rlz['PointProbability'] = np.atleast_1d(rlz.get('PointProbability',1.0) * kwargs.get('PointProbability',1.0)) # FIXME: adding "_n" to Optimizer samples scrambles its ability to find evaluations! ## temporary fix: only append if there's multiple realizations, and error out if sampler is an optimizer. if numRlz > 1: if '_' in kwargs['prefix']: self.raiseAnError(RuntimeError,'OUTER RAVEN is using an OPTIMIZER, but INNER RAVEN is returning multiple realizations!') addon = '_{}'.format(n) else: addon = '' rlz['prefix'] = np.atleast_1d(kwargs['prefix']+addon) ## add the rest of the metadata # TODO slow for var,val in kwargs.items(): if var not in rlz.keys(): rlz[var] = np.atleast_1d(val) self._replaceVariablesNamesWithAliasSystem(rlz,'inout',True) exportDict['realizations'].append(rlz) ## The last thing before returning should be to delete the temporary log ## file and any other file the user requests to be cleared if deleteSuccessfulLogFiles: self.raiseAMessage(' Run "' +kwargs['prefix']+'" ended smoothly, removing log file!') codeLofFileFullPath = os.path.join(metaData['subDirectory'],codeLogFile) if os.path.exists(codeLofFileFullPath): os.remove(codeLofFileFullPath) ## Check if the user specified any file extensions for clean up for fileExt in fileExtensionsToDelete: fileList = [ os.path.join(metaData['subDirectory'],f) for f in os.listdir(metaData['subDirectory']) if f.endswith(fileExt) ] for f in fileList: os.remove(f) return exportDict else: self.raiseAMessage(" Process Failed "+str(command)+" returnCode "+str(returnCode)) absOutputFile = os.path.join(sampleDirectory,outputFile) if os.path.exists(absOutputFile): self.raiseAMessage(repr(open(absOutputFile,"r").read()).replace("\\n","\n")) else: self.raiseAMessage(" No output " + absOutputFile) ## If you made it here, then the run must have failed return None
def localGenerateInput(self, model, myInput): """ Function to select the next most informative point for refining the limit surface search. After this method is called, the self.inputInfo should be ready to be sent to the model @ In, model, model instance, an instance of a model @ In, myInput, list, a list of the original needed inputs for the model (e.g. list of files, etc.) @ Out, None """ if self.startAdaptive == True and self.adaptiveReady == True: LimitSurfaceSearch.localGenerateInput(self, model, myInput) #the adaptive sampler created the next point sampled vars #find the closest branch if self.hybridDETstrategy is not None: closestBranch, cdfValues, treer = self._checkClosestBranch() else: closestBranch, cdfValues = self._checkClosestBranch() if closestBranch is None: self.raiseADebug( 'An usable branch for next candidate has not been found => create a parallel branch!' ) # add pbthresholds in the grid investigatedPoint = {} for key, value in cdfValues.items(): try: ind = utils.first( np.atleast_1d( np.asarray(self.branchProbabilities[key]) <= value ).nonzero())[-1] except (IndexError, ValueError): ind = 0 if value not in self.branchProbabilities[key]: self.branchProbabilities[key].insert(ind, value) self.branchValues[key].insert( ind, self.distDict[key].ppf(value)) investigatedPoint[key] = value # collect investigated point self.investigatedPoints.append(investigatedPoint) if closestBranch: info = self._retrieveBranchInfo(closestBranch) self._constructEndInfoFromBranch(model, myInput, info, cdfValues) else: # create a new tree, since there are no branches that are close enough to the adaptive request elm = ETS.HierarchicalNode( self.messageHandler, self.name + '_' + str(len(self.TreeInfo.keys()) + 1)) elm.add('name', self.name + '_' + str(len(self.TreeInfo.keys()) + 1)) elm.add('startTime', 0.0) # Initialize the endTime to be equal to the start one... # It will modified at the end of each branch elm.add('endTime', 0.0) elm.add('runEnded', False) elm.add('running', True) elm.add('queue', False) elm.add('completedHistory', False) branchedLevel = {} for key, value in cdfValues.items(): branchedLevel[key] = utils.first( np.atleast_1d( np.asarray(self.branchProbabilities[key]) == value).nonzero())[-1] # The dictionary branchedLevel is stored in the xml tree too. That's because # the advancement of the thresholds must follow the tree structure elm.add('branchedLevel', branchedLevel) if self.hybridDETstrategy is not None and not self.foundEpistemicTree: # adaptive hybrid DET and not found a tree in the epistemic space # take the first tree and modify the hybridsamplerCoordinate hybridSampled = copy.deepcopy( utils.first(self.TreeInfo.values()).getrootnode().get( 'hybridsamplerCoordinate')) for hybridStrategy in hybridSampled: for key in self.epistemicVariables.keys(): if key in hybridStrategy['SampledVars'].keys(): self.raiseADebug("epistemic var " + str(key) + " value = " + str(self.values[key])) hybridStrategy['SampledVars'][key] = copy.copy( self.values[key]) hybridStrategy['SampledVarsPb'][ key] = self.distDict[key].pdf( self.values[key]) hybridStrategy['prefix'] = len( self.TreeInfo.values()) + 1 # TODO: find a strategy to recompute the probability weight here (for now == PointProbability) hybridStrategy['PointProbability'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) hybridStrategy['ProbabilityWeight'] = reduce( mul, self.inputInfo['SampledVarsPb'].values()) elm.add('hybridsamplerCoordinate', hybridSampled) self.inputInfo.update({ 'ProbabilityWeight-' + key.strip(): value for key, value in self.inputInfo['SampledVarsPb'].items() }) # Here it is stored all the info regarding the DET => we create the info for all the branchings and we store them self.TreeInfo[self.name + '_' + str(len(self.TreeInfo.keys()) + 1)] = ETS.HierarchicalTree( self.messageHandler, elm) self._createRunningQueueBeginOne( self.TreeInfo[self.name + '_' + str(len(self.TreeInfo.keys()))], branchedLevel, model, myInput) return DynamicEventTree.localGenerateInput(self, model, myInput)
def __runTemporalSciKitLearn(self, Input): """ This method executes the postprocessor action. In this case it loads the results to specified dataObject. This is for temporalSciKitLearn @ In, Input, dict, dictionary of data to process @ Out, outputDict, dict, dictionary containing the post-processed results """ self.unSupervisedEngine.features = Input['Features'] self.unSupervisedEngine.pivotVariable = self.pivotVariable if not self.unSupervisedEngine.amITrained: self.unSupervisedEngine.train(Input['Features']) self.unSupervisedEngine.confidence() self.userInteraction() outputDict = self.unSupervisedEngine.outputDict numberOfHistoryStep = self.unSupervisedEngine.numberOfHistoryStep numberOfSample = self.unSupervisedEngine.numberOfSample if 'bicluster' == self.unSupervisedEngine.getDataMiningType(): self.raiseAnError(RuntimeError, 'Bicluster has not yet been implemented.') ## Rename the algorithm output to point to the user-defined label feature if 'labels' in self.unSupervisedEngine.outputDict['outputs'].keys(): labels = np.zeros(shape=(numberOfSample,numberOfHistoryStep)) for t in range(numberOfHistoryStep): labels[:,t] = self.unSupervisedEngine.outputDict['outputs']['labels'][t] outputDict['outputs'][self.labelFeature] = labels elif 'embeddingVectors' in outputDict['outputs']: transformedData = outputDict['outputs'].pop('embeddingVectors') reducedDimensionality = utils.first(transformedData.values()).shape[1] for i in range(reducedDimensionality): dimensionI = np.zeros(shape=(numberOfSample,numberOfHistoryStep)) newColumnName = self.labelFeature + str(i + 1) for t in range(numberOfHistoryStep): dimensionI[:, t] = transformedData[t][:, i] outputDict['outputs'][newColumnName] = dimensionI if 'cluster' == self.unSupervisedEngine.getDataMiningType(): ## SKL will always enumerate cluster centers starting from zero, if this ## is violated, then the indexing below will break. if 'clusterCentersIndices' in self.unSupervisedEngine.metaDict.keys(): clusterCentersIndices = self.unSupervisedEngine.metaDict['clusterCentersIndices'] if 'clusterCenters' in self.unSupervisedEngine.metaDict.keys(): clusterCenters = self.unSupervisedEngine.metaDict['clusterCenters'] # Output cluster centroid to solutionExport if self.solutionExport is not None: rlzDims = {} rlzs = {} clusterLabels = range(int(np.max(labels)) + 1) rlzs[self.labelFeature] = np.atleast_1d(clusterLabels) rlzs[self.pivotParameter] = self.pivotVariable ## We will process each cluster in turn for rlzIndex in clusterLabels: ## Now we will process each feature available ## TODO: Ensure user requests each of these for featureIdx, feat in enumerate(self.unSupervisedEngine.features): ## We will go through the time series and find every instance ## where this cluster exists, if it does not, then we put a NaN ## to signal that the information is missing for this timestep timeSeries = np.zeros(numberOfHistoryStep) for timeIdx in range(numberOfHistoryStep): ## Here we use the assumption that SKL provides clusters that ## are integer values beginning at zero, which make for nice ## indexes with no need to add another layer of obfuscation if rlzIndex in clusterCentersIndices[timeIdx]: loc = clusterCentersIndices[timeIdx].index(rlzIndex) timeSeries[timeIdx] = self.unSupervisedEngine.metaDict['clusterCenters'][timeIdx][loc,featureIdx] else: timeSeries[timeIdx] = np.atleast_1d(np.nan) ## In summary, for each feature, we fill a temporary array and ## stuff it into the solutionExport, one question is how do we ## tell it which item we are exporting? I am assuming that if ## I add an input, then I need to do the corresponding ## updateOutputValue to associate everything with it? Once I ## call updateInputValue again, it will move the pointer? This ## needs verified if feat not in rlzs.keys(): rlzs[feat] = np.zeros((len(clusterLabels), numberOfHistoryStep)) rlzs[feat][rlzIndex] = copy.copy(timeSeries) rlzDims[feat] = [self.pivotParameter] else: rlzs[feat][rlzIndex] = copy.copy(timeSeries) self.solutionExport.load(rlzs, style='dict',dims=rlzDims) if 'inertia' in self.unSupervisedEngine.outputDict.keys(): inertia = self.unSupervisedEngine.outputDict['inertia'] elif 'mixture' == self.unSupervisedEngine.getDataMiningType(): if 'covars' in self.unSupervisedEngine.metaDict.keys(): mixtureCovars = self.unSupervisedEngine.metaDict['covars'] else: mixtureCovars = None if 'precs' in self.unSupervisedEngine.metaDict.keys(): mixturePrecs = self.unSupervisedEngine.metaDict['precs'] else: mixturePrecs = None if 'componentMeanIndices' in self.unSupervisedEngine.metaDict.keys(): componentMeanIndices = self.unSupervisedEngine.metaDict['componentMeanIndices'] else: componentMeanIndices = None if 'means' in self.unSupervisedEngine.metaDict.keys(): mixtureMeans = self.unSupervisedEngine.metaDict['means'] else: mixtureMeans = None # Output cluster centroid to solutionExport if self.solutionExport is not None: ## We will process each cluster in turn rlzDims = {} rlzs = {} ## First store the label as the input for this cluster mixLabels = range(int(np.max(list(componentMeanIndices.values())))+1) rlzs[self.labelFeature] = np.atleast_1d(mixLabels) rlzs[self.pivotParameter] = self.pivotVariable for rlzIndex in mixLabels: ## Now we will process each feature available ## TODO: Ensure user requests each of these if mixtureMeans is not None: for featureIdx, feat in enumerate(self.unSupervisedEngine.features): ## We will go through the time series and find every instance ## where this cluster exists, if it does not, then we put a NaN ## to signal that the information is missing for this timestep timeSeries = np.zeros(numberOfHistoryStep) for timeIdx in range(numberOfHistoryStep): loc = componentMeanIndices[timeIdx].index(rlzIndex) timeSeries[timeIdx] = mixtureMeans[timeIdx][loc,featureIdx] ## In summary, for each feature, we fill a temporary array and ## stuff it into the solutionExport, one question is how do we ## tell it which item we are exporting? I am assuming that if ## I add an input, then I need to do the corresponding ## updateOutputValue to associate everything with it? Once I ## call updateInputValue again, it will move the pointer? This ## needs verified if feat not in rlzs.keys(): rlzs[feat] = copy.copy(timeSeries) rlzDims[feat] = [self.pivotParameter] else: rlzs[feat] = np.vstack((rlzs[feat], copy.copy(timeSeries))) ## You may also want to output the covariances of each pair of ## dimensions as well if mixtureCovars is not None: for i,row in enumerate(self.unSupervisedEngine.features.keys()): for joffset,col in enumerate(list(self.unSupervisedEngine.features.keys())[i:]): j = i+joffset timeSeries = np.zeros(numberOfHistoryStep) for timeIdx in range(numberOfHistoryStep): loc = componentMeanIndices[timeIdx].index(rlzIndex) timeSeries[timeIdx] = mixtureCovars[timeIdx][loc][i,j] covPairName = 'cov_' + str(row) + '_' + str(col) if covPairName not in rlzs.keys(): rlzs[covPairName] = timeSeries rlzDims[covPairName] = [self.pivotParameter] else: rlzs[covPairName] = np.vstack((rlzs[covPairName], timeSeries)) self.solutionExport.load(rlzs, style='dict',dims=rlzDims) elif 'decomposition' == self.unSupervisedEngine.getDataMiningType(): if self.solutionExport is not None: solutionExportDict = self.unSupervisedEngine.metaDict ## Get the transformation matrix and push it to a SolutionExport ## data object. ## Can I be sure of the order of dimensions in the features dict, is ## the same order as the data held in the UnSupervisedLearning object? if 'components' in solutionExportDict: components = solutionExportDict['components'] ## Note, this implies some data exists (Really this information should ## be stored in a dictionary to begin with) numComponents,numDimensions = components[0].shape componentsArray = np.zeros((numComponents, numberOfHistoryStep, numDimensions)) evrArray = np.zeros((numComponents, numberOfHistoryStep)) for timeIdx in range(numberOfHistoryStep): for componentIdx,values in enumerate(components[timeIdx]): componentsArray[componentIdx,timeIdx,:] = values evrArray[componentIdx, timeIdx] = solutionExportDict['explainedVarianceRatio'][timeIdx][componentIdx] rlzs = {} rlzDims = {} ## First store the dimension name as the input for this component rlzs[self.labelFeature] = np.atleast_1d(range(1,numComponents+1)) rlzs[self.pivotParameter] = self.pivotVariable for dimIdx,dimName in enumerate(self.unSupervisedEngine.features.keys()): values = componentsArray[:,:,dimIdx] rlzs[dimName] = values rlzDims[dimName] = [self.pivotParameter] if 'explainedVarianceRatio' in solutionExportDict: rlzs['ExplainedVarianceRatio'] = evrArray rlzDims['ExplainedVarianceRatio'] = [self.pivotParameter] self.solutionExport.load(rlzs, style='dict',dims=rlzDims) return outputDict