Esempio n. 1
0
def _getPDFandCDFfromData(dataName, data, csv, methodInfo, interpolation,
                         generateCSV):
  """
    This method is used to convert some data into a PDF and CDF function.
    Note, it might be better done by scipy.stats.gaussian_kde
    @ In, dataName, str, The name of the data.
    @ In, data, np.array, one dimentional array of the data to process
    @ In, csv, File, file to write out information on data.
    @ In, methodInfo, dict, the info about which processing method needs to be used
    @ In, interpolation, str, "linear" or "quadratic", depending on which interpolation is used
    @ In, generateCSV, bool, True if the csv should be written
    @ Out, (dataStats, cdfFunc, pdfFunc), tuple, dataStats is dictionary with things like "mean" and "stdev", cdfFunction is a function that returns the CDF value and pdfFunc is a function that returns the PDF value.
  """
  #Convert data to pdf and cdf.
  dataStats = __processData( data, methodInfo)
  dataKeys = set(dataStats.keys())
  counts = dataStats['counts']
  bins = dataStats['bins']
  countSum = sum(counts)
  binBoundaries = [dataStats['low']] + bins + [dataStats['high']]
  if generateCSV:
    utils.printCsv(csv, '"' + dataName + '"')
    utils.printCsv(csv, '"numBins"', dataStats['numBins'])
    utils.printCsv(csv, '"binBoundary"', '"binMidpoint"', '"binCount"', '"normalizedBinCount"', '"f_prime"', '"cdf"')
  cdf = [0.0] * len(counts)
  midpoints = [0.0] * len(counts)
  cdfSum = 0.0
  for i in range(len(counts)):
    f0 = counts[i] / countSum
    cdfSum += f0
    cdf[i] = cdfSum
    midpoints[i] = (binBoundaries[i] + binBoundaries[i + 1]) / 2.0
  cdfFunc = mathUtils.createInterp(midpoints, cdf, 0.0, 1.0, interpolation)
  fPrimeData = [0.0] * len(counts)
  for i in range(len(counts)):
    h = binBoundaries[i + 1] - binBoundaries[i]
    nCount = counts[i] / countSum  # normalized count
    f0 = cdf[i]
    if i + 1 < len(counts):
      f1 = cdf[i + 1]
    else:
      f1 = 1.0
    if i + 2 < len(counts):
      f2 = cdf[i + 2]
    else:
      f2 = 1.0
    if interpolation == 'linear':
      fPrime = (f1 - f0) / h
    else:
      fPrime = (-1.5 * f0 + 2.0 * f1 + -0.5 * f2) / h
    fPrimeData[i] = fPrime
    if generateCSV:
      utils.printCsv(csv, binBoundaries[i + 1], midpoints[i], counts[i], nCount, fPrime, cdf[i])
  pdfFunc = mathUtils.createInterp(midpoints, fPrimeData, 0.0, 0.0, interpolation)
  dataKeys -= set({'numBins', 'counts', 'bins'})
  if generateCSV:
    for key in dataKeys:
      utils.printCsv(csv, '"' + key + '"', dataStats[key])
  return dataStats, cdfFunc, pdfFunc
Esempio n. 2
0
  def collectOutput(self, finishedJob, output):
    """
      Function to place all of the computed data into the output object
      @ In, finishedJob, JobHandler External or Internal instance, A JobHandler object that is in charge of running this post-processor
      @ In, output, dataObjects, The object where we want to place our computed results
      @ Out, None
    """
    self.raiseADebug("finishedJob: " + str(finishedJob) + ", output " + str(output))
    evaluation = finishedJob.getEvaluation()
    if isinstance(evaluation, Runners.Error):
      self.raiseAnError(RuntimeError, "No available output to collect (run possibly not finished yet)")

    outputDictionary = evaluation[1]
    self.dataDict.update(outputDictionary)

    dataToProcess = []
    for compareGroup in self.compareGroups:
      dataPulls = compareGroup.dataPulls
      reference = compareGroup.referenceData
      foundDataObjects = []
      for name, kind, rest in dataPulls:
        data = self.dataDict[name].getParametersValues(kind)
        if len(rest) == 1:
          foundDataObjects.append(data[rest[0]])
      dataToProcess.append((dataPulls, foundDataObjects, reference))
    generateCSV = False
    generatePointSet = False
    if isinstance(output,Files.File):
      generateCSV = True
    elif output.type == 'PointSet':
      generatePointSet = True
    else:
      self.raiseAnError(IOError, 'unsupported type ' + str(type(output)))
    if generateCSV:
      csv = output
    for dataPulls, datas, reference in dataToProcess:
      graphData = []
      if "name" in reference:
        distributionName = reference["name"]
        if not distributionName in self.distributions:
          self.raiseAnError(IOError, 'Did not find ' + distributionName +
                             ' in ' + str(self.distributions.keys()))
        else:
          distribution = self.distributions[distributionName]
        refDataStats = {"mean":distribution.untruncatedMean(),
                        "stdev":distribution.untruncatedStdDev()}
        refDataStats["minBinSize"] = refDataStats["stdev"] / 2.0
        refPdf = lambda x:distribution.pdf(x)
        refCdf = lambda x:distribution.cdf(x)
        graphData.append((refDataStats, refCdf, refPdf, "ref_" + distributionName))
      for dataPull, data in zip(dataPulls, datas):
        dataStats = self.__processData( data, self.methodInfo)
        dataKeys = set(dataStats.keys())
        counts = dataStats['counts']
        bins = dataStats['bins']
        countSum = sum(counts)
        binBoundaries = [dataStats['low']] + bins + [dataStats['high']]
        if generateCSV:
          utils.printCsv(csv, '"' + str(dataPull) + '"')
          utils.printCsv(csv, '"numBins"', dataStats['numBins'])
          utils.printCsv(csv, '"binBoundary"', '"binMidpoint"', '"binCount"', '"normalizedBinCount"', '"f_prime"', '"cdf"')
        cdf = [0.0] * len(counts)
        midpoints = [0.0] * len(counts)
        cdfSum = 0.0
        for i in range(len(counts)):
          f0 = counts[i] / countSum
          cdfSum += f0
          cdf[i] = cdfSum
          midpoints[i] = (binBoundaries[i] + binBoundaries[i + 1]) / 2.0
        cdfFunc = mathUtils.createInterp(midpoints, cdf, 0.0, 1.0, self.interpolation)
        fPrimeData = [0.0] * len(counts)
        for i in range(len(counts)):
          h = binBoundaries[i + 1] - binBoundaries[i]
          nCount = counts[i] / countSum  # normalized count
          f0 = cdf[i]
          if i + 1 < len(counts):
            f1 = cdf[i + 1]
          else:
            f1 = 1.0
          if i + 2 < len(counts):
            f2 = cdf[i + 2]
          else:
            f2 = 1.0
          if self.interpolation == 'linear':
            fPrime = (f1 - f0) / h
          else:
            fPrime = (-1.5 * f0 + 2.0 * f1 + -0.5 * f2) / h
          fPrimeData[i] = fPrime
          if generateCSV:
            utils.printCsv(csv, binBoundaries[i + 1], midpoints[i], counts[i], nCount, fPrime, cdf[i])
        pdfFunc = mathUtils.createInterp(midpoints, fPrimeData, 0.0, 0.0, self.interpolation)
        dataKeys -= set({'numBins', 'counts', 'bins'})
        if generateCSV:
          for key in dataKeys:
            utils.printCsv(csv, '"' + key + '"', dataStats[key])
        self.raiseADebug("dataStats: " + str(dataStats))
        graphData.append((dataStats, cdfFunc, pdfFunc, str(dataPull)))
      graphDataDict = mathUtils.getGraphs(graphData, self.fZStats)
      if generateCSV:
        for key in graphDataDict:
          value = graphDataDict[key]
          if type(value).__name__ == 'list':
            utils.printCsv(csv, *(['"' + l[0] + '"' for l in value]))
            for i in range(1, len(value[0])):
              utils.printCsv(csv, *([l[i] for l in value]))
          else:
            utils.printCsv(csv, '"' + key + '"', value)
      if generatePointSet:
        for key in graphDataDict:
          value = graphDataDict[key]
          if type(value).__name__ == 'list':
            for i in range(len(value)):
              subvalue = value[i]
              name = subvalue[0]
              subdata = subvalue[1:]
              if i == 0:
                output.updateInputValue(name, subdata)
              else:
                output.updateOutputValue(name, subdata)
            break  # XXX Need to figure out way to specify which data to return
      if generateCSV:
        for i in range(len(graphData)):
          dataStat = graphData[i][0]
          def delist(l):
            """
              Method to create a string out of a list l
              @ In, l, list, the list to be 'stringed' out
              @ Out, delist, string, the string representing the list
            """
            if type(l).__name__ == 'list':
              return '_'.join([delist(x) for x in l])
            else:
              return str(l)
          newFileName = output.getBase() + "_" + delist(dataPulls) + "_" + str(i) + ".csv"
          if type(dataStat).__name__ != 'dict':
            assert(False)
            continue
          dataPairs = []
          for key in sorted(dataStat.keys()):
            value = dataStat[key]
            if np.isscalar(value):
              dataPairs.append((key, value))
          extraCsv = Files.returnInstance('CSV',self)
          extraCsv.initialize(newFileName,self.messageHandler)
          extraCsv.open("w")
          extraCsv.write(",".join(['"' + str(x[0]) + '"' for x in dataPairs]))
          extraCsv.write("\n")
          extraCsv.write(",".join([str(x[1]) for x in dataPairs]))
          extraCsv.write("\n")
          extraCsv.close()
        utils.printCsv(csv)
    def collectOutput(self, finishedJob, output):
        """
      Function to place all of the computed data into the output object
      @ In, finishedJob, JobHandler External or Internal instance, A JobHandler object that is in charge of running this post-processor
      @ In, output, dataObjects, The object where we want to place our computed results
      @ Out, None
    """
        self.raiseADebug("finishedJob: " + str(finishedJob) + ", output " +
                         str(output))
        evaluation = finishedJob.getEvaluation()
        if isinstance(evaluation, Runners.Error):
            self.raiseAnError(
                RuntimeError,
                "No available output to collect (run possibly not finished yet)"
            )

        outputDictionary = evaluation[1]
        self.dataDict.update(outputDictionary)

        dataToProcess = []
        for compareGroup in self.compareGroups:
            dataPulls = compareGroup.dataPulls
            reference = compareGroup.referenceData
            foundDataObjects = []
            for name, kind, rest in dataPulls:
                dataSet = self.dataDict[name].asDataset()
                if len(rest) == 1:
                    foundDataObjects.append(copy.copy(dataSet[rest[0]].values))
            dataToProcess.append((dataPulls, foundDataObjects, reference))
        if not isinstance(output, Files.File):
            self.raiseAnError(IOError, 'unsupported type ' + str(type(output)))
        for dataPulls, datas, reference in dataToProcess:
            graphData = []
            if "name" in reference:
                distributionName = reference["name"]
                if not distributionName in self.distributions:
                    self.raiseAnError(
                        IOError, 'Did not find ' + distributionName + ' in ' +
                        str(self.distributions.keys()))
                else:
                    distribution = self.distributions[distributionName]
                refDataStats = {
                    "mean": distribution.untruncatedMean(),
                    "stdev": distribution.untruncatedStdDev()
                }
                refDataStats["minBinSize"] = refDataStats["stdev"] / 2.0
                refPdf = lambda x: distribution.pdf(x)
                refCdf = lambda x: distribution.cdf(x)
                graphData.append(
                    (refDataStats, refCdf, refPdf, "ref_" + distributionName))
            for dataPull, data in zip(dataPulls, datas):
                dataStats, cdfFunc, pdfFunc = _getPDFandCDFfromData(
                    str(dataPull), data, output, self.methodInfo,
                    self.interpolation, True)
                self.raiseADebug("dataStats: " + str(dataStats))
                graphData.append((dataStats, cdfFunc, pdfFunc, str(dataPull)))
            graphDataDict = _getGraphs(graphData, self.fZStats)
            for key in graphDataDict:
                value = graphDataDict[key]
                if type(value).__name__ == 'list':
                    utils.printCsv(output,
                                   *(['"' + l[0] + '"' for l in value]))
                    for i in range(1, len(value[0])):
                        utils.printCsv(output, *([l[i] for l in value]))
                else:
                    utils.printCsv(output, '"' + key + '"', value)
            for i in range(len(graphData)):
                dataStat = graphData[i][0]

                def delist(l):
                    """
            Method to create a string out of a list l
            @ In, l, list, the list to be 'stringed' out
            @ Out, delist, string, the string representing the list
          """
                    if type(l).__name__ == 'list':
                        return '_'.join([delist(x) for x in l])
                    else:
                        return str(l)

                newFileName = output.getBase() + "_" + delist(
                    dataPulls) + "_" + str(i) + ".csv"
                if type(dataStat).__name__ != 'dict':
                    assert (False)
                    continue
                dataPairs = []
                for key in sorted(dataStat.keys()):
                    value = dataStat[key]
                    if np.isscalar(value):
                        dataPairs.append((key, value))
                extraCsv = Files.returnInstance('CSV', self)
                extraCsv.initialize(newFileName, self.messageHandler)
                extraCsv.open("w")
                extraCsv.write(",".join(
                    ['"' + str(x[0]) + '"' for x in dataPairs]))
                extraCsv.write("\n")
                extraCsv.write(",".join([str(x[1]) for x in dataPairs]))
                extraCsv.write("\n")
                extraCsv.close()
            utils.printCsv(output)