コード例 #1
0
    def _toDataColumn_dateTime(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)

        data2 = NP("empty", len(data), dtype=self.dtype)
        mask2 = NP("zeros", len(data), dtype=defs.maskType)

        for i, x in enumerate(data):
            if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"):
                data2[i] = defs.PADDING
                mask2[i] = defs.MISSING
            else:
                try:
                    data2[i] = self.stringToValue(x)
                except (ValueError, TypeError):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.INVALID

        if not mask2.any():
            data, mask = data2, None
        else:
            data, mask = data2, mask2

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #2
0
    def _checkNumpy(self, data, mask, tryToCast=True):
        if mask is None and isinstance(data, NP.ma.MaskedArray):
            m = NP.ma.getmask(data)
            if m is not None:
                mask = m

        if isinstance(data, NP.ma.MaskedArray):
            data = NP.ma.getdata(data)
        
        if isinstance(data, NP.ndarray):
            if len(data.shape) != 1:
                raise TypeError("DataColumns cannot be built from n > 1 dimensional arrays")
            if tryToCast and data.dtype != self.dtype:
                try:
                    data = NP("array", data, dtype=self.dtype)
                except (TypeError, ValueError):
                    pass

        if isinstance(mask, NP.ndarray):
            if mask.shape != data.shape:
                raise TypeError("Mask, if provided, must have the same shape as data")
            if mask.dtype != defs.maskType:
                mask = NP(NP(mask != 0) * defs.MISSING)
        
        return data, mask
コード例 #3
0
    def mapper(self, dataTable):
        dataTable = dataTable.subTable()  # ensure that the results of this calculation do not get propagated

        self.metadata["ClusteringModel"].calculate(dataTable, performanceTable=self.performanceTable)

        data = dataTable.score.data
        mask = dataTable.score.mask
        stringToValue = dataTable.score.fieldType.stringToValue
        for index, cluster in enumerate(self.clusters):
            clusterName = cluster.get("id", "%d" % (index + 1))
            value = stringToValue(clusterName)

            selection = NP(data == value)
            if mask is not None:
                NP("logical_and", selection, NP(mask == defs.VALID), selection)

            denominator = selection.sum()

            numer = dict((fieldName, 0.0) for fieldName in self.fieldNames)
            denom = dict((fieldName, 0.0) for fieldName in self.fieldNames)

            for fieldName in self.fieldNames:
                numer[fieldName] += dataTable.fields[fieldName].data[selection].sum()
                denom[fieldName] += denominator

            self.emit(clusterName, {"numer": numer, "denom": denom})
コード例 #4
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormDiscrete")

        dataColumn = dataTable.fields[self["field"]]
        value = dataColumn.fieldType.stringToValue(self["value"])
        data = NP("array",
                  NP(dataColumn.data == value),
                  dtype=self._fieldType.dtype)
        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("NormDiscrete")
        return DataColumn(self._fieldType, data, mask)
コード例 #5
0
ファイル: FieldCastMethods.py プロジェクト: soedjais/augustus
    def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False):
        """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing".

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            mask = dataColumn.mask
            mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment"))
            return DataColumn(dataColumn.fieldType, dataColumn.data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type invalidValueTreatment: string
        @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect.
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 1d Numpy array of dtype defs.maskType
        @return: The new mask.
        """

        if mask is None: return mask

        if invalidValueTreatment == "asMissing":
            if overwrite:
                mask.setflags(write=True)
            else:
                mask = NP("copy", mask)
                mask.setflags(write=True)
            mask[NP(mask == defs.INVALID)] = defs.MISSING

        return mask
コード例 #6
0
ファイル: PlotCurve.py プロジェクト: Huskyeder/augustus
    def generateSamples(self, low, high):
        """Used by C{prepare} to generate an array of samples.

        @type low: number
        @param low: Minimum value to sample.
        @type high: number
        @param high: Maximum value to sample.
        @rtype: 1d Numpy array
        @return: An array of uniform, random, or adaptive samples of an interval.
        """

        numSamples = self.get("numSamples", defaultFromXsd=True, convertType=True)
        samplingMethod = self.get("samplingMethod", defaultFromXsd=True)

        if samplingMethod == "uniform":
            samples = NP("linspace", low, high, numSamples, endpoint=True)

        elif samplingMethod == "random":
            samples = NP(NP(NP(NP.random.rand(numSamples)) * (high - low)) + low)
            samples.sort()

        else:
            raise NotImplementedError("TODO: add 'adaptive'")

        return samples
コード例 #7
0
    def select(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression or predicate, given input data and
        a function table.

        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: 1d Numpy array of bool
        @return: The result of the expression or predicate as a Numpy mask.
        """

        predicate = self.childOfClass(PmmlPredicate)
        if predicate is not None:
            return predicate.evaluate(dataTable, functionTable,
                                      performanceTable)

        expression = self.childOfClass(PmmlExpression)
        dataColumn = expression.evaluate(dataTable, functionTable,
                                         performanceTable)

        if not dataColumn.fieldType.isboolean():
            raise defs.PmmlValidationError(
                "PlotSelection must evaluate to boolean, not %r" %
                dataColumn.fieldType)

        dataColumn._unlock()
        if dataColumn.mask is not None:
            NP("logical_and", dataColumn.data,
               NP(dataColumn.mask == defs.VALID), dataColumn.data)

        return dataColumn.data
コード例 #8
0
    def _toDataColumn_dateTimeNumber(self, data, mask):
        dataColumn = self._toDataColumn_number(data, mask)
        data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #9
0
ファイル: DataColumn.py プロジェクト: soedjais/augustus
    def subDataColumn(self, selection=None):
        """Return or filter this DataColumn with C{selection}.

        If C{selection} is None, this function returns a shallow copy
        of the DataColumn.  It has a new Python C{id}, but the
        potentially large numerical array is not copied.  This
        function can therefore be used in performance-critical
        situtations.

        @type selection: 1d Numpy array of dtype bool, or None
        @param selection: If None, simply return the DataColumn; otherwise, use the boolean array to filter it.
        @rtype: DataColumn
        @return: A DataColumn of the same length or shorter.
        """

        if selection is None:
            return DataColumn(self._fieldType, self._data, self._mask)

        else:
            subData = self.data[selection]
            if self.mask is None:
                subMask = None
            else:
                subMask = self.mask[selection]

            if not isinstance(subData, NP.ndarray):
                subData = NP("array", [subData])
                if subMask != None:
                    subMask = NP("array", [subMask])

            return DataColumn(self._fieldType, subData, subMask)
コード例 #10
0
ファイル: KMeans.py プロジェクト: soedjais/augustus
    def mapReduce(self):
        """Build a MapReduce-Ready K-means producer.

        Used by C{optimize} and C{hadoopOptimize}.

        @rtype: MapReduce
        @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop.
        """
        class KMeansMapReduceApplication(MapReduceKMeans):
            metadata = {}
            allChangeThreshold = self.allChangeThreshold

        KMeansMapReduceApplication.metadata[
            "ClusteringModel"] = self.clusteringModel

        clusterVectors = {}
        for index, cluster in enumerate(
                self.clusteringModel.xpath("pmml:Cluster")):
            clusterName = cluster.get("id", "%d" % (index + 1))
            clusterVectors[clusterName] = NP(
                "array",
                cluster.childOfTag("Array").values(),
                dtype=NP.dtype(float))
        KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors

        self.KMeansMapReduceApplication = KMeansMapReduceApplication

        return MapReduce(KMeansMapReduceApplication)
コード例 #11
0
 def applyWithoutMask(self, data, mask, argument):
     data, allbad = data
     NP("logical_xor", data, argument.data, data)
     if argument.mask is not None:
         NP("logical_and", allbad, NP(argument.mask != defs.VALID),
            allbad)
     return (data, allbad), mask
コード例 #12
0
    def evaluate(self,
                 dataTable,
                 functionTable,
                 performanceTable,
                 returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("Predicate False")

        result = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        if returnUnknowns:
            unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            result = result, unknowns, unknowns

        performanceTable.end("Predicate False")
        return result
コード例 #13
0
ファイル: Aggregate.py プロジェクト: soedjais/augustus
    def functionAverageFake(self, value, howmany, fieldType):
        """Averages rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("double", "continuous")
        numerator = NP("empty", howmany, dtype=fieldType.dtype)
        denominator = NP("empty", howmany, dtype=fieldType.dtype)
        numerator[:] = value[0]
        denominator[:] = value[1]
        data = NP(numerator / denominator)
        if value[1] == 0:
            mask = NP("empty", howmany, dtype=defs.maskType)
            mask[:] = defs.INVALID
        else:
            mask = None
        return DataColumn(fieldType, data, mask)
コード例 #14
0
ファイル: BaselineModel.py プロジェクト: Huskyeder/augustus
    def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable):
        """Calculate the score of a CUSUM TestStatistic.

        The CUSUM cumulative sum is a stateful calculation: each row
        depends on the result of the previous row.  To continue
        calculations through multiple calls to C{calc} or
        C{calculate}, pass a DataTableState object and give the
        BaselineModel a C{stateId} attribute.  The C{stateId} is not
        valid in strict PMML, but it can be inserted after validation
        or used in custom-ODG models (C{from augustus.odg import *}).

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue").
        """

        baseline = testDistributions.xpath("pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution")
        alternate = testDistributions.xpath("pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution")

        if len(baseline) == 0 or len(alternate) == 0:
            raise defs.PmmlValidationError("BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution")

        ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(dataColumn.data)
        if dataColumn.mask is None:
            good = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            good = NP(dataColumn.mask == defs.VALID)

        stateId = self.get("stateId")
        last = None
        if stateId is not None:
            last = state.get(stateId)
        if last is None:
            last = 0.0

        resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True)

        output = NP("empty", len(dataColumn), dtype=NP.dtype(float))

        performanceTable.begin("fill CUSUM")
        for index in xrange(len(dataColumn)):
            if good[index]:
                last = max(resetValue, last + ratios[index])
            output[index] = last
        performanceTable.end("fill CUSUM")

        if stateId is not None:
            state[stateId] = last

        return {None: DataColumn(self.scoreType, output, None)}
コード例 #15
0
 def applyWithMask(self, data, mask, argument, mask2):
     data, allbad = data
     data[mask2] = NP("logical_xor", data[mask2], argument.data[mask2])
     if argument.mask is not None:
         allbad[mask2] = NP("logical_and",
                            NP(allbad[mask2] != defs.VALID),
                            argument.mask[mask2])
     return (data, allbad), mask
コード例 #16
0
ファイル: Aggregate.py プロジェクト: Huskyeder/augustus
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field")

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)
コード例 #17
0
ファイル: PmmlModel.py プロジェクト: soedjais/augustus
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        if not self.get("isScorable", defaultFromXsd=True, convertType=True):
            dataTable.score = DataColumn(self.scoreType,
                                         NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING),
                                         NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID))
            return dataTable

        subTable = dataTable.subTable()

        for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(subTable, functionTable, performanceTable)

        for calculable in self.calculableTrans():
            calculable.calculate(subTable, functionTable, performanceTable)

        score = self.calculateScore(subTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if self.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[self.name] = value
                else:
                    dataTable.fields["%s.%s" % (self.name, key)] = value

        for outputField in self.xpath("pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score)

        for fieldName in subTable.output:
            dataTable.output[fieldName] = subTable.output[fieldName]

        return dataTable.score
コード例 #18
0
    def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix):
        """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers.

        Only modifes the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type adjustM: 1d Numpy array of numbers
        @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        @type covarianceMatrix: Numpy matrix
        @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}.
        @rtype: 1d Numpy array of numbers
        @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering.
        """

        if adjustM is None:
            result = state.sumInQuadrature
        else:
            result = NP(state.sumInQuadrature * adjustM)

        if distributionBased:
            normalizations = NP("sqrt", NP("sum", NP(state.displacements**2), axis=1))
            selection = NP(normalizations > 0.0)
            state.displacements[selection] = state.displacements[selection] / (normalizations[:, NP.newaxis])[selection]

            lengthOfSigma = NP("sum", NP(NP(state.displacements.dot(covarianceMatrix)) * state.displacements), axis=1)

            result[selection] = NP(result[selection] / lengthOfSigma[selection])

        return result
コード例 #19
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _checkValues(self, data, mask):
        values = self.values
        if len(values) == 0:
            return data, mask

        if mask is None:
            missing = NP("zeros", len(data), dtype=NP.dtype(bool))
            invalid = NP("zeros", len(data), dtype=NP.dtype(bool))
        else:
            missing = NP(mask == defs.MISSING)
            invalid = NP(mask == defs.INVALID)
        valid = NP("zeros", len(data), dtype=NP.dtype(bool))

        numberOfValidSpecified = 0
        for value in values:
            v = value.get("value")
            displayValue = value.get("displayValue")
            if displayValue is not None:
                self._displayValue[v] = displayValue

            prop = value.get("property", "valid")
            try:
                v2 = self.stringToValue(v)
            except ValueError:
                raise defs.PmmlValidationError("Improper value in Value specification: \"%s\"" % v)

            if prop == "valid":
                NP("logical_or", valid, NP(data == v2), valid)
                numberOfValidSpecified += 1
            elif prop == "missing":
                NP("logical_or", missing, NP(data == v2), missing)
            elif prop == "invalid":
                NP("logical_or", invalid, NP(data == v2), invalid)

        if numberOfValidSpecified > 0:
            # guilty until proven innocent
            NP("logical_and", valid, NP("logical_not", missing), valid)
            if valid.all():
                return data, None
            mask = NP(NP("ones", len(data), dtype=defs.maskType) * defs.INVALID)
            mask[missing] = defs.MISSING
            mask[valid] = defs.VALID

        else:
            # innocent until proven guilty
            NP("logical_and", invalid, NP("logical_not", missing), invalid)
            if not NP("logical_or", invalid, missing).any():
                return data, None
            mask = NP("zeros", len(data), dtype=defs.maskType)
            mask[missing] = defs.MISSING
            mask[invalid] = defs.INVALID

        return data, mask
コード例 #20
0
ファイル: Aggregate.py プロジェクト: Huskyeder/augustus
    def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Averages rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of averaged rows.
        """

        fieldType = FakeFieldType("double", "continuous")

        if dataColumn.fieldType.dataType not in ("integer", "float", "double"):
            raise defs.PmmlValidationError("Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"")

        denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype)
        if dataColumn.mask is not None:
            NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator)

        if whereMask is not None:
            NP("logical_and", denominator, whereMask, denominator)

        if groupSelection is not None:
            NP("logical_and", denominator, groupSelection, denominator)

        numerator = NP("multiply", denominator, dataColumn.data)

        if getstate is not None and len(dataColumn) > 0:
            startingState  = getstate()
            if startingState is not None:
                startingNumerator, startingDenominator = startingState
                numerator[0] += startingNumerator
                denominator[0] += startingDenominator

        numerator = NP("cumsum", numerator)
        denominator = NP("cumsum", denominator)

        data = NP(numerator / denominator)
        mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID)
        if not mask.any():
            mask = None

        if setstate is not None and len(dataColumn) > 0:
            setstate((numerator[-1], denominator[-1]))

        return DataColumn(fieldType, data, mask)
コード例 #21
0
ファイル: Aggregate.py プロジェクト: Huskyeder/augustus
    def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn of dict objects
        @return: A column of multisetted rows.
        """

        fieldType = FakeFieldType("object", "any")

        selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        if dataColumn.mask is not None:
            selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID))

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        multiset = {}
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                multiset = startingState
        current = dict(multiset)

        data = NP("empty", len(dataColumn), dtype=NP.dtype(object))

        toPython = dataColumn.fieldType.valueToPython
        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                value = toPython(x)
                if value not in multiset:
                    multiset[value] = 0
                multiset[value] += 1
                current = dict(multiset)
            data[i] = current

        if setstate is not None:
            setstate(multiset)

        return DataColumn(fieldType, data, None)
コード例 #22
0
ファイル: PlotCurve.py プロジェクト: Huskyeder/augustus
    def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop):
        """Fit a smooth line through a set of given numeric points
        with a characteristic smoothing scale.

        This is a non-parametric locally linear fit, used to plot data
        as a smooth line.

        @type xarray: 1d Numpy array of numbers
        @param xarray: Array of x values.
        @type yarray: 1d Numpy array of numbers
        @param yarray: Array of y values.
        @type samples: 1d Numpy array of numbers
        @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives.
        @type smoothingScale: number
        @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit.
        @type loop: bool
        @param loop: If False, disconnect the end of the fitted curve from the beginning.
        @rtype: 4-tuple of 1d Numpy arrays
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}.
        """

        ylist = []
        dylist = []

        for sample in samples:
            weights = NP(NP(NP("exp", NP(NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0*math.pi)))
            sum1 = weights.sum()
            sumx = NP(weights * xarray).sum()
            sumxx = NP(weights * NP(xarray * xarray)).sum()
            sumy = NP(weights * yarray).sum()
            sumxy = NP(weights * NP(xarray * yarray)).sum()

            delta = (sum1 * sumxx) - (sumx * sumx)
            intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta
            slope = ((sum1 * sumxy) - (sumx * sumy)) / delta

            ylist.append(intercept + (sample * slope))
            dylist.append(slope)

        xlist = samples
        ylist = NP("array", ylist, dtype=NP.dtype(float))
        dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
        dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist
        if not loop:
            dxlist[0] = 0.0
            dxlist[-1] = 0.0
            dylist[0] = 0.0
            dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist
コード例 #23
0
ファイル: PlotCurve.py プロジェクト: soedjais/augustus
    def generateSamples(self, low, high):
        """Used by C{prepare} to generate an array of samples.

        @type low: number
        @param low: Minimum value to sample.
        @type high: number
        @param high: Maximum value to sample.
        @rtype: 1d Numpy array
        @return: An array of uniform, random, or adaptive samples of an interval.
        """

        numSamples = self.get("numSamples",
                              defaultFromXsd=True,
                              convertType=True)
        samplingMethod = self.get("samplingMethod", defaultFromXsd=True)

        if samplingMethod == "uniform":
            samples = NP("linspace", low, high, numSamples, endpoint=True)

        elif samplingMethod == "random":
            samples = NP(
                NP(NP(NP.random.rand(numSamples)) * (high - low)) + low)
            samples.sort()

        else:
            raise NotImplementedError("TODO: add 'adaptive'")

        return samples
コード例 #24
0
 def _fromDataColumn_number(self, dataColumn):
     if dataColumn.mask is None:
         return NP("array", dataColumn.data, dtype=NP.dtype(object))
     else:
         output = NP("empty", len(dataColumn), dtype=NP.dtype(object))
         mask = dataColumn.mask
         for i, x in enumerate(dataColumn.data):
             if mask[i] == defs.VALID:
                 output[i] = x
             elif mask[i] == defs.MISSING:
                 output[i] = defs.NAN
             else:
                 output[i] = None
         return output
コード例 #25
0
    def logpdf(self, array):
        """Vectorized logarithm of the probability density function (PDF).

        @type array: 1d Numpy array of numbers
        @param array: The input vector.
        @rtype: 1d Numpy array of numbers
        @return: The result of ln(PDF_Gaussian(x)) for all input values x.
        """

        mean = float(self.attrib["mean"])
        twovariance = 2.0 * float(self.attrib["variance"])
        return NP(
            NP(NP("negative", NP("square", NP(array - mean))) / twovariance) -
            math.log(math.sqrt(math.pi * twovariance)))
コード例 #26
0
ファイル: FieldCastMethods.py プロジェクト: soedjais/augustus
    def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False):
        """Replace MISSING values with a given substitute.

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            data, mask = dataColumn.data, dataColumn.mask
            data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999")
            return DataColumn(dataColumn.fieldType, data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type fieldType: FieldType
        @param fieldType: The data fieldType (to interpret C{mapMissingTo}).
        @type data: 1d Numpy array
        @param data: The data.
        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type mapMissingTo: string
        @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute).
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 2-tuple of 1d Numpy arrays
        @return: The new data and mask.
        """

        if mask is None: return data, mask

        if mapMissingTo is not None:
            selection = NP(mask == defs.MISSING)
            try:
                mappedValue = fieldType.stringToValue(mapMissingTo)
            except ValueError as err:
                raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err)))

            if overwrite:
                data.setflags(write=True)
                mask.setflags(write=True)
            else:
                data = NP("copy", data)
                mask = NP("copy", mask)

            data[selection] = mappedValue
            mask[selection] = defs.VALID

            if not mask.any():
                mask = None

        return data, mask
コード例 #27
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _toDataColumn_number(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            mask2 = NP("isnan", data)
            if mask is None:
                mask = NP("array", mask2, defs.maskType) * defs.MISSING
            else:
                mask[mask2] = defs.MISSING

        else:
            data, mask = self._checkNonNumpy(data, mask)
            try:
                data = NP("array", data, dtype=self.dtype)
                # mask is handled in the else statement after the except block

            except (ValueError, TypeError):
                data2 = NP("empty", len(data), dtype=self.dtype)
                if mask is None:
                    mask2 = NP("zeros", len(data), dtype=defs.maskType)
                else:
                    mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask))

                for i, v in enumerate(data):
                    try:
                        data2[i] = v
                        if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")):
                            mask2[i] = defs.MISSING
                        if v is None:
                            raise TypeError
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        if mask2[i] == defs.VALID:
                            if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"):
                                mask2[i] = defs.MISSING
                            else:
                                mask2[i] = defs.INVALID

                if not mask2.any():
                    mask2 = None

                data, mask = data2, mask2

            else:
                mask2 = NP("isnan", data)
                if mask is None:
                    mask = NP("array", mask2, defs.maskType)
                else:
                    mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING)
                if not mask.any():
                    mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #28
0
ファイル: DataTable.py プロジェクト: soedjais/augustus
    def singleton(self, inputData, inputMask=None, inputState=None):
        """Create a single-row DataTable for event-based processes.

        This static method is to the DataTable constructor, but it
        creates a DataTable with only one row and it uses the Python
        data type of the C{inputData} to define a type, rather than an
        explicit C{context}.

        @type inputData: dict-like mapping from strings to single values (not lists)
        @param inputData: A single data record.
        @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None
        @param inputMask: A single mask.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        """

        dataColumns = OrderedDict()
        for fieldName in sorted(inputData.keys()):
            value = inputData[fieldName]

            if isinstance(value, basestring):
                fieldType = FakeFieldType("string", "continuous")
            elif isinstance(value, float):
                fieldType = FakeFieldType("double", "continuous")
            elif isinstance(value, int):
                fieldType = FakeFieldType("integer", "continuous")
            elif isinstance(value, bool):
                fieldType = FakeFieldType("boolean", "continuous")

            # TODO: PMML date types (when passed a datetype.datetype object)

            else:
                fieldType = FakeFieldType("object", "any")

            data = NP("empty", 1, dtype=fieldType.dtype)
            data[0] = value

            if inputMask is None or inputMask.get(fieldName) is None:
                mask = None
            else:
                mask = NP("empty", 1, dtype=defs.maskType)
                mask[0] = inputMask.get(fieldName)

            dataColumns[fieldName] = DataColumn(fieldType, data, mask)

        dataTable = DataTable.__new__(DataTable)
        dataTable._configure(dataColumns, inputState)
        return dataTable
コード例 #29
0
    def _stringToValue_date(self, string):
        regex = re.match(self._iso8601_date, string)
        if regex is None:
            raise ValueError("invalid ISO 8601 date string: \"%s\"" % string)

        year = regex.group(1)
        month = regex.group(3)
        day = regex.group(5)
        
        try:
            if year is not None and month is not None and day is not None:
                dateTimeObject = datetime.datetime(int(year), int(month), int(day))

            elif year is not None and month is not None:
                dateTimeObject = datetime.datetime(int(year), int(month), 1)

            elif year is not None:
                dateTimeObject = datetime.datetime(int(year), 1, 1)

            else:
                raise ValueError

        except ValueError:
            raise ValueError("invalid ISO 8601 date string: \"%s\"" % string)

        td = dateTimeObject - self._dateTimeOrigin
        return NP.int64(td.days*86400 * self._dateTimeResolution)
コード例 #30
0
    def _toDataColumn_string(self, data, mask):
        dataColumn = self._toDataColumn_object(data, mask)

        data = dataColumn.data
        mask = dataColumn.mask
        data.setflags(write=True)
        if mask is not None:
            mask.setflags(write=True)

        if mask is not None:
            for i, x in enumerate(dataColumn.data):
                if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID:
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

        else:
            for i, x in enumerate(dataColumn.data):
                if x is None or (isinstance(x, float) and math.isnan(x)):
                    if mask is None:
                        mask = NP("zeros", len(data), dtype=defs.maskType)
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

            if mask is not None:
                dataColumn._mask = mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)

        return DataColumn(self, data, mask)
コード例 #31
0
ファイル: PlotRange.py プロジェクト: soedjais/augustus
    def zmaxPush(self, zmax, fieldType, sticky=False):
        """Make the z range of the bounding box larger by (possibly)
        pushing the z maximum higher.

        "Sticky" means that the final bounding box will not be
        expanded beyond this value, if it turns out to be the most
        extreme.  This feature is used, for example, in the layout of
        a vertical histogram: the xmin and xmax of the plot window
        should align with the xmin and xmax of a histogram unless an
        overlaying graphic pushes the boundary farther.  The ymax of
        the histogram should be inflated beyond the tallest bin so
        that it can be clearly seen.

        If C{zStrictlyPositive} is True, negative C{zmax} values are
        ignored.

        @type zmax: number
        @param zmax: The new C{zmax}, if this C{zmax} is larger than the currently largest C{zmax}.
        @type fieldType: FieldType
        @param fieldType: The FieldType of z.  Only homogeneous FieldTypes are allowed.
        @type sticky: bool
        @param sticky: Label this zmax as a "sticky" zmax.
        @raise PmmlValidationError: If any z FieldTypes differ, this function will raise an error.
        """

        self._checkFieldTypeZ(fieldType)
        if NP("isfinite", zmax) and (not self.zStrictlyPositive or zmax > 0.0) and (self.zmax is None or zmax > self.zmax):
            self.zmax = zmax
            if sticky: self.zmaxSticky = zmax
コード例 #32
0
ファイル: PlotRange.py プロジェクト: soedjais/augustus
    def yminPush(self, ymin, fieldType, sticky=False):
        """Make the y range of the bounding box larger by (possibly)
        pushing the y minimum lower.

        "Sticky" means that the final bounding box will not be
        expanded beyond this value, if it turns out to be the most
        extreme.  This feature is used, for example, in the layout of
        a vertical histogram: the xmin and xmax of the plot window
        should align with the xmin and xmax of a histogram unless an
        overlaying graphic pushes the boundary farther.  The ymax of
        the histogram should be inflated beyond the tallest bin so
        that it can be clearly seen.

        If C{yStrictlyPositive} is True, negative C{ymin} values are
        ignored.

        @type ymin: number
        @param ymin: The new C{ymin}, if this C{ymin} is smaller than the currently smallest C{ymin}.
        @type fieldType: FieldType
        @param fieldType: The FieldType of y.  Only homogeneous FieldTypes are allowed.
        @type sticky: bool
        @param sticky: Label this ymin as a "sticky" ymin.
        @raise PmmlValidationError: If any y FieldTypes differ, this function will raise an error.
        """

        self._checkFieldTypeY(fieldType)
        if NP("isfinite", ymin) and (not self.yStrictlyPositive or ymin > 0.0) and (self.ymin is None or ymin < self.ymin):
            self.ymin = ymin
            if sticky: self.yminSticky = ymin
コード例 #33
0
    def accumulate(self, state, cxy, fieldWeight, distributionBased):
        """Second step in a vectorized metric calculation, called for each field and cluster center.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type cxy: 1d Numpy array of numbers
        @param cxy: Comparison distance or similarity for all rows.
        @type fieldWeight: number
        @param fieldWeight: The weight of this field.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        NP("maximum", state.maximumComponent, NP(cxy * fieldWeight), state.maximumComponent)
コード例 #34
0
 def _fromDataColumn(self, dataColumn):
     # enumeration uses less memory and, interestingly, a little less time than a list comprehension (80 ns instead of 100 ns per record)
     output = NP("empty", len(dataColumn), dtype=NP.dtype(object))
     if dataColumn.mask is None:
         for i, x in enumerate(dataColumn.data):
             output[i] = self.valueToPython(x)
     else:
         mask = dataColumn.mask
         for i, x in enumerate(dataColumn.data):
             if mask[i] == defs.VALID:
                 output[i] = self.valueToPython(x)
             elif mask[i] == defs.MISSING:
                 output[i] = defs.NAN
             else:
                 output[i] = None
     return output
コード例 #35
0
    def endReducerKey(self, key):
        for clusterName in self.clusterVectors.keys():
            if clusterName == key:
                newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float))

                self.emit(clusterName, newPosition)
                break
コード例 #36
0
ファイル: PlotHistogram.py プロジェクト: soedjais/augustus
    def determineScaleBins(numBins, low, high, array):
        """Determine the C{numBins}, C{low}, and C{high} of the
        histogram from explicitly set values where available and
        implicitly derived values where necessary.

        Explicitly set values always override implicit values derived
        from the dataset.
          - C{low}, C{high} implicit values are the extrema of the
            dataset.
          - C{numBins} implicit value is the Freedman-Diaconis
            heuristic for number of histogram bins.

        @type numBins: int or None
        @param numBins: Input number of bins.
        @type low: number or None
        @param low: Low edge.
        @type high: number or None
        @param high: High edge.
        @type array: 1d Numpy array of numbers
        @param array: Dataset to use to implicitly derive values.
        @rtype: 3-tuple
        @return: C{numBins}, C{low}, C{high}
        """

        generateLow = (low is None)
        generateHigh = (high is None)

        if generateLow: low = float(array.min())
        if generateHigh: high = float(array.max())

        if low == high:
            low, high = low - 1.0, high + 1.0
        elif high < low:
            if generateLow:
                low = high - 1.0
            elif generateHigh:
                high = low + 1.0
            else:
                raise defs.PmmlValidationError(
                    "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g"
                    % (low, high))
        else:
            if generateLow and generateHigh:
                low, high = low - 0.2 * (high - low), high + 0.2 * (high - low)
            elif generateLow:
                low = low - 0.2 * (high - low)
            elif generateHigh:
                high = high + 0.2 * (high - low)

        if numBins is None:
            # the Freedman-Diaconis rule
            q1, q3 = NP("percentile", array, [25.0, 75.0])
            binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0)
            if binWidth > 0.0:
                numBins = max(10, int(math.ceil((high - low) / binWidth)))
            else:
                numBins = 10

        return numBins, low, high
コード例 #37
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.fieldTypeFromSignature(arguments)
            left, right = arguments

            zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID)
            if not zeroDenominators.any():
                zeroDenominators = None

            mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask])

            dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask)

            performanceTable.end("built-in \"%s\"" % self.name)
            return dataColumn
コード例 #38
0
 def evaluate(self, dataTable, functionTable, performanceTable,
              arguments):
     dataColumn = Between.evaluate(dataTable, functionTable,
                                   performanceTable, arguments)
     dataColumn._unlock()
     NP("logical_not", dataColumn.data, dataColumn.data)
     dataColumn._lock()
     return dataColumn
コード例 #39
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask,
                                                         arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
コード例 #40
0
    def __call__(self, x, y):
        """Transform the point x, y from this inner coordinate system
        all the way out to the outermost global coordinates, the
        coordinates of the SVG file.

        @type x: number
        @param x: The horizontal position in this coordinate system.
        @type y: number
        @param y: The vertical position in this coordinate system.
        @rtype: 2-tuple of numbers
        @return: The X, Y position in the outermost global coordinates.
        """

        if not isinstance(x, (NP.ndarray, NP.double)):
            x = NP.double(x)
        if not isinstance(y, (NP.ndarray, NP.double)):
            y = NP.double(y)

        x, y = self._fx(x), self._fy(y)

        if isinstance(x, NP.ndarray):
            infinite = NP("isinf", x)
            minusInfinity = NP("logical_and", infinite, NP(x < 0.0))
            x[infinite] = self.outerYPlusInfinity
            x[minusInfinity] = self.outerYMinusInfinity
        else:
            if x == float("inf"):
                x = self.outerYPlusInfinity
            elif x == float("-inf"):
                x = self.outerYMinusInfinity

        if isinstance(y, NP.ndarray):
            infinite = NP("isinf", y)
            minusInfinity = NP("logical_and", infinite, NP(y < 0.0))
            y[infinite] = self.outerYPlusInfinity
            y[minusInfinity] = self.outerYMinusInfinity
        else:
            if y == float("inf"):
                y = self.outerYPlusInfinity
            elif y == float("-inf"):
                y = self.outerYMinusInfinity

        x, y = super(PlotCoordinatesWindow, self).__call__(x, y)
        return x, y
コード例 #41
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _toDataColumn_object(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            pass  # proceed to return statement (after checking values and intervals)

        else:
            data, mask = self._checkNonNumpy(data, mask)
            data = NP.array(data, dtype=self.dtype)

            if mask is None:
                mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data))
            else:
                mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask))
            if not mask.any():
                mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
コード例 #42
0
    def outliersAsMissing(mask, originalMask, selection, overwrite=False):
        """Label all rows specified by a selection as MISSING.

        This function does not modify the original mask (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            mask = dataColumn.mask
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data < MINIMUM_CUT)
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data > MAXIMUM_CUT)
            return DataColumn(dataColumn.fieldType, dataColumn.data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type mask: 1d Numpy array of type defs.maskType, or None
        @param mask: The mask to be updated.
        @type originalMask: 1d Numpy array of type defs.maskType, or None
        @param originalMask: The original mask.
        @type selection: 1d Numpy array of bool
        @param selection: The rows to label as MISSING.
        @type overwrite: bool
        @param overwrite: If True, temporarily unlock and overwrite the original mask.
        @rtype: 1d Numpy array of type defs.maskType
        @return: The new mask.
        """

        if mask is None:
            mask = selection * defs.MISSING

        elif mask is originalMask:
            NP("logical_and", selection, NP(mask == defs.VALID), selection)
            if overwrite:
                mask.setflags(write=True)
            else:
                mask = NP("copy", mask)
                mask.setflags(write=True)
            mask[selection] = defs.MISSING

        else:
            NP("logical_and", selection, NP(mask == defs.VALID), selection)
            mask[selection] = defs.MISSING

        return mask
コード例 #43
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _checkIntervals(self, data, mask):
        intervals = self.intervals
        if len(intervals) == 0:
            return data, mask

        # innocent until proven guilty
        invalid = NP("zeros", len(data), dtype=NP.dtype(bool))
        for interval in intervals:
            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")

            if leftMargin is not None:
                try:
                    leftMargin = self.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    invalid[NP(data <= leftMargin)] = True
                elif closure in ("closedOpen", "closedClosed"):
                    invalid[NP(data < leftMargin)] = True

            if rightMargin is not None:
                try:
                    rightMargin = self.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    invalid[NP(data >= rightMargin)] = True
                elif closure in ("openClosed", "closedClosed"):
                    invalid[NP(data > rightMargin)] = True

        if not invalid.any():
            return data, mask

        if mask is None:
            return data, NP(invalid * defs.INVALID)
        else:
            NP("logical_and", invalid, NP(mask == defs.VALID), invalid)   # only change what wasn't already marked as MISSING
            mask[invalid] = defs.INVALID
            return data, mask
コード例 #44
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
コード例 #45
0
    def __call__(self, x, y):
        """Transform the point x, y from this inner coordinate system
        all the way out to the outermost global coordinates, the
        coordinates of the SVG file.

        @type x: number
        @param x: The horizontal position in this coordinate system.
        @type y: number
        @param y: The vertical position in this coordinate system.
        @rtype: 2-tuple of numbers
        @return: The X, Y position in the outermost global coordinates.
        """

        if not isinstance(x, (NP.ndarray, NP.double)):
            x = NP.double(x)
        if not isinstance(y, (NP.ndarray, NP.double)):
            y = NP.double(y)

        x, y = self.xoffset + x, self.yoffset + y
        x, y = super(PlotCoordinatesOffset, self).__call__(x, y)
        return x, y
コード例 #46
0
ファイル: Function.py プロジェクト: Huskyeder/augustus
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
コード例 #47
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _toDataColumn_internal(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)
        
        try:
            data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data))
            # mask is handled in the else statement after the except block

        except ValueError:
            data2 = NP("empty", len(data), dtype=self.dtype)
            if mask is None:
                mask2 = NP("zeros", len(data), dtype=defs.maskType)
            else:
                mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask))

            for i, v in enumerate(data):
                if isinstance(v, float) and math.isnan(v):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.MISSING
                else:
                    try:
                        data2[i] = self.stringToValue(v)
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        mask2[i] = defs.INVALID

            if not mask2.any():
                mask2 = None

            data, mask = data2, mask2

        else:
            if mask is not None and not isinstance(mask, NP.ndarray):
                mask = NP("array", mask, dtype=defs.maskType)

        # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings

        return DataColumn(self, data, mask)
コード例 #48
0
ファイル: FALSE.py プロジェクト: justinrichie/augustus
    def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("Predicate False")

        result = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        if returnUnknowns:
            unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            result = result, unknowns, unknowns

        performanceTable.end("Predicate False")
        return result
コード例 #49
0
ファイル: Chebychev.py プロジェクト: Huskyeder/augustus
    def initialize(self, state, numberOfRecords, numberOfFields, distributionBased):
        """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type numberOfRecords: int
        @param numberOfRecords: The number of rows in the dataset.
        @type numberOfFields: int
        @param numberOfFields: The number of columns in the dataset.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        state.maximumComponent = NP("zeros", numberOfRecords, dtype=NP.dtype(float))
        if distributionBased:
            raise NotImplementedError("Distribution-based clustering has not been implemented for the %s metric" % self.t)
コード例 #50
0
    def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("SimpleSetPredicate")

        fieldName = self.get("field")
        dataColumn = dataTable.fields[fieldName]
        
        fromString = dataColumn.fieldType.stringToValue
        array = [fromString(x) for x in self.childOfClass(Array).values(convertType=False)]

        selection = NP("in1d", dataColumn.data, array)

        if self.get("booleanOperator") == "isNotIn":
            NP("logical_not", selection, selection)

        if returnUnknowns:
            if dataColumn.mask is None:
                unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            else:
                unknowns = NP(dataColumn.mask != defs.VALID)

            performanceTable.end("SimpleSetPredicate")
            return selection, unknowns, unknowns

        else:
            if dataColumn.mask is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)

            performanceTable.end("SimpleSetPredicate")
            return selection
コード例 #51
0
ファイル: Euclidean.py プロジェクト: Huskyeder/augustus
    def initialize(self, state, numberOfRecords, numberOfFields, distributionBased):
        """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type numberOfRecords: int
        @param numberOfRecords: The number of rows in the dataset.
        @type numberOfFields: int
        @param numberOfFields: The number of columns in the dataset.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        state.sumInQuadrature = NP("zeros", numberOfRecords, dtype=NP.dtype(float))
        if distributionBased:
            state.displacements = NP("empty", (numberOfRecords, numberOfFields), dtype=NP.dtype(float))
            state.displacementIndex = 0
コード例 #52
0
ファイル: FieldType.py プロジェクト: Huskyeder/augustus
    def _stringToValue_time(self, string):
        regex = re.match(self._iso8601_time, string)
        if regex is None:
            raise ValueError("invalid ISO 8601 time string: \"%s\"" % string)

        hour = regex.group(1)
        minute = regex.group(2)
        second = regex.group(4)
        subsecond = regex.group(5)
        timezone = regex.group(6)

        timezoneOffset = 0
        try:
            if hour is not None and minute is not None and second is not None:
                if subsecond is None:
                    microsecond = 0
                else:
                    microsecond = int(round(float(subsecond) * 1e6))
                dateTimeObject = datetime.datetime(1970, 1, 1, int(hour), int(minute), int(second), microsecond)

            elif hour is not None and minute is not None:
                if subsecond is not None:
                    raise ValueError
                dateTimeObject = datetime.datetime(1970, 1, 1, int(hour), int(minute))

            if timezone is not None:
                regex2 = re.match(self._timezone, timezone)
                if regex2 is not None:
                    sign, hourOffset, minuteOffset = regex2.groups()
                    timezoneOffset = ((int(hourOffset) * 60) + int(minuteOffset)) * 60 * self._dateTimeResolution   # microseconds
                    if sign == "-":
                        timezoneOffset *= -1

        except ValueError:
            raise ValueError("invalid ISO 8601 time string: \"%s\"" % string)

        td = dateTimeObject - self._dateTimeOrigin
        return NP.int64(td.seconds * self._dateTimeResolution + td.microseconds - timezoneOffset)
コード例 #53
0
ファイル: ClusteringModel.py プロジェクト: Huskyeder/augustus
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields, missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError("Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix")
                try:
                    covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering")

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased)

            for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight, distributionBased)

            distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int))   # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1   # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score