def __init__(self, context, inputData, inputMask=None, inputState=None): """Create a DataTable from a type-context, input data, possible input masks, and possible input states. For maximum flexibility, very few assumptions are made about the format of C{inputData}. It need only have a structure that is equivalent to a dictionary mapping strings (field names) to lists of values (data columns). Numpy U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>}, U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>}, and U{Pandas data frames<http://pandas.pydata.org/>} effectively present their data in this format because:: inputData[fieldName] yields a column of values. Regardless of the input type, these values are then interpreted by the C{context} to set their PMML type. The length of the resulting DataTable is equal to the length of the shortest DataColumn. Generally, one should use equal-length arrays to build a DataTable. @type context: PmmlBinding, FieldType, string, dict, or None @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}. If a FieldType, use that FieldType to interpret all fields. If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields. If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis. Otherwise, assume a FieldType from the Numpy C{dtype}. The last option only works if all C{inputData} columns are Numpy arrays. @type inputData: any dict-like mapping from strings to lists @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}. @type inputMask: dict-like mapping from strings to lists of bool, or None @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention). Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error. """ if isinstance(context, PmmlBinding) and len(context.xpath("ancestor-or-self::pmml:PMML")) != 0: # get types from PMML dataColumns = OrderedDict() for fieldName, fieldDefinition in context.fieldContext().items(): fieldType = FieldType(fieldDefinition) try: dataField = inputData[fieldName] except KeyError: dataField = None else: try: maskField = inputMask[fieldName] except (KeyError, TypeError): maskField = None if dataField is not None: dataColumns[fieldName] = fieldType.toDataColumn(dataField, maskField) else: if not isinstance(context, dict): context = dict((x, context) for x in inputData) if all(isinstance(x, FieldType) for x in context.values()): # FieldTypes provided explicitly dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] dataColumns[fieldName] = context[fieldName].toDataColumn(data, mask) elif all(isinstance(x, basestring) for x in context.values()): # FieldTypes provided by dataType name dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if context[fieldName] == "string": fieldType = FakeFieldType(context[fieldName], "categorical") else: fieldType = FakeFieldType(context[fieldName], "continuous") dataColumns[fieldName] = fieldType.toDataColumn(data, mask) elif all(isinstance(inputData[x], NP.ndarray) for x in inputData.keys()): # FieldTypes provided by NumPy types dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if data.dtype in (NP.object, NP.object0, NP.object_, NP.str, NP.str_, NP.string0, NP.string_) or re.match("\|S[0-9]+", str(data.dtype)) is not None: fieldType = FakeFieldType("string", "categorical") elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16, NP.int32, NP.int64, NP.int_, NP.integer): fieldType = FakeFieldType("integer", "continuous") elif data.dtype in (NP.float, NP.__getattr__("float16", noneIfMissing=True), NP.float32): fieldType = FakeFieldType("float", "continuous") elif data.dtype in (NP.float64, NP.float128, NP.float_, NP.double): fieldType = FakeFieldType("double", "continuous") elif data.dtype in (NP.bool, NP.bool8, NP.bool_): fieldType = FakeFieldType("boolean", "continuous") else: raise TypeError("Unrecognized NumPy dtype: %r" % data.dtype) dataColumns[fieldName] = fieldType.toDataColumn(data, mask) else: raise TypeError("Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays") self._configure(dataColumns, inputState)
def __init__(self, context, inputData, inputMask=None, inputState=None): """Create a DataTable from a type-context, input data, possible input masks, and possible input states. For maximum flexibility, very few assumptions are made about the format of C{inputData}. It need only have a structure that is equivalent to a dictionary mapping strings (field names) to lists of values (data columns). Numpy U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>}, U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>}, and U{Pandas data frames<http://pandas.pydata.org/>} effectively present their data in this format because:: inputData[fieldName] yields a column of values. Regardless of the input type, these values are then interpreted by the C{context} to set their PMML type. The length of the resulting DataTable is equal to the length of the shortest DataColumn. Generally, one should use equal-length arrays to build a DataTable. @type context: PmmlBinding, FieldType, string, dict, or None @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}. If a FieldType, use that FieldType to interpret all fields. If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields. If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis. Otherwise, assume a FieldType from the Numpy C{dtype}. The last option only works if all C{inputData} columns are Numpy arrays. @type inputData: any dict-like mapping from strings to lists @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}. @type inputMask: dict-like mapping from strings to lists of bool, or None @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention). Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error. """ if isinstance(context, PmmlBinding) and len( context.xpath("ancestor-or-self::pmml:PMML")) != 0: # get types from PMML dataColumns = OrderedDict() for fieldName, fieldDefinition in context.fieldContext().items(): fieldType = FieldType(fieldDefinition) try: dataField = inputData[fieldName] except KeyError: dataField = None else: try: maskField = inputMask[fieldName] except (KeyError, TypeError): maskField = None if dataField is not None: dataColumns[fieldName] = fieldType.toDataColumn( dataField, maskField) else: if not isinstance(context, dict): context = dict((x, context) for x in inputData) if all(isinstance(x, FieldType) for x in context.values()): # FieldTypes provided explicitly dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] dataColumns[fieldName] = context[fieldName].toDataColumn( data, mask) elif all(isinstance(x, basestring) for x in context.values()): # FieldTypes provided by dataType name dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if context[fieldName] == "string": fieldType = FakeFieldType(context[fieldName], "categorical") else: fieldType = FakeFieldType(context[fieldName], "continuous") dataColumns[fieldName] = fieldType.toDataColumn(data, mask) elif all( isinstance(inputData[x], NP.ndarray) for x in inputData.keys()): # FieldTypes provided by NumPy types dataColumns = OrderedDict() for fieldName in sorted(context.keys()): data = inputData[fieldName] if inputMask is None: mask = None else: mask = inputMask[fieldName] if data.dtype in (NP.object, NP.object0, NP.object_, NP.str, NP.str_, NP.string0, NP.string_) or re.match( "\|S[0-9]+", str( data.dtype)) is not None: fieldType = FakeFieldType("string", "categorical") elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16, NP.int32, NP.int64, NP.int_, NP.integer): fieldType = FakeFieldType("integer", "continuous") elif data.dtype in (NP.float, NP.__getattr__("float16", noneIfMissing=True), NP.float32): fieldType = FakeFieldType("float", "continuous") elif data.dtype in (NP.float64, NP.float128, NP.float_, NP.double): fieldType = FakeFieldType("double", "continuous") elif data.dtype in (NP.bool, NP.bool8, NP.bool_): fieldType = FakeFieldType("boolean", "continuous") else: raise TypeError("Unrecognized NumPy dtype: %r" % data.dtype) dataColumns[fieldName] = fieldType.toDataColumn(data, mask) else: raise TypeError( "Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays" ) self._configure(dataColumns, inputState)