Example #1
0
    def __iter__(self):
        types = {}
        for name, fieldType in self.namesToFieldTypes.items():
            name = self._removeUnicode(name)

            if fieldType.dataType == "string":
                if fieldType.optype == "continuous":
                    types[name] = "string"
                elif fieldType.optype in ("categorical", "ordinal"):
                    types[name] = "category"
            elif fieldType.dataType in ("integer", "dateDaysSince[0]",
                                        "dateDaysSince[1960]",
                                        "dateDaysSince[1970]",
                                        "dateDaysSince[1980]", "timeSeconds",
                                        "dateTimeSecondsSince[0]",
                                        "dateTimeSecondsSince[1960]",
                                        "dateTimeSecondsSince[1970]",
                                        "dateTimeSecondsSince[1980]"):
                types[name] = "integer"
            elif fieldType.dataType == "double":
                types[name] = "double"

            if name not in types:
                raise TypeError("Cannot match %r to an extraction type" %
                                fieldType)

        if len(types) == 0:
            raise TypeError("At least one field must be selected")

        namesToAvroPaths = self._removeUnicode(self.namesToAvroPaths)

        if self.inputState is None:
            self.inputState = DataTableState()

        for fileName in self.fileNames:
            inputStream = InputStream()

            inputStream.start(fileName, self.chunkSize, namesToAvroPaths,
                              types)
            try:
                while True:
                    arrays = inputStream.next()
                    yield DataTable.buildManually(self.namesToFieldTypes,
                                                  arrays,
                                                  inputState=self.inputState)
                    if len(arrays.values()[0]) < self.chunkSize:
                        break
            except Exception:
                raise
            finally:
                inputStream.close()
    def __iter__(self):
        types = {}
        for name, fieldType in self.namesToFieldTypes.items():
            name = self._removeUnicode(name)

            if fieldType.dataType == "string":
                if fieldType.optype == "continuous":
                    types[name] = "string"
                elif fieldType.optype in ("categorical", "ordinal"):
                    types[name] = "category"
            elif fieldType.dataType in ("integer", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"):
                types[name] = "integer"
            elif fieldType.dataType == "double":
                types[name] = "double"

            if name not in types:
                raise TypeError("Cannot match %r to an extraction type" % fieldType)

        if len(types) == 0:
            raise TypeError("At least one field must be selected")

        namesToAvroPaths = self._removeUnicode(self.namesToAvroPaths)

        if self.inputState is None:
            self.inputState = DataTableState()

        for fileName in self.fileNames:
            inputStream = InputStream()

            inputStream.start(fileName, self.chunkSize, namesToAvroPaths, types)
            try:
                while True:
                    arrays = inputStream.next()
                    yield DataTable.buildManually(self.namesToFieldTypes, arrays, inputState=self.inputState)
                    if len(arrays.values()[0]) < self.chunkSize:
                        break
            except Exception:
                raise
            finally:
                inputStream.close()
Example #3
0
    def __init__(self,
                 fileNames,
                 namesToFieldTypes=None,
                 namesToAvroPaths=None,
                 inputState=None,
                 chunkSize=1000000):
        if InputStream is None:
            raise RuntimeError(
                "The optional augustus.avrostream module is required for \"AvroDataTableStream\" but it hasn't been installed or the Avro C++ library is not accessible;%sRecommendations: re-build Augustus with \"python setup.py install --with-avrostream\" or correct your LD_LIBRARY_PATH"
                % os.linesep)

        if isinstance(fileNames, basestring):
            self.fileNames = glob.glob(fileNames)
            if len(self.fileNames) == 0:
                raise IOError("No files matched the fileName pattern \"%s\"" %
                              fileNames)
        else:
            self.fileNames = fileNames

        self.schema = None
        for fileName in self.fileNames:
            inputStream = InputStream()
            inputStream.start(fileName, 0, {}, {})
            try:
                schema = json.loads(inputStream.schema())
                if self.schema is not None and schema != self.schema:
                    raise ValueError(
                        "these files do not all have the same schema")
                self.schema = schema
            except Exception:
                raise
            finally:
                inputStream.close()

        if self.schema["type"] != "record":
            raise TypeError(
                "Top level of schema must describe a record, not %r" %
                self.schema)

        if namesToFieldTypes is None:
            if namesToAvroPaths is None:
                namesToFieldTypes = dict(
                    (x["name"], None) for x in self.schema["fields"])

                # If no parameters are given and this is a map-reduce result, drill down and get the values.
                if set(namesToFieldTypes.keys()) == set(["key", "value"]) and [
                        x["type"]
                        for x in self.schema["fields"] if x["name"] == "key"
                ][0] == "string" and [
                        x["type"]
                        for x in self.schema["fields"] if x["name"] == "value"
                ][0]["type"] == "record":
                    del namesToFieldTypes["value"]
                    namesToAvroPaths = {"key": ("key", )}
                    for x in [
                            x["type"] for x in self.schema["fields"]
                            if x["name"] == "value"
                    ][0]["fields"]:
                        name = x["name"]
                        if name != "key":
                            namesToFieldTypes[name] = None
                            namesToAvroPaths[name] = ("value", name)

            else:
                namesToFieldTypes = dict((x, None) for x in namesToAvroPaths)

        if isinstance(namesToFieldTypes, (list, tuple)):
            namesToFieldTypes = dict((x, None) for x in namesToFieldTypes)

        if namesToAvroPaths is None:
            self.namesToAvroPaths = {}
            for name in namesToFieldTypes:
                self.namesToAvroPaths[name] = (name, )
        else:
            self.namesToAvroPaths = dict(namesToAvroPaths)
            for name, path in self.namesToAvroPaths.items():
                if isinstance(path, basestring):
                    self.namesToAvroPaths[name] = (path, )

        self.namesToFieldTypes = dict(namesToFieldTypes)
        for name, fieldType in namesToFieldTypes.items():
            schemaObject = self.schema
            path = self.namesToAvroPaths[name]

            for pathname in path:
                if schemaObject["type"] == "record":
                    pass
                elif isinstance(
                        schemaObject["type"],
                        dict) and schemaObject["type"].get("type") == "record":
                    schemaObject = schemaObject["type"]
                else:
                    raise LookupError("path %r not found in the schema" %
                                      (path, ))

                fieldNames = [x["name"] for x in schemaObject["fields"]]
                if pathname not in fieldNames:
                    raise LookupError("path %r not found in the schema" %
                                      (path, ))

                schemaObject, = (x for x in schemaObject["fields"]
                                 if x["name"] == pathname)

            avroType = schemaObject["type"]
            if isinstance(avroType, dict):
                avroType = avroType["type"]

            if avroType == "enum":
                values = [
                    FakeFieldValue(x) for x in schemaObject["type"]["symbols"]
                ]
            else:
                values = []

            if fieldType == "string":
                self.namesToFieldTypes[name] = FakeFieldType(
                    "string", "continuous")
            elif fieldType == "categorical":
                self.namesToFieldTypes[name] = FakeFieldType("string",
                                                             "categorical",
                                                             values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif fieldType == "ordinal":
                self.namesToFieldTypes[name] = FakeFieldType("string",
                                                             "ordinal",
                                                             values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif isinstance(fieldType, basestring):
                self.namesToFieldTypes[name] = FakeFieldType(
                    fieldType, "continuous")
            elif fieldType is None:
                if avroType in ("null", "record", "array", "map", "fixed"):
                    del self.namesToFieldTypes[name]
                    del self.namesToAvroPaths[name]
                elif avroType in ("boolean", "int", "long"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "integer", "continuous")
                elif avroType in ("float", "double"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "double", "continuous")
                elif avroType in ("bytes", "string"):
                    self.namesToFieldTypes[name] = FakeFieldType(
                        "string", "continuous")
                elif avroType == "enum":
                    self.namesToFieldTypes[name] = FakeFieldType("string",
                                                                 "categorical",
                                                                 values=values)
                    self._setupMaps(self.namesToFieldTypes[name])
                else:
                    raise TypeError("Unrecognized Avro type: %s" % avroType)

            if name in self.namesToFieldTypes:
                fieldType = self.namesToFieldTypes[name]
                if not isinstance(fieldType, FieldType):
                    raise TypeError("namesToFieldTypes must map to FieldTypes")

                # TODO: make this more sensible

                if fieldType.dataType in ("date", "time", "dateTime",
                                          "dateDaysSince[0]",
                                          "dateDaysSince[1960]",
                                          "dateDaysSince[1970]",
                                          "dateDaysSince[1980]", "timeSeconds",
                                          "dateTimeSecondsSince[0]",
                                          "dateTimeSecondsSince[1960]",
                                          "dateTimeSecondsSince[1970]",
                                          "dateTimeSecondsSince[1980]"):
                    raise NotImplementedError

                if fieldType.dataType == "object":
                    raise TypeError(
                        "PMML type %r and Avro type \"%s\" are incompatible" %
                        (fieldType, avroType))

                elif fieldType.dataType == "string":
                    if fieldType.optype == "continuous":
                        if avroType not in ("boolean", "int", "long", "float",
                                            "double", "string", "bytes"):
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))
                    elif fieldType.optype == "categorical":
                        if avroType != "enum":
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))
                    elif fieldType.optype == "ordinal":
                        if avroType != "enum":
                            raise TypeError(
                                "PMML type %r and Avro type \"%s\" are incompatible"
                                % (fieldType, avroType))

                elif fieldType.dataType in ("boolean", "integer",
                                            "dateDaysSince[0]",
                                            "dateDaysSince[1960]",
                                            "dateDaysSince[1970]",
                                            "dateDaysSince[1980]",
                                            "timeSeconds",
                                            "dateTimeSecondsSince[0]",
                                            "dateTimeSecondsSince[1960]",
                                            "dateTimeSecondsSince[1970]",
                                            "dateTimeSecondsSince[1980]"):
                    if avroType not in ("boolean", "int", "long"):
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

                elif fieldType.dataType in ("float", "double"):
                    if avroType not in ("boolean", "int", "long", "float",
                                        "double"):
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

                elif fieldType.dataType == "boolean":
                    raise TypeError(
                        "PMML type %r and Avro type \"%s\" are incompatible" %
                        (fieldType, avroType))

                elif fieldType.dataType in ("date", "time", "dateTime"):
                    if avroType != "string":
                        raise TypeError(
                            "PMML type %r and Avro type \"%s\" are incompatible"
                            % (fieldType, avroType))

        self.inputState = inputState
        self.chunkSize = chunkSize
    def __init__(self, fileNames, namesToFieldTypes=None, namesToAvroPaths=None, inputState=None, chunkSize=1000000):
        if InputStream is None:
            raise RuntimeError("The optional augustus.avrostream module is required for \"AvroDataTableStream\" but it hasn't been installed or the Avro C++ library is not accessible;%sRecommendations: re-build Augustus with \"python setup.py install --with-avrostream\" or correct your LD_LIBRARY_PATH" % os.linesep)

        if isinstance(fileNames, basestring):
            self.fileNames = glob.glob(fileNames)
            if len(self.fileNames) == 0:
                raise IOError("No files matched the fileName pattern \"%s\"" % fileNames)
        else:
            self.fileNames = fileNames

        self.schema = None
        for fileName in self.fileNames:
            inputStream = InputStream()
            inputStream.start(fileName, 0, {}, {})
            try:
                schema = json.loads(inputStream.schema())
                if self.schema is not None and schema != self.schema:
                    raise ValueError("these files do not all have the same schema")
                self.schema = schema
            except Exception:
                raise
            finally:
                inputStream.close()

        if self.schema["type"] != "record":
            raise TypeError("Top level of schema must describe a record, not %r" % self.schema)

        if namesToFieldTypes is None:
            if namesToAvroPaths is None:
                namesToFieldTypes = dict((x["name"], None) for x in self.schema["fields"])

                # If no parameters are given and this is a map-reduce result, drill down and get the values.
                if set(namesToFieldTypes.keys()) == set(["key", "value"]) and [x["type"] for x in self.schema["fields"] if x["name"] == "key"][0] == "string" and [x["type"] for x in self.schema["fields"] if x["name"] == "value"][0]["type"] == "record":
                    del namesToFieldTypes["value"]
                    namesToAvroPaths = {"key": ("key",)}
                    for x in [x["type"] for x in self.schema["fields"] if x["name"] == "value"][0]["fields"]:
                        name = x["name"]
                        if name != "key":
                            namesToFieldTypes[name] = None
                            namesToAvroPaths[name] = ("value", name)

            else:
                namesToFieldTypes = dict((x, None) for x in namesToAvroPaths)

        if isinstance(namesToFieldTypes, (list, tuple)):
            namesToFieldTypes = dict((x, None) for x in namesToFieldTypes)

        if namesToAvroPaths is None:
            self.namesToAvroPaths = {}
            for name in namesToFieldTypes:
                self.namesToAvroPaths[name] = (name,)
        else:
            self.namesToAvroPaths = dict(namesToAvroPaths)
            for name, path in self.namesToAvroPaths.items():
                if isinstance(path, basestring):
                    self.namesToAvroPaths[name] = (path,)

        self.namesToFieldTypes = dict(namesToFieldTypes)
        for name, fieldType in namesToFieldTypes.items():
            schemaObject = self.schema
            path = self.namesToAvroPaths[name]

            for pathname in path:
                if schemaObject["type"] == "record":
                    pass
                elif isinstance(schemaObject["type"], dict) and schemaObject["type"].get("type") == "record":
                    schemaObject = schemaObject["type"]
                else:
                    raise LookupError("path %r not found in the schema" % (path,))
                
                fieldNames = [x["name"] for x in schemaObject["fields"]]
                if pathname not in fieldNames:
                    raise LookupError("path %r not found in the schema" % (path,))

                schemaObject, = (x for x in schemaObject["fields"] if x["name"] == pathname)

            avroType = schemaObject["type"]
            if isinstance(avroType, dict):
                avroType = avroType["type"]

            if avroType == "enum":
                values = [FakeFieldValue(x) for x in schemaObject["type"]["symbols"]]
            else:
                values = []

            if fieldType == "string":
                self.namesToFieldTypes[name] = FakeFieldType("string", "continuous")
            elif fieldType == "categorical":
                self.namesToFieldTypes[name] = FakeFieldType("string", "categorical", values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif fieldType == "ordinal":
                self.namesToFieldTypes[name] = FakeFieldType("string", "ordinal", values=values)
                self._setupMaps(self.namesToFieldTypes[name])
            elif isinstance(fieldType, basestring):
                self.namesToFieldTypes[name] = FakeFieldType(fieldType, "continuous")
            elif fieldType is None:
                if avroType in ("null", "record", "array", "map", "fixed"):
                    del self.namesToFieldTypes[name]
                    del self.namesToAvroPaths[name]
                elif avroType in ("boolean", "int", "long"):
                    self.namesToFieldTypes[name] = FakeFieldType("integer", "continuous")
                elif avroType in ("float", "double"):
                    self.namesToFieldTypes[name] = FakeFieldType("double", "continuous")
                elif avroType in ("bytes", "string"):
                    self.namesToFieldTypes[name] = FakeFieldType("string", "continuous")
                elif avroType == "enum":
                    self.namesToFieldTypes[name] = FakeFieldType("string", "categorical", values=values)
                    self._setupMaps(self.namesToFieldTypes[name])
                else:
                    raise TypeError("Unrecognized Avro type: %s" % avroType)

            if name in self.namesToFieldTypes:
                fieldType = self.namesToFieldTypes[name]
                if not isinstance(fieldType, FieldType):
                    raise TypeError("namesToFieldTypes must map to FieldTypes")

                # TODO: make this more sensible

                if fieldType.dataType in ("date", "time", "dateTime", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"):
                    raise NotImplementedError

                if fieldType.dataType == "object":
                    raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

                elif fieldType.dataType == "string":
                    if fieldType.optype == "continuous":
                        if avroType not in ("boolean", "int", "long", "float", "double", "string", "bytes"):
                            raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))
                    elif fieldType.optype == "categorical":
                        if avroType != "enum":
                            raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))
                    elif fieldType.optype == "ordinal":
                        if avroType != "enum":
                            raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

                elif fieldType.dataType in ("boolean", "integer", "dateDaysSince[0]", "dateDaysSince[1960]", "dateDaysSince[1970]", "dateDaysSince[1980]", "timeSeconds", "dateTimeSecondsSince[0]", "dateTimeSecondsSince[1960]", "dateTimeSecondsSince[1970]", "dateTimeSecondsSince[1980]"):
                    if avroType not in ("boolean", "int", "long"):
                        raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

                elif fieldType.dataType in ("float", "double"):
                    if avroType not in ("boolean", "int", "long", "float", "double"):
                        raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

                elif fieldType.dataType == "boolean":
                    raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

                elif fieldType.dataType in ("date", "time", "dateTime"):
                    if avroType != "string":
                        raise TypeError("PMML type %r and Avro type \"%s\" are incompatible" % (fieldType, avroType))

        self.inputState = inputState
        self.chunkSize = chunkSize