Exemple #1
0
    def __init__(self, fileLocation, sorter=None, blocksize=4096):
        self.fileNames = getfiles(fileLocation, sorter)
        if len(self.fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        self.fields = None
        self.types = None
        self.args = {}
        self.strings = {}

        for fileName in self.fileNames:
            tmpfile = open(fileName, "rb")
            header = tmpfile.readline().rstrip()
            tmpfile.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" does not begin with 'RecArray'" %
                    fileName)

            self.args[fileName] = dict(
                asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in self.args.keys():
                raise NotImplementedError(
                    "No support yet for NAB files (such as \"%s\") with masked NumPy arrays"
                    % fileName)

            if set(self.args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])"
                    % (fileName, str(set(self.args[fileName].keys()))))

            thisfields = self.args[fileName]["names"].split(",")
            thistypes = self.args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    self.strings[thisfields[i]] = True
                else:
                    self.strings[thisfields[i]] = False

            if self.fields is None:
                self.fields = thisfields
                self.types = thistypes
            else:
                if self.fields != thisfields:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has fields %s, which differ from the first %s"
                        % (fileName, str(thisfields), str(self.fields)))
                if self.types != thistypes:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has types %s, which differ from the first %s"
                        % (fileName, str(thistypes), str(self.types)))

            self.args[fileName]["shape"] = blocksize

        self.types = dict(zip(self.fields, self.types))
Exemple #2
0
    def __iter__(self):
        parser = self.setupParser()

        if self._checkfile is not None: fileNames = [self._checkfile]
        else: fileNames = self.fileNames

        for fileName in fileNames:
            self._depth = 0
            self._rowtag = None
            self._rowflatten = None
            self._rowdepth = 0
            self._indata = False

            if self._checkfile is not None: self._checktypes = {}
            else: self._checktypes = None
            self._checkfields = None

            with open(fileName) as file:
                while True:
                    self._rows = []
                    data = file.read(self.blocksize)
                    try:
                        parser.Parse(data)
                    except xml.parsers.expat.ExpatError as err:
                        raise BadlyFormattedInputData(
                            "XML reader encountered an error: %s" % str(err))

                    for record in self._rows:
                        if self._checkfile is not None:
                            if self._checkfields is not None:
                                _checktypes = dict(
                                    (f, "string") for f in self._checkfields)
                                _checktypes.update(self._checktypes)
                                yield self._checkfields, _checktypes
                            else:
                                yield record.keys(), self._checktypes

                        elif self.types is not None:
                            for f in self.fields:
                                try:
                                    record[f] = self._types[f](record[f])
                                except KeyError:
                                    pass
                        yield record

                    if len(data) < self.blocksize:
                        break
Exemple #3
0
    def __iter__(self):
        try:
            reader = csv.reader(self.stream, self.dialect)
            if self.skipHeader:
                next(reader)  # get past the header

            for record in reader:
                if self.types is None:
                    yield record
                else:
                    yield [
                        self._types[f](r) for f, r in zip(self.fields, record)
                    ]

        except csv.Error as err:
            raise BadlyFormattedInputData(
                "CSV reader encountered an error: %s" % str(err))
Exemple #4
0
        def start_element(tag, attrib):
            if self._depth == 0:
                if "tag" in attrib:
                    self._rowtag = attrib["tag"]
                if "structure" in attrib:
                    if attrib["structure"] == "flatten":
                        self._rowflatten = True
                    elif attrib["structure"] == "ignore":
                        self._rowflatten = False
                    else:
                        raise BadlyFormattedInputData(
                            "'structure' attribute must either be \"flatten\" or \"ignore\"."
                        )
                if "fields" in attrib:
                    self._checkfields = [
                        x.lstrip().rstrip()
                        for x in attrib["fields"].split(",")
                    ]
                    if "types" in attrib:
                        self._checktypes = dict(
                            zip(self._checkfields, [
                                x.lstrip().rstrip()
                                for x in attrib["types"].split(",")
                            ]))

            elif self._depth == 1 and self._rowtag is None:
                self._rowdepth = 1
                self._thisrow = {}

            elif tag == self._rowtag:
                self._rowdepth = 1
                self._thisrow = {}

            elif self._rowdepth >= 1:
                self._rowdepth += 1
                if self._rowflatten or self._rowdepth == 2:
                    if self._checktypes is not None:
                        if tag not in self._checktypes:
                            if "type" in attrib:
                                self._checktypes[tag] = attrib["type"]
                            else:
                                self._checktypes[tag] = "string"
                    self._thistext = []

            self._depth += 1
Exemple #5
0
    def __iter__(self):
        for fileName in self.fileNames:
            try:
                reader = csv.reader(open(fileName, "r"), self.dialect)
                if self.explicitHeader is None:
                    next(reader)  # get past the header

                for record in reader:
                    if self.types is None:
                        yield record
                    else:
                        yield [
                            self._types[f](r)
                            for f, r in zip(self.fields, record)
                        ]

            except csv.Error as err:
                raise BadlyFormattedInputData(
                    "CSV reader encountered an error: %s" % str(err))
Exemple #6
0
    def __iter__(self):
        self._checktypes = None
        parser = self.setupParser()

        self._depth = 0
        self._rowtag = None
        self._rowflatten = None
        self._rowdepth = 0
        self._indata = False

        while True:
            self._rows = []
            data = self.stream.read(self.blocksize)
            try:
                parser.Parse(data)
            except xml.parsers.expat.ExpatError as err:
                raise BadlyFormattedInputData(
                    "XML reader encountered an error: %s" % str(err))

            for record in self._rows:
                if len(set(record.keys()).difference(self.fields)) > 0:
                    self.fields = self.fields.union(record.keys())
                    if self._autoUpdateTypes:
                        self._types = dict([(f, cast[t]) for f, t in (
                            self.types.items() if self.
                            types is not None else [(f, "string")
                                                    for f in self.fields])])

                if self.types is not None:
                    for f in self.fields:
                        if f in record:
                            record[f] = self._types[f](record[f])

                yield record

            if len(data) < self.blocksize:
                break
Exemple #7
0
def readUniTable(fileLocation,
                 format=None,
                 sorter=None,
                 pageSize=None,
                 mapInvalid=None,
                 mapMissing=None,
                 **parameters):
    format = getformat(fileLocation, format)

    ################################################################ CSV
    if format == "CSV":
        csvInput = CSVStream(fileLocation, sorter, **parameters)

        if csvInput.types is not None:
            types = csvInput.types
        else:
            types = dict((f, "string") for f in csvInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(csvInput.fields, types)
        table.initMemory(pageSize)

        for record in csvInput:
            table.fill([
                mapInvalid[f]
                if r is INVALID else mapMissing[f] if r is MISSING else r
                for f, r in zip(csvInput.fields, record)
            ])

        return table

    ################################################################ XML
    if format == "XML":
        xmlInput = XMLStream(fileLocation, sorter, **parameters)

        if xmlInput.types is not None:
            types = xmlInput.types
        else:
            types = dict((f, "string") for f in xmlInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(xmlInput.fields, types)
        table.initMemory(pageSize)

        for record in xmlInput:
            table.fill([
                mapInvalid[f] if r is INVALID else r
                for f, r in [(f, record.get(f, mapMissing[f]))
                             for f in xmlInput.fields]
            ])

        return table

    ################################################################ NAB
    elif format == "NAB":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        fields = None
        types = None
        strings = {}
        args = {}
        for fileName in fileNames:
            file = open(fileName, "rb")
            header = file.readline().rstrip()
            file.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" does not begin with 'RecArray'" %
                    fileName)

            args[fileName] = dict(
                asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in args.keys():
                raise NotImplementedError(
                    "No support yet for NAB files (such as \"%s\") with masked NumPy arrays"
                    % fileName)

            if set(args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])"
                    % (fileName, str(set(args[fileName].keys()))))

            thisfields = args[fileName]["names"].split(",")
            thistypes = args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    strings[thisfields[i]] = True
                else:
                    strings[thisfields[i]] = False

            if fields is None:
                fields = thisfields
                types = thistypes
            else:
                if fields != thisfields:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has fields %s, which differ from the first %s"
                        % (fileName, str(thisfields), str(fields)))
                if types != thistypes:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has types %s, which differ from the first %s"
                        % (fileName, str(thistypes), str(types)))

        table = UniTable(fields, dict(zip(fields, types)))
        table.pages = []
        table.starts = []
        table.length = 0

        for fileName in fileNames:
            file = open(fileName, "rb")
            file.readline()
            data = numpy.rec.fromfile(file, **args[fileName])

            table.pageSize = len(data)
            page = UniPage(table.fields, table.types)

            arrays = {}
            for f in table.fields:
                arr = data.field(f)
                if strings[f]:
                    arr = [i.decode("utf-8") for i in arr]
                arrays[f] = arr

            page.initExisting(table.pageSize,
                              arrays,
                              copy=False,
                              stringToCategory=True)
            table.pages.append(page)
            table.starts.append(table.length)
            table.length += len(data)

        return table

    ################################################################ XTBL
    elif format == "XTBL":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        limitGB = parameters.get("limitGB", None)
        memoryMap = parameters.get("memoryMap", False)

        # get the footers from each file (XML) and make sure they have identical DataDictionaries
        footers = []
        for i, fileName in enumerate(fileNames):
            fileSize = os.stat(fileName).st_size
            file = open(fileName, "rb")

            file.seek(max(0, fileSize - 1024))
            text = file.read()
            m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text)
            if m is not None:
                textStart = int(m.group(1))
            else:
                raise IOError(
                    "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)"
                    % fileName)

            file.seek(textStart)

            footer = load(file.read(), xtbl.XTBL)
            footers.append(footer)
            if len(footers) > 1:
                thisDataDictionary = footer.child(xtbl.DataDictionary)
                firstDataDictionary = footers[0].child(xtbl.DataDictionary)

                if thisDataDictionary != firstDataDictionary:
                    for x in thisDataDictionary.matches(
                            xtbl.LookupTable,
                            maxdepth=None) + firstDataDictionary.matches(
                                xtbl.LookupTable, maxdepth=None):
                        x.serialize()
                    raise IncompatibleFilesInChain(
                        "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s"
                        % (fileNames[i], fileNames[0], os.linesep,
                           thisDataDictionary.xml(), os.linesep,
                           firstDataDictionary.xml()))

            file.close()

        # set up the UniTable's fields, types, pages, starts, and length
        fields = []
        types = {}
        dtypes = {}
        lookups = {}

        for dataField in footers[0].child(xtbl.DataDictionary).matches(
                xtbl.DataField):
            field = dataField.attrib["name"]
            fields.append(field)
            types[field] = dataField.attrib["type"]
            dtypes[field] = dataField.attrib["dtype"]

            lookup = dataField.child(xtbl.LookupTable, exception=False)
            if lookup is not None:
                lookups[field] = lookup.n_to_v
            else:
                lookups[field] = None

        categories = []
        for f in fields:
            n_to_v = lookups[f]
            if n_to_v is None:
                categories.append(None)
            else:
                v_to_n = dict((v, n) for n, v in n_to_v.items())
                categories.append((v_to_n, n_to_v))

        table = UniTable(fields, types)
        table.pages = []
        table.starts = []
        table.length = 0

        uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap)

        for i, fileName in enumerate(fileNames):
            for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page):
                length = xtblpage.attrib["length"]

                byteOffsets = {}
                for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset):
                    byteOffsets[pageFieldOffset.attrib[
                        "name"]] = pageFieldOffset.attrib["byteOffset"]

                uniPage = UniPageOnDisk(fields, table.types)
                uniPage.initDisk(length, fileName, byteOffsets, dtypes,
                                 categories, uniPageDiskCacheManager)

                table.pages.append(uniPage)
                table.starts.append(table.length)
                table.length += length

        return table