def __init__(self, fileLocation, sorter=None, blocksize=4096): self.fileNames = getfiles(fileLocation, sorter) if len(self.fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) self.fields = None self.types = None self.args = {} self.strings = {} for fileName in self.fileNames: tmpfile = open(fileName, "rb") header = tmpfile.readline().rstrip() tmpfile.close() headerfields = header.decode("utf-8").split() if headerfields[0] != "RecArray": raise BadlyFormattedInputData( "NAB file \"%s\" does not begin with 'RecArray'" % fileName) self.args[fileName] = dict( asciistr(f).split("=") for f in headerfields[1:]) if "masktype" in self.args.keys(): raise NotImplementedError( "No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName) if set(self.args[fileName].keys()) != set(["formats", "names"]): raise BadlyFormattedInputData( "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(self.args[fileName].keys())))) thisfields = self.args[fileName]["names"].split(",") thistypes = self.args[fileName]["formats"].split(",") for i in xrange(len(thistypes)): if thistypes[i][0] == "a": thistypes[i] = "string" self.strings[thisfields[i]] = True else: self.strings[thisfields[i]] = False if self.fields is None: self.fields = thisfields self.types = thistypes else: if self.fields != thisfields: raise IncompatibleFilesInChain( "NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(self.fields))) if self.types != thistypes: raise IncompatibleFilesInChain( "NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(self.types))) self.args[fileName]["shape"] = blocksize self.types = dict(zip(self.fields, self.types))
def __iter__(self): parser = self.setupParser() if self._checkfile is not None: fileNames = [self._checkfile] else: fileNames = self.fileNames for fileName in fileNames: self._depth = 0 self._rowtag = None self._rowflatten = None self._rowdepth = 0 self._indata = False if self._checkfile is not None: self._checktypes = {} else: self._checktypes = None self._checkfields = None with open(fileName) as file: while True: self._rows = [] data = file.read(self.blocksize) try: parser.Parse(data) except xml.parsers.expat.ExpatError as err: raise BadlyFormattedInputData( "XML reader encountered an error: %s" % str(err)) for record in self._rows: if self._checkfile is not None: if self._checkfields is not None: _checktypes = dict( (f, "string") for f in self._checkfields) _checktypes.update(self._checktypes) yield self._checkfields, _checktypes else: yield record.keys(), self._checktypes elif self.types is not None: for f in self.fields: try: record[f] = self._types[f](record[f]) except KeyError: pass yield record if len(data) < self.blocksize: break
def __iter__(self): try: reader = csv.reader(self.stream, self.dialect) if self.skipHeader: next(reader) # get past the header for record in reader: if self.types is None: yield record else: yield [ self._types[f](r) for f, r in zip(self.fields, record) ] except csv.Error as err: raise BadlyFormattedInputData( "CSV reader encountered an error: %s" % str(err))
def start_element(tag, attrib): if self._depth == 0: if "tag" in attrib: self._rowtag = attrib["tag"] if "structure" in attrib: if attrib["structure"] == "flatten": self._rowflatten = True elif attrib["structure"] == "ignore": self._rowflatten = False else: raise BadlyFormattedInputData( "'structure' attribute must either be \"flatten\" or \"ignore\"." ) if "fields" in attrib: self._checkfields = [ x.lstrip().rstrip() for x in attrib["fields"].split(",") ] if "types" in attrib: self._checktypes = dict( zip(self._checkfields, [ x.lstrip().rstrip() for x in attrib["types"].split(",") ])) elif self._depth == 1 and self._rowtag is None: self._rowdepth = 1 self._thisrow = {} elif tag == self._rowtag: self._rowdepth = 1 self._thisrow = {} elif self._rowdepth >= 1: self._rowdepth += 1 if self._rowflatten or self._rowdepth == 2: if self._checktypes is not None: if tag not in self._checktypes: if "type" in attrib: self._checktypes[tag] = attrib["type"] else: self._checktypes[tag] = "string" self._thistext = [] self._depth += 1
def __iter__(self): for fileName in self.fileNames: try: reader = csv.reader(open(fileName, "r"), self.dialect) if self.explicitHeader is None: next(reader) # get past the header for record in reader: if self.types is None: yield record else: yield [ self._types[f](r) for f, r in zip(self.fields, record) ] except csv.Error as err: raise BadlyFormattedInputData( "CSV reader encountered an error: %s" % str(err))
def __iter__(self): self._checktypes = None parser = self.setupParser() self._depth = 0 self._rowtag = None self._rowflatten = None self._rowdepth = 0 self._indata = False while True: self._rows = [] data = self.stream.read(self.blocksize) try: parser.Parse(data) except xml.parsers.expat.ExpatError as err: raise BadlyFormattedInputData( "XML reader encountered an error: %s" % str(err)) for record in self._rows: if len(set(record.keys()).difference(self.fields)) > 0: self.fields = self.fields.union(record.keys()) if self._autoUpdateTypes: self._types = dict([(f, cast[t]) for f, t in ( self.types.items() if self. types is not None else [(f, "string") for f in self.fields])]) if self.types is not None: for f in self.fields: if f in record: record[f] = self._types[f](record[f]) yield record if len(data) < self.blocksize: break
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters): format = getformat(fileLocation, format) ################################################################ CSV if format == "CSV": csvInput = CSVStream(fileLocation, sorter, **parameters) if csvInput.types is not None: types = csvInput.types else: types = dict((f, "string") for f in csvInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(csvInput.fields, types) table.initMemory(pageSize) for record in csvInput: table.fill([ mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record) ]) return table ################################################################ XML if format == "XML": xmlInput = XMLStream(fileLocation, sorter, **parameters) if xmlInput.types is not None: types = xmlInput.types else: types = dict((f, "string") for f in xmlInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(xmlInput.fields, types) table.initMemory(pageSize) for record in xmlInput: table.fill([ mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields] ]) return table ################################################################ NAB elif format == "NAB": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) fields = None types = None strings = {} args = {} for fileName in fileNames: file = open(fileName, "rb") header = file.readline().rstrip() file.close() headerfields = header.decode("utf-8").split() if headerfields[0] != "RecArray": raise BadlyFormattedInputData( "NAB file \"%s\" does not begin with 'RecArray'" % fileName) args[fileName] = dict( asciistr(f).split("=") for f in headerfields[1:]) if "masktype" in args.keys(): raise NotImplementedError( "No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName) if set(args[fileName].keys()) != set(["formats", "names"]): raise BadlyFormattedInputData( "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys())))) thisfields = args[fileName]["names"].split(",") thistypes = args[fileName]["formats"].split(",") for i in xrange(len(thistypes)): if thistypes[i][0] == "a": thistypes[i] = "string" strings[thisfields[i]] = True else: strings[thisfields[i]] = False if fields is None: fields = thisfields types = thistypes else: if fields != thisfields: raise IncompatibleFilesInChain( "NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields))) if types != thistypes: raise IncompatibleFilesInChain( "NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types))) table = UniTable(fields, dict(zip(fields, types))) table.pages = [] table.starts = [] table.length = 0 for fileName in fileNames: file = open(fileName, "rb") file.readline() data = numpy.rec.fromfile(file, **args[fileName]) table.pageSize = len(data) page = UniPage(table.fields, table.types) arrays = {} for f in table.fields: arr = data.field(f) if strings[f]: arr = [i.decode("utf-8") for i in arr] arrays[f] = arr page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True) table.pages.append(page) table.starts.append(table.length) table.length += len(data) return table ################################################################ XTBL elif format == "XTBL": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) limitGB = parameters.get("limitGB", None) memoryMap = parameters.get("memoryMap", False) # get the footers from each file (XML) and make sure they have identical DataDictionaries footers = [] for i, fileName in enumerate(fileNames): fileSize = os.stat(fileName).st_size file = open(fileName, "rb") file.seek(max(0, fileSize - 1024)) text = file.read() m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text) if m is not None: textStart = int(m.group(1)) else: raise IOError( "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName) file.seek(textStart) footer = load(file.read(), xtbl.XTBL) footers.append(footer) if len(footers) > 1: thisDataDictionary = footer.child(xtbl.DataDictionary) firstDataDictionary = footers[0].child(xtbl.DataDictionary) if thisDataDictionary != firstDataDictionary: for x in thisDataDictionary.matches( xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches( xtbl.LookupTable, maxdepth=None): x.serialize() raise IncompatibleFilesInChain( "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml())) file.close() # set up the UniTable's fields, types, pages, starts, and length fields = [] types = {} dtypes = {} lookups = {} for dataField in footers[0].child(xtbl.DataDictionary).matches( xtbl.DataField): field = dataField.attrib["name"] fields.append(field) types[field] = dataField.attrib["type"] dtypes[field] = dataField.attrib["dtype"] lookup = dataField.child(xtbl.LookupTable, exception=False) if lookup is not None: lookups[field] = lookup.n_to_v else: lookups[field] = None categories = [] for f in fields: n_to_v = lookups[f] if n_to_v is None: categories.append(None) else: v_to_n = dict((v, n) for n, v in n_to_v.items()) categories.append((v_to_n, n_to_v)) table = UniTable(fields, types) table.pages = [] table.starts = [] table.length = 0 uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap) for i, fileName in enumerate(fileNames): for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page): length = xtblpage.attrib["length"] byteOffsets = {} for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset): byteOffsets[pageFieldOffset.attrib[ "name"]] = pageFieldOffset.attrib["byteOffset"] uniPage = UniPageOnDisk(fields, table.types) uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager) table.pages.append(uniPage) table.starts.append(table.length) table.length += length return table