def initMemory(self, pageSize=None): if pageSize is None: pageSize = default["pageSize"] if pageSize <= 0: raise ValueError("UniTable pageSize must be positive (not %d)" % pageSize) self.pageSize = pageSize page = UniPage(self.fields, self.types) page.initMemory(self.pageSize) self.pages = [page] self.starts = [0] self.length = 0
def initExisting(self, values, copy=True): if set(values.keys()) != set(self.fields): raise ValueError( "fields in initExisting (%s) differ from original fields (%s)" % (str(set(values.keys())), str(set(self.fields)))) lengths = [len(arr) for arr in values.values()] same = [l == lengths[0] for l in lengths] if False in same: raise ValueError("arrays have different lengths: %s" % lengths) self.pageSize = lengths[0] page = UniPage(self.fields, self.types) page.initExisting(self.pageSize, values, copy=copy) self.pages = [page] self.starts = [0] self.length = self.pageSize
def fill(self, values): try: self.pages[-1].fill(values) self.length += 1 except BeyondPageException: self.starts.append(self.starts[-1] + self.pages[-1].allocation) self._writing() page = UniPage(self.fields, self.types) page.initMemory(self.pageSize) page.categories = self.pages[-1].categories self.pages.append(page) self.pages[-1].fill(values) self.length += 1 self._cullPages() except AttributeError: raise UninitializedError( "UniTable initMemory or initExisting must be called before fill" )
def fillpage(self, values, copy=True): if set(values.keys()) != set(self.fields): raise ValueError( "fields in initExisting (%s) differ from original fields (%s)" % (str(set(values.keys())), str(set(self.fields)))) lengths = [len(arr) for arr in values.values()] same = [l == lengths[0] for l in lengths] if False in same: raise ValueError("arrays have different lengths: %s" % lengths) if not hasattr(self, "pages"): raise UninitializedError( "UniTable initMemory or initExisting must be called before fillpage" ) page = UniPage(self.fields, self.types) page.initExisting(lengths[0], values, copy=copy) if self.pages[-1].length == 0: # empty page (probably just called initMemory); replace it and leave the starts list as it is self.pages[-1] = page else: # non-empty page (either full from fillpage() or partially full from fill()); write it an add this new page # _writing() has protection against being called twice self._writing() self.pages.append(page) self.starts.append(self.starts[-1] + lengths[0]) # either way, update lengths self.length += lengths[0] # write out the new page and cull any excess (_cullPages() also has protection against being called twice) self._writing() self._cullPages()
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters): format = getformat(fileLocation, format) ################################################################ CSV if format == "CSV": csvInput = CSVStream(fileLocation, sorter, **parameters) if csvInput.types is not None: types = csvInput.types else: types = dict((f, "string") for f in csvInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(csvInput.fields, types) table.initMemory(pageSize) for record in csvInput: table.fill([ mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record) ]) return table ################################################################ XML if format == "XML": xmlInput = XMLStream(fileLocation, sorter, **parameters) if xmlInput.types is not None: types = xmlInput.types else: types = dict((f, "string") for f in xmlInput.fields) _mapInvalid = dict( (f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapInvalid is None: mapInvalid = _mapInvalid else: _mapInvalid.update(mapInvalid) mapInvalid = _mapInvalid _mapMissing = dict( (f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields) if mapMissing is None: mapMissing = _mapMissing else: _mapMissing.update(mapMissing) mapMissing = _mapMissing table = UniTable(xmlInput.fields, types) table.initMemory(pageSize) for record in xmlInput: table.fill([ mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields] ]) return table ################################################################ NAB elif format == "NAB": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) fields = None types = None strings = {} args = {} for fileName in fileNames: file = open(fileName, "rb") header = file.readline().rstrip() file.close() headerfields = header.decode("utf-8").split() if headerfields[0] != "RecArray": raise BadlyFormattedInputData( "NAB file \"%s\" does not begin with 'RecArray'" % fileName) args[fileName] = dict( asciistr(f).split("=") for f in headerfields[1:]) if "masktype" in args.keys(): raise NotImplementedError( "No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName) if set(args[fileName].keys()) != set(["formats", "names"]): raise BadlyFormattedInputData( "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys())))) thisfields = args[fileName]["names"].split(",") thistypes = args[fileName]["formats"].split(",") for i in xrange(len(thistypes)): if thistypes[i][0] == "a": thistypes[i] = "string" strings[thisfields[i]] = True else: strings[thisfields[i]] = False if fields is None: fields = thisfields types = thistypes else: if fields != thisfields: raise IncompatibleFilesInChain( "NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields))) if types != thistypes: raise IncompatibleFilesInChain( "NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types))) table = UniTable(fields, dict(zip(fields, types))) table.pages = [] table.starts = [] table.length = 0 for fileName in fileNames: file = open(fileName, "rb") file.readline() data = numpy.rec.fromfile(file, **args[fileName]) table.pageSize = len(data) page = UniPage(table.fields, table.types) arrays = {} for f in table.fields: arr = data.field(f) if strings[f]: arr = [i.decode("utf-8") for i in arr] arrays[f] = arr page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True) table.pages.append(page) table.starts.append(table.length) table.length += len(data) return table ################################################################ XTBL elif format == "XTBL": fileNames = getfiles(fileLocation, sorter) if len(fileNames) == 0: raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation) limitGB = parameters.get("limitGB", None) memoryMap = parameters.get("memoryMap", False) # get the footers from each file (XML) and make sure they have identical DataDictionaries footers = [] for i, fileName in enumerate(fileNames): fileSize = os.stat(fileName).st_size file = open(fileName, "rb") file.seek(max(0, fileSize - 1024)) text = file.read() m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text) if m is not None: textStart = int(m.group(1)) else: raise IOError( "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName) file.seek(textStart) footer = load(file.read(), xtbl.XTBL) footers.append(footer) if len(footers) > 1: thisDataDictionary = footer.child(xtbl.DataDictionary) firstDataDictionary = footers[0].child(xtbl.DataDictionary) if thisDataDictionary != firstDataDictionary: for x in thisDataDictionary.matches( xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches( xtbl.LookupTable, maxdepth=None): x.serialize() raise IncompatibleFilesInChain( "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml())) file.close() # set up the UniTable's fields, types, pages, starts, and length fields = [] types = {} dtypes = {} lookups = {} for dataField in footers[0].child(xtbl.DataDictionary).matches( xtbl.DataField): field = dataField.attrib["name"] fields.append(field) types[field] = dataField.attrib["type"] dtypes[field] = dataField.attrib["dtype"] lookup = dataField.child(xtbl.LookupTable, exception=False) if lookup is not None: lookups[field] = lookup.n_to_v else: lookups[field] = None categories = [] for f in fields: n_to_v = lookups[f] if n_to_v is None: categories.append(None) else: v_to_n = dict((v, n) for n, v in n_to_v.items()) categories.append((v_to_n, n_to_v)) table = UniTable(fields, types) table.pages = [] table.starts = [] table.length = 0 uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap) for i, fileName in enumerate(fileNames): for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page): length = xtblpage.attrib["length"] byteOffsets = {} for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset): byteOffsets[pageFieldOffset.attrib[ "name"]] = pageFieldOffset.attrib["byteOffset"] uniPage = UniPageOnDisk(fields, table.types) uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager) table.pages.append(uniPage) table.starts.append(table.length) table.length += length return table