def main(): import sys import mmap import contextlib import argparse parser = argparse.ArgumentParser(description='Get an SDS record by index.') parser.add_argument('-v', action="store_true", dest="verbose", help="Print debugging information") parser.add_argument('SDS', action="store", help="Input SDS file path") parser.add_argument('index', action="store", type=int, help="Entry index to fetch") results = parser.parse_args() with open(results.SDS, 'r') as f: with contextlib.closing( mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as buf: s = SDS(buf, 0, None) print "SDS" for e in s.sds_entries(): print(" SDS_ENTRY") print(e.get_all_string(indent=2))
def __init__(self, map_file, operation, hdf_object, output_format, verbose): """ Constructor: @param map_file: HDF4 xml map file generated by the HDF4 map writer @param operation: read / dump content of the HDF file @param hdf_object: target objects inside the HDf file (SDS, VData, RIS, ALL) @param output_format: binary / ascii in CSV files /numpy table """ self.xml_file = map_file self.depth = 0 # sds_dump_headers aggregates dimension's information as well as the data type but it slows down the dumping process # if the user just needs the data without the headers use --no-headers that applies to SDS arrays self.dump_format = output_format self.tree = None self.verbose = verbose try: self.tree = etree.parse(self.xml_file).getroot() # Parse the XML document and get the root tag except: print "The Map file could not be found or contains not well-formed XML, please verify it", self.xml_file return None self.schema = "{http://www.hdfgroup.org/HDF4/XML/schema/HDF4map/1.0.1}" try: file_node_info = self.hdf_file_name = self.tree.find(self.schema + "HDF4FileInformation").getchildren() hdf_file_name = file_node_info[0].text hdf_file_path = file_node_info[1].getchildren()[1].text self.hdf_file_name = hdf_file_path + "/" + hdf_file_name except: print "The HDF file described in the map file was not found or has an incorrect path " return None self.hdf_object = hdf_object self.hdf_operation = operation self.map_path = os.path.relpath(self.xml_file).replace(self.xml_file, "") self.hdf_handler = HDFfile(self.schema, self.hdf_file_name) self.group_stack = [] self.external_files = {} # Will store the references to external files self.palletes = {} # Will store the references to RIS palletes self.vdata = VData(self.schema) self.SDS = SDS() self.vdata_table = [] # This list will store the VData tables. self.dataValidator = dataValidator() self.utils = utils() self.return_code = 0
def main(): import sys import mmap import contextlib import argparse parser = argparse.ArgumentParser(description='Get an SDS record by index.') parser.add_argument('-v', action="store_true", dest="verbose", help="Print debugging information") parser.add_argument('SDS', action="store", help="Input SDS file path") parser.add_argument('index', action="store", type=int, help="Entry index to fetch") results = parser.parse_args() with open(results.SDS, 'r') as f: with contextlib.closing(mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as buf: s = SDS(buf, 0, None) print "SDS" for e in s.sds_entries(): print(" SDS_ENTRY") print(e.get_all_string(indent=2))
class XMLparser: """ This module recursively parses an XML map file looking for supported xml tags. These tags contain metadata about an HDF object in an HDF file. If a supported tag is found the class creates an instance of "HDFhandler", this class will load the HDF file and return the object data in a normalized buffer. This buffer and the xml tag are passed then to the handler class(in this iteration just VData). The handler classes are in charge of reconstruct the HDF objects and return them as Python data structures. """ def __init__(self, map_file, operation, hdf_object, output_format, verbose): """ Constructor: @param map_file: HDF4 xml map file generated by the HDF4 map writer @param operation: read / dump content of the HDF file @param hdf_object: target objects inside the HDf file (SDS, VData, RIS, ALL) @param output_format: binary / ascii in CSV files /numpy table """ self.xml_file = map_file self.depth = 0 # sds_dump_headers aggregates dimension's information as well as the data type but it slows down the dumping process # if the user just needs the data without the headers use --no-headers that applies to SDS arrays self.dump_format = output_format self.tree = None self.verbose = verbose try: self.tree = etree.parse(self.xml_file).getroot() # Parse the XML document and get the root tag except: print "The Map file could not be found or contains not well-formed XML, please verify it", self.xml_file return None self.schema = "{http://www.hdfgroup.org/HDF4/XML/schema/HDF4map/1.0.1}" try: file_node_info = self.hdf_file_name = self.tree.find(self.schema + "HDF4FileInformation").getchildren() hdf_file_name = file_node_info[0].text hdf_file_path = file_node_info[1].getchildren()[1].text self.hdf_file_name = hdf_file_path + "/" + hdf_file_name except: print "The HDF file described in the map file was not found or has an incorrect path " return None self.hdf_object = hdf_object self.hdf_operation = operation self.map_path = os.path.relpath(self.xml_file).replace(self.xml_file, "") self.hdf_handler = HDFfile(self.schema, self.hdf_file_name) self.group_stack = [] self.external_files = {} # Will store the references to external files self.palletes = {} # Will store the references to RIS palletes self.vdata = VData(self.schema) self.SDS = SDS() self.vdata_table = [] # This list will store the VData tables. self.dataValidator = dataValidator() self.utils = utils() self.return_code = 0 def parseAndDumpMapContent(self): """ Parses the XML map and dumps or list the HDF4 objects that are found. """ print "Processing : " + self.xml_file self.val = [] self.lastId = [] self.group_stack.append( "Root--" ) # We maintain a hierarchy to name the extracted objects; this is the first prefix. self.recursiveWalk(self.tree, 1) return self.return_code def recursiveWalk(self, root_node, depth): """ This recursive function traverse the XML document using the ElementTree API; all the nodess are stored in a tree-like structure. If a tag is recognized the method uses "self.operation" to either print a short version of the XML file or extract the object into to a CSV file. If a 'Group' tag is found, the attribute 'ID' is inserted in a stack; its node will have this value as prefix for the file name. This is accumulative, if a given VData object is under the group ID_ABC and ID_DEF the CSV file will be named: G-ID_ABC-G-ID_DEF.csv @param root_node: lxml root node of the map file @param depth: used to keep track of the recursion level, 0 on the first call """ self.depth = depth for node in root_node.getchildren(): if node.tag == ( self.schema + "ExternalFile" ): # We store the location and ID of external files in a Python directory self.external_files[str(node.attrib["id"])] = ( str(node.attrib["location"]) + "/" + str(node.attrib["filename"]) ) if self.verbose is not None: print self.external_files if node.tag == (self.schema + "Palette") and self.hdf_object in [ "ALL", "RIS", ]: # We store the palette in a directory data_node = node.find(self.schema + "paletteData") palette_location = self.getNodeAttribute(data_node, self.schema + "byteStream", "offset") palette_data = self.hdf_handler.get_data(int(palette_location), 768) self.palletes[str(node.attrib["id"])] = palette_data.getvalue() if node.tag == (self.schema + "Raster") and self.hdf_object in ["ALL", "RIS"]: # we extract the image raster_name = node.attrib["name"] if raster_name == None: raster_name = self.hdf_file_name + node.attrib["id"] palette = self.palletes[node.find(self.schema + "paletteRef").attrib["ref"]] raster_bytes = self.hdf_handler.linearizeDataSpace(node, "RIS") image = RIS(palette, raster_bytes, node) temp_file_name = self.xml_file + "_dump/" + "".join(self.group_stack) + node.attrib["id"] image.save(temp_file_name + raster_name) if node.tag == (self.schema + "Group"): # this tag just keeps track of the nested groups for naming conventions if self.hdf_operation == "l": print "-" * self.depth + "Group: " + node.attrib["name"] else: if self.depth >= len(self.group_stack): self.group_stack.append(" G-" + node.attrib["id"] + " ") else: self.group_stack.pop() # VData if node.tag == (self.schema + "Table") and self.hdf_object in ["ALL", "VData"]: VData = None if self.hdf_operation == "l": print "-" * self.depth + "VData: " + node.attrib["name"] else: if self.verbose is not None: print "-" * self.depth + "VData: " + node.attrib["name"] # Iter the comments to extract validation data self.ExtractValidationData(node) data_node = node.find(self.schema + "tableData") if data_node is None: print " " * self.depth + "data node not found, skipping VData " + node.attrib["name"] self.return_code = -1 pass if data_node is not None: inExternalFile_nodes = data_node.getchildren() if inExternalFile_nodes[0].tag == (self.schema + "dataInExternalFile"): # If a table is stored in an external file we create a temporary instance of HDFfile to buffer the object from that file. data_buffer = HDFfile( self.schema, self.hdf_path + self.external_files[inExternalFile_nodes[0].attrib["ref"]] ).linearizeDataSpace(inExternalFile_nodes[0], "VData") else: # the class will use the information to extract the object and return it in a linear buffer. data_buffer = self.hdf_handler.linearizeDataSpace(data_node, "VData") VData = self.vdata.Extract(node, data_buffer, self.dump_format) valid_values = str(self.dataValidator.validateVData(node.attrib["id"], VData)) if self.verbose is not None: print " " * self.depth + "Valid values: " + valid_values temp_file_name = ( self.xml_file + "_dump/" + "".join(self.group_stack) + node.attrib["name"] + "-" + node.attrib["id"] ) if ( self.dump_format == False or self.dump_format is None ): # If dump_format is None we dump the data in ASCII into CSV self.utils.createCSVfromTable(VData[0], VData[1:], temp_file_name) else: # If we want the data in binary we dump it in .dat files self.utils.createPlainDatFile(VData, temp_file_name) # SDS elif node.tag == (self.schema + "Array") and self.hdf_object in ["ALL", "SDS"]: if self.hdf_operation == "l": print "-" * self.depth + "Array: " + node.attrib["name"] else: # Iter the comments to extract validation data self.ExtractValidationData(node) if self.verbose is not None: print "-" * self.depth + "Array: " + node.attrib["name"] data_node = node.find(self.schema + "arrayData") if not etree.iselement(data_node): # if we couldn't find an arrayData tag print " " * self.depth + "arrayData not found, skipping SDS " + node.attrib["name"] self.return_code = -1 pass else: inExternalFile_nodes = data_node.getchildren() if inExternalFile_nodes[0].tag == (self.schema + "dataInExternalFile"): # If a table is stored in an external file we create a temporary instance of # HDFfile to buffer the object from that file. if self.verbose is not None: print "External data" sds_array = HDFfile( self.schema, self.hdf_path + self.external_files[inExternalFile_nodes[0].attrib["ref"]] ).linearizeDataSpace(node, "SDS") else: # If the data is stored in the same HDF file we just get the data from the HDF file # In this process, we send the XML nodes SDS to the HDFfile class, # the class will use the information to extract the object and return it in a linear buffer. sds_array = self.hdf_handler.linearizeDataSpace(node, "SDS") temp_file_name = ( self.xml_file + "_dump/" + "".join(self.group_stack) + node.attrib["name"].replace("/", "") + "-" + node.attrib["id"] ) sds_info = SDS_info(self.schema, node) if self.dump_format is None: # If dump_format is None we dump the data in ASCII into CSV if sds_array != None: valid_values = str(self.dataValidator.validateSDS(node.attrib["id"], sds_array)) if self.verbose is not None: print " " * self.depth + "Valid values: " + valid_values self.SDS.extract(sds_array, sds_info, "csv", temp_file_name) else: self.return_code = -1 else: # If we want the data in binary we dump it in .dat files if sds_array != None: self.SDS.extract(sds_array, sds_info, "binary", temp_file_name) else: self.return_code - 1 if len(node) > 0: self.recursiveWalk(node, self.depth + 1) self.depth = self.depth - 1 # Aux methods def getNodeAttribute(self, node, tag, attrib_name): """ returns the xml attribute of an xml node that matches a given tag and attribute @param node: an HDF object node (lxml node instance) @param tag: xml tag @param attrib_name: xml attribute """ try: for element in node.iter(): if element.tag == tag: return element.get(attrib_name) return None except: return None def getNodeText(self, node, tag): """ returns the text on an xml node that matches a given tag @param tag: xml tag @param node: an HDF object node (lxml node instance) """ try: text = node.find(tag).text return text except: return None def ExtractValidationData(self, node): """ Appends the validation values to a dictionary on the dataValidator class @param node: an HDF object (lxml node instance) """ objectName = node.attrib["id"] for items in node.getiterator(tag=etree.Comment): if "verification" in items.text: self.val = [] for lines in items.text.split("\n"): m = re.split("\[(.*?)\]", lines) if len(m) == 3: coords = m[1].split(",") value = m[2][1:] self.val.append([coords, value]) self.dataValidator.validationDictionary[objectName] = self.val