def parse_opnsense(input_path, document): handler = OpnSenseContentHandler(document) if input_path == '-': with sys.stdin as input_file: parse(input_file, handler) else: with open(input_path, 'rb') as input_file: parse(input_file, handler)
def __init__(self, err, data, namespace=None): self.err = err self.manifest = u"urn:mozilla:install-manifest" self.namespace = namespace or "http://www.mozilla.org/2004/em-rdf" if isinstance(data, types.StringTypes): data = StringIO(data) # Wrap data in a pseudo-file try: # Use an empty ContentHandler, we just want to make sure it parses. parse(data, ContentHandler()) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) else: # We consumed the file, start over from the beginning. data.seek(0) from rdflib.plugins.parsers import rdfxml orig_create_parser = rdfxml.create_parser try: # Patch rdflib to not resolve URL entities. def create_parser(*args, **kwargs): parser = orig_create_parser(*args, **kwargs) parser.setEntityResolver(AddonRDFEntityResolver(err)) return parser rdfxml.create_parser = create_parser # Load up and parse the file in XML format. graph = Graph() graph.parse(data, format="xml") self.rdf = graph except ParserError as ex: # Re-raise the exception in a local exception type. raise RDFException(message=ex.message) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) finally: # If we fail, we don't want to sully up the creation function. rdfxml.create_parser = orig_create_parser
def __init__(self, err, data, namespace=None): self.err = err self.manifest = u'urn:mozilla:install-manifest' self.namespace = namespace or 'http://www.mozilla.org/2004/em-rdf' if (hasattr(data, 'read') and hasattr(data, 'readline') or isinstance(data, StringIO) ): # It could be a file-like object, let's read it so that we can # wrap it in StringIO so that we can re-open at any time data.seek(0) data = data.read() try: # Use an empty ContentHandler, we just want to make sure it parses. parse(StringIO(data), ContentHandler()) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) from rdflib.plugins.parsers import rdfxml orig_create_parser = rdfxml.create_parser try: # Patch rdflib to not resolve URL entities. def create_parser(*args, **kwargs): parser = orig_create_parser(*args, **kwargs) parser.setEntityResolver(AddonRDFEntityResolver(err)) return parser rdfxml.create_parser = create_parser # Load up and parse the file in XML format. graph = Graph() graph.parse(StringIO(data), format='xml') self.rdf = graph except ParserError as ex: # Re-raise the exception in a local exception type. raise RDFException(message=ex.message) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) finally: # If we fail, we don't want to sully up the creation function. rdfxml.create_parser = orig_create_parser
def __init__(self, err, data, namespace=None): self.err = err self.manifest = u'urn:mozilla:install-manifest' self.namespace = namespace or 'http://www.mozilla.org/2004/em-rdf' if (hasattr(data, 'read') and hasattr(data, 'readline') or isinstance(data, StringIO)): # It could be a file-like object, let's read it so that we can # wrap it in StringIO so that we can re-open at any time data.seek(0) data = data.read() try: # Use an empty ContentHandler, we just want to make sure it parses. parse(StringIO(data), ContentHandler()) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) from rdflib.plugins.parsers import rdfxml orig_create_parser = rdfxml.create_parser try: # Patch rdflib to not resolve URL entities. def create_parser(*args, **kwargs): parser = orig_create_parser(*args, **kwargs) parser.setEntityResolver(AddonRDFEntityResolver(err)) return parser rdfxml.create_parser = create_parser # Load up and parse the file in XML format. graph = Graph() graph.parse(StringIO(data), format='xml') self.rdf = graph except ParserError as ex: # Re-raise the exception in a local exception type. raise RDFException(message=ex.message) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) finally: # If we fail, we don't want to sully up the creation function. rdfxml.create_parser = orig_create_parser
def __init__(self, err, data, namespace=None): self.err = err self.manifest = u'urn:mozilla:install-manifest' self.namespace = namespace or 'http://www.mozilla.org/2004/em-rdf' if isinstance(data, types.StringTypes): data = StringIO(data) # Wrap data in a pseudo-file try: # Use an empty ContentHandler, we just want to make sure it parses. parse(data, ContentHandler()) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) else: # We consumed the file, start over from the beginning. data.seek(0) from rdflib.plugins.parsers import rdfxml orig_create_parser = rdfxml.create_parser try: # Patch rdflib to not resolve URL entities. def create_parser(*args, **kwargs): parser = orig_create_parser(*args, **kwargs) parser.setEntityResolver(AddonRDFEntityResolver(err)) return parser rdfxml.create_parser = create_parser # Load up and parse the file in XML format. graph = Graph() graph.parse(data, format='xml') self.rdf = graph except ParserError as ex: # Re-raise the exception in a local exception type. raise RDFException(message=ex.message) except SAXParseException as ex: # Raise the SAX parse exceptions so we get some line info. raise RDFException(orig_exception=ex) finally: # If we fail, we don't want to sully up the creation function. rdfxml.create_parser = orig_create_parser
def xmlxls2pd(attachedDoc, table=0): xxh = XmlXlsHandler() if isinstance(attachedDoc, str): with open(attachedDoc, 'r') as ad: parse(ad, xxh) else: parse(attachecDoc, xxh) table = xxh.tables[table] header = table.showRowList[header] row = table.showRowList[:] col = {} def show_rows(cells): for cell in cells: index = cell.attr["ss:Index"] if index in index_columns: column = index_columns[index] if cell.data is not None: data = cell.data.data dtype = cell.data.attr.get("ss:Type", None) if column in coll: assert (coll[column] == dtype) else: coll[column] = dtype yield column, data def show_items(rows): rows = iter(rows) while True: row = next(rows) cells = row.cells item = dict(show_rows(cells)) merge_down = max(int(c.attr.get("ss:MergeDown", 0)) for c in cells) if merge_down > 0: merger = defaultdict(list) for i in range(merge_down): # consumer rows to be merged cells = next(rows).cells for k, v in row_gen(cells): merger[k].append(v) for k, v in merger.items(): v.insert(0, item[k]) item[k] = '\n'.join(v) yield item df = pd.DataFrame(show_items(rows)) conversions = { None: lambda x: x, 'DateTime': lambda x: pd.to_datetime(x, utc=True), 'String': lambda x: x, 'Number': pd.to_numeric } for col in df.columns: df[col] = conversions[coll[col]](df[col]) return df
def parse_pfsense(input_path, document): handler = PfSenseContentHandler(document) with open(input_path, 'rb') as input_file: parse(input_file, handler)
def xmlxls_to_pd(attachment, table=0, header=0, skiprows=None, skip_footer=0, merge_down=True): if skiprows is None: skiprows = header+1 excelHandler = ExcelHandler() if isinstance(attachment, str): with open(attachment, 'r') as attachment: parse(attachment, excelHandler) else: parse(attachment, excelHandler) table = excelHandler.tables[table] header = table.rows[header] columns_index = {c.data.data: c.attr["ss:Index"] for c in header.cells} index_columns = {v: k for k, v in columns_index.items()} # print sorted(index_columns.items(), key=lambda x: int(x[0])) rows = table.rows[skiprows:] rows = rows[:(len(rows) - skip_footer)] coll_dtype = {} def row_gen(cells): for cell in cells: index = cell.attr["ss:Index"] if index in index_columns: column = index_columns[index] if cell.data is not None: data = cell.data.data dtype = cell.data.attr.get("ss:Type", None) if column in coll_dtype: assert(coll_dtype[column] == dtype) else: coll_dtype[column] = dtype yield column, data def item_gen(rows): rows = iter(rows) while True: row = next(rows) cells = row.cells item = dict(row_gen(cells)) merge_down = max(int(c.attr.get("ss:MergeDown", 0)) for c in cells) if merge_down > 0: merger = defaultdict(list) for i in range(merge_down): # consumer rows to be merged cells = next(rows).cells for k, v in row_gen(cells): merger[k].append(v) for k, v in merger.items(): v.insert(0, item[k]) item[k] = '\n'.join(v) yield item # make data frame df = pd.DataFrame(item_gen(rows)) # apply data types for col in df.columns: df[col] = conversions[coll_dtype[col]](df[col]) return df