def parse_xml(file_name, size=0, method="json"): """Parse single XML file into the separated XML instances. Args: :param file_name(str): Name of file within same level as root. :param size(int): Number of XML instances within XML file to be read. :param method(str): Export method(Default:Json). Returns: int. The return code:: 0 -- Success! """ # Set initial values count = 0 results = [] # add benchmark bm = BenchMark() global MAX_NUMBER_OF_PATENTS # create an XML Reader parser = xml.sax.make_parser() # turn off namespaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # turn off validation for DTD parser.setFeature(xml.sax.handler.feature_external_ges, False) # override the default Context Handler xml_patent_handler = PatentHandler() parser.setContentHandler(xml_patent_handler) try: with open(file_name) as citation: bm.toggleOn('Start processing [ ]') for xml_part in xml_documents(citation): # Cast string back to file-like object to parse parser.parse(cStringIO.StringIO(xml_part)) results.append( copy.deepcopy(xml_patent_handler.serialization())) count = count + 1 if not MAX_NUMBER_OF_PATENTS: if count == int(size): break # Clean up stack after processing one xml paragraph xml_patent_handler.reset() bm.add(0) bm.toggleOff(' \bOK] - ' + str(count) + ' patents ') if method == "json": export2json(results) return 0 except IOError as e: raise e
def parse_xml(file_name, size = 0, method = "json"): """Parse single XML file into the separated XML instances. Args: :param file_name(str): Name of file within same level as root. :param size(int): Number of XML instances within XML file to be read. :param method(str): Export method(Default:Json). Returns: int. The return code:: 0 -- Success! """ # Set initial values count = 0 results = [] # add benchmark bm = BenchMark() global MAX_NUMBER_OF_PATENTS # create an XML Reader parser = xml.sax.make_parser() # turn off namespaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # turn off validation for DTD parser.setFeature(xml.sax.handler.feature_external_ges, False) # override the default Context Handler xml_patent_handler = PatentHandler() parser.setContentHandler(xml_patent_handler) try: with open(file_name) as citation: bm.toggleOn('Start processing [ ]') for xml_part in xml_documents(citation): # Cast string back to file-like object to parse parser.parse(cStringIO.StringIO(xml_part)) results.append(copy.deepcopy(xml_patent_handler.serialization())) count = count+1 if not MAX_NUMBER_OF_PATENTS: if count == int(size): break # Clean up stack after processing one xml paragraph xml_patent_handler.reset() bm.add(0) bm.toggleOff(' \bOK] - '+ str(count) + ' patents ') if method == "json": export2json(results) return 0 except IOError as e: raise e