def testErrors(self): # Missing file. try: self.parser.parseFile(File("", "missing_dump$@%.xml")) self.fail("WiktionaryException/IOException expected") except (IOException, WiktionaryException) as e: self.assertTrue(isinstance(e, IOException) or isinstance(e, WiktionaryException)) # Invalid XML. try: self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorXMLTest.xml")) self.fail("WiktionaryException/SAXException expected") except xml.sax.SAXException as e: self.assertTrue(isinstance(e, xml.sax.SAXParseException)) # Empty Bzip2 file. try: self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorEmptyTest.xml.bz2")) self.fail("WiktionaryException/IOException expected") except xml.sax.SAXException as e: self.assertTrue(isinstance(e, xml.sax.SAXParseException)) # Erroneous Bzip2 header. try: self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorHeaderTest.xml.bz2")) self.fail("WiktionaryException/IOException expected") except xml.sax.SAXException as e: self.assertTrue(isinstance(e, xml.sax.SAXParseException))
def setUp(self): # throws Exception super().setUp() self.pageCount = 0 # atomic self._lock = threading.Lock() self.multistreamDump = File(os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream.xml.bz2") self.multistreamDumpIndex = File(os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream-index.txt.bz2") self.subject = MultistreamXMLDumpParser(MultistreamXMLDumpParserTest._XMLDumpParser(self))
def testParsedInformation(self): """<dump_parser_test> <header> <param name="param1" value="value1" /> <param name="param1" value="value2" /> </header> <element id="1"> Some text content </element> <element id="2"> Some text content </element> </dump_parser_test>*""" expectedValues = list() expectedValues.append("onParserStart") expectedValues.append("onElementStart: dump_parser_test") expectedValues.append("onElementStart: header") expectedValues.append("onElementStart: param") expectedValues.append("onElementEnd: param") expectedValues.append("onElementStart: param") expectedValues.append("onElementEnd: param") expectedValues.append("onElementEnd: header") expectedValues.append("onElementStart: element") expectedValues.append("onElementEnd: element") expectedValues.append("onElementStart: element") expectedValues.append("onElementEnd: element") expectedValues.append("onElementEnd: dump_parser_test") expectedValues.append("onParserEnd") self.parser = XMLDumpParserTest._XMLDumpParser(self, expectedValues) self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserTest.xml")) self.assertTrue(not expectedValues)
def testParsedInformation(self): expectedValues = list() expectedValues.append( "setBaseURL: http://de.wiktionary.org/wiki/Wiktionary:Hauptseite") expectedValues.append("addNamespace: Diskussion") expectedValues.append("onPageStart") expectedValues.append("setTitle: Page 1") expectedValues.append("setPageId: 9") expectedValues.append("setRevision: 10763") expectedValues.append("setTimestamp: 2004-09-17T08:23:57Z") expectedValues.append("setAuthor: TJ") expectedValues.append("setText: Text 1") expectedValues.append("onPageEnd") expectedValues.append("onPageStart") expectedValues.append("setTitle: Page 2") expectedValues.append("setPageId: 10") expectedValues.append("setRevision: 10764") expectedValues.append("setTimestamp: 2004-09-17T08:34:29Z") expectedValues.append("setAuthor: TJ") expectedValues.append("setText: Text 2\n\n Test Test") expectedValues.append("onPageEnd") parser = WiktionaryDumpParserTest._WiktionaryDumpParser( self, expectedValues) parser.parseFile( File(os.path.join(os.getcwd(), "resources"), "WiktionaryDumpParserTest.xml")) self.assertTrue(not expectedValues)
class WiktionaryTestCase: """ Abstract test case for PyWKTL. """ RESOURCE_PATH = os.path.join(os.getcwd(), "resources") workDir = None # noinspection PyMethodMayBeStatic def getName(self): return "" def setUp(self): # throws Exception # super().setUp() self.workDir = File(os.path.join(os.getcwd(), "target/test-output/"), self.__class__.__name__ + "_" + self.getName()) self.deleteDirectory(self.workDir) self.workDir.mkdir() def tearDown(self): # throws Exception self.deleteDirectory(self.workDir) # super().tearDown() @classmethod def deleteDirectory(cls, path): if path.exists(): files = path.listFiles() for file in files: if file.isDirectory(): if not cls.deleteDirectory(file): print("Unable to delete dir: " + file) else: if not file.delete(): print("Unable to delete file: " + file) return path.delete() def assertTrue(self): pass def assertFals(self): pass def assertEqual(self): pass
def _testParseMultistream(self): # throws Exception pageIds = list() siteInfo = [IDumpInfo] def onSiteInfoComplete(_, dumpInfo): if siteInfo[0] is None: siteInfo[0] = dumpInfo else: raise IllegalStateException( "received onSiteInfoComplete more than once") def setPageId(_, pageId): pageIds.append(pageId) parser = type( "_EmptyParser", (WiktionaryDumpParser, WiktionaryDumpParserTest.EmptyParser), { "onSiteInfoComplete": onSiteInfoComplete, "setPageId": setPageId })() multistreamDump = File( os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream.xml.bz2") multistreamDumpIndex = File( os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream-index.txt.bz2") parser.parseMultistream(multistreamDump, multistreamDumpIndex, MultistreamFilter.IncludingNames("aardvark")) self.assertIsNotNone("did not parse siteInfo", siteInfo[0]) self.assertEqual(Language.ENGLISH, siteInfo[0].getDumpLanguage()) self.assertEqual(100, len(pageIds)) first = pageIds[0] last = pageIds[-1] self.assertEqual(177, first) self.assertEqual(306, last)
def main(): import sys args = sys.argv[1:] for arg in args: print(arg) """ Runs the example. @param args name of the dump file, output directory for parsed data, boolean value that specifies if existing parsed data should be deleted. """ if len(args) != 3: raise IllegalArgumentException( "Too few arguments. " + "Required arguments: <DUMP_FILE> <OUTPUT_DIRECTORY> " + "<OVERWRITE_EXISTING_DATA>") dumpFile = File(os.path.dirname(args[0]), os.path.basename(args[0])) outputDirectory = File(args[1], "") overwriteExisting = bool(args[2]) # Parse dump file PyWKTL.parseWiktionaryDump(dumpFile, outputDirectory, overwriteExisting) # Create IWiktionaryEdition for our parsed data. wkt = PyWKTL.openEdition(outputDirectory) # Retrieve all IWiktionaryEntries for the word "Wiktionary". entries = wkt.getEntriesForWord("Wiktionary") # Print the information of the parsed entries. for entry in entries: print(WiktionaryFormatter.formatHeaderForEntry(entry)) # Close the Wiktionary edition. wkt.close()
def getSimpleENDump(self): entries = list() entries.append(self.EN_PARAMETER) entries.append(self.EN_PLACE1) entries.append(self.EN_PLACE2) entries.append(self.EN_PLACE3) entries.append(self.EN_PLACE4) entries.append(self.EN_PLACE5) entries.append(self.EN_PLACE6) entries.append(self.EN_PLACE7) return WiktionaryDataTestCase( File(WiktionaryTestCase.RESOURCE_PATH, "WiktionaryTestData_en_20080613.xml"), Language.ENGLISH, "en", entries)
def testParseEmptyFields(self): expectedValues = list() expectedValues.append("onPageStart") expectedValues.append("setTitle: ") expectedValues.append("setPageId: 0") expectedValues.append("setRevision: 0") expectedValues.append("setTimestamp: null") expectedValues.append("setText: ") expectedValues.append("onPageEnd") expectedValues.append("onPageStart") expectedValues.append("setTimestamp: null") expectedValues.append("onPageEnd") parser = WiktionaryDumpParserTest._WiktionaryDumpParser( self, expectedValues) parser.parseFile( File(os.path.join(os.getcwd(), "resources"), "WiktionaryDumpParserNullTest.xml"))
def testBzip2Stream(self): expectedValues = list() expectedValues.append("onParserStart") expectedValues.append("onElementStart: dump_parser_test") expectedValues.append("onElementStart: header") expectedValues.append("onElementStart: param") expectedValues.append("onElementEnd: param") expectedValues.append("onElementStart: param") expectedValues.append("onElementEnd: param") expectedValues.append("onElementEnd: header") expectedValues.append("onElementStart: element") expectedValues.append("onElementEnd: element") expectedValues.append("onElementStart: element") expectedValues.append("onElementEnd: element") expectedValues.append("onElementEnd: dump_parser_test") expectedValues.append("onParserEnd") self.parser = XMLDumpParserTest._XMLDumpParser(self, expectedValues) self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserTest.xml.bz2")) self.assertTrue(not expectedValues)
def PyWiktionaryCli(): """ * Offers a command line interface to Wiktionary. You can type a word and * after pressing <enter> the information of corresponding entries will * be printed. In order to quit the interface just hit enter """ """ # @param args path to parsed Wiktionary data """ if len(sys.argv) != 2: raise IllegalArgumentException("Too few arguments. Required arguments: <PARSED-WIKTIONARY>") PROMPT = "> " END = "" wktPath = sys.argv[1] from api.WiktionaryFormatter import WiktionaryFormatter formatter = WiktionaryFormatter.instance() try: from PyWKTL import PyWKTL wkt = PyWKTL.openEdition(File("", wktPath)) while True: line = input(PROMPT) if line == END: break page = wkt.getPageForWord(line) if page is None or page.getEntryCount() == 0: print(line + " is not in Wiktionary") else: print(formatter.formatPage(page)) except EOFError: print("exit")
def testParseMultistreamWithNormalParser(self): # throws Exception self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream.xml.bz2"))
def setUp(self): # throws Exception # super().setUp() self.workDir = File(os.path.join(os.getcwd(), "target/test-output/"), self.__class__.__name__ + "_" + self.getName()) self.deleteDirectory(self.workDir) self.workDir.mkdir()
def setUp(self): # throws Exception: super().setUp() self.wktEN = self.getSimpleENDump() self.wktEN.parseFile(File(self.workDir, "/en"))