def testErrors(self):

        # Missing file.
        try:
            self.parser.parseFile(File("", "missing_dump$@%.xml"))
            self.fail("WiktionaryException/IOException expected")
        except (IOException, WiktionaryException) as e:
            self.assertTrue(isinstance(e, IOException) or isinstance(e, WiktionaryException))

        # Invalid XML.
        try:
            self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorXMLTest.xml"))
            self.fail("WiktionaryException/SAXException expected")
        except xml.sax.SAXException as e:
            self.assertTrue(isinstance(e, xml.sax.SAXParseException))

        # Empty Bzip2 file.
        try:
            self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorEmptyTest.xml.bz2"))
            self.fail("WiktionaryException/IOException expected")
        except xml.sax.SAXException as e:
            self.assertTrue(isinstance(e, xml.sax.SAXParseException))

        # Erroneous Bzip2 header.
        try:
            self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserErrorHeaderTest.xml.bz2"))
            self.fail("WiktionaryException/IOException expected")
        except xml.sax.SAXException as e:
            self.assertTrue(isinstance(e, xml.sax.SAXParseException))
    def setUp(self):  # throws Exception
        super().setUp()

        self.pageCount = 0  # atomic
        self._lock = threading.Lock()

        self.multistreamDump = File(os.path.join(os.getcwd(), "resources"),
                                    "enwiktionary-20150224-pages-articles-multistream.xml.bz2")
        self.multistreamDumpIndex = File(os.path.join(os.getcwd(), "resources"),
                                         "enwiktionary-20150224-pages-articles-multistream-index.txt.bz2")
        self.subject = MultistreamXMLDumpParser(MultistreamXMLDumpParserTest._XMLDumpParser(self))
    def testParsedInformation(self):
        """<dump_parser_test>
          <header>
            <param name="param1" value="value1" />
            <param name="param1" value="value2" />
          </header>
          <element id="1">
            Some text content
          </element>
          <element id="2">
            Some text content
          </element>
        </dump_parser_test>*"""

        expectedValues = list()
        expectedValues.append("onParserStart")
        expectedValues.append("onElementStart: dump_parser_test")
        expectedValues.append("onElementStart: header")
        expectedValues.append("onElementStart: param")
        expectedValues.append("onElementEnd: param")
        expectedValues.append("onElementStart: param")
        expectedValues.append("onElementEnd: param")
        expectedValues.append("onElementEnd: header")
        expectedValues.append("onElementStart: element")
        expectedValues.append("onElementEnd: element")
        expectedValues.append("onElementStart: element")
        expectedValues.append("onElementEnd: element")
        expectedValues.append("onElementEnd: dump_parser_test")
        expectedValues.append("onParserEnd")

        self.parser = XMLDumpParserTest._XMLDumpParser(self, expectedValues)
        self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserTest.xml"))
        self.assertTrue(not expectedValues)
    def testParsedInformation(self):
        expectedValues = list()

        expectedValues.append(
            "setBaseURL: http://de.wiktionary.org/wiki/Wiktionary:Hauptseite")
        expectedValues.append("addNamespace: Diskussion")
        expectedValues.append("onPageStart")
        expectedValues.append("setTitle: Page 1")
        expectedValues.append("setPageId: 9")
        expectedValues.append("setRevision: 10763")
        expectedValues.append("setTimestamp: 2004-09-17T08:23:57Z")
        expectedValues.append("setAuthor: TJ")
        expectedValues.append("setText: Text 1")
        expectedValues.append("onPageEnd")
        expectedValues.append("onPageStart")
        expectedValues.append("setTitle: Page 2")
        expectedValues.append("setPageId: 10")
        expectedValues.append("setRevision: 10764")
        expectedValues.append("setTimestamp: 2004-09-17T08:34:29Z")
        expectedValues.append("setAuthor: TJ")
        expectedValues.append("setText: Text 2\n\n      Test Test")
        expectedValues.append("onPageEnd")

        parser = WiktionaryDumpParserTest._WiktionaryDumpParser(
            self, expectedValues)
        parser.parseFile(
            File(os.path.join(os.getcwd(), "resources"),
                 "WiktionaryDumpParserTest.xml"))
        self.assertTrue(not expectedValues)
class WiktionaryTestCase:
    """ Abstract test case for PyWKTL. """

    RESOURCE_PATH = os.path.join(os.getcwd(), "resources")

    workDir = None

    # noinspection PyMethodMayBeStatic
    def getName(self):
        return ""

    def setUp(self):  # throws Exception
        # super().setUp()
        self.workDir = File(os.path.join(os.getcwd(), "target/test-output/"),
                            self.__class__.__name__ + "_" + self.getName())
        self.deleteDirectory(self.workDir)
        self.workDir.mkdir()

    def tearDown(self):  # throws Exception
        self.deleteDirectory(self.workDir)
        # super().tearDown()

    @classmethod
    def deleteDirectory(cls, path):
        if path.exists():
            files = path.listFiles()
            for file in files:
                if file.isDirectory():
                    if not cls.deleteDirectory(file):
                        print("Unable to delete dir: " + file)
                else:
                    if not file.delete():
                        print("Unable to delete file: " + file)
        return path.delete()

    def assertTrue(self):
        pass

    def assertFals(self):
        pass

    def assertEqual(self):
        pass
    def _testParseMultistream(self):  # throws Exception

        pageIds = list()
        siteInfo = [IDumpInfo]

        def onSiteInfoComplete(_, dumpInfo):
            if siteInfo[0] is None:
                siteInfo[0] = dumpInfo
            else:
                raise IllegalStateException(
                    "received onSiteInfoComplete more than once")

        def setPageId(_, pageId):
            pageIds.append(pageId)

        parser = type(
            "_EmptyParser",
            (WiktionaryDumpParser, WiktionaryDumpParserTest.EmptyParser), {
                "onSiteInfoComplete": onSiteInfoComplete,
                "setPageId": setPageId
            })()

        multistreamDump = File(
            os.path.join(os.getcwd(), "resources"),
            "enwiktionary-20150224-pages-articles-multistream.xml.bz2")
        multistreamDumpIndex = File(
            os.path.join(os.getcwd(), "resources"),
            "enwiktionary-20150224-pages-articles-multistream-index.txt.bz2")

        parser.parseMultistream(multistreamDump, multistreamDumpIndex,
                                MultistreamFilter.IncludingNames("aardvark"))

        self.assertIsNotNone("did not parse siteInfo", siteInfo[0])
        self.assertEqual(Language.ENGLISH, siteInfo[0].getDumpLanguage())

        self.assertEqual(100, len(pageIds))
        first = pageIds[0]
        last = pageIds[-1]
        self.assertEqual(177, first)
        self.assertEqual(306, last)
    def main():

        import sys

        args = sys.argv[1:]
        for arg in args:
            print(arg)
        """ Runs the example.
            @param args name of the dump file, output directory for parsed data,
                boolean value that specifies if existing parsed data should
            be deleted. """

        if len(args) != 3:
            raise IllegalArgumentException(
                "Too few arguments. " +
                "Required arguments: <DUMP_FILE> <OUTPUT_DIRECTORY> " +
                "<OVERWRITE_EXISTING_DATA>")

        dumpFile = File(os.path.dirname(args[0]), os.path.basename(args[0]))
        outputDirectory = File(args[1], "")
        overwriteExisting = bool(args[2])

        # Parse dump file
        PyWKTL.parseWiktionaryDump(dumpFile, outputDirectory,
                                   overwriteExisting)

        # Create IWiktionaryEdition for our parsed data.
        wkt = PyWKTL.openEdition(outputDirectory)

        # Retrieve all IWiktionaryEntries for the word "Wiktionary".
        entries = wkt.getEntriesForWord("Wiktionary")

        # Print the information of the parsed entries.
        for entry in entries:
            print(WiktionaryFormatter.formatHeaderForEntry(entry))

        # Close the Wiktionary edition.
        wkt.close()
 def getSimpleENDump(self):
     entries = list()
     entries.append(self.EN_PARAMETER)
     entries.append(self.EN_PLACE1)
     entries.append(self.EN_PLACE2)
     entries.append(self.EN_PLACE3)
     entries.append(self.EN_PLACE4)
     entries.append(self.EN_PLACE5)
     entries.append(self.EN_PLACE6)
     entries.append(self.EN_PLACE7)
     return WiktionaryDataTestCase(
         File(WiktionaryTestCase.RESOURCE_PATH,
              "WiktionaryTestData_en_20080613.xml"), Language.ENGLISH, "en",
         entries)
    def testParseEmptyFields(self):
        expectedValues = list()
        expectedValues.append("onPageStart")
        expectedValues.append("setTitle: ")
        expectedValues.append("setPageId: 0")
        expectedValues.append("setRevision: 0")
        expectedValues.append("setTimestamp: null")
        expectedValues.append("setText: ")
        expectedValues.append("onPageEnd")
        expectedValues.append("onPageStart")
        expectedValues.append("setTimestamp: null")
        expectedValues.append("onPageEnd")

        parser = WiktionaryDumpParserTest._WiktionaryDumpParser(
            self, expectedValues)
        parser.parseFile(
            File(os.path.join(os.getcwd(), "resources"),
                 "WiktionaryDumpParserNullTest.xml"))
    def testBzip2Stream(self):
        expectedValues = list()
        expectedValues.append("onParserStart")
        expectedValues.append("onElementStart: dump_parser_test")
        expectedValues.append("onElementStart: header")
        expectedValues.append("onElementStart: param")
        expectedValues.append("onElementEnd: param")
        expectedValues.append("onElementStart: param")
        expectedValues.append("onElementEnd: param")
        expectedValues.append("onElementEnd: header")
        expectedValues.append("onElementStart: element")
        expectedValues.append("onElementEnd: element")
        expectedValues.append("onElementStart: element")
        expectedValues.append("onElementEnd: element")
        expectedValues.append("onElementEnd: dump_parser_test")
        expectedValues.append("onParserEnd")

        self.parser = XMLDumpParserTest._XMLDumpParser(self, expectedValues)
        self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "XMLDumpParserTest.xml.bz2"))
        self.assertTrue(not expectedValues)
Example #11
0
def PyWiktionaryCli():
    """
     * Offers a command line interface to Wiktionary. You can type a word and
     * after pressing &ltenter&gt the information of corresponding entries will
     * be printed. In order to quit the interface just hit enter
    """

    """
     # @param args path to parsed Wiktionary data
    """
    if len(sys.argv) != 2:
        raise IllegalArgumentException("Too few arguments. Required arguments: <PARSED-WIKTIONARY>")

    PROMPT = "> "
    END = ""

    wktPath = sys.argv[1]

    from api.WiktionaryFormatter import WiktionaryFormatter
    formatter = WiktionaryFormatter.instance()

    try:
        from PyWKTL import PyWKTL
        wkt = PyWKTL.openEdition(File("", wktPath))

        while True:

            line = input(PROMPT)
            if line == END:
                break

            page = wkt.getPageForWord(line)
            if page is None or page.getEntryCount() == 0:
                print(line + " is not in Wiktionary")
            else:
                print(formatter.formatPage(page))
    except EOFError:
        print("exit")
 def testParseMultistreamWithNormalParser(self):  # throws Exception
     self.parser.parseFile(File(os.path.join(os.getcwd(), "resources"), "enwiktionary-20150224-pages-articles-multistream.xml.bz2"))
 def setUp(self):  # throws Exception
     # super().setUp()
     self.workDir = File(os.path.join(os.getcwd(), "target/test-output/"),
                         self.__class__.__name__ + "_" + self.getName())
     self.deleteDirectory(self.workDir)
     self.workDir.mkdir()
 def setUp(self):  # throws Exception:
     super().setUp()
     self.wktEN = self.getSimpleENDump()
     self.wktEN.parseFile(File(self.workDir, "/en"))