def testLoadRegexes(self):
        api = DataPreparationAPI(None,None)
        api.setRegexFile(self.regexFile)

        api.getRegexes()

        self.assertTrue(api.substitutionRegexFormula.hasPatterns())
        self.assertTrue(len(api.validationPatternList) > 0)
        self.assertTrue(len(api.substitutionRegexFormula.substitutionPatternList[0]) > 1)
 def testPrepareDocumentSimple(self):
     api = DataPreparationAPI(None, None)
     api.setRegexFile(self.regexFile)
     api.setLMModeling(True)
     try:
         api.setFormattedText(u'"')
         api.prepareDocument(1)
     except Exception:
         self.fail("Should not raise an exception")
Beispiel #3
0
    def outputSentencesToFiles(self, textDocumentsList):
        """Output the original sentences with language
           information to the database.
        """
        sentencesDict = {FRENCH_LABEL: [], GERMAN_LABEL: [], ITALIAN_LABEL: [], ENGLISH_LABEL: [], UNKNOWN_LABEL: []}

        for textDocument in textDocumentsList:
            DataPreparationAPI.appendDocumentSentences(textDocument, sentencesDict)

        DataPreparationAPI.outputPerLanguage(sentencesDict, self.getTempDirectory())
Beispiel #4
0
    def outputSentencesToFiles(self, textDocumentsList):
        """Output the original sentences with language
           information to the database.
        """
        sentencesDict = {FRENCH_LABEL:[], GERMAN_LABEL:[],
                         ITALIAN_LABEL:[], ENGLISH_LABEL:[],
                         UNKNOWN_LABEL:[]}

        for textDocument in textDocumentsList:
            DataPreparationAPI.appendDocumentSentences(textDocument, sentencesDict)

        DataPreparationAPI.outputPerLanguage(sentencesDict, self.getTempDirectory())
 def testPrepareDocumentSimple(self):
     api = DataPreparationAPI(None, None)
     api.setRegexFile(self.regexFile)
     api.setLMModeling(True)
     try:
         api.setFormattedText(u'"')
         api.prepareDocument(1)
     except Exception:
         self.fail("Should not raise an exception")
 def testPrepareDocument(self):
     api = DataPreparationAPI(None, None)
     api.setRegexFile(self.regexFile)
     api.setLMModeling(True)
     for languageId, strFileName in self.testFileList:
         self.logger.info("Testing %s" % strFileName)
         testList = self.getTestList(strFileName)
         for test, gt, bDiscard in testList:
             if int(bDiscard): 
                 continue
             #Main call
             api.setFormattedText(test)
             api.prepareDocument(languageId)
             formattedText = api.getCleanedText()
             self.assertEquals(formattedText.encode('utf-8'), gt.encode('utf-8'),
                 "'%s' is not '%s':%s" % (formattedText.encode('utf-8'), 
                                 gt.encode('utf-8'), strFileName))
    def testLoadRegexes(self):
        api = DataPreparationAPI(None, None)
        api.setRegexFile(self.regexFile)

        api.getRegexes()

        self.assertTrue(api.substitutionRegexFormula.hasPatterns())
        self.assertTrue(len(api.validationPatternList) > 0)
        self.assertTrue(
            len(api.substitutionRegexFormula.substitutionPatternList[0]) > 1)
Beispiel #8
0
    #Parse arguments
    args = parser.parse_args()
    inputFile = args.inputFile[0]
    outputDir = args.outputDir[0]
    language = int(args.language[0])
    regexFile = args.regexFile[0]

    #Flags
    debug = args.debug
    filterSentences = args.filter
    removePunctuation = args.rmpunct
    verbalizePunctuation = args.vbpunct
    lmModeling = args.lm

    setupLogging(logging.INFO, outputDir + "/task_log.txt")

    #Api setup
    api = DataPreparationAPI(inputFile, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)

    if language == 0:
        api.trainClassifier()

    #Main processing
    api.prepareDocument(language)
    api.outputSentencesToFiles(outputDir)
 def testPrepareDocument(self):
     api = DataPreparationAPI(None, None)
     api.setRegexFile(self.regexFile)
     api.setLMModeling(True)
     for languageId, strFileName in self.testFileList:
         self.logger.info("Testing %s" % strFileName)
         testList = self.getTestList(strFileName)
         for test, gt, bDiscard in testList:
             if int(bDiscard):
                 continue
             #Main call
             api.setFormattedText(test)
             api.prepareDocument(languageId)
             formattedText = api.getCleanedText()
             self.assertEquals(
                 formattedText.encode('utf-8'), gt.encode('utf-8'),
                 "'%s' is not '%s':%s" % (formattedText.encode('utf-8'),
                                          gt.encode('utf-8'), strFileName))
    def testPrepareDocumentBasic(self):
        testString = ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -"
        gtString = ur"hes-so und adg/la auch im winter sommer"

        api = DataPreparationAPI(None, None)
        api.setRegexFile(self.regexFile)
        api.setLMModeling(True)
        api.setKeepNewWords(True)
        api.setFormattedText(testString)
        api.prepareDocument(2)
        formattedText = api.getCleanedText()
        self.assertEquals(gtString.encode('utf-8'),
                          formattedText.encode('utf-8'))
    def testPrepareDocumentBasic(self):
        testString = ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -"
        gtString = ur"hes-so und adg/la auch im winter sommer"

        api = DataPreparationAPI(None, None)
        api.setRegexFile(self.regexFile)
        api.setLMModeling(True)
        api.setKeepNewWords(True)
        api.setFormattedText(testString)
        api.prepareDocument(2)
        formattedText = api.getCleanedText()
        self.assertEquals(gtString.encode('utf-8'), formattedText.encode('utf-8'))
Beispiel #12
0
    def doWork(self):
        """The actual upload of sentences.
        """
        self._log(logging.INFO, "Do work!")

        if len(self.mapLists) > 1:
            self._log(logging.CRITICAL, "Only one map list accepted!")

        documentUrl = None

        try:
            # All pdf documents
            textDocumentsList = []
            dictMap = self.mapLists[0].getDictionaryMap()

            totalCount = len(dictMap.keys())
            count = 0

            self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory())
            self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory())
            self._log(logging.INFO, "%d files to process!" % totalCount)

            # Setup once for all documents
            api = DataPreparationAPI(None, self.getOutputDirectory())
            if self.regexFile != None and len(self.regexFile) > 0:
                api.setRegexFile(self.regexFile)

            api.setFilterSentences(self.textFiltering)
            api.setDebugMode(self.debug)
            api.setRemovePunctuation(self.removePunctuation)
            api.setVerbalizePunctuation(self.verbalizePunctuation)
            api.setLMModeling(self.lmModeling)

            # Loop trough map file
            for documentName in dictMap.keys():
                for language in dictMap[documentName]:
                    documentUrl = self.inputList.getPath(documentName)

                    # Set the current document information
                    api.setInputFile(documentUrl)

                    # Main processing
                    api.prepareDocument(LANGUAGE2ID[language])
                    textDocumentsList.append(api.getDocument())

                count += 1
                self._log(logging.INFO, "%d remaining files to process!" % (totalCount - count))

            self._log(logging.INFO, "Output results to language files.")
            self.outputSentencesToFiles(textDocumentsList)

            # Outcome of the work to be saved
            self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile())

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences from %s" % documentUrl
            self._log(logging.CRITICAL, getErrorMessage(e, errorMessage))
            raise e
Beispiel #13
0
    #Parse arguments
    args = parser.parse_args()
    inputFile = args.inputFile[0]
    outputDir = args.outputDir[0]
    language = int(args.language[0])
    regexFile = args.regexFile[0]

    #Flags
    debug = args.debug
    filterSentences = args.filter
    removePunctuation = args.rmpunct
    verbalizePunctuation = args.vbpunct
    lmModeling = args.lm

    setupLogging(logging.INFO, outputDir + "/task_log.txt")

    #Api setup
    api = DataPreparationAPI(inputFile, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)

    if language == 0:
        api.trainClassifier()

    #Main processing
    api.prepareDocument(language)
    api.outputSentencesToFiles(outputDir)
Beispiel #14
0
    def doWork(self):
        """The actual upload of sentences.
        """
        self._log(logging.INFO, "Do work!")

        if len(self.mapLists) > 1:
            self._log(logging.CRITICAL,"Only one map list accepted!")

        documentUrl = None

        try:
            #All pdf documents
            textDocumentsList = []
            dictMap = self.mapLists[0].getDictionaryMap()

            totalCount = len(dictMap.keys())
            count = 0

            self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory())
            self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory())
            self._log(logging.INFO, "%d files to process!" % totalCount)

            #Setup once for all documents
            api = DataPreparationAPI(None, self.getOutputDirectory())
            if self.regexFile != None and len(self.regexFile) > 0:
                api.setRegexFile(self.regexFile)

            api.setFilterSentences(self.textFiltering)
            api.setDebugMode(self.debug)
            api.setRemovePunctuation(self.removePunctuation)
            api.setVerbalizePunctuation(self.verbalizePunctuation)
            api.setSegmentWithNLTK(self.segmentWithNLTK)
            api.setLMModeling(self.lmModeling)
            api.trainClassifier()

            #Loop trough map file
            for documentName in dictMap.keys():
                for language in dictMap[documentName]:
                    documentUrl = self.inputList.getPath(documentName)

                    #Set the current document information
                    api.setInputFile(documentUrl)
                   
                    #Main processing
                    api.prepareDocument(LANGUAGE2ID[language])
                    textDocumentsList.append(api.getDocument())

                count += 1
                self._log(logging.INFO, "%d remaining files to process!" % (totalCount-count))

            self._log(logging.INFO, "Output results to language files.")
            self.outputSentencesToFiles(textDocumentsList)

            #Outcome of the work to be saved
            self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile())

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences from %s" % documentUrl
            self._log(logging.CRITICAL, getErrorMessage(e, errorMessage))
            raise e
    regexFile = args.regexFile[0]

    # Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage = bool(args.filter2ndStage)
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    expandNumberInWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    # Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setExpandNumberInWords(expandNumberInWords)

    if language == 0:
        api.trainClassifier()

    # Main processing
    MyFile.checkDirExists(outputDir)
    regexFile = args.regexFile[0]

    #Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage  = bool( args.filter2ndStage )
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    keepNewWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    #Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()

    #Main processing
    MyFile.checkDirExists(outputDir)
Beispiel #17
0
    regexFile = args.regexFile[0]

    #Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage  = bool( args.filter2ndStage )
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    keepNewWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/task_log.txt")

    #Api setup
    api = DataPreparationAPI(inputFile, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()

    #Main processing
    api.prepareDocument(language)
    api.outputSentencesToFiles(outputDir)
    def testPrepareDocumentBasic(self):
        testString = r"/ HES-SO und AdG/LA - auch im Winter / Sommer -"
        gtString = r"hes-so und adg/la auch im winter sommer"

        api = DataPreparationAPI(None, None)
        api.setRegexFile(self.regexFile)
        api.setLMModeling(True)
        api.setExpandNumberInWords(False)
        api.setFormattedText(testString)
        api.prepareDocument(2)
        formattedText = api.getCleanedText()
        self.assertEqual(gtString,
                         formattedText)