コード例 #1
0
ファイル: run_data_preparation.py プロジェクト: idiap/asrt
    inputFile = args.inputFile[0]
    outputDir = args.outputDir[0]
    language = int(args.language[0])
    regexFile = args.regexFile[0]

    #Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage  = bool( args.filter2ndStage )
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    keepNewWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/task_log.txt")

    #Api setup
    api = DataPreparationAPI(inputFile, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()
コード例 #2
0
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)

################
# main
#
if __name__ == "__main__" :
    parser = argparse.ArgumentParser(description=usage)
    parser.add_argument("-i", "--input", help="input file", nargs=1, dest="inputFile", required=True)
    parser.add_argument("-o", "--output", help="output file", nargs=1, dest="outputFile", required=True)
    parser.add_argument("-r", "--regex", help="regular expression file", nargs=1, dest="regexFile", required=True)
    
    args = parser.parse_args()

    inputFile = os.path.abspath(args.inputFile[0])
    outputFile = os.path.abspath(args.outputFile[0])
    regexFile = os.path.abspath(args.regexFile[0])

    setupLogging(logging.INFO)

    applyRegexes(inputFile, outputFile, regexFile)
    
コード例 #3
0
# along with asrt. If not, see <http://opensource.org/licenses/>.

__author__ = "Alexandre Nanchen"
__version__ = "Revision: 1.0 "
__date__ = "Date: 2015/09"
__copyright__ = "Copyright (c) 2015 Idiap Research Institute"
__license__ = "BSD 3-Clause"

import unittest, re, string, logging

from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula
from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE
from asrt.common.AsrtConstants import ABBREVIATIONS
from asrt.common.LoggingSetup import setupLogging

setupLogging(logging.INFO, "./output.log")


class TestFormulaLMPreparation(unittest.TestCase):
    allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE

    def verifyEqual(self, testList, f, callback):
        for t, gt in testList:
            f.strText = t
            callback()
            self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))

    ############
    #Tests
    #
    def testNormalizeUtf8(self):
コード例 #4
0
#
if __name__ == "__main__":
    #Setup parser
    parser = argparse.ArgumentParser(description=usage)
    parser.add_argument("-t", "--target", help="target directory containing the data.olist and data.omap", 
                         nargs=1, dest="targetDir", required=True)
    parser.add_argument("-o", "--output", help="output directory", nargs=1, dest="outputDir", required=True)
    parser.add_argument("-r", "--regex", help="regex file", nargs=1, dest="regexFile", required=True)
    parser.add_argument("-f", "--filter", help="filter sentences", dest="filter",action="store_true")
    parser.add_argument("-d", "--debug", help="enable debug output", action="store_true")
    parser.add_argument("-n", "--rmpunctuation", help="remove punctuation", action="store_true")
    parser.add_argument("-p", "--vbpunctuation", help="verbalize punctuation", action="store_true")
    parser.add_argument("-s", "--rawseg", help="do not segment sentences with NLTK", dest="rawseg",action="store_true")
    parser.add_argument("-m", "--lm", help="prepare for lm modeling", dest="lm",action="store_true")

    #Parse arguments
    args = parser.parse_args()
    targetDir = args.targetDir[0]
    outputDir = args.outputDir[0]
    regexFile = args.regexFile[0]

    segmentWithNLTK = "True" if not args.rawseg else "False"

    setupLogging(logging.INFO, outputDir + "/task_log.txt")

    task = ImportDocumentTask(TaskInfo(STRPARAMETERS % (regexFile, str(args.debug), 
                                                        args.rmpunctuation, args.vbpunctuation,
                                                        segmentWithNLTK, args.filter, args.lm), 
                                       outputDir, targetDir))
    task.execute()
コード例 #5
0
    inputList = args.inputList[0]
    outputDir = args.outputDir[0]
    language = int(args.language[0])
    regexFile = args.regexFile[0]

    #Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage  = bool( args.filter2ndStage )
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    keepNewWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    #Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()
コード例 #6
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=usage)
    parser.add_argument("-i",
                        "--input",
                        help="input file",
                        nargs=1,
                        dest="inputFile",
                        required=True)
    parser.add_argument("-o",
                        "--output",
                        help="output file",
                        nargs=1,
                        dest="outputFile",
                        required=True)
    parser.add_argument("-r",
                        "--regex",
                        help="regular expression file",
                        nargs=1,
                        dest="regexFile",
                        required=True)

    args = parser.parse_args()

    inputFile = os.path.abspath(args.inputFile[0])
    outputFile = os.path.abspath(args.outputFile[0])
    regexFile = os.path.abspath(args.regexFile[0])

    setupLogging(logging.INFO)

    applyRegexes(inputFile, outputFile, regexFile)
コード例 #7
0
# along with asrt. If not, see <http://opensource.org/licenses/>.

__author__ = "Alexandre Nanchen"
__version__ = "Revision: 1.0 "
__date__ = "Date: 2015/09"
__copyright__ = "Copyright (c) 2015 Idiap Research Institute"
__license__ = "BSD 3-Clause"

import unittest, re, string, logging

from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula
from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE
from asrt.common.AsrtConstants import ABBREVIATIONS
from asrt.common.LoggingSetup import setupLogging

setupLogging(logging.INFO, "./output.log")

class TestFormulaLMPreparation(unittest.TestCase):
    allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE

    def verifyEqual(self, testList, f, callback):
        for t, gt in testList:
            f.strText = t
            callback()
            self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))

    ############
    #Tests
    #
    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
コード例 #8
0
__date__ = "Date: 2015/09"
__copyright__ = "Copyright (c) 2015 Idiap Research Institute"
__license__ = "BSD 3-Clause"

import unittest
import re
import string
import logging

from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula
from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE
from asrt.common.AsrtConstants import ABBREVIATIONS
from asrt.common.LoggingSetup import setupLogging
from asrt.config.AsrtConfig import TEMPDIRUNITTEST

setupLogging(logging.INFO, TEMPDIRUNITTEST + "/output.log")


class TestFormulaLMPreparation(unittest.TestCase):
    allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE

    def verifyEqual(self, testList, f, callback):
        for t, gt in testList:
            f.strText = t
            callback()
            self.assertEqual(gt.encode('utf-8'), f.strText.encode('utf-8'))

    ############
    # Tests
    #
    def testNormalizeUtf8(self):
コード例 #9
0
    inputList = args.inputList[0]
    outputDir = args.outputDir[0]
    language = int(args.language[0])
    regexFile = args.regexFile[0]

    # Flags
    debug = bool(args.debug)
    filterSentences = bool(args.filter)
    filterSentences2ndStage = bool(args.filter2ndStage)
    removePunctuation = bool(args.rmpunct)
    verbalizePunctuation = bool(args.vbpunct)
    rawSeg = bool(args.rawseg)
    lmModeling = bool(args.lm)
    expandNumberInWords = bool(not args.trim)

    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    # Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setExpandNumberInWords(expandNumberInWords)

    if language == 0:
        api.trainClassifier()