Esempio n. 1
0
def Training(args = sys.argv[2:]):
    """
    Training RDRPOSTagger using initialized corpus against golden corpus!
    The initialized corpus is already generated by the use of an external initial tagger.
    """
    
    from src.tagger.SCRDRlearner.PosTaggingRDRTree import PosTaggingRDRTree
    
    dirPath = os.path.join(args[0] + "/")
    correctTrain = args[1]
    initializedTrain = args[2]
    learntRules = args[3]
    
    print ('\nTraining RDRPOSTagger in the use of initialized corpus against golden corpus....')
    print ('Building SCRDR-based POS tagging tree of rules...')
    
    for (improveThreshold, matchThreshold) in thresholds:
        outputDir = "T%d-%d/" % (improveThreshold, matchThreshold)     
        os.mkdir(dirPath + outputDir)         
        
        rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) 
        rdrTree.buildTreeFromCorpus(dirPath + initializedTrain, dirPath + correctTrain)
        
        print ("Write the tree to file...")
        rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules)
        #rdrTree.writeToFile(dirPath + outputDir + learntRules)
        
    print ('Completed!')
Esempio n. 2
0
def VnTraining(args = sys.argv[2:]):
    pathToDict = args[0]
    dirPath = os.path.join(args[1] + "/")
    correctTrain = args[2]
    learntRules = args[3]
    
    print( '\nTraining RDRPOSTagger for Vietnamese POS Tagging...')   
    print( "Initial tagging...")
    
    getRawTextFromFile(dirPath + correctTrain, dirPath + correctTrain + ".RAW")
    DICT = readDictionary(pathToDict)
    VnInitTagger4Corpus(DICT, dirPath + correctTrain + ".RAW", dirPath + correctTrain + ".INIT")
    
    print ("Done Initialization!")
    
    print ('Building SCRDR-based POS tagging tree of rules...')
    
    for (improveThreshold, matchThreshold) in thresholds:
        timeStart = time.time()
        outputDir = "T%d-%d/" % (improveThreshold, matchThreshold)
        os.mkdir(dirPath + outputDir)         
        
        rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) 
        rdrTree.buildTreeFromCorpus(dirPath + correctTrain + ".INIT", dirPath + correctTrain)
        
        print ("Write the tree to file...")
        rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules)
        #rdrTree.writeToFile(dirPath + outputDir + learntRules)       
        
        print ("\nTraining time for threshold %d-%d: %f seconds\n" % (improveThreshold, matchThreshold, time.time() - timeStart))
            
    print ('\nCompleted!')
Esempio n. 3
0
def TaggingInitializedCorpus(args = sys.argv[2:]):

    learntRulesPath = args[0]
    initTestCorpusPath = args[1]
 
    print ("\nTagging initialized corpus:", initTestCorpusPath)
    from src.tagger.SCRDRlearner.PosTaggingRDRTree import PosTaggingRDRTree
    r = PosTaggingRDRTree()
    r.constructTreeFromRulesFile(learntRulesPath)
    r.tagInitializedCorpus_new(initTestCorpusPath, initTestCorpusPath + ".TAGGED")

    print ('Completed!')