def VnTraining(args = sys.argv[2:]): pathToDict = args[0] dirPath = os.path.join(args[1] + "/") correctTrain = args[2] learntRules = args[3] print( '\nTraining RDRPOSTagger for Vietnamese POS Tagging...') print( "Initial tagging...") getRawTextFromFile(dirPath + correctTrain, dirPath + correctTrain + ".RAW") DICT = readDictionary(pathToDict) VnInitTagger4Corpus(DICT, dirPath + correctTrain + ".RAW", dirPath + correctTrain + ".INIT") print ("Done Initialization!") print ('Building SCRDR-based POS tagging tree of rules...') for (improveThreshold, matchThreshold) in thresholds: timeStart = time.time() outputDir = "T%d-%d/" % (improveThreshold, matchThreshold) os.mkdir(dirPath + outputDir) rdrTree = PosTaggingRDRTree(improveThreshold, matchThreshold) rdrTree.buildTreeFromCorpus(dirPath + correctTrain + ".INIT", dirPath + correctTrain) print ("Write the tree to file...") rdrTree.writeToFileWithoutSeenCases(dirPath + outputDir + learntRules) #rdrTree.writeToFile(dirPath + outputDir + learntRules) print ("\nTraining time for threshold %d-%d: %f seconds\n" % (improveThreshold, matchThreshold, time.time() - timeStart)) print ('\nCompleted!')
def runVnRDRPOSTagger(args = sys.argv[1:]): if (len(args) == 0): printInstructions() elif args[0].lower().find("train") > -1: VnTraining() elif args[0].lower().find("tag") > -1: r = VnRDRTree() r.constructTreeFromRulesFile(args[1]) DICT = readDictionary(args[2]) r.tagRawCorpus(DICT, args[3]) else: printInstructions()
# -*- coding: utf-8 -*- import re # from Utility.Utils import readDictionary, isAbbre, isVnProperNoun, isVnUpperChar from src.tagger.Utility.Utils import readDictionary, isAbbre, isVnProperNoun, isVnUpperChar # VNUNKNWORDS = readDictionary("../jSCRDRtagger/addDicts/VNOTHERS.DICT") # VNNAMES = readDictionary("../jSCRDRtagger/addDicts/VNNAMES.DICT") VNUNKNWORDS = readDictionary("./resource/VNOTHERS.DICT") VNNAMES = readDictionary("./resource/VNNAMES.DICT") def VnInitTagger4Sentence(VNFREQ, sentence): """ Initial tagger for Vietnamese sentence. VNUNKNWORDS and VNNAMES were not utilized in the version as described in our CICLing 2011 paper """ words = sentence.strip().split() taggedSen = '' for word in words: if word in VNFREQ: taggedSen += word + "/" + VNFREQ[word] + " " elif word in VNUNKNWORDS: taggedSen += word + "/" + VNUNKNWORDS[word] + " " elif word in VNNAMES: taggedSen += word + "/Np " else: if (re.search(r"[0-9]+", word) != None): taggedSen += word + "/M " elif(len(word) == 1 and isVnUpperChar(word[0])): taggedSen += word + "/Y " else: