def run(args=sys.argv[1:]): if (len(args) == 0): printHelp() elif args[0].lower() == "train": try: print("\n====== Start ======") print( "\nGenerate from the gold standard training corpus a lexicon " + args[1] + ".DICT") createLexicon(args[1], 'full') createLexicon(args[1], 'short') print( "\nExtract from the gold standard training corpus a raw text corpus " + args[1] + ".RAW") getRawText(args[1], args[1] + ".RAW") print( "\nPerform initially POS tagging on the raw text corpus, to generate " + args[1] + ".INIT") DICT = readDictionary(args[1] + ".sDict") initializeCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT") print( '\nLearn a tree model of rules for POS tagging from %s and %s' % (args[1], args[1] + ".INIT")) rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) rdrTree.learnRDRTree(args[1] + ".INIT", args[1]) print("\nWrite the learned tree model to file " + args[1] + ".RDR") rdrTree.writeToFile(args[1] + ".RDR") print('\nDone!') os.remove(args[1] + ".INIT") os.remove(args[1] + ".RAW") os.remove(args[1] + ".sDict") except Exception as e: print("\nERROR ==> ", e) printHelp() elif args[0].lower() == "tag": try: r = RDRPOSTagger() print("\n=> Read a POS tagging model from " + args[1]) r.constructSCRDRtreeFromRDRfile(args[1]) print("\n=> Read a lexicon from " + args[2]) DICT = readDictionary(args[2]) print("\n=> Perform POS tagging on " + args[3]) r.tagRawCorpus(DICT, args[3]) except Exception as e: print("\nERROR ==> ", e) printHelp() else: printHelp()
def run(args=sys.argv[1:]): if (len(args) == 0): printHelp() elif args[0].lower() == "train": try: print "\n====== Start ======" print "\nGenerate from the gold standard training corpus an English lexicon", args[ 1] + ".DICT" createLexicon(args[1], 'full') createLexicon(args[1], 'short') print "\nExtract from the gold standard training corpus a raw text corpus", args[ 1] + ".RAW" getRawText(args[1], args[1] + ".RAW") print "\nPerform initially POS tagging on the raw text corpus, to create", args[ 1] + ".INIT" DICT = readDictionary(args[1] + ".sDict") initializeEnCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT") print '\nLearn a tree model of rules for English POS tagging from %s and %s' % ( args[1], args[1] + ".INIT") rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) rdrTree.learnRDRTree(args[1] + ".INIT", args[1]) print "\nWrite the learned tree model to file ", args[1] + ".RDR" rdrTree.writeToFile(args[1] + ".RDR") print '\nDone!' os.remove(args[1] + ".INIT") os.remove(args[1] + ".RAW") os.remove(args[1] + ".sDict") except Exception, e: print "\nERROR ==> ", e printHelp()
def run(args = sys.argv[1:]): if (len(args) == 0): printHelp() elif args[0].lower() == "train": try: print "\n====== Start ======" print "\nGenerate from the gold standard training corpus an English lexicon", args[1] + ".DICT" createLexicon(args[1], 'full') createLexicon(args[1], 'short') print "\nExtract from the gold standard training corpus a raw text corpus", args[1] + ".RAW" getRawText(args[1], args[1] + ".RAW") print "\nPerform initially POS tagging on the raw text corpus, to create", args[1] + ".INIT" DICT = readDictionary(args[1] + ".sDict") initializeEnCorpus(DICT, args[1] + ".RAW", args[1] + ".INIT") print '\nLearn a tree model of rules for English POS tagging from %s and %s' % (args[1], args[1] + ".INIT") rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) rdrTree.learnRDRTree(args[1] + ".INIT", args[1]) print "\nWrite the learned tree model to file ", args[1] + ".RDR" rdrTree.writeToFile(args[1] + ".RDR") print '\nDone!' os.remove(args[1] + ".INIT") os.remove(args[1] + ".RAW") os.remove(args[1] + ".sDict") except Exception, e: print "\nERROR ==> ", e printHelp()
def run(args=sys.argv[1:]): if (len(args) == 0): printHelp() elif args[0].lower() == "tag": try: r = RDRPOSTagger() r.constructSCRDRtreeFromRDRfile(args[1]) DICT = readDictionary(args[2]) r.tagRawCorpus(DICT, sys.stdin.readlines()) except Exception, e: print "\nERROR ==> ", e printHelp()
def __init__(self, language): self.language = language model = self.models.get(language) lexicon = self.dicts.get(language) if not model: raise (Exception( "Unsupported language for POS tagging: {}".format(language))) self.tagger = RDRPOSTagger() # Load the POS tagging model for X language self.tagger.constructSCRDRtreeFromRDRfile( os.path.join(multilingual_posTagger_home, model)) # Load the lexicon for X language self.dict = readDictionary( os.path.join(multilingual_posTagger_home, lexicon))
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): """ Return known-word accuracy, unknown-word accuracy and the overall accuracy """ tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)) return 0 fullDICT = readDictionary(fullDictFile) numwords = count = 0 countKN = countUNKN = 0 countCorrectKN = countCorrectUNKN = 0 for i in range(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print( "Words are not the same in gold standard and tagged corpora, at the index " + str(i)) return 0 if tag1.lower() == tag2.lower(): count += 1 if word1 in fullDICT: countKN += 1 if tag1.lower() == tag2.lower(): countCorrectKN += 1 else: countUNKN += 1 if tag1.lower() == tag2.lower(): countCorrectUNKN += 1 if countUNKN == 0: return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords else: return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): """ Return known-word accuracy, unknown-word accuracy and the overall accuracy """ tagged = open(taggedCorpus, "r").read().split() goldStandard = open(goldStandardCorpus, "r").read().split() if len(tagged) != len(goldStandard): print "The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus) return 0 fullDICT = readDictionary(fullDictFile) numwords = count = 0 countKN = countUNKN = 0 countCorrectKN = countCorrectUNKN = 0 for i in xrange(len(tagged)): numwords += 1 word1, tag1 = getWordTag(tagged[i]) word2, tag2 = getWordTag(goldStandard[i]) if word1 != word2 and word1 != "''" and word2 != "''": print "Words are not the same in gold standard and tagged corpora, at the index", i return 0 if tag1.lower() == tag2.lower(): count += 1 if word1 in fullDICT: countKN += 1 if tag1.lower() == tag2.lower(): countCorrectKN += 1 else: countUNKN += 1 if tag1.lower() == tag2.lower(): countCorrectUNKN += 1 if countUNKN == 0: return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords else: return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) rdrTree.learnRDRTree(args[1] + ".INIT", args[1]) print "\nWrite the learned tree model to file ", args[1] + ".RDR" rdrTree.writeToFile(args[1] + ".RDR") print '\nDone!' os.remove(args[1] + ".INIT") os.remove(args[1] + ".RAW") os.remove(args[1] + ".sDict") except Exception, e: print "\nERROR ==> ", e printHelp() elif args[0].lower() == "tag": try: r = RDRPOSTagger4En() print "\n=> Read an English POS tagging model from", args[1] r.constructSCRDRtreeFromRDRfile(args[1]) print "\n=> Read an English lexicon from", args[2] DICT = readDictionary(args[2]) print "\n=> Perform English POS tagging on", args[3] r.tagRawEnCorpus(DICT, args[3]) except Exception, e: print "\nERROR ==> ", e printHelp() else: printHelp() if __name__ == "__main__": run() pass
print '\nLearn a tree model of rules for English POS tagging from %s and %s' % (args[1], args[1] + ".INIT") rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) rdrTree.learnRDRTree(args[1] + ".INIT", args[1]) print "\nWrite the learned tree model to file ", args[1] + ".RDR" rdrTree.writeToFile(args[1] + ".RDR") print '\nDone!' os.remove(args[1] + ".INIT") os.remove(args[1] + ".RAW") os.remove(args[1] + ".sDict") except Exception, e: print "\nERROR ==> ", e printHelp() elif args[0].lower() == "tag": try: r = RDRPOSTagger4En() print "\n=> Read an English POS tagging model from", args[1] r.constructSCRDRtreeFromRDRfile(args[1]) print "\n=> Read an English lexicon from", args[2] DICT = readDictionary(args[2]) print "\n=> Perform English POS tagging on", args[3] r.tagRawEnCorpus(DICT, args[3]) except Exception, e: print "\nERROR ==> ", e printHelp() else: printHelp() if __name__ == "__main__": run() pass
def englishSetup(self): # initialize self with english setup (DICT and tree) self.constructSCRDRtreeFromRDRfile(current_python_file_dir + "/Models/POS/English.RDR") self.DICT = readDictionary(current_python_file_dir + "/Models/POS/English.DICT")
if e in e_to_f: e_to_f[e][f] = p else: e_to_f[e] = {} e_to_f[e][f] = p return e_to_f configs = read_config(sys.argv[1], sys.argv[2]) foreign_language = configs["foreign_language"] model_rdr = "./Models/UniPOS/UD_"+foreign_language+"/train.UniPOS.RDR" model_dict = "./Models/UniPOS/UD_"+foreign_language+"/train.UniPOS.DICT" foreign_language_tagger = RDRPOSTagger() foreign_language_tagger.constructSCRDRtreeFromRDRfile(model_rdr) foreign_language_dictionary = readDictionary(model_dict) k = int(configs["k"]) mu = float(configs["mu"]) sigma = float(configs["sigma"]) file = codecs.open(configs["input_file"],"r","utf-8") bilingual_dictionary = read_dictionary(configs["bilingual_dictionary"]) output_file = codecs.open(configs["output_ranking"],"w","utf-8") english_lines = [] foreign_lines = [] for line in file: tokens = line.split("\t") english_lines.append(tokens[1].strip()) foreign_lines.append(tokens[2].strip()) output = defaultdict()