def simpleConll(inputfilename): """ fonction qui lit rhapsodie tok et choisit la fonction unique par token resort un format conll standard """ trees = conllFile2trees(inputfilename) #1/0 with codecs.open(inputfilename + '.simpl', 'w', 'utf-8') as f: for treedic in trees: #print "\n" #for id in sorted(treedic): #node=treedic[id] #print "____",id, node #print "ooo" treedic = completeInformation(treedic) ntreedic = passcomp(treedic) # treedic ressemble à {1: {'lemma': u'bonjour', 'gov': {0: u'root'}, 'tag': u'B_I', 'id': 1, 't': u'bonjour'}, 3: {'lemma': u'Eric', 'gov': {0: u'root'}, 'tag': u'B_N', 'id': 3, 't': u'Eric'}} for tokid in sorted(treedic): tdic = treedic[tokid] #print tokid,tdic if tdic["gov"] != {}: tgov = tdic["gov"].keys()[0] tfonc = tdic["gov"][tgov] else: tgov, tfonc = "", "" f.write("\t".join([ str(tdic["id"]), tdic["t"], tdic["lemma"], tdic["tag"][2:], "", "", str(tgov), tfonc ]) + "\n") f.write("\n")
def makeTrainTestSets(infolder,pattern="*conll",train="train",test="test",empty="emptytest",testsize=10, lemma=True): tottec,tottrc,toks=0,0,0 with codecs.open(os.path.join(infolder, test),"w","utf-8") as testf, codecs.open(os.path.join(infolder, empty),"w","utf-8") as emptyf, codecs.open(os.path.join(infolder, train),"w","utf-8") as trainf: for infilename in glob.glob(os.path.join(infolder, pattern)): print "newconvert: looking at",infilename tec,trc=0,0 allsentences=conll.conllFile2trees(infilename) print len(allsentences),"sentences" testselection=random.sample(range(len(allsentences)),len(allsentences)*testsize/100) for i,s in enumerate(allsentences): toks+=len(s) if i in testselection: testf.write(treeToConll14Text(s)+"\n") emptyf.write(treeToEmptyConll14Text(s,lemma)+"\n") tec+=1 else: trainf.write(treeToConll14Text(s)+"\n") trc+=1 print "testing with",tec,"sentences. training with",trc,"sentences" tottec+=tec tottrc+=trc if not lemma: shutil.copyfile(os.path.join(infolder, empty), os.path.join(infolder, empty)+"_lem") print "tottec,tottrc,toks=",tottec,tottrc,toks
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.4): trees = conll.conllFile2trees(conllfile) nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 print int(nbgovs * removeDeps) tobeRemoved = sorted(random.sample(range(nbgovs), int(nbgovs * removeDeps))) print "nbgovs:", nbgovs, "tobeRemoved:", tobeRemoved nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys()[0] != -1: if node["gov"].values()[0] in removeFuncs: node["gov"] = {} else: nbgovs += 1 if nbgovs in tobeRemoved: node["gov"] = {} newname = conllfile if conllfile.endswith(".conll"): newname = conllfile[:-len(".conll")] shutil.move(conllfile, newname + ".orig") conll.trees2conllFile(trees, newname + ".deg", columns=10)
def passcompUp(infilename, outfilename): trees = conllFile2trees(infilename) print "read trees" with codecs.open(outfilename, "w", "utf-8") as outfile: for tree in trees: tree = completeInformation(tree) tree = passcomp(tree) for i, tokenid in enumerate(sorted(tree)): node = tree[tokenid] gov = node.get("gov", {}).items() govid, func = gov[0] outfile.write("\t".join([ str(tokenid), node.get("t", "_"), node.get("lemma", "_"), node.get("lemma", "_"), node.get("tag", "_"), node.get("tag", "_"), node.get("morph", "_"), node.get("morph", "_"), str(govid), str(govid), func, func, "_", "_" ]) + "\n") outfile.write("\n")
def funcsearch(infolder): errors = {} numerrors = 0 for infile in sorted(glob.glob(os.path.join(infolder, "*"))): # .conll if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) for tree in trees: for i, node in tree.iteritems(): if node["gov"].values()[0] in ["dm"] and node["tag"] not in [ "INT" ]: key = " ".join([node["t"], node["lemma"], node["tag"]]) #print key,"!!!" errors[key] = errors.get(key, 0) + 1 numerrors += 1 with codecs.open("dm.txt", "w", "utf-8") as outf: for key in sorted(errors, key=errors.get, reverse=True): print errors[key], key outf.write(str(errors[key]) + " " + key + "\n") print "total of", numerrors, "cases" outf.write(" ".join(["total of", str(numerrors), "cases"]))
def transform(infolder, outfolder, mixOldNew=True): createNonExistingFolders(outfolder) spaceToks = {} #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))): basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = platinum(newtree) newtrees += [newtree] findSpaces(spaceToks, tree) conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename))) #corrdic = correctionDics("corrConll.txt") #for c in corrdic: #print c #qsdf for i, tok in enumerate(sorted(spaceToks)): print i, tok, spaceToks[tok]
def dicoTiretsRhapsodie(): dico = {} trees = conll.conllFile2trees("mate/fr/Rhaps.gold.conll14") for arbre in trees: for i, node in arbre.iteritems(): if node["t"][0] == "-": dico[node["t"]] = dico.get(node["t"], node["tag"]) return dico
def updateParseResult(projectname, conlldirpath, filepattern="*.trees.conll14", annotatorName="parser", removeToGetDB="-one-word-per-line.conll14_parse"): sql = database.SQL(projectname) db,cursor=sql.open() print "updateTrees:",glob(os.path.join(conlldirpath, filepattern)) for filename in glob(os.path.join(conlldirpath, filepattern)): print "entering",filename sentences=conll.conllFile2trees(filename) dbtextname = os.path.basename(filename)[:-len(removeToGetDB)] textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) if not textid: print "couldn't find the database named",textid return enterNewAnnotation(sql, db,cursor, sentences, textid, annotatorName=annotatorName)
def transform(infolder, outfolder, mixOldNew=True): createNonExistingFolders(outfolder) #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))): basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = platinum(newtree) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def tagFuncConfig(infolder,pattern="*conll"): tags,funcs={},{} for infilename in glob.glob(os.path.join(infolder, pattern)): print infilename allsentences=conll.conllFile2trees(infilename) for tree in allsentences: for i in tree.keys(): node = tree[i] gov = node.get("gov",{}).items() tag=node.get("tag",None) if tag: tags[tag]=None for govid,func in gov: funcs[func]=None for tag in sorted(tags): print tag, '{"fill": "#69399d"}' for func in sorted(funcs): print func, '{"stroke": "#000000","stroke-width":"1","stroke-dasharray": ""}'
def transform(infolder, outfolder, mixOldNew=False): createNonExistingFolders(outfolder) spaceToks = {} #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))): for infile in sorted(glob.glob(os.path.join(infolder, "*"))): if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = correct(newtree) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def transform(infolder, outfolder, mixOldNew=False): createNonExistingFolders(outfolder) corrinst = compil('corrinst.txt') print len(corrinst), "rules" for infile in sorted(glob.glob(os.path.join(infolder, "*"))): if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) newtrees = [] for tree in trees: if mixOldNew: newtrees += [tree] newtree = copy.deepcopy(tree) newtree = correct(newtree, corrinst) newtrees += [newtree] conll.trees2conllFile(newtrees, os.path.join(outfolder, fixOutname(basename)))
def search(infolder,fun): goodtrees=[] print "doing", fun.__name__ #try: os.mkdir(outdir) #except OSError: pass for infile in sorted(glob.glob(os.path.join(infolder,"*"))): # .conll if not os.path.isfile(infile): continue basename=os.path.basename(infile) print "reading",basename trees = conll.conllFile2trees(infile) for tree in trees: #if hasVerbalDm(tree): #if isNonProjective(tree): if fun(tree): goodtrees+=[tree] print "found",len(goodtrees) if goodtrees: conll.trees2conllFile(goodtrees,fun.__name__+".conll")
def retokeniser(nomdufichier, path="", addtoout=""): if not path: path, _ = os.path.split( nomdufichier) # take the same path as the nomdufichier if path and path[-1] != "/": path = path + "/" trees = conll.conllFile2trees(nomdufichier) # on lit le fichier print "le fichier", nomdufichier, "a", len(trees), "arbres" #newtrees, alltrees=[], [] newtrees = [] digitsandnumbers = codecs.open(droporfeo + "lexique/gg", "r", "utf-8").read().split('\n') for i, arbre in enumerate(trees): # on boucle sur les arbres #alltrees+=[copy.deepcopy(arbre)] #oldtree=copy.deepcopy(arbre) racines = addinfototree(arbre) oldtree = copy.deepcopy(arbre) arbre = corrigerNumerotation(arbre) arbre = nombresComposes(arbre) arbre = digits(arbre, digitsandnumbers) arbre = corrigerArbreCompos( arbre) # Décomposition des expressions multimots #for i, node in arbre.items(): # Reconfiguration des enfants #if node["gov"] == {}: #print "crap" arbre = recomposerMultimots(arbre, expressions_multimots) arbre = corrigerNumerotationSplice(arbre) arbre = corrigerSegmentationClitiques(arbre, dico_clitiques) arbre = corrigerInaudibles(arbre) arbre = corrigerClitiques(arbre) arbre = retoken(arbre) if arbre != oldtree: print i for ii in arbre: if arbre[ii] != oldtree.get(ii, None): print ii, arbre[ii]['t'], arbre[ii], oldtree.get(ii, None) newtrees.append(arbre) newname = path + os.path.basename(nomdufichier + addtoout) conll.trees2conllFile(newtrees, newname, columns=10) return newname
def addArbitraryPuncs(infolder, outfolder): createNonExistingFolders(outfolder) for conllinfile in glob.glob(os.path.join(infolder, '*')): print conllinfile trees = conll.conllFile2trees(conllinfile) for i, tree in enumerate(trees): m = max(tree) splitcode = ".,!?;:()" p = splitcode[i % len(splitcode)] tree[m + 1] = { u'tag': u'PUNC', u'lemma': p, u't': p, 'gov': { 0: u'punc' } } conll.trees2conllFile(trees, os.path.join(outfolder, os.path.basename(conllinfile)), columns=14)
def conll2phrasestructure(conllinfilename, phrasestructureoutname, args): beginning = "" rhaps = False with codecs.open(conllinfilename, "r", "utf-8") as f: beginning = f.read(50) if beginning.startswith("Text ID Tree ID Token ID"): rhaps = True elif args.orfeo: rhaps = "orfeo" trees = conll.conllFile2trees(conllinfilename, {"tag": "cat"}, rhaps=rhaps) #print len(trees),trees[0] ctrees, xmldoc = makePhraseStructure(trees) out = codecs.open(phrasestructureoutname, "w", "utf-8") if args.bracketing: for ctree in ctrees: out.write(ctree.pprint() + "\n\n") elif args.pasteBracketing: pasteBracketing(trees, ctrees, conllinfilename, out) elif args.latex: out.write(startlatex) for ctree in ctrees: out.write( unicode(ctree.pprint_latex_qtree()).replace("#", "\\#") + "\n\n") out.write(endlatex) try: import subprocess, webbrowser proc = subprocess.Popen(['pdflatex', phrasestructureoutname]) proc.communicate() webbrowser.open_new_tab( os.path.abspath( ".".join(phrasestructureoutname.split(".")[:-1]) + ".pdf")) except: print "is pdflatex and qtree installed? try 'sudo apt-get install texlive-humanities'" else: out.write(xmldoc.toprettyxml()) out.close() if args.graphs: for ctree in ctrees: ctree.draw()
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.2): trees = conll.conllFile2trees(conllfile) nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 print int(nbgovs * removeDeps) tobeRemoved = sorted(random.sample(range(nbgovs), int(nbgovs * removeDeps))) print nbgovs, tobeRemoved nbgovs = 0 for arbre in trees: for i, node in arbre.iteritems(): if "gov" in node and node["gov"].keys( )[0] != -1 and node["gov"].values()[0] not in removeFuncs: nbgovs += 1 if nbgovs in tobeRemoved: node["gov"] = {} shutil.move(conllfile, conllfile + ".orig") conll.trees2conllFile(trees, conllfile, columns=10)
def search(infolder): allLem = lireToutLex() errors = {} numerrors = 0 for infile in sorted(glob.glob(os.path.join(infolder, "*"))): # .conll if not os.path.isfile(infile): continue basename = os.path.basename(infile) print "reading", basename trees = conll.conllFile2trees(infile) for tree in trees: #if hasVerbalDm(tree): #if isNonProjective(tree): for i, node in tree.iteritems(): #if (node["lemma"] in allLem and node["tag"] in allLem[node["lemma"]]) and node["tag"] not in ["NOM"] and allLem.get(node["lemma"],None): #pass ##print node["t"],node["lemma"],node["tag"],"ok" #else: if node["tag"] in ["NOM"] and node["tag"] not in allLem.get( node["lemma"], []) and allLem.get(node["lemma"], []) == []: key = " ".join([ node["t"], node["lemma"], node["tag"], "lexique:", " ".join( sorted( set( allLem.get(node["lemma"], ["pas dans le lexique"])))) ]) #print key,"!!!" errors[key] = errors.get(key, 0) + 1 numerrors += 1 with codecs.open("problems.txt", "w", "utf-8") as outf: for key in sorted(errors, key=errors.get, reverse=True): print errors[key], key outf.write(str(errors[key]) + " " + key + "\n") print "total of", numerrors, "errors" outf.write(" ".join(["total of", str(numerrors), "errors"]))
def split(conllfile, maxi): trees = conll.conllFile2trees(conllfile) for j, ts in enumerate( [trees[i:i + maxi] for i in range(0, len(trees), maxi)]): conll.trees2conllFile(ts, conllfile + str(j))
def fusionForgottenTrees(project="Platinum", fusdir="../projects/OrfeoGold2016/platinum/*", annotators=["admin"]): """ takes trees from project ordered by annotators. if they exist fuse them into the fusdir result has the extension "cool.conll" ,"Sy","Marion" """ #print lemmacorrection sys.path.insert(0, '../tools') import difflib outfiles = [] sql = SQL(project) db, cursor = sql.open() goodTexts = {} outdir = os.path.join("..", "projects", project, "exportcool") try: os.mkdir(outdir) except OSError: pass for annotator in annotators: print[ list( cursor.execute("select rowid from users where user =?;", (annotator, ))) ] annotatorIds = tuple(a for (a, ) in [ list( cursor.execute("select rowid from users where user =?;", ( annotator, )))[0] for annotator in annotators ]) print annotators, annotatorIds for textid, textname, nrtokens in list( cursor.execute("select rowid, * from texts;")): # for each text print "\n__________________________doing", textname, "with", nrtokens, "tokens" nrutids = {} for nr, userid, treeid in list( cursor.execute( "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and textid = ? order by nr;" .format(annotatorIds=annotatorIds), (textid, ))): nrutids[nr] = nrutids.get(nr, {}) nrutids[nr][userid] = treeid trees = {} for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: tree = sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"] trees[nr] = tree #print "atree:",tree break #if not tree: #print "problem: no tree for nr",nr,"type",type(nr) #print "annotatorIds",annotatorIds #raise Exception('no tree', nr) #print trees print len(trees), "trees from", project print textname, textname.split(".")[0] btextname = os.path.basename(textname).split(".")[0] if btextname.endswith("-one-word-per-line"): btextname = btextname[:-len("-one-word-per-line")] #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)] cooltrees = [] ptrees, ftrees = 0, 0 for fi in glob.glob(fusdir): if btextname == os.path.basename(fi).split(".")[0]: print "yes", btextname fustrees = conll.conllFile2trees(fi) print len(fustrees), "ftrees", fi for nr, ftree in enumerate(fustrees): if nr + 1 in trees: #print "added tree",nr+1,"from database" #ptree=platinum(trees[nr+1]) ptree = trees[nr + 1] for iii in ptree: ptree[iii]["tag2"] = "_" if ptree[iii]["lemma"] in lemmacorrection: ptree[iii]["lemma"] = lemmacorrection[ ptree[iii]["lemma"]] cooltrees += [ptree] #print nr+1,"tree from",project#,tree ptrees += 1 if ftree.sentence() != u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]): print "\n_________", nr + 1 print ftree.sentence() print u" ".join( [ptree[i].get("t", "") for i in sorted(ptree)]) #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l #print "dbtree",platinum(trees[nr+1]) else: for iii in ftree: ftree[iii]["tag2"] = "_" if ftree[iii]["lemma"] in lemmacorrection: ftree[iii]["lemma"] = lemmacorrection[ ftree[iii]["lemma"]] #print nr+1,"tree from",fusdir#,tree ftrees += 1 cooltrees += [ftree] #print "added tree",nr+1,"from fustrees",fi outfile = os.path.join(outdir, textname + ".cool.conll") conll.trees2conllFile(cooltrees, outfile=outfile, columns=10) print "wrote", outfile print ptrees, "ptrees, ", ftrees, "ftrees" break if len(cooltrees) == 0: print "nothing for", btextname outfiles += [outfile] #qsdf return outfiles
new_trees = list() for nr, tree in sortable: # adding metadatas 应该是重命名sent_id,从0开始 tree.sentencefeatures["text"] = tree.sentence() tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1) # removing useless metadata del tree.sentencefeatures["nr"] new_trees.append(tree) conll.trees2conllFile(new_trees, outfile) if __name__ == "__main__": ## Open project database sql = SQL("NaijaSUD") # 输入project名字 db, cursor = sql.open() ## Use 2 functions : # - exportLastBestAnnotations in lib/database.py -> writes a file with trees and their rank # - reorder in lib/yuchen.py -> reorder trees based on their rank, write a file with the output users, c = sql.exportLastBestAnnotations( 115, "P_ABJ_GWA_06_Ugo-lifestory_PRO" ) # 输入textid和text name,可通过链接https://arborator.ilpga.fr/editor.cgi?project=NaijaSUD&textid=74&opensentence=1看到textid print(users, c) fpath = "E:/TAL/Stage/arborator/projects/NaijaSUD/export/P_ABJ_GWA_06_Ugo.lifestory_PRO.most.recent.trees.with.feats.conllu" # 输入导出的文件所在路径 trees = conll.conllFile2trees(fpath) # 重新排序conll树,重命名sent_id reorder(trees, fpath + "_reordered")
q[i:i + maxiAtOnce] for i in xrange(0, len(q), maxiAtOnce) ]: res = service.translations().list(source=source, target=target, q=wordgroup).execute() translations += [td["translatedText"] for td in res["translations"]] return translations #translate(u"准许 一 位 人士 入境 的 权力".split()) for conllinfile in glob.glob(os.path.join("corpus/conll/", 'CONV*.*')): print conllinfile trees = conllFile2trees(conllinfile) path, base = os.path.split(conllinfile) translateDic = {} counter = 0 for tree in trees: for i, node in tree.iteritems(): node["tag2"] = pinyin.get(node["t"]) translateDic[node["t"]] = None counter += 1 if not counter % 100: print counter, "trees" words = sorted(translateDic) print len(words), "words" trads = translate(words) translateDic = dict(zip(words, trads)) print len(translateDic), "translations"
langConllFiles[lcode] = langConllFiles.get( lcode, []) + [os.path.join(dirpath, f)] #print langConllFiles return langConllFiles conllfiles = getAllConllFiles("../sud-treebanks-v2.4") #print() unigrams = {} bigrams = {} bigtype = {} for fi in tqdm.tqdm(conllfiles['ja']): if "FTB" in fi: continue print('analyzing', fi) trees = conll.conllFile2trees(fi) for tree in trees: toks = [ tree[i]['t'] for i in sorted(tree.keys()) if tree[i]['tag'] != "PUNCT" ] for t in toks: unigrams[t] = unigrams.get(t, 0) + 1 maxtree = max(tree.keys()) for i in sorted(tree): if i == maxtree: continue na = tree[i] nb = tree[i + 1] a = na['t'] b = nb['t']
def parsing(infile, lemodel="LemModel", tagmodel="TagModel", parsemodel="ParseModel", outfolder="parses", memory="4G", depparse=True, parserType="graph", lemmatized=False): """ parsing function TODO: parserType is ignored """ print "parsing now i'm at os.getcwd()", os.getcwd(), os.path.dirname( __file__), __file__ if outfolder[-1] != "/": outfolder = outfolder + "/" outfile = outfolder + os.path.basename(infile) anna, lemclass, tagclass, parseclass = getClasses("graph") #anna = ".:./mate/mate_components/anna-3.3.jar" #anna = os.path.join(os.path.dirname(__file__), 'mate/mate_components/anna-3.3.jar') #print anna #print "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar" #assert anna=="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar" #print anna #anna = "anna-3.3.jar" #anna = "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar" #print anna #lemodel="../platinum.2017-03-26_03:10/models/LemModel" #print lemodel #print "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel" #assert lemodel=="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel" #lemodel="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel" #infile="/home/kim/Dropbox/programmation/arborator/trunk/tools/parses/upload.fr.txt.sents.conll" lemcommand = "java -Xmx{memory} -cp {anna} {lemclass} -model {lemodel} -test {infile} -out {outfile}_lem".format( memory=memory, anna=anna, lemclass=lemclass, infile=infile, lemodel=lemodel, outfile=outfile) tagcommand = "java -Xmx{memory} -cp {anna} {tagclass} -model {tagmodel} -test {outfile}_lem -out {outfile}_tag".format( memory=memory, anna=anna, tagclass=tagclass, tagmodel=tagmodel, outfile=outfile) parsecommand = "java -Xmx{memory} -cp {anna} {parseclass} -model {parsemodel} -test {outfile}_tag -out {outfile}_parse".format( memory=memory, anna=anna, parseclass=parseclass, parsemodel=parsemodel, outfile=outfile) if lemodel and lemodel[-1] != "/": if verbose: print "\n\n========== lemmatizing...", lemcommand p1 = subprocess.Popen( [lemcommand], shell=True, stdout=subprocess.PIPE, cwd= '/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components' ) out, err = p1.communicate() if verbose: print out, err else: if lemmatized: print "copying", outfolder + os.path.basename( infile), "as lemma file" shutil.copyfile(outfolder + os.path.basename(infile), outfolder + os.path.basename(infile) + "_lem") else: print "adding toks as lems", outfolder + os.path.basename(infile) trees = conll.conllFile2trees(infile) with codecs.open(outfolder + os.path.basename(infile) + "_lem", "w", "utf-8") as lemf: for tree in trees: lemf.write( newconvert.treeToEmptyConll14Text(tree, lemma=False) + "\n") if verbose: print "\n\n========== tagging...", tagcommand p1 = subprocess.Popen([tagcommand], shell=True, stdout=subprocess.PIPE) out, err = p1.communicate() if verbose: print out, err if depparse: if verbose: print "\n\n========== dep analysis...", parsecommand p1 = subprocess.Popen([parsecommand], shell=True, stdout=subprocess.PIPE) print p1.stdout.read() if verbose: print "\n\n========== parsed" #java -Xmx40G -cp mate/mate_components/anna-3.3.jar is2.parser.Parser -model ParseModel -test annotatedCorpora.corr.conll_tag -out naija_parse #anna = "" #lemclass = "is2.lemmatizer.Lemmatizer" #tagclass = "is2.tag.Tagger" #if lang=="graph": #parseclass = "" if checkIntegrity(outfile + '_parse') == False: print "*********ERROR IN FILE", outfile + "_parse", "Please Try again*********" return outfile + "_parse"
def trainingEvaluationParsing(project=u"OrfeoGold2016", parserType="graph", whoseTrees="validator", evaluationPercent=10, additionnalLexicon=None, resultAnnotator="mate", getFromFolder=False, parseDB=False, memory="40G", stopOnError=False): """ if additionnalLexicon is given, it is joined to the training file for lemmatization and tagging. change memory here! todo : - add function to choose parser type (lang=) - creer mate.log pour progression (fin = "Ready.") """ mateLogs("Begin") ti = time.time() if getFromFolder: parseDB = False # TODO: correct this so that all options are available parserType = (parserType or "graph") whoseTrees = whoseTrees or "validator" evaluationPercent = evaluationPercent or 10 resultAnnotator = resultAnnotator or "mate" try: os.chmod("mate/parse.log", 0666) # just in case... except: pass timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') ##### #defining project and creation of saves directories ##### basepath = createDailyPath("./mate/", project) if parseDB: backupbase = backupOldDatabase(project, basepath) mateLogs( "A copy of the database has been stored in {backupbase}. Getting validated trees..." .format(backupbase=backupbase)) traindir = createDirectory(basepath + "training") modeldir = createDirectory(basepath + "models") logdir = createDirectory(basepath + "logs") parsedir = createDirectory(basepath + "parses") ##### #getting gold trees for training ##### if getFromFolder: # getFromFolder contains folder name containing only conll files error = False goldtrees = [] for infile in glob.glob(os.path.join(getFromFolder, '*')): if os.path.isfile(infile): print "reading", infile gtrees = conll.conllFile2trees(infile) for tree in gtrees: problemkeys = [] for i in tree: for gi in tree[i]["gov"]: if not 0 <= gi <= len(tree): print infile print tree print "has a problematic governor:", gi error = True problemkeys += [i] for problemk in problemkeys: del tree[problemk] goldtrees += gtrees if error and stopOnError: sys.exit() else: goldtrees = trees2train.getValidatedTrees(project, basepath, whoseTrees) mateLogs( u"{nrtrees} validated trees extracted".format(nrtrees=len(goldtrees))) lemma = None if goldtrees: # see whether the first token of the first tree has a lemma. if lemma==None: we'll skip lemmatization lemma = goldtrees[0][sorted(goldtrees[0])[0]].get( "lemma", None) # just trying to get the first lemma value if lemma == "_": lemma = None print "found lemma in first tree:", lemma #TODO: do something here: double tokens as lemmas for chinese, see function makeTrainTestSets else: print "no trees from:", getFromFolder sys.exit() print "goldtrees:", len(goldtrees) ##### #creating trainingfiles ##### alldeptraining = traindir + "alldeptraining.conll" conll.trees2conllFile(goldtrees, alldeptraining, columns=14) traintrees = makeTrainTestSets(traindir, pattern=os.path.basename(alldeptraining), train="partialdeptrain.conll", test="test.conll", empty="emptytest.conll", testsize=int(evaluationPercent), lemma=lemma) print "traintrees:", len(traintrees) if additionnalLexicon: lexicontrees = conll.conllFile2trees(additionnalLexicon) print "lexicontrees:", len(lexicontrees) alllemtagtrain = traindir + "alllemtagtrain.conll" conll.trees2conllFile(goldtrees + lexicontrees, alllemtagtrain, columns=14) partiallemtagtrain = traindir + "partiallemtagtrain.conll" conll.trees2conllFile(traintrees + lexicontrees, partiallemtagtrain, columns=14) else: alllemtagtrain = alldeptraining partiallemtagtrain = traindir + "partialdeptrain.conll" #creating files used for evaluation #if isinstance(evaluationPercent, str): evaluationPercent = int(evaluationPercent) mateLogs("trainfiles created") if verbose: print "just testing whether i can load them..." conll.conllFile2trees(traindir + "partialdeptrain.conll") conll.conllFile2trees(traindir + "emptytest.conll") conll.conllFile2trees(traindir + "test.conll") mateLogs("training of partial tree file for evaluation... ====") lemodelpartial, tagmodelpartial, parsemodelpartial = makeTrainingModels( basepath, lemtagin=partiallemtagtrain, depin=traindir + "partialdeptrain.conll", outfolder=modeldir, memory=memory, testfile=traindir + "emptytest.conll", evalfile=traindir + "test.conll", lemma=lemma, parserType=parserType) mateLogs("evaluation...") #evaluation evaluFileName = detailedEvaluation(parserType=parserType, memory=memory, testfile=traindir + "emptytest.conll_parse", evalfile=traindir + "test.conll", path=logdir, evaluationPercent=evaluationPercent) evalu = unicode(evaluFileName) + "\n" with codecs.open(evaluFileName, "r", "utf-8") as f: evalu += f.read() #full training mateLogs("training of full tree file for parsing... ====") lemodel, tagmodel, parsemodel = makeTrainingModels(basepath, lemtagin=alllemtagtrain, depin=alldeptraining, outfolder=modeldir, memory=memory, lemma=lemma, parserType=parserType) #getting texts to parse mateLogs( "training and evaluation complete. Starting the parse...\n\n{evalu}". format(evalu=evalu)) #filenames=getTextsForParsing.main(project, parsedir) if parseDB: filenames = getTextsForParsing.extractConllFiles(project, parsedir) #parsing for infile in filenames: #mateLogs("Training and evaluation complete. Starting the parse of {infile}\n\n{evalu}".format(infile=infile, evalu=evalu)) mateLogs( "Training and evaluation complete. Starting the parse of {}\n\n" .format(infile)) parsedfile = parsing(infile, lemodel=lemodel, tagmodel=tagmodel, parsemodel=parsemodel, outfolder=parsedir, parserType=parserType, memory=memory) #update on base newname = os.path.basename(parsedfile) updateTrees.updateParseResult(project, parsedir, filepattern=newname, annotatorName=resultAnnotator, removeToGetDB="_parse") # make it easy for everyone to erase all this stuff: for root, dirs, files in os.walk(basepath): for momo in dirs: try: os.chmod(os.path.join(root, momo), 0777) except: pass for momo in files: try: os.chmod(os.path.join(root, momo), 0666) except: pass totaltime = (time.time() - ti) / 60 mateLogs( "Ready. It took {totaltime} minutes for the whole process\n\n{evalu}". format(totaltime=round(totaltime, 1), evalu=evalu))
def parsing(infile, lemodel="LemModel", tagmodel="TagModel", parsemodel="ParseModel", outfolder="parses", memory="4G", depparse=True, parserType="graph", lemmatized=False): """ parsing function TODO: parserType is ignored """ if outfolder[-1] != "/": outfolder = outfolder + "/" outfile = outfolder + os.path.basename(infile) anna, lemclass, tagclass, parseclass = getClasses("graph") lemcommand = "java -Xmx{memory} -cp {anna} {lemclass} -model {lemodel} -test {infile} -out {outfile}_lem".format( memory=memory, anna=anna, lemclass=lemclass, infile=infile, lemodel=lemodel, outfile=outfile) tagcommand = "java -Xmx{memory} -cp {anna} {tagclass} -model {tagmodel} -test {outfile}_lem -out {outfile}_tag".format( memory=memory, anna=anna, tagclass=tagclass, tagmodel=tagmodel, outfile=outfile) parsecommand = "java -Xmx{memory} -cp {anna} {parseclass} -model {parsemodel} -test {outfile}_tag -out {outfile}_parse".format( memory=memory, anna=anna, parseclass=parseclass, parsemodel=parsemodel, outfile=outfile) if lemodel and lemodel[-1] != "/": if verbose: print "\n\n========== lemmatizing...", lemcommand p1 = subprocess.Popen([lemcommand], shell=True, stdout=subprocess.PIPE) out, err = p1.communicate() if verbose: print out, err else: if lemmatized: print "copying", outfolder + os.path.basename( infile), "as lemma file" shutil.copyfile(outfolder + os.path.basename(infile), outfolder + os.path.basename(infile) + "_lem") else: print "adding toks as lems", outfolder + os.path.basename(infile) trees = conll.conllFile2trees(infile) with codecs.open(outfolder + os.path.basename(infile) + "_lem", "w", "utf-8") as lemf: for tree in trees: lemf.write( newconvert.treeToEmptyConll14Text(tree, lemma=False) + "\n") if verbose: print "\n\n========== tagging...", tagcommand p1 = subprocess.Popen([tagcommand], shell=True, stdout=subprocess.PIPE) out, err = p1.communicate() if verbose: print out, err if depparse: if verbose: print "\n\n========== dep analysis...", parsecommand p1 = subprocess.Popen([parsecommand], shell=True, stdout=subprocess.PIPE) print p1.stdout.read() if verbose: print "\n\n========== parsed" if checkIntegrity(outfile + '_parse') == False: print "*********ERROR IN FILE", outfile + "_parse", "Please Try again*********" return outfile + "_parse"