def enterConll(dbname,filename,annotatorName=None, eraseAllAnnos=False, dbtextname=None, corrdic={"tag":"cat","t":"orthotext"}): """ TODO: think about when to erase all tokens and features of existing annotations. maybe put warnings and confirm? """ sql = database.SQL(dbname) db,cursor=sql.open() sentences=conll.conll2trees(filename,corrdic) if dbtextname: textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) else: #M016.XML.Paola.trees.conll10 if filename[-14:-2]=='.trees.conll': dbtextname=".".join(filename.split("/")[-1].split(".")[:-3]) # TODO: remove: dbtextname= "Rhap-"+dbtextname[:2] + "0" + dbtextname[2:4]+"-Synt.xml" #print dbtextname textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) #print textid if not annotatorName: annotatorName=filename.split(".")[-3] else: dbtextname=filename.split("/")[-1] textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos ) db.commit() db.close() return len(sentences)
def readinallmates(projectname,conlldirpath,filepattern="*.trees.conll14",eraseAllAnnos=True, steps=1000): sql = database.SQL(projectname) db,cursor=sql.open() for filename in glob(os.path.join(conlldirpath, filepattern)): print "entering",filename allsentences=conll.conll2trees(filename) for i in range(len(allsentences)/steps+1): sentences=allsentences[steps*i:steps*(i+1)] #print sentences annotatorName=sql.baseAnnotatorName dbtextname = filename.split("/")[-1].decode("utf-8") if dbtextname.endswith(".conll07") or dbtextname.endswith(".conll10") or dbtextname.endswith(".conll14") or dbtextname.endswith(".malt") or dbtextname.endswith(".tab"): dbtextname=".".join(dbtextname.split(".")[:-1]) if dbtextname.endswith(".trees"): dbtextname=".".join(dbtextname.split(".")[:-1]) if steps<len(allsentences): dbtextname+=".no"+str(i) print dbtextname textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos, tokname="t" ) db.commit() db.close()
def passcompUp(infilename, outfilename): trees = conll2trees(infilename) print "read trees" with codecs.open(outfilename, "w", "utf-8") as outfile: for tree in trees: tree = completeInformation(tree) tree = passcomp(tree) for i, tokenid in enumerate(sorted(tree)): node = tree[tokenid] gov = node.get("gov", {}).items() govid, func = gov[0] outfile.write( "\t".join( [ str(tokenid), node.get("t", "_"), node.get("lemma", "_"), node.get("lemma", "_"), node.get("tag", "_"), node.get("tag", "_"), node.get("morph", "_"), node.get("morph", "_"), str(govid), str(govid), func, func, "_", "_", ] ) + "\n" ) outfile.write("\n")
def simpleConll(inputfilename): """ fonction qui lit rhapsodie tok et choisit la fonction unique par token resort un format conll standard """ trees = conll2trees(inputfilename) # 1/0 with codecs.open(inputfilename + ".simpl", "w", "utf-8") as f: for treedic in trees: # print "\n" # for id in sorted(treedic): # node=treedic[id] # print "____",id, node # print "ooo" treedic = completeInformation(treedic) ntreedic = passcomp(treedic) # treedic ressemble à {1: {'lemma': u'bonjour', 'gov': {0: u'root'}, 'tag': u'B_I', 'id': 1, 't': u'bonjour'}, 3: {'lemma': u'Eric', 'gov': {0: u'root'}, 'tag': u'B_N', 'id': 3, 't': u'Eric'}} for tokid in sorted(treedic): tdic = treedic[tokid] # print tokid,tdic if tdic["gov"] != {}: tgov = tdic["gov"].keys()[0] tfonc = tdic["gov"][tgov] else: tgov, tfonc = "", "" f.write( "\t".join([str(tdic["id"]), tdic["t"], tdic["lemma"], tdic["tag"][2:], "", "", str(tgov), tfonc]) + "\n" ) f.write("\n")
def uploadConll(sql, filename, annotatorName=None, eraseAllAnnos=True): """ important function, called from project.cgi!!! """ db,cursor=sql.open() sentences=conll.conll2trees(filename) #print sentences if not annotatorName: annotatorName=sql.baseAnnotatorName dbtextname = filename.split("/")[-1].decode("utf-8") if dbtextname.endswith(".conll10") or dbtextname.endswith(".conll14") or dbtextname.endswith(".malt") or dbtextname.endswith(".tab"): dbtextname=".".join(dbtextname.split(".")[:-1]) textid = sql.enter(cursor, "texts",["textname"],(dbtextname,)) enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos, tokname="t" ) db.commit() db.close() return len(sentences)
def conll2phrasestructure(conllinfilename, phrasestructureoutname, args): beginning="" rhaps=False with codecs.open(conllinfilename,"r","utf-8") as f: beginning = f.read(50) if beginning.startswith("Text ID Tree ID Token ID"): rhaps=True trees=conll.conll2trees(conllinfilename, {"tag":"cat"}, rhaps=rhaps) ctrees,xmldoc=makePhraseStructure(trees) out=codecs.open(phrasestructureoutname,"w","utf-8") if args.bracketing: for ctree in ctrees: out.write(ctree.pprint()+"\n\n") elif args.pasteBracketing: pasteBracketing(trees,ctrees,conllinfilename, out) elif args.latex: out.write(startlatex) for ctree in ctrees: out.write(ctree.pprint_latex_qtree()+"\n\n") out.write(endlatex) else: out.write(xmldoc.toprettyxml()) out.close() if args.graphs: for ctree in ctrees: ctree.draw()
def readInTestResults(projectname,conllfilename,gold="gold",parser="parser"): """ reads in the result of the mate transition parser into visually comparable results """ print "________________________________________current file is: " + conllfilename shortname = conllfilename.split("/")[-1] if projectname: sql = database.SQL(projectname) db,cursor=sql.open() textid = sql.enter(cursor, "texts",["textname"],(shortname,)) goldid = sql.enter(cursor, "users",["user","realname"],(gold,gold,)) parserid = sql.enter(cursor, "users",["user","realname"],(parser,parser,)) sents={} goldsentences,parsesentences,sentencefeatures=[],[],[] l,t,g,f,length=0.0,0.0,0.0,0.0,0 allcats,allfuncs={},{} sentences=conll.conll2trees(conllfilename) #print len(sentences) for tree in sentences: tl,tt,tg,tf,tlength=computeDifference(tree) #"lemma precision:"+str(round(tl/tlength,2))+ info = " LAS:"+str(round(tf/tlength,3))+" UAS:"+str(round(tg/tlength,3))+" pos precision:"+str(round(tt/tlength,3))+u" n° tokens:"+str(tlength) goldtree,parsetree={},{} for i in tree: node=tree[i] goldtree[i]= {"id":node["id"],"t":node["t"],"lemma":node["lemma"],"tag":node["tag"],"morph":node["morph"],"gov":node["gov"]} parsetree[i]={"id":node["id"],"t":node["t"],"lemma":node["lemma2"],"tag":node["tag2"],"morph":node["morph2"],"gov":node["gov2"]} allcats[node["tag"]]=None allcats[node["tag2"]]=None allfuncs[node["gov"].values()[0]]=None allfuncs[node["gov2"].values()[0]]=None sents[tf/tlength]=sents.get(tf/tlength,[])+[ ( goldtree,parsetree,info ) ] (l,t,g,f,length)=map(sum,zip((l,t,g,f,length),(tl,tt,tg,tf,tlength) )) # globalinfo = "lemma precision:"+str(round(l/length,2))+" pos precision:"+str(round(t/length,2))+" UAS:"+str(round(g/length,2))+" LAS:"+str(round(f/length,2))+u" n° tokens:"+str(length)+u" n° sentences:"+str(len(sentences)) print globalinfo treec=0 for f in sorted(sents): for ( goldtree,parsetree,info ) in sents[f]: #print f, treec #print f, goldtree goldsentences+=[goldtree] parsesentences+=[parsetree] sentencefeatures+=[{"info":info}] treec+=1 print "entering",len(parsesentences),"sentences" enterSentences(sql,cursor,goldsentences,shortname, textid,gold,eraseAllAnnos=True, sentencefeatures=sentencefeatures,tokname="t" ) enterSentences(sql,cursor,parsesentences,shortname, textid,parser,eraseAllAnnos=False, defaultAnnotatorName="gold", tokname="t" ) db.commit() db.close() writeConfigs(projectname,allcats,allfuncs) else: #only testing sentences=conll.conll2trees(conllfilename) l,t,g,f,length=0.0,0.0,0.0,0.0,0 for s in sentences: (l,t,g,f,length)=map(sum,zip((l,t,g,f,length),computeDifference(s))) print "____" print "lemma precision:",l/length,"tag precision:",t/length,"UAS:",g/length,"LAS:",f/length print "n° tokens:",length,"n° sentences:",len(sentences) #print computeDifference(sentences[1]) print sentences[1]