Beispiel #1
0
def enterConll(dbname,filename,annotatorName=None, eraseAllAnnos=False, dbtextname=None, corrdic={"tag":"cat","t":"orthotext"}): 
		""" 
		TODO: think about when to erase all tokens and features of existing annotations. maybe put warnings and confirm?
		"""
		sql = database.SQL(dbname)
		db,cursor=sql.open()
		sentences=conll.conll2trees(filename,corrdic)
		
		if dbtextname:
			textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
		else:
			#M016.XML.Paola.trees.conll10
			if filename[-14:-2]=='.trees.conll':
				dbtextname=".".join(filename.split("/")[-1].split(".")[:-3])
				
				# TODO: remove:
				dbtextname= "Rhap-"+dbtextname[:2] + "0" + dbtextname[2:4]+"-Synt.xml"
				#print dbtextname
				
				textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
				#print textid
				if not annotatorName: annotatorName=filename.split(".")[-3]
			else:
				dbtextname=filename.split("/")[-1]
				textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
				
		enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos )
		db.commit()
		db.close()
		return len(sentences)
Beispiel #2
0
def readinallmates(projectname,conlldirpath,filepattern="*.trees.conll14",eraseAllAnnos=True, steps=1000):
	
	
	sql = database.SQL(projectname)
	db,cursor=sql.open()
	
	for filename in glob(os.path.join(conlldirpath, filepattern)):
		print "entering",filename
		allsentences=conll.conll2trees(filename)
		
		for i in range(len(allsentences)/steps+1):
			 
			sentences=allsentences[steps*i:steps*(i+1)]
			
			#print sentences
			annotatorName=sql.baseAnnotatorName
			
			dbtextname = filename.split("/")[-1].decode("utf-8")
			if dbtextname.endswith(".conll07") or dbtextname.endswith(".conll10") or dbtextname.endswith(".conll14") or  dbtextname.endswith(".malt") or  dbtextname.endswith(".tab"):
				dbtextname=".".join(dbtextname.split(".")[:-1])
			if dbtextname.endswith(".trees"):
				dbtextname=".".join(dbtextname.split(".")[:-1])
			if steps<len(allsentences):
				dbtextname+=".no"+str(i)
				print dbtextname
			textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
			
			enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos, tokname="t" )
	db.commit()
	db.close()
def passcompUp(infilename, outfilename):
    trees = conll2trees(infilename)
    print "read trees"
    with codecs.open(outfilename, "w", "utf-8") as outfile:
        for tree in trees:
            tree = completeInformation(tree)
            tree = passcomp(tree)
            for i, tokenid in enumerate(sorted(tree)):
                node = tree[tokenid]
                gov = node.get("gov", {}).items()
                govid, func = gov[0]
                outfile.write(
                    "\t".join(
                        [
                            str(tokenid),
                            node.get("t", "_"),
                            node.get("lemma", "_"),
                            node.get("lemma", "_"),
                            node.get("tag", "_"),
                            node.get("tag", "_"),
                            node.get("morph", "_"),
                            node.get("morph", "_"),
                            str(govid),
                            str(govid),
                            func,
                            func,
                            "_",
                            "_",
                        ]
                    )
                    + "\n"
                )

            outfile.write("\n")
def simpleConll(inputfilename):
    """
	fonction qui lit rhapsodie tok
	et choisit la fonction unique par token
	resort un format conll standard
	"""
    trees = conll2trees(inputfilename)
    # 1/0
    with codecs.open(inputfilename + ".simpl", "w", "utf-8") as f:
        for treedic in trees:
            # print "\n"

            # for id in sorted(treedic):
            # node=treedic[id]
            # print "____",id, node
            # print "ooo"
            treedic = completeInformation(treedic)
            ntreedic = passcomp(treedic)

            # treedic ressemble à {1: {'lemma': u'bonjour', 'gov': {0: u'root'}, 'tag': u'B_I', 'id': 1, 't': u'bonjour'}, 3: {'lemma': u'Eric', 'gov': {0: u'root'}, 'tag': u'B_N', 'id': 3, 't': u'Eric'}}
            for tokid in sorted(treedic):

                tdic = treedic[tokid]
                # print tokid,tdic
                if tdic["gov"] != {}:
                    tgov = tdic["gov"].keys()[0]
                    tfonc = tdic["gov"][tgov]
                else:
                    tgov, tfonc = "", ""
                f.write(
                    "\t".join([str(tdic["id"]), tdic["t"], tdic["lemma"], tdic["tag"][2:], "", "", str(tgov), tfonc])
                    + "\n"
                )
            f.write("\n")
Beispiel #5
0
def uploadConll(sql, filename, annotatorName=None, eraseAllAnnos=True): 
		""" 
		important function, called from project.cgi!!!
		
		"""

		db,cursor=sql.open()
		sentences=conll.conll2trees(filename)
		#print sentences
		if not annotatorName: annotatorName=sql.baseAnnotatorName
		
		dbtextname = filename.split("/")[-1].decode("utf-8")
		if dbtextname.endswith(".conll10") or dbtextname.endswith(".conll14") or  dbtextname.endswith(".malt") or  dbtextname.endswith(".tab"):
			dbtextname=".".join(dbtextname.split(".")[:-1])
		
		textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
		
				
		enterSentences(sql,cursor,sentences,filename, textid,annotatorName,eraseAllAnnos, tokname="t" )
		db.commit()
		db.close()
		return len(sentences)
Beispiel #6
0
def conll2phrasestructure(conllinfilename, phrasestructureoutname, args):
	beginning=""
	rhaps=False
	with codecs.open(conllinfilename,"r","utf-8") as f: beginning = f.read(50)
	if beginning.startswith("Text ID	Tree ID	Token ID"): rhaps=True
	trees=conll.conll2trees(conllinfilename, {"tag":"cat"}, rhaps=rhaps)
	ctrees,xmldoc=makePhraseStructure(trees)
	out=codecs.open(phrasestructureoutname,"w","utf-8")
	if args.bracketing:
		for ctree in ctrees: out.write(ctree.pprint()+"\n\n")
	elif args.pasteBracketing:
		pasteBracketing(trees,ctrees,conllinfilename, out)
	elif args.latex:
		out.write(startlatex)
		for ctree in ctrees: out.write(ctree.pprint_latex_qtree()+"\n\n")
		out.write(endlatex)
	else:
		out.write(xmldoc.toprettyxml())
	out.close()
	if args.graphs:
		for ctree in ctrees:
			ctree.draw()
Beispiel #7
0
def readInTestResults(projectname,conllfilename,gold="gold",parser="parser"):
	"""
	reads in the result of the mate transition parser into visually comparable results
	"""
	print "________________________________________current file is: " + conllfilename
	shortname = conllfilename.split("/")[-1]
	if projectname:
		
		sql = database.SQL(projectname)
		db,cursor=sql.open()
		textid = sql.enter(cursor, "texts",["textname"],(shortname,))
		goldid = sql.enter(cursor, "users",["user","realname"],(gold,gold,))
		parserid = sql.enter(cursor, "users",["user","realname"],(parser,parser,))
		
		
		sents={}
		goldsentences,parsesentences,sentencefeatures=[],[],[]
		l,t,g,f,length=0.0,0.0,0.0,0.0,0
		allcats,allfuncs={},{}
		
		sentences=conll.conll2trees(conllfilename)
		#print len(sentences)
		for tree in sentences:
			tl,tt,tg,tf,tlength=computeDifference(tree)
			#"lemma precision:"+str(round(tl/tlength,2))+
			info = " LAS:"+str(round(tf/tlength,3))+" UAS:"+str(round(tg/tlength,3))+" pos precision:"+str(round(tt/tlength,3))+u" n° tokens:"+str(tlength)
			
			
			goldtree,parsetree={},{}
			for i in tree:
				node=tree[i]
				goldtree[i]= {"id":node["id"],"t":node["t"],"lemma":node["lemma"],"tag":node["tag"],"morph":node["morph"],"gov":node["gov"]}
				parsetree[i]={"id":node["id"],"t":node["t"],"lemma":node["lemma2"],"tag":node["tag2"],"morph":node["morph2"],"gov":node["gov2"]}
				allcats[node["tag"]]=None
				allcats[node["tag2"]]=None
				allfuncs[node["gov"].values()[0]]=None
				allfuncs[node["gov2"].values()[0]]=None
				
			sents[tf/tlength]=sents.get(tf/tlength,[])+[ ( goldtree,parsetree,info ) ]
			(l,t,g,f,length)=map(sum,zip((l,t,g,f,length),(tl,tt,tg,tf,tlength) ))
		#	
		globalinfo = "lemma precision:"+str(round(l/length,2))+" pos precision:"+str(round(t/length,2))+" UAS:"+str(round(g/length,2))+" LAS:"+str(round(f/length,2))+u" n° tokens:"+str(length)+u" n° sentences:"+str(len(sentences))	
		print globalinfo
		treec=0
		for f in sorted(sents):
			for ( goldtree,parsetree,info ) in sents[f]:
				#print f, treec
				#print f, goldtree
				goldsentences+=[goldtree]
				parsesentences+=[parsetree]
				sentencefeatures+=[{"info":info}]
				treec+=1
		print "entering",len(parsesentences),"sentences"
		enterSentences(sql,cursor,goldsentences,shortname, textid,gold,eraseAllAnnos=True, sentencefeatures=sentencefeatures,tokname="t" )
		enterSentences(sql,cursor,parsesentences,shortname, textid,parser,eraseAllAnnos=False, defaultAnnotatorName="gold", tokname="t" )
		db.commit()
		db.close()
		
		writeConfigs(projectname,allcats,allfuncs)
		
		
	else:
		#only testing
		sentences=conll.conll2trees(conllfilename)
		l,t,g,f,length=0.0,0.0,0.0,0.0,0
		
		for s in sentences:
			(l,t,g,f,length)=map(sum,zip((l,t,g,f,length),computeDifference(s)))
			
		print "____"
		print "lemma precision:",l/length,"tag precision:",t/length,"UAS:",g/length,"LAS:",f/length
		print "n° tokens:",length,"n° sentences:",len(sentences)
		#print computeDifference(sentences[1])
		print sentences[1]