Beispiel #1
0
def searchConllView(grammar, inconll, outhtmlfolder="html/", includeNonMatchingTrees=False):
	"""
	inconll: an conll file as input
	grammar: search grammar, "or" relation between grammar trees
	function applies the grammar and writes out 
	- the diff html view file, preceded by the name of the grammar
	"""
	conllbasename = os.path.basename(inconll).split(".conllu")[0]
	print("doing",conllbasename,"...")
	stat=""
	
	grammarbasename = os.path.basename(grammar).split(".")[0]
	outhtml = outhtmlfolder+grammarbasename+"."+conllbasename+".html"
	intrees = conll.conllFile2trees(inconll)#[:1]
	sg = transconll.SearchGrammar(open(grammar).read())
	collectedConll=""
	try: pbar = tqdm.tqdm(total=len(intrees))
	except: pass
	for i, intree in enumerate(intrees): # list(enumerate(intrees))[:3]
		try: pbar.update()
		except: pass
		matchingRoots = sg.findall(intree)
		if matchingRoots or includeNonMatchingTrees:
			collectedConll += htmlconll.format(heading=str(i+1), conll=intree.conllu(), stat="")
	
	if collectedConll:
		with open(outhtml,"w") as outf:
			outf.write(htmltemplate.replace("{conll}",collectedConll) )	
		print("written",outhtml)
	else:
		print("no matches found")
Beispiel #2
0
def transConllView(grammar, inconll, outconllfolder="conll/", outhtmlfolder="html/", addstat=False):
	"""
	inconll: an conll file as input
	grammar: transconll grammar
	function applies the grammar and writes out 
	- the transformed conll file
	- the diff html view file, both preceded by the name of the grammar
	"""
	conllbasename = os.path.basename(inconll).split(".conllu")[0]
	print("doing",conllbasename,"...")
	stat=""
	btnbrel, btnbsynt, btdistrel, btdistsynt, btrighttot, btrightsynt= 0,0,0,0,0,0
	atnbrel, atnbsynt, atdistrel, atdistsynt, atrighttot, atrightsynt= 0,0,0,0,0,0
	grammarbasename = os.path.basename(grammar).split(".")[0]
	outconll = outconllfolder+grammarbasename+"."+conllbasename+".conllu"
	outhtml = outhtmlfolder+grammarbasename+"."+conllbasename+".html"
	intrees = conll.conllFile2trees(inconll)#[:1]
	tr = transconll.TransGrammar(open(grammar).read())
	outconlls=[]
	collectedDoubleConll=""
	try: pbar = tqdm.tqdm(total=len(intrees))
	except: pass
	for i, intree in enumerate(intrees): # list(enumerate(intrees))[:3]
		try: pbar.update()
		except: pass
		outtree = copy.deepcopy(intree)
		tr.transform(outtree)
		tr.adddiff(intree, outtree)
		beforeconll = intree.conllu()
		afterconll = outtree.conllu()
		outconlls += [ afterconll ]
		if addstat: 
			nbrel, nbsynt, distrel, distsynt, righttot, rightsynt = simpleStat(intree)
			inhtmlstat=htmlstat.format(title="before", nbrel=nbrel, nbsynt=nbsynt, distrel=distrel, distsynt=distsynt, righttot=righttot, rightsynt=rightsynt)
			btdistrel=(btdistrel*btnbrel+distrel*nbrel)/(btnbrel+nbrel)
			btdistsynt=(btdistsynt*btnbsynt+distsynt*nbsynt)/(btnbsynt+nbsynt)
			btrighttot=(btrighttot*btnbrel+righttot*nbrel)/(btnbrel+nbrel)
			btrightsynt=(btrightsynt*btnbsynt+rightsynt*nbsynt)/(btnbsynt+nbsynt)
			btnbrel+=nbrel
			btnbsynt+=nbsynt
			binsofarhtmlstat=htmlstat.format(title="before so far", nbrel=btnbrel, nbsynt=btnbsynt, distrel=btdistrel, distsynt=btdistsynt, righttot=btrighttot, rightsynt=btrightsynt)
			
			nbrel, nbsynt, distrel, distsynt, righttot, rightsynt = simpleStat(outtree)
			outhtmlstat = htmlstat.format(title="after", nbrel=nbrel, nbsynt=nbsynt, distrel=distrel, distsynt=distsynt, righttot=righttot, rightsynt=rightsynt)
			atdistrel=(atdistrel*atnbrel+distrel*nbrel)/(atnbrel+nbrel)
			atdistsynt=(atdistsynt*atnbsynt+distsynt*nbsynt)/(atnbsynt+nbsynt)
			atrighttot=(atrighttot*atnbrel+righttot*nbrel)/(atnbrel+nbrel)
			atrightsynt=(atrightsynt*atnbsynt+rightsynt*nbsynt)/(atnbsynt+nbsynt)
			atnbrel+=nbrel
			atnbsynt+=nbsynt
			ainsofarhtmlstat=htmlstat.format(title="after so far", nbrel=atnbrel, nbsynt=atnbsynt, distrel=round(atdistrel,2), distsynt=round(atdistsynt,2), righttot=round(atrighttot,2), rightsynt=round(atrightsynt,2))
			stat = inhtmlstat + binsofarhtmlstat + outhtmlstat + ainsofarhtmlstat
		collectedDoubleConll += htmlconll.format(heading=str(i+1), conll=beforeconll +"\n\n"+afterconll, stat=stat)
		#break
	with open(outhtml,"w") as outf:
		outf.write(htmltemplate.replace("{conll}",collectedDoubleConll) )	
	with open(outconll,"w") as outf:
		outf.write("\n\n".join(outconlls))	
	print("written",outconll,"and",outhtml)
Beispiel #3
0
def syntacticize(inconll, filenameAddon=".SUD", skipIfPresent=True):
    """
	single thread of conllu transformation by means of a transconll grammar
	"""
    basename = os.path.basename(inconll).split(".conllu")[0]
    print("doing", basename, "...")
    outconll = "conll/" + basename + filenameAddon + ".conllu"
    if skipIfPresent and os.path.isfile(outconll):
        print("found already", basename)
        return  # comment out to recompute
    intrees = conll.conllFile2trees(inconll)
    outconlls = []
    #pbar = tqdm.tqdm(total=len(intrees))
    for i, intree in enumerate(intrees):
        #pbar.update()
        trgram.transform(intree)
        outconlls += [intree.conllu()]
    with open(outconll, "w") as outf:
        outf.write("\n\n".join(outconlls))
    print("done with", basename)
Beispiel #4
0
def conll2html(inconll, sort=False):
	"""
	simple function: just creates an html file to view a conllu file
	
	"""
	basename = os.path.basename(inconll).split(".conllu")[0]
	print("doing",basename,"...")
	outhtml = "html/"+basename+".html"
	intrees = conll.conllFile2trees(inconll)#[:1]
	collectedConll=[]
	try: pbar = tqdm.tqdm(total=len(intrees))
	except: pass
	for i, intree in enumerate(intrees):
		try: pbar.update()
		except: pass
		collectedConll += [htmlconll.format(heading=str(i+1)+" "+intree.sentencefeatures.get('sent_id',''), conll=intree.conllu(), stat="")]
	if sort:
		collectedConll=sorted(collectedConll)
	with open(outhtml,"w") as outf:
		outf.write(htmltemplate.replace("{conll}","\n".join(collectedConll) ))
	print("written",outhtml)
Beispiel #5
0
     + files, "r")
 ###### 统一新旧文件名字 ######
 conll10 = str(files)
 conll10 = conll10.replace(".conll", ".most.recent.trees.conll10")
 conll10 = conll10.replace("-", ".")
 ###### 通过统一后的新文件名(已改错)打开旧文件名(未改错)
 f_ori = open(
     "E:/TAL/Stage/arborator/projects/NaijaSUD/export/newest_conll10/" +
     conll10, "r")
 contenu = ""
 for ligne in f_ori:
     contenu = contenu + ligne
 textename = files.replace(".conll", "")
 for chaque_ligne in f:
     if chaque_ligne != "\n":
         phrase = phrase + chaque_ligne
     else:
         if phrase not in contenu:  # 如果在已改错文件中的句子与旧文件不同,说明此句已修改,将会写入数据库
             sentence = re.findall("# text = (.+)\n",
                                   phrase)  # 通过# text = 获取句子文本
             sentence = str(sentence[0])
             scounter = re.findall("sent_id = P_.+PRO_(\d+)", phrase)
             scounter = int(scounter[0])  # 通过sent_id获取句子序号
             w = open("chaque_phrase.conll", "w")
             w.write(phrase)
             w.close()
             trees = conllFile2trees("test.conll", encoding="utf-8")
             simpleEnterSentences(sql, trees, textename, "yuchen")
         phrase = ""
 f.close()
 f_ori.close()