def searchConllView(grammar, inconll, outhtmlfolder="html/", includeNonMatchingTrees=False): """ inconll: an conll file as input grammar: search grammar, "or" relation between grammar trees function applies the grammar and writes out - the diff html view file, preceded by the name of the grammar """ conllbasename = os.path.basename(inconll).split(".conllu")[0] print("doing",conllbasename,"...") stat="" grammarbasename = os.path.basename(grammar).split(".")[0] outhtml = outhtmlfolder+grammarbasename+"."+conllbasename+".html" intrees = conll.conllFile2trees(inconll)#[:1] sg = transconll.SearchGrammar(open(grammar).read()) collectedConll="" try: pbar = tqdm.tqdm(total=len(intrees)) except: pass for i, intree in enumerate(intrees): # list(enumerate(intrees))[:3] try: pbar.update() except: pass matchingRoots = sg.findall(intree) if matchingRoots or includeNonMatchingTrees: collectedConll += htmlconll.format(heading=str(i+1), conll=intree.conllu(), stat="") if collectedConll: with open(outhtml,"w") as outf: outf.write(htmltemplate.replace("{conll}",collectedConll) ) print("written",outhtml) else: print("no matches found")
def transConllView(grammar, inconll, outconllfolder="conll/", outhtmlfolder="html/", addstat=False): """ inconll: an conll file as input grammar: transconll grammar function applies the grammar and writes out - the transformed conll file - the diff html view file, both preceded by the name of the grammar """ conllbasename = os.path.basename(inconll).split(".conllu")[0] print("doing",conllbasename,"...") stat="" btnbrel, btnbsynt, btdistrel, btdistsynt, btrighttot, btrightsynt= 0,0,0,0,0,0 atnbrel, atnbsynt, atdistrel, atdistsynt, atrighttot, atrightsynt= 0,0,0,0,0,0 grammarbasename = os.path.basename(grammar).split(".")[0] outconll = outconllfolder+grammarbasename+"."+conllbasename+".conllu" outhtml = outhtmlfolder+grammarbasename+"."+conllbasename+".html" intrees = conll.conllFile2trees(inconll)#[:1] tr = transconll.TransGrammar(open(grammar).read()) outconlls=[] collectedDoubleConll="" try: pbar = tqdm.tqdm(total=len(intrees)) except: pass for i, intree in enumerate(intrees): # list(enumerate(intrees))[:3] try: pbar.update() except: pass outtree = copy.deepcopy(intree) tr.transform(outtree) tr.adddiff(intree, outtree) beforeconll = intree.conllu() afterconll = outtree.conllu() outconlls += [ afterconll ] if addstat: nbrel, nbsynt, distrel, distsynt, righttot, rightsynt = simpleStat(intree) inhtmlstat=htmlstat.format(title="before", nbrel=nbrel, nbsynt=nbsynt, distrel=distrel, distsynt=distsynt, righttot=righttot, rightsynt=rightsynt) btdistrel=(btdistrel*btnbrel+distrel*nbrel)/(btnbrel+nbrel) btdistsynt=(btdistsynt*btnbsynt+distsynt*nbsynt)/(btnbsynt+nbsynt) btrighttot=(btrighttot*btnbrel+righttot*nbrel)/(btnbrel+nbrel) btrightsynt=(btrightsynt*btnbsynt+rightsynt*nbsynt)/(btnbsynt+nbsynt) btnbrel+=nbrel btnbsynt+=nbsynt binsofarhtmlstat=htmlstat.format(title="before so far", nbrel=btnbrel, nbsynt=btnbsynt, distrel=btdistrel, distsynt=btdistsynt, righttot=btrighttot, rightsynt=btrightsynt) nbrel, nbsynt, distrel, distsynt, righttot, rightsynt = simpleStat(outtree) outhtmlstat = htmlstat.format(title="after", nbrel=nbrel, nbsynt=nbsynt, distrel=distrel, distsynt=distsynt, righttot=righttot, rightsynt=rightsynt) atdistrel=(atdistrel*atnbrel+distrel*nbrel)/(atnbrel+nbrel) atdistsynt=(atdistsynt*atnbsynt+distsynt*nbsynt)/(atnbsynt+nbsynt) atrighttot=(atrighttot*atnbrel+righttot*nbrel)/(atnbrel+nbrel) atrightsynt=(atrightsynt*atnbsynt+rightsynt*nbsynt)/(atnbsynt+nbsynt) atnbrel+=nbrel atnbsynt+=nbsynt ainsofarhtmlstat=htmlstat.format(title="after so far", nbrel=atnbrel, nbsynt=atnbsynt, distrel=round(atdistrel,2), distsynt=round(atdistsynt,2), righttot=round(atrighttot,2), rightsynt=round(atrightsynt,2)) stat = inhtmlstat + binsofarhtmlstat + outhtmlstat + ainsofarhtmlstat collectedDoubleConll += htmlconll.format(heading=str(i+1), conll=beforeconll +"\n\n"+afterconll, stat=stat) #break with open(outhtml,"w") as outf: outf.write(htmltemplate.replace("{conll}",collectedDoubleConll) ) with open(outconll,"w") as outf: outf.write("\n\n".join(outconlls)) print("written",outconll,"and",outhtml)
def syntacticize(inconll, filenameAddon=".SUD", skipIfPresent=True): """ single thread of conllu transformation by means of a transconll grammar """ basename = os.path.basename(inconll).split(".conllu")[0] print("doing", basename, "...") outconll = "conll/" + basename + filenameAddon + ".conllu" if skipIfPresent and os.path.isfile(outconll): print("found already", basename) return # comment out to recompute intrees = conll.conllFile2trees(inconll) outconlls = [] #pbar = tqdm.tqdm(total=len(intrees)) for i, intree in enumerate(intrees): #pbar.update() trgram.transform(intree) outconlls += [intree.conllu()] with open(outconll, "w") as outf: outf.write("\n\n".join(outconlls)) print("done with", basename)
def conll2html(inconll, sort=False): """ simple function: just creates an html file to view a conllu file """ basename = os.path.basename(inconll).split(".conllu")[0] print("doing",basename,"...") outhtml = "html/"+basename+".html" intrees = conll.conllFile2trees(inconll)#[:1] collectedConll=[] try: pbar = tqdm.tqdm(total=len(intrees)) except: pass for i, intree in enumerate(intrees): try: pbar.update() except: pass collectedConll += [htmlconll.format(heading=str(i+1)+" "+intree.sentencefeatures.get('sent_id',''), conll=intree.conllu(), stat="")] if sort: collectedConll=sorted(collectedConll) with open(outhtml,"w") as outf: outf.write(htmltemplate.replace("{conll}","\n".join(collectedConll) )) print("written",outhtml)
+ files, "r") ###### 统一新旧文件名字 ###### conll10 = str(files) conll10 = conll10.replace(".conll", ".most.recent.trees.conll10") conll10 = conll10.replace("-", ".") ###### 通过统一后的新文件名(已改错)打开旧文件名(未改错) f_ori = open( "E:/TAL/Stage/arborator/projects/NaijaSUD/export/newest_conll10/" + conll10, "r") contenu = "" for ligne in f_ori: contenu = contenu + ligne textename = files.replace(".conll", "") for chaque_ligne in f: if chaque_ligne != "\n": phrase = phrase + chaque_ligne else: if phrase not in contenu: # 如果在已改错文件中的句子与旧文件不同,说明此句已修改,将会写入数据库 sentence = re.findall("# text = (.+)\n", phrase) # 通过# text = 获取句子文本 sentence = str(sentence[0]) scounter = re.findall("sent_id = P_.+PRO_(\d+)", phrase) scounter = int(scounter[0]) # 通过sent_id获取句子序号 w = open("chaque_phrase.conll", "w") w.write(phrase) w.close() trees = conllFile2trees("test.conll", encoding="utf-8") simpleEnterSentences(sql, trees, textename, "yuchen") phrase = "" f.close() f_ori.close()