def simpleConll(inputfilename):
    """
	fonction qui lit rhapsodie tok
	et choisit la fonction unique par token
	resort un format conll standard
	"""
    trees = conllFile2trees(inputfilename)
    #1/0
    with codecs.open(inputfilename + '.simpl', 'w', 'utf-8') as f:
        for treedic in trees:
            #print "\n"

            #for id in sorted(treedic):
            #node=treedic[id]
            #print "____",id, node
            #print "ooo"
            treedic = completeInformation(treedic)
            ntreedic = passcomp(treedic)

            # treedic ressemble à {1: {'lemma': u'bonjour', 'gov': {0: u'root'}, 'tag': u'B_I', 'id': 1, 't': u'bonjour'}, 3: {'lemma': u'Eric', 'gov': {0: u'root'}, 'tag': u'B_N', 'id': 3, 't': u'Eric'}}
            for tokid in sorted(treedic):

                tdic = treedic[tokid]
                #print tokid,tdic
                if tdic["gov"] != {}:
                    tgov = tdic["gov"].keys()[0]
                    tfonc = tdic["gov"][tgov]
                else:
                    tgov, tfonc = "", ""
                f.write("\t".join([
                    str(tdic["id"]), tdic["t"], tdic["lemma"], tdic["tag"][2:],
                    "", "",
                    str(tgov), tfonc
                ]) + "\n")
            f.write("\n")
def makeTrainTestSets(infolder,pattern="*conll",train="train",test="test",empty="emptytest",testsize=10, lemma=True):
	tottec,tottrc,toks=0,0,0
	with codecs.open(os.path.join(infolder, test),"w","utf-8") as testf, codecs.open(os.path.join(infolder, empty),"w","utf-8") as emptyf, codecs.open(os.path.join(infolder, train),"w","utf-8") as trainf:
		for infilename in glob.glob(os.path.join(infolder, pattern)):
			print "newconvert: looking at",infilename
			
			tec,trc=0,0
			allsentences=conll.conllFile2trees(infilename)
			print len(allsentences),"sentences"
			testselection=random.sample(range(len(allsentences)),len(allsentences)*testsize/100)
			
			for i,s in enumerate(allsentences):
				toks+=len(s)
				if i in testselection:
					testf.write(treeToConll14Text(s)+"\n")
					emptyf.write(treeToEmptyConll14Text(s,lemma)+"\n")
					tec+=1
				else:
					trainf.write(treeToConll14Text(s)+"\n")
					trc+=1
						
			print "testing with",tec,"sentences. training with",trc,"sentences"
			tottec+=tec
			tottrc+=trc
			if not lemma: shutil.copyfile(os.path.join(infolder, empty), os.path.join(infolder, empty)+"_lem")
	print "tottec,tottrc,toks=",tottec,tottrc,toks
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.4):
    trees = conll.conllFile2trees(conllfile)
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
    print int(nbgovs * removeDeps)
    tobeRemoved = sorted(random.sample(range(nbgovs),
                                       int(nbgovs * removeDeps)))
    print "nbgovs:", nbgovs, "tobeRemoved:", tobeRemoved
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys()[0] != -1:
                if node["gov"].values()[0] in removeFuncs:
                    node["gov"] = {}
                else:
                    nbgovs += 1
                    if nbgovs in tobeRemoved:
                        node["gov"] = {}
    newname = conllfile
    if conllfile.endswith(".conll"): newname = conllfile[:-len(".conll")]
    shutil.move(conllfile, newname + ".orig")
    conll.trees2conllFile(trees, newname + ".deg", columns=10)
def passcompUp(infilename, outfilename):
    trees = conllFile2trees(infilename)
    print "read trees"
    with codecs.open(outfilename, "w", "utf-8") as outfile:
        for tree in trees:
            tree = completeInformation(tree)
            tree = passcomp(tree)
            for i, tokenid in enumerate(sorted(tree)):
                node = tree[tokenid]
                gov = node.get("gov", {}).items()
                govid, func = gov[0]
                outfile.write("\t".join([
                    str(tokenid),
                    node.get("t", "_"),
                    node.get("lemma", "_"),
                    node.get("lemma", "_"),
                    node.get("tag", "_"),
                    node.get("tag", "_"),
                    node.get("morph", "_"),
                    node.get("morph", "_"),
                    str(govid),
                    str(govid), func, func, "_", "_"
                ]) + "\n")

            outfile.write("\n")
Exemple #5
0
def funcsearch(infolder):
    errors = {}
    numerrors = 0
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):  # .conll
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)

        for tree in trees:
            for i, node in tree.iteritems():

                if node["gov"].values()[0] in ["dm"] and node["tag"] not in [
                        "INT"
                ]:
                    key = " ".join([node["t"], node["lemma"], node["tag"]])
                    #print key,"!!!"
                    errors[key] = errors.get(key, 0) + 1
                    numerrors += 1
    with codecs.open("dm.txt", "w", "utf-8") as outf:
        for key in sorted(errors, key=errors.get, reverse=True):
            print errors[key], key
            outf.write(str(errors[key]) + " " + key + "\n")
        print "total of", numerrors, "cases"
        outf.write(" ".join(["total of", str(numerrors), "cases"]))
Exemple #6
0
def transform(infolder, outfolder, mixOldNew=True):
    createNonExistingFolders(outfolder)
    spaceToks = {}
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))):
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = platinum(newtree)
            newtrees += [newtree]

            findSpaces(spaceToks, tree)

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))

    #corrdic = correctionDics("corrConll.txt")
    #for c in corrdic:
    #print c
    #qsdf
    for i, tok in enumerate(sorted(spaceToks)):
        print i, tok, spaceToks[tok]
def dicoTiretsRhapsodie():
    dico = {}
    trees = conll.conllFile2trees("mate/fr/Rhaps.gold.conll14")
    for arbre in trees:
        for i, node in arbre.iteritems():
            if node["t"][0] == "-":
                dico[node["t"]] = dico.get(node["t"], node["tag"])
    return dico
def updateParseResult(projectname, conlldirpath, filepattern="*.trees.conll14", annotatorName="parser", removeToGetDB="-one-word-per-line.conll14_parse"):
	sql = database.SQL(projectname)
	db,cursor=sql.open()
	print  "updateTrees:",glob(os.path.join(conlldirpath, filepattern))
	for filename in glob(os.path.join(conlldirpath, filepattern)):
		print "entering",filename
		sentences=conll.conllFile2trees(filename)
		dbtextname = os.path.basename(filename)[:-len(removeToGetDB)]
		textid = sql.enter(cursor, "texts",["textname"],(dbtextname,))
		
		if not textid:
			print "couldn't find the database named",textid
			return
		enterNewAnnotation(sql, db,cursor, sentences, textid, annotatorName=annotatorName)
Exemple #9
0
def transform(infolder, outfolder, mixOldNew=True):
    createNonExistingFolders(outfolder)
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))):
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = platinum(newtree)
            newtrees += [newtree]
        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
def tagFuncConfig(infolder,pattern="*conll"):
	tags,funcs={},{}
	for infilename in glob.glob(os.path.join(infolder, pattern)):
		print infilename
		allsentences=conll.conllFile2trees(infilename)
		for tree in allsentences:
			for i in tree.keys():
				node = tree[i]
				gov = node.get("gov",{}).items()
				tag=node.get("tag",None)
				if tag: tags[tag]=None
				for govid,func in gov:
					 funcs[func]=None
	for tag in sorted(tags):
		print tag, '{"fill": "#69399d"}'
	for func in sorted(funcs):
		print func, '{"stroke": "#000000","stroke-width":"1","stroke-dasharray": ""}'
Exemple #11
0
def transform(infolder, outfolder, mixOldNew=False):
    createNonExistingFolders(outfolder)
    spaceToks = {}
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = correct(newtree)
            newtrees += [newtree]

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
def transform(infolder, outfolder, mixOldNew=False):
    createNonExistingFolders(outfolder)

    corrinst = compil('corrinst.txt')
    print len(corrinst), "rules"
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = correct(newtree, corrinst)
            newtrees += [newtree]

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
Exemple #13
0
def search(infolder,fun):
	goodtrees=[]
	print "doing", fun.__name__
	#try: os.mkdir(outdir)	
	#except OSError: pass
	for infile in sorted(glob.glob(os.path.join(infolder,"*"))): # .conll
		if not os.path.isfile(infile): continue
		basename=os.path.basename(infile)
		print "reading",basename
		trees = conll.conllFile2trees(infile)
		
		for tree in trees:
			#if hasVerbalDm(tree):
			#if isNonProjective(tree):
			if fun(tree):
				goodtrees+=[tree]
	print "found",len(goodtrees)
	if goodtrees:
		conll.trees2conllFile(goodtrees,fun.__name__+".conll")
def retokeniser(nomdufichier, path="", addtoout=""):
    if not path:
        path, _ = os.path.split(
            nomdufichier)  # take the same path as the nomdufichier
    if path and path[-1] != "/": path = path + "/"
    trees = conll.conllFile2trees(nomdufichier)  # on lit le fichier
    print "le fichier", nomdufichier, "a", len(trees), "arbres"
    #newtrees, alltrees=[], []
    newtrees = []
    digitsandnumbers = codecs.open(droporfeo + "lexique/gg", "r",
                                   "utf-8").read().split('\n')
    for i, arbre in enumerate(trees):  # on boucle sur les arbres
        #alltrees+=[copy.deepcopy(arbre)]
        #oldtree=copy.deepcopy(arbre)
        racines = addinfototree(arbre)
        oldtree = copy.deepcopy(arbre)
        arbre = corrigerNumerotation(arbre)
        arbre = nombresComposes(arbre)
        arbre = digits(arbre, digitsandnumbers)
        arbre = corrigerArbreCompos(
            arbre)  # Décomposition des expressions multimots
        #for i, node in arbre.items(): # Reconfiguration des enfants
        #if node["gov"] == {}:
        #print "crap"

        arbre = recomposerMultimots(arbre, expressions_multimots)
        arbre = corrigerNumerotationSplice(arbre)

        arbre = corrigerSegmentationClitiques(arbre, dico_clitiques)
        arbre = corrigerInaudibles(arbre)
        arbre = corrigerClitiques(arbre)
        arbre = retoken(arbre)
        if arbre != oldtree:
            print i
            for ii in arbre:
                if arbre[ii] != oldtree.get(ii, None):
                    print ii, arbre[ii]['t'], arbre[ii], oldtree.get(ii, None)
        newtrees.append(arbre)
    newname = path + os.path.basename(nomdufichier + addtoout)
    conll.trees2conllFile(newtrees, newname, columns=10)
    return newname
Exemple #15
0
def addArbitraryPuncs(infolder, outfolder):
    createNonExistingFolders(outfolder)
    for conllinfile in glob.glob(os.path.join(infolder, '*')):
        print conllinfile
        trees = conll.conllFile2trees(conllinfile)
        for i, tree in enumerate(trees):
            m = max(tree)
            splitcode = ".,!?;:()"
            p = splitcode[i % len(splitcode)]
            tree[m + 1] = {
                u'tag': u'PUNC',
                u'lemma': p,
                u't': p,
                'gov': {
                    0: u'punc'
                }
            }
        conll.trees2conllFile(trees,
                              os.path.join(outfolder,
                                           os.path.basename(conllinfile)),
                              columns=14)
def conll2phrasestructure(conllinfilename, phrasestructureoutname, args):
    beginning = ""
    rhaps = False
    with codecs.open(conllinfilename, "r", "utf-8") as f:
        beginning = f.read(50)
    if beginning.startswith("Text ID	Tree ID	Token ID"): rhaps = True
    elif args.orfeo: rhaps = "orfeo"
    trees = conll.conllFile2trees(conllinfilename, {"tag": "cat"}, rhaps=rhaps)
    #print len(trees),trees[0]
    ctrees, xmldoc = makePhraseStructure(trees)
    out = codecs.open(phrasestructureoutname, "w", "utf-8")
    if args.bracketing:
        for ctree in ctrees:
            out.write(ctree.pprint() + "\n\n")
    elif args.pasteBracketing:
        pasteBracketing(trees, ctrees, conllinfilename, out)
    elif args.latex:
        out.write(startlatex)
        for ctree in ctrees:
            out.write(
                unicode(ctree.pprint_latex_qtree()).replace("#", "\\#") +
                "\n\n")
        out.write(endlatex)
        try:
            import subprocess, webbrowser
            proc = subprocess.Popen(['pdflatex', phrasestructureoutname])
            proc.communicate()
            webbrowser.open_new_tab(
                os.path.abspath(
                    ".".join(phrasestructureoutname.split(".")[:-1]) + ".pdf"))
        except:
            print "is pdflatex and qtree installed? try 'sudo apt-get install texlive-humanities'"

    else:
        out.write(xmldoc.toprettyxml())
    out.close()
    if args.graphs:
        for ctree in ctrees:
            ctree.draw()
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.2):
    trees = conll.conllFile2trees(conllfile)
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
    print int(nbgovs * removeDeps)
    tobeRemoved = sorted(random.sample(range(nbgovs),
                                       int(nbgovs * removeDeps)))
    print nbgovs, tobeRemoved
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
                if nbgovs in tobeRemoved:
                    node["gov"] = {}
    shutil.move(conllfile, conllfile + ".orig")
    conll.trees2conllFile(trees, conllfile, columns=10)
Exemple #18
0
def search(infolder):
    allLem = lireToutLex()
    errors = {}
    numerrors = 0
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):  # .conll
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)

        for tree in trees:
            #if hasVerbalDm(tree):
            #if isNonProjective(tree):
            for i, node in tree.iteritems():
                #if (node["lemma"] in allLem and node["tag"] in allLem[node["lemma"]]) and node["tag"] not in ["NOM"] and allLem.get(node["lemma"],None):
                #pass
                ##print node["t"],node["lemma"],node["tag"],"ok"
                #else:
                if node["tag"] in ["NOM"] and node["tag"] not in allLem.get(
                        node["lemma"], []) and allLem.get(node["lemma"],
                                                          []) == []:
                    key = " ".join([
                        node["t"], node["lemma"], node["tag"], "lexique:",
                        " ".join(
                            sorted(
                                set(
                                    allLem.get(node["lemma"],
                                               ["pas dans le lexique"]))))
                    ])
                    #print key,"!!!"
                    errors[key] = errors.get(key, 0) + 1
                    numerrors += 1
    with codecs.open("problems.txt", "w", "utf-8") as outf:
        for key in sorted(errors, key=errors.get, reverse=True):
            print errors[key], key
            outf.write(str(errors[key]) + " " + key + "\n")
        print "total of", numerrors, "errors"
        outf.write(" ".join(["total of", str(numerrors), "errors"]))
Exemple #19
0
def split(conllfile, maxi):
    trees = conll.conllFile2trees(conllfile)
    for j, ts in enumerate(
        [trees[i:i + maxi] for i in range(0, len(trees), maxi)]):
        conll.trees2conllFile(ts, conllfile + str(j))
Exemple #20
0
def fusionForgottenTrees(project="Platinum",
                         fusdir="../projects/OrfeoGold2016/platinum/*",
                         annotators=["admin"]):
    """
	takes trees from project ordered by annotators. if they exist fuse them into the fusdir
	result has the extension "cool.conll"
	,"Sy","Marion"
	"""

    #print lemmacorrection
    sys.path.insert(0, '../tools')
    import difflib
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "exportcool")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    for annotator in annotators:
        print[
            list(
                cursor.execute("select rowid from users where user =?;",
                               (annotator, )))
        ]
    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "\n__________________________doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = {}
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees[nr] = tree
                    #print "atree:",tree
                    break
            #if not tree:
            #print "problem: no tree for nr",nr,"type",type(nr)
            #print "annotatorIds",annotatorIds
            #raise Exception('no tree', nr)
        #print trees
        print len(trees), "trees from", project
        print textname, textname.split(".")[0]
        btextname = os.path.basename(textname).split(".")[0]
        if btextname.endswith("-one-word-per-line"):
            btextname = btextname[:-len("-one-word-per-line")]
        #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)]
        cooltrees = []
        ptrees, ftrees = 0, 0
        for fi in glob.glob(fusdir):
            if btextname == os.path.basename(fi).split(".")[0]:
                print "yes", btextname
                fustrees = conll.conllFile2trees(fi)
                print len(fustrees), "ftrees", fi
                for nr, ftree in enumerate(fustrees):
                    if nr + 1 in trees:
                        #print "added tree",nr+1,"from database"
                        #ptree=platinum(trees[nr+1])
                        ptree = trees[nr + 1]
                        for iii in ptree:
                            ptree[iii]["tag2"] = "_"
                            if ptree[iii]["lemma"] in lemmacorrection:
                                ptree[iii]["lemma"] = lemmacorrection[
                                    ptree[iii]["lemma"]]
                        cooltrees += [ptree]
                        #print nr+1,"tree from",project#,tree
                        ptrees += 1
                        if ftree.sentence() != u" ".join(
                            [ptree[i].get("t", "") for i in sorted(ptree)]):
                            print "\n_________", nr + 1
                            print ftree.sentence()
                            print u" ".join(
                                [ptree[i].get("t", "") for i in sorted(ptree)])
                            #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l

                        #print "dbtree",platinum(trees[nr+1])
                    else:
                        for iii in ftree:
                            ftree[iii]["tag2"] = "_"
                            if ftree[iii]["lemma"] in lemmacorrection:
                                ftree[iii]["lemma"] = lemmacorrection[
                                    ftree[iii]["lemma"]]
                        #print nr+1,"tree from",fusdir#,tree
                        ftrees += 1
                        cooltrees += [ftree]
                        #print "added tree",nr+1,"from fustrees",fi
                outfile = os.path.join(outdir, textname + ".cool.conll")
                conll.trees2conllFile(cooltrees, outfile=outfile, columns=10)
                print "wrote", outfile
                print ptrees, "ptrees, ", ftrees, "ftrees"
                break
        if len(cooltrees) == 0: print "nothing for", btextname
        outfiles += [outfile]
        #qsdf
    return outfiles
    new_trees = list()
    for nr, tree in sortable:

        # adding metadatas	应该是重命名sent_id,从0开始
        tree.sentencefeatures["text"] = tree.sentence()
        tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1)

        # removing useless metadata
        del tree.sentencefeatures["nr"]
        new_trees.append(tree)
    conll.trees2conllFile(new_trees, outfile)


if __name__ == "__main__":

    ## Open project database

    sql = SQL("NaijaSUD")  # 输入project名字
    db, cursor = sql.open()

    ## Use 2 functions :
    # - exportLastBestAnnotations in lib/database.py -> writes a file with trees and their rank
    # - reorder in lib/yuchen.py -> reorder trees based on their rank, write a file with the output

    users, c = sql.exportLastBestAnnotations(
        115, "P_ABJ_GWA_06_Ugo-lifestory_PRO"
    )  # 输入textid和text name,可通过链接https://arborator.ilpga.fr/editor.cgi?project=NaijaSUD&textid=74&opensentence=1看到textid
    print(users, c)
    fpath = "E:/TAL/Stage/arborator/projects/NaijaSUD/export/P_ABJ_GWA_06_Ugo.lifestory_PRO.most.recent.trees.with.feats.conllu"  # 输入导出的文件所在路径
    trees = conll.conllFile2trees(fpath)  # 重新排序conll树,重命名sent_id
    reorder(trees, fpath + "_reordered")
Exemple #22
0
            q[i:i + maxiAtOnce] for i in xrange(0, len(q), maxiAtOnce)
    ]:
        res = service.translations().list(source=source,
                                          target=target,
                                          q=wordgroup).execute()
        translations += [td["translatedText"] for td in res["translations"]]

    return translations


#translate(u"准许 一 位 人士 入境 的 权力".split())

for conllinfile in glob.glob(os.path.join("corpus/conll/", 'CONV*.*')):

    print conllinfile
    trees = conllFile2trees(conllinfile)
    path, base = os.path.split(conllinfile)
    translateDic = {}
    counter = 0
    for tree in trees:
        for i, node in tree.iteritems():
            node["tag2"] = pinyin.get(node["t"])
            translateDic[node["t"]] = None
        counter += 1
        if not counter % 100: print counter, "trees"

    words = sorted(translateDic)
    print len(words), "words"
    trads = translate(words)
    translateDic = dict(zip(words, trads))
    print len(translateDic), "translations"
                langConllFiles[lcode] = langConllFiles.get(
                    lcode, []) + [os.path.join(dirpath, f)]
    #print langConllFiles
    return langConllFiles


conllfiles = getAllConllFiles("../sud-treebanks-v2.4")

#print()
unigrams = {}
bigrams = {}
bigtype = {}
for fi in tqdm.tqdm(conllfiles['ja']):
    if "FTB" in fi: continue
    print('analyzing', fi)
    trees = conll.conllFile2trees(fi)

    for tree in trees:
        toks = [
            tree[i]['t'] for i in sorted(tree.keys())
            if tree[i]['tag'] != "PUNCT"
        ]
        for t in toks:
            unigrams[t] = unigrams.get(t, 0) + 1
        maxtree = max(tree.keys())
        for i in sorted(tree):
            if i == maxtree: continue
            na = tree[i]
            nb = tree[i + 1]
            a = na['t']
            b = nb['t']
Exemple #24
0
def parsing(infile,
            lemodel="LemModel",
            tagmodel="TagModel",
            parsemodel="ParseModel",
            outfolder="parses",
            memory="4G",
            depparse=True,
            parserType="graph",
            lemmatized=False):
    """
	parsing function
	TODO: parserType is ignored
	"""

    print "parsing now i'm at os.getcwd()", os.getcwd(), os.path.dirname(
        __file__), __file__

    if outfolder[-1] != "/": outfolder = outfolder + "/"
    outfile = outfolder + os.path.basename(infile)

    anna, lemclass, tagclass, parseclass = getClasses("graph")

    #anna = ".:./mate/mate_components/anna-3.3.jar"
    #anna = os.path.join(os.path.dirname(__file__), 'mate/mate_components/anna-3.3.jar')
    #print anna
    #print "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar"
    #assert anna=="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar"
    #print anna
    #anna = "anna-3.3.jar"
    #anna =        "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components/anna-3.3.jar"
    #print anna
    #lemodel="../platinum.2017-03-26_03:10/models/LemModel"
    #print lemodel
    #print "/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel"
    #assert lemodel=="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel"
    #lemodel="/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/platinum.2016-11-29_00:22/models/LemModel"
    #infile="/home/kim/Dropbox/programmation/arborator/trunk/tools/parses/upload.fr.txt.sents.conll"

    lemcommand = "java -Xmx{memory} -cp {anna} {lemclass} -model {lemodel} -test {infile} -out {outfile}_lem".format(
        memory=memory,
        anna=anna,
        lemclass=lemclass,
        infile=infile,
        lemodel=lemodel,
        outfile=outfile)
    tagcommand = "java -Xmx{memory} -cp {anna} {tagclass} -model {tagmodel} -test {outfile}_lem -out {outfile}_tag".format(
        memory=memory,
        anna=anna,
        tagclass=tagclass,
        tagmodel=tagmodel,
        outfile=outfile)
    parsecommand = "java -Xmx{memory} -cp {anna} {parseclass} -model {parsemodel} -test {outfile}_tag -out {outfile}_parse".format(
        memory=memory,
        anna=anna,
        parseclass=parseclass,
        parsemodel=parsemodel,
        outfile=outfile)

    if lemodel and lemodel[-1] != "/":
        if verbose: print "\n\n========== lemmatizing...", lemcommand
        p1 = subprocess.Popen(
            [lemcommand],
            shell=True,
            stdout=subprocess.PIPE,
            cwd=
            '/home/kim/Dropbox/programmation/arborator/trunk/tools/mate/mate_components'
        )
        out, err = p1.communicate()
        if verbose:
            print out, err
    else:
        if lemmatized:
            print "copying", outfolder + os.path.basename(
                infile), "as lemma file"
            shutil.copyfile(outfolder + os.path.basename(infile),
                            outfolder + os.path.basename(infile) + "_lem")
        else:
            print "adding toks as lems", outfolder + os.path.basename(infile)
            trees = conll.conllFile2trees(infile)
            with codecs.open(outfolder + os.path.basename(infile) + "_lem",
                             "w", "utf-8") as lemf:
                for tree in trees:
                    lemf.write(
                        newconvert.treeToEmptyConll14Text(tree, lemma=False) +
                        "\n")
    if verbose:
        print "\n\n========== tagging...", tagcommand
    p1 = subprocess.Popen([tagcommand], shell=True, stdout=subprocess.PIPE)
    out, err = p1.communicate()
    if verbose: print out, err
    if depparse:
        if verbose: print "\n\n========== dep analysis...", parsecommand
        p1 = subprocess.Popen([parsecommand],
                              shell=True,
                              stdout=subprocess.PIPE)
        print p1.stdout.read()
        if verbose: print "\n\n========== parsed"
        #java -Xmx40G -cp mate/mate_components/anna-3.3.jar is2.parser.Parser -model ParseModel -test annotatedCorpora.corr.conll_tag -out naija_parse
        #anna = ""
        #lemclass = "is2.lemmatizer.Lemmatizer"
        #tagclass = "is2.tag.Tagger"

        #if lang=="graph":
        #parseclass = ""

    if checkIntegrity(outfile + '_parse') == False:
        print "*********ERROR IN FILE", outfile + "_parse", "Please Try again*********"
    return outfile + "_parse"
Exemple #25
0
def trainingEvaluationParsing(project=u"OrfeoGold2016",
                              parserType="graph",
                              whoseTrees="validator",
                              evaluationPercent=10,
                              additionnalLexicon=None,
                              resultAnnotator="mate",
                              getFromFolder=False,
                              parseDB=False,
                              memory="40G",
                              stopOnError=False):
    """
	if additionnalLexicon is given, it is joined to the training file for lemmatization and tagging.
	change memory here!
	todo :
		- add function to choose parser type (lang=)
		- creer mate.log pour progression (fin = "Ready.")
	"""
    mateLogs("Begin")
    ti = time.time()

    if getFromFolder:
        parseDB = False  # TODO: correct this so that all options are available
    parserType = (parserType or "graph")
    whoseTrees = whoseTrees or "validator"
    evaluationPercent = evaluationPercent or 10
    resultAnnotator = resultAnnotator or "mate"

    try:
        os.chmod("mate/parse.log", 0666)  # just in case...
    except:
        pass

    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M')

    #####
    #defining project and creation of saves directories
    #####

    basepath = createDailyPath("./mate/", project)
    if parseDB:
        backupbase = backupOldDatabase(project, basepath)
        mateLogs(
            "A copy of the database has been stored in {backupbase}. Getting validated trees..."
            .format(backupbase=backupbase))
    traindir = createDirectory(basepath + "training")
    modeldir = createDirectory(basepath + "models")
    logdir = createDirectory(basepath + "logs")
    parsedir = createDirectory(basepath + "parses")

    #####
    #getting gold trees for training
    #####

    if getFromFolder:  # getFromFolder contains folder name containing only conll files
        error = False
        goldtrees = []
        for infile in glob.glob(os.path.join(getFromFolder, '*')):
            if os.path.isfile(infile):
                print "reading", infile
                gtrees = conll.conllFile2trees(infile)
                for tree in gtrees:
                    problemkeys = []
                    for i in tree:
                        for gi in tree[i]["gov"]:
                            if not 0 <= gi <= len(tree):
                                print infile
                                print tree
                                print "has a problematic governor:", gi
                                error = True
                                problemkeys += [i]
                    for problemk in problemkeys:
                        del tree[problemk]
                goldtrees += gtrees
        if error and stopOnError: sys.exit()
    else:
        goldtrees = trees2train.getValidatedTrees(project, basepath,
                                                  whoseTrees)
    mateLogs(
        u"{nrtrees} validated trees extracted".format(nrtrees=len(goldtrees)))

    lemma = None
    if goldtrees:  # see whether the first token of the first tree has a lemma. if lemma==None: we'll skip lemmatization
        lemma = goldtrees[0][sorted(goldtrees[0])[0]].get(
            "lemma", None)  # just trying to get the first lemma value
        if lemma == "_": lemma = None
        print "found lemma in first tree:", lemma
        #TODO: do something here: double tokens as lemmas for chinese, see function makeTrainTestSets
    else:
        print "no trees from:", getFromFolder
        sys.exit()
    print "goldtrees:", len(goldtrees)

    #####
    #creating trainingfiles
    #####

    alldeptraining = traindir + "alldeptraining.conll"
    conll.trees2conllFile(goldtrees, alldeptraining, columns=14)
    traintrees = makeTrainTestSets(traindir,
                                   pattern=os.path.basename(alldeptraining),
                                   train="partialdeptrain.conll",
                                   test="test.conll",
                                   empty="emptytest.conll",
                                   testsize=int(evaluationPercent),
                                   lemma=lemma)
    print "traintrees:", len(traintrees)
    if additionnalLexicon:
        lexicontrees = conll.conllFile2trees(additionnalLexicon)
        print "lexicontrees:", len(lexicontrees)
        alllemtagtrain = traindir + "alllemtagtrain.conll"
        conll.trees2conllFile(goldtrees + lexicontrees,
                              alllemtagtrain,
                              columns=14)
        partiallemtagtrain = traindir + "partiallemtagtrain.conll"
        conll.trees2conllFile(traintrees + lexicontrees,
                              partiallemtagtrain,
                              columns=14)
    else:
        alllemtagtrain = alldeptraining
        partiallemtagtrain = traindir + "partialdeptrain.conll"

    #creating files used for evaluation
    #if isinstance(evaluationPercent, str): evaluationPercent = int(evaluationPercent)
    mateLogs("trainfiles created")
    if verbose:
        print "just testing whether i can load them..."
        conll.conllFile2trees(traindir + "partialdeptrain.conll")
        conll.conllFile2trees(traindir + "emptytest.conll")
        conll.conllFile2trees(traindir + "test.conll")

    mateLogs("training of partial tree file for evaluation... ====")
    lemodelpartial, tagmodelpartial, parsemodelpartial = makeTrainingModels(
        basepath,
        lemtagin=partiallemtagtrain,
        depin=traindir + "partialdeptrain.conll",
        outfolder=modeldir,
        memory=memory,
        testfile=traindir + "emptytest.conll",
        evalfile=traindir + "test.conll",
        lemma=lemma,
        parserType=parserType)
    mateLogs("evaluation...")
    #evaluation
    evaluFileName = detailedEvaluation(parserType=parserType,
                                       memory=memory,
                                       testfile=traindir +
                                       "emptytest.conll_parse",
                                       evalfile=traindir + "test.conll",
                                       path=logdir,
                                       evaluationPercent=evaluationPercent)

    evalu = unicode(evaluFileName) + "\n"
    with codecs.open(evaluFileName, "r", "utf-8") as f:
        evalu += f.read()

    #full training
    mateLogs("training of full tree file for parsing... ====")
    lemodel, tagmodel, parsemodel = makeTrainingModels(basepath,
                                                       lemtagin=alllemtagtrain,
                                                       depin=alldeptraining,
                                                       outfolder=modeldir,
                                                       memory=memory,
                                                       lemma=lemma,
                                                       parserType=parserType)
    #getting texts to parse
    mateLogs(
        "training and evaluation complete. Starting the parse...\n\n{evalu}".
        format(evalu=evalu))
    #filenames=getTextsForParsing.main(project, parsedir)
    if parseDB:
        filenames = getTextsForParsing.extractConllFiles(project, parsedir)
        #parsing
        for infile in filenames:
            #mateLogs("Training and evaluation complete. Starting the parse of {infile}\n\n{evalu}".format(infile=infile, evalu=evalu))
            mateLogs(
                "Training and evaluation complete. Starting the parse of {}\n\n"
                .format(infile))
            parsedfile = parsing(infile,
                                 lemodel=lemodel,
                                 tagmodel=tagmodel,
                                 parsemodel=parsemodel,
                                 outfolder=parsedir,
                                 parserType=parserType,
                                 memory=memory)
            #update on base
            newname = os.path.basename(parsedfile)
            updateTrees.updateParseResult(project,
                                          parsedir,
                                          filepattern=newname,
                                          annotatorName=resultAnnotator,
                                          removeToGetDB="_parse")

    # make it easy for everyone to erase all this stuff:
    for root, dirs, files in os.walk(basepath):
        for momo in dirs:
            try:
                os.chmod(os.path.join(root, momo), 0777)
            except:
                pass
        for momo in files:
            try:
                os.chmod(os.path.join(root, momo), 0666)
            except:
                pass

    totaltime = (time.time() - ti) / 60
    mateLogs(
        "Ready. It took {totaltime} minutes for the whole process\n\n{evalu}".
        format(totaltime=round(totaltime, 1), evalu=evalu))
Exemple #26
0
def parsing(infile,
            lemodel="LemModel",
            tagmodel="TagModel",
            parsemodel="ParseModel",
            outfolder="parses",
            memory="4G",
            depparse=True,
            parserType="graph",
            lemmatized=False):
    """
	parsing function
	TODO: parserType is ignored
	"""
    if outfolder[-1] != "/": outfolder = outfolder + "/"
    outfile = outfolder + os.path.basename(infile)

    anna, lemclass, tagclass, parseclass = getClasses("graph")

    lemcommand = "java -Xmx{memory} -cp {anna} {lemclass} -model {lemodel} -test {infile} -out {outfile}_lem".format(
        memory=memory,
        anna=anna,
        lemclass=lemclass,
        infile=infile,
        lemodel=lemodel,
        outfile=outfile)
    tagcommand = "java -Xmx{memory} -cp {anna} {tagclass} -model {tagmodel} -test {outfile}_lem -out {outfile}_tag".format(
        memory=memory,
        anna=anna,
        tagclass=tagclass,
        tagmodel=tagmodel,
        outfile=outfile)

    parsecommand = "java -Xmx{memory} -cp {anna} {parseclass} -model {parsemodel} -test {outfile}_tag -out {outfile}_parse".format(
        memory=memory,
        anna=anna,
        parseclass=parseclass,
        parsemodel=parsemodel,
        outfile=outfile)

    if lemodel and lemodel[-1] != "/":
        if verbose: print "\n\n========== lemmatizing...", lemcommand
        p1 = subprocess.Popen([lemcommand], shell=True, stdout=subprocess.PIPE)
        out, err = p1.communicate()
        if verbose:
            print out, err
    else:
        if lemmatized:
            print "copying", outfolder + os.path.basename(
                infile), "as lemma file"
            shutil.copyfile(outfolder + os.path.basename(infile),
                            outfolder + os.path.basename(infile) + "_lem")
        else:
            print "adding toks as lems", outfolder + os.path.basename(infile)
            trees = conll.conllFile2trees(infile)
            with codecs.open(outfolder + os.path.basename(infile) + "_lem",
                             "w", "utf-8") as lemf:
                for tree in trees:
                    lemf.write(
                        newconvert.treeToEmptyConll14Text(tree, lemma=False) +
                        "\n")
    if verbose:
        print "\n\n========== tagging...", tagcommand
    p1 = subprocess.Popen([tagcommand], shell=True, stdout=subprocess.PIPE)
    out, err = p1.communicate()
    if verbose: print out, err
    if depparse:
        if verbose: print "\n\n========== dep analysis...", parsecommand
        p1 = subprocess.Popen([parsecommand],
                              shell=True,
                              stdout=subprocess.PIPE)
        print p1.stdout.read()
        if verbose: print "\n\n========== parsed"

    if checkIntegrity(outfile + '_parse') == False:
        print "*********ERROR IN FILE", outfile + "_parse", "Please Try again*********"
    return outfile + "_parse"