Example #1
0
def exportUniqueSentences(project, mode="lasttree", pattern=False):
    """
	exports one tree per sentences: the first time the sentence is found, the newest tree
	"""
    sql = SQL(project)
    db, cursor = sql.open()
    sentences = {}  # toks -> tree
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    outfile = os.path.join(outdir, "allSentences.conll")
    if pattern:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		and textname like "{pattern}"
		group by sentenceid order by trees.rowid;""".format(pattern=pattern)
    else:
        command = """select trees.rowid,userid,max(timestamp) from trees, sentences, texts where texts.rowid=sentences.textid and sentences.rowid=trees.sentenceid
		group by sentenceid order by trees.rowid;"""
    for i, (
            treeid,
            userid,
            timestamp,
    ) in enumerate(cursor.execute(command).fetchall()):
        tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"]
        toks = tuple(tree[i]["t"] for i in tree)
        print "___", i, "\r",
        if toks not in sentences:
            sentences[toks] = tree
    print "writing file with", len(sentences), "sentences..."
    conll.trees2conllFile([sentences[toks] for toks in sorted(sentences)],
                          outfile=outfile,
                          columns=10)
    return outfile
Example #2
0
def transform(infolder, outfolder, mixOldNew=True):
    createNonExistingFolders(outfolder)
    spaceToks = {}
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))):
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = platinum(newtree)
            newtrees += [newtree]

            findSpaces(spaceToks, tree)

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))

    #corrdic = correctionDics("corrConll.txt")
    #for c in corrdic:
    #print c
    #qsdf
    for i, tok in enumerate(sorted(spaceToks)):
        print i, tok, spaceToks[tok]
def makeConllUfromNaijaFile(infilename):
    trees = []
    tree = {}
    with codecs.open(infilename, "r", "utf-8") as infile:
        for line in infile:
            #print "$$$$",line
            line = line.strip()
            if line and line[0] != "#":
                cells = line.split('\t')
                nrCells = len(cells)
                if nrCells != 10:
                    print line
                    continue
                nr, t, x, tag = cells[:4]
                nr = int(nr)
                newf = {'id': nr, 't': t, 'tag': tag}
                x = x.strip()
                if "=" in x:
                    mf = dict([(av.split("=")[0], av.split("=")[-1])
                               for av in x.split("|")])
                    newf = update({"features": mf}, newf)

                elif x != ".":
                    newf = update({"lemma": x}, newf)
                if nr == 1:
                    trees += [tree.copy()]
                    tree = {}
                tree[nr] = update(tree.get(nr, {}), newf)
    print len(trees), "trees"
    trees2conllFile(trees,
                    os.path.basename(infilename).split(".")[0] + ".conllu",
                    columns="u")
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.4):
    trees = conll.conllFile2trees(conllfile)
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
    print int(nbgovs * removeDeps)
    tobeRemoved = sorted(random.sample(range(nbgovs),
                                       int(nbgovs * removeDeps)))
    print "nbgovs:", nbgovs, "tobeRemoved:", tobeRemoved
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys()[0] != -1:
                if node["gov"].values()[0] in removeFuncs:
                    node["gov"] = {}
                else:
                    nbgovs += 1
                    if nbgovs in tobeRemoved:
                        node["gov"] = {}
    newname = conllfile
    if conllfile.endswith(".conll"): newname = conllfile[:-len(".conll")]
    shutil.move(conllfile, newname + ".orig")
    conll.trees2conllFile(trees, newname + ".deg", columns=10)
Example #5
0
def lastTreeForAllSamples(project, onlyHuman=True, combine=False):
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    sql = SQL(project)
    db, cursor = sql.open()
    if onlyHuman:
        parserid = 0
        for pid, in cursor.execute(
                "select rowid from users where user='******';"):
            parserid = pid
    else:
        parserid = -1
    sents = sorted(
        cursor.execute(
            "select texts.textname, sentences.rowid, sentences.nr from sentences, texts where texts.rowid=sentences.textid;"
        ).fetchall())
    print "todo:", len(sents), "sentences"
    pbar = tqdm.tqdm(total=len(sents))
    annotators = {}

    if combine:
        trees = []
        getTreesForSents(sents,
                         trees,
                         annotators,
                         parserid,
                         cursor,
                         db,
                         sql,
                         pbar,
                         project=project)
        outfile = os.path.join(outdir,
                               project + ".lastHumanTreeForAllSamples.conllu")
        conll.trees2conllFile(trees, outfile=outfile)
        print "wrote", outfile

    else:
        for tid, textname, nrtokens in list(
                cursor.execute("select rowid, * from texts;")):
            print tid, textname, nrtokens
            sents = list(
                cursor.execute(
                    "select rowid, * from sentences where textid=?;",
                    (tid, )).fetchall())
            trees = []
            getTreesForSents(sents, trees, annotators, parserid, cursor, db,
                             sql, pbar)
            if textname.endswith(".conll_parse"):
                textname = textname[:len(".conll_parse")]
            outfile = os.path.join(outdir, textname + ".lastHumanTrees.conllu")
            conll.trees2conllFile(trees, outfile=outfile)
            print "wrote", outfile
    for a in annotators:
        print a, annotators[a]
Example #6
0
def exportConllByAnnotators(project, annotators=["prof", "Sy", "parser"]):
    """
	exports complete project
	for every sentence, trees of annotators in given order.
	if no tree: throw error 
	
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    #print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = []
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees += [tree]
                    #print "atree:",tree
                    break
            if not tree:
                print "problem: no tree for nr", nr, "type", type(nr)
                print "annotatorIds", annotatorIds
                raise Exception('no tree', nr)

        if textname.endswith(".conll"): textname = textname[:-len(".conll")]
        outfile = os.path.join(outdir, textname)
        conll.trees2conllFile(trees, outfile=outfile, columns=10)
        print len(trees), "trees"
        outfiles += [outfile]
    return outfiles
Example #7
0
def exportConllByAnnotators(project,
                            annotators=["prof", "Sy", "parser"],
                            fileExtension=".conllu"):
    """
	exports complete project
	for every sentence, trees of annotators in given order.
	if no tree: throw error

	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    try:
        annotatorIds = tuple(a for (a, ) in [
            list(
                cursor.execute("select rowid from users where user =?;", (
                    annotator, )))[0] for annotator in annotators
        ])
    except:
        print "some required annotator IDs are not in the database"
        return
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds if len(annotatorIds) > 1
                            else '(' + str(annotatorIds[0]) + ')'),
                    (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = getSpecificTrees(sql, db, cursor, nrutids, annotatorIds)
        if trees:
            if textname.endswith(".conll"):
                textname = textname[:-len(".conll")]
            if textname.endswith(".conllu"):
                textname = textname[:-len(".conllu")]
            outfile = os.path.join(outdir, textname + fileExtension)
            conll.trees2conllFile(trees, outfile=outfile, columns=10)
            print len(trees), "trees"
            outfiles += [outfile]
        else:
            print "skipped", textname
    return outfiles
Example #8
0
def transform(infolder, outfolder, mixOldNew=True):
    createNonExistingFolders(outfolder)
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*.conll"))):
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = platinum(newtree)
            newtrees += [newtree]
        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
Example #9
0
def transform(infolder, outfolder, mixOldNew=False):
    createNonExistingFolders(outfolder)
    spaceToks = {}
    #for infile in sorted(glob.glob(os.path.join(infolder,"test.conll"))):
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = correct(newtree)
            newtrees += [newtree]

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
Example #10
0
def search(infolder,fun):
	goodtrees=[]
	print "doing", fun.__name__
	#try: os.mkdir(outdir)	
	#except OSError: pass
	for infile in sorted(glob.glob(os.path.join(infolder,"*"))): # .conll
		if not os.path.isfile(infile): continue
		basename=os.path.basename(infile)
		print "reading",basename
		trees = conll.conllFile2trees(infile)
		
		for tree in trees:
			#if hasVerbalDm(tree):
			#if isNonProjective(tree):
			if fun(tree):
				goodtrees+=[tree]
	print "found",len(goodtrees)
	if goodtrees:
		conll.trees2conllFile(goodtrees,fun.__name__+".conll")
Example #11
0
def transform(infolder, outfolder, mixOldNew=False):
    createNonExistingFolders(outfolder)

    corrinst = compil('corrinst.txt')
    print len(corrinst), "rules"
    for infile in sorted(glob.glob(os.path.join(infolder, "*"))):
        if not os.path.isfile(infile): continue
        basename = os.path.basename(infile)
        print "reading", basename
        trees = conll.conllFile2trees(infile)
        newtrees = []
        for tree in trees:
            if mixOldNew: newtrees += [tree]
            newtree = copy.deepcopy(tree)
            newtree = correct(newtree, corrinst)
            newtrees += [newtree]

        conll.trees2conllFile(newtrees,
                              os.path.join(outfolder, fixOutname(basename)))
def retokeniser(nomdufichier, path="", addtoout=""):
    if not path:
        path, _ = os.path.split(
            nomdufichier)  # take the same path as the nomdufichier
    if path and path[-1] != "/": path = path + "/"
    trees = conll.conllFile2trees(nomdufichier)  # on lit le fichier
    print "le fichier", nomdufichier, "a", len(trees), "arbres"
    #newtrees, alltrees=[], []
    newtrees = []
    digitsandnumbers = codecs.open(droporfeo + "lexique/gg", "r",
                                   "utf-8").read().split('\n')
    for i, arbre in enumerate(trees):  # on boucle sur les arbres
        #alltrees+=[copy.deepcopy(arbre)]
        #oldtree=copy.deepcopy(arbre)
        racines = addinfototree(arbre)
        oldtree = copy.deepcopy(arbre)
        arbre = corrigerNumerotation(arbre)
        arbre = nombresComposes(arbre)
        arbre = digits(arbre, digitsandnumbers)
        arbre = corrigerArbreCompos(
            arbre)  # Décomposition des expressions multimots
        #for i, node in arbre.items(): # Reconfiguration des enfants
        #if node["gov"] == {}:
        #print "crap"

        arbre = recomposerMultimots(arbre, expressions_multimots)
        arbre = corrigerNumerotationSplice(arbre)

        arbre = corrigerSegmentationClitiques(arbre, dico_clitiques)
        arbre = corrigerInaudibles(arbre)
        arbre = corrigerClitiques(arbre)
        arbre = retoken(arbre)
        if arbre != oldtree:
            print i
            for ii in arbre:
                if arbre[ii] != oldtree.get(ii, None):
                    print ii, arbre[ii]['t'], arbre[ii], oldtree.get(ii, None)
        newtrees.append(arbre)
    newname = path + os.path.basename(nomdufichier + addtoout)
    conll.trees2conllFile(newtrees, newname, columns=10)
    return newname
Example #13
0
def addArbitraryPuncs(infolder, outfolder):
    createNonExistingFolders(outfolder)
    for conllinfile in glob.glob(os.path.join(infolder, '*')):
        print conllinfile
        trees = conll.conllFile2trees(conllinfile)
        for i, tree in enumerate(trees):
            m = max(tree)
            splitcode = ".,!?;:()"
            p = splitcode[i % len(splitcode)]
            tree[m + 1] = {
                u'tag': u'PUNC',
                u'lemma': p,
                u't': p,
                'gov': {
                    0: u'punc'
                }
            }
        conll.trees2conllFile(trees,
                              os.path.join(outfolder,
                                           os.path.basename(conllinfile)),
                              columns=14)
def degradeConllfile(conllfile, removeFuncs=["para"], removeDeps=0.2):
    trees = conll.conllFile2trees(conllfile)
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
    print int(nbgovs * removeDeps)
    tobeRemoved = sorted(random.sample(range(nbgovs),
                                       int(nbgovs * removeDeps)))
    print nbgovs, tobeRemoved
    nbgovs = 0
    for arbre in trees:
        for i, node in arbre.iteritems():
            if "gov" in node and node["gov"].keys(
            )[0] != -1 and node["gov"].values()[0] not in removeFuncs:
                nbgovs += 1
                if nbgovs in tobeRemoved:
                    node["gov"] = {}
    shutil.move(conllfile, conllfile + ".orig")
    conll.trees2conllFile(trees, conllfile, columns=10)
def reorder(trees, outfile):  # 重新排序conll树,可重命名所有sent_id
    """
	Reorders the trees based on the nr sentencefeature, adds updated text and sentence_id.
	Once this is done, the trees are written to a new file.

	input: List(Tree), Str
	does: Writes <outfile>
	output: None
	"""
    prefix = "_".join(trees[0].sentencefeatures.get("sent_id").split("_")[:-1])
    sortable = sorted(
        list([(int(t.sentencefeatures.get("nr")), t) for t in trees]))
    new_trees = list()
    for nr, tree in sortable:

        # adding metadatas	应该是重命名sent_id,从0开始
        tree.sentencefeatures["text"] = tree.sentence()
        tree.sentencefeatures["sent_id"] = prefix + "_" + str(nr - 1)

        # removing useless metadata
        del tree.sentencefeatures["nr"]
        new_trees.append(tree)
    conll.trees2conllFile(new_trees, outfile)
Example #16
0
def split(conllfile, maxi):
    trees = conll.conllFile2trees(conllfile)
    for j, ts in enumerate(
        [trees[i:i + maxi] for i in range(0, len(trees), maxi)]):
        conll.trees2conllFile(ts, conllfile + str(j))
Example #17
0
def trainingEvaluationParsing(project=u"OrfeoGold2016",
                              parserType="graph",
                              whoseTrees="validator",
                              evaluationPercent=10,
                              additionnalLexicon=None,
                              resultAnnotator="mate",
                              getFromFolder=False,
                              parseDB=False,
                              memory="40G",
                              stopOnError=False):
    """
	if additionnalLexicon is given, it is joined to the training file for lemmatization and tagging.
	change memory here!
	todo :
		- add function to choose parser type (lang=)
		- creer mate.log pour progression (fin = "Ready.")
	"""
    mateLogs("Begin")
    ti = time.time()

    if getFromFolder:
        parseDB = False  # TODO: correct this so that all options are available
    parserType = (parserType or "graph")
    whoseTrees = whoseTrees or "validator"
    evaluationPercent = evaluationPercent or 10
    resultAnnotator = resultAnnotator or "mate"

    try:
        os.chmod("mate/parse.log", 0666)  # just in case...
    except:
        pass

    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M')

    #####
    #defining project and creation of saves directories
    #####

    basepath = createDailyPath("./mate/", project)
    if parseDB:
        backupbase = backupOldDatabase(project, basepath)
        mateLogs(
            "A copy of the database has been stored in {backupbase}. Getting validated trees..."
            .format(backupbase=backupbase))
    traindir = createDirectory(basepath + "training")
    modeldir = createDirectory(basepath + "models")
    logdir = createDirectory(basepath + "logs")
    parsedir = createDirectory(basepath + "parses")

    #####
    #getting gold trees for training
    #####

    if getFromFolder:  # getFromFolder contains folder name containing only conll files
        error = False
        goldtrees = []
        for infile in glob.glob(os.path.join(getFromFolder, '*')):
            if os.path.isfile(infile):
                print "reading", infile
                gtrees = conll.conllFile2trees(infile)
                for tree in gtrees:
                    problemkeys = []
                    for i in tree:
                        for gi in tree[i]["gov"]:
                            if not 0 <= gi <= len(tree):
                                print infile
                                print tree
                                print "has a problematic governor:", gi
                                error = True
                                problemkeys += [i]
                    for problemk in problemkeys:
                        del tree[problemk]
                goldtrees += gtrees
        if error and stopOnError: sys.exit()
    else:
        goldtrees = trees2train.getValidatedTrees(project, basepath,
                                                  whoseTrees)
    mateLogs(
        u"{nrtrees} validated trees extracted".format(nrtrees=len(goldtrees)))

    lemma = None
    if goldtrees:  # see whether the first token of the first tree has a lemma. if lemma==None: we'll skip lemmatization
        lemma = goldtrees[0][sorted(goldtrees[0])[0]].get(
            "lemma", None)  # just trying to get the first lemma value
        if lemma == "_": lemma = None
        print "found lemma in first tree:", lemma
        #TODO: do something here: double tokens as lemmas for chinese, see function makeTrainTestSets
    else:
        print "no trees from:", getFromFolder
        sys.exit()
    print "goldtrees:", len(goldtrees)

    #####
    #creating trainingfiles
    #####

    alldeptraining = traindir + "alldeptraining.conll"
    conll.trees2conllFile(goldtrees, alldeptraining, columns=14)
    traintrees = makeTrainTestSets(traindir,
                                   pattern=os.path.basename(alldeptraining),
                                   train="partialdeptrain.conll",
                                   test="test.conll",
                                   empty="emptytest.conll",
                                   testsize=int(evaluationPercent),
                                   lemma=lemma)
    print "traintrees:", len(traintrees)
    if additionnalLexicon:
        lexicontrees = conll.conllFile2trees(additionnalLexicon)
        print "lexicontrees:", len(lexicontrees)
        alllemtagtrain = traindir + "alllemtagtrain.conll"
        conll.trees2conllFile(goldtrees + lexicontrees,
                              alllemtagtrain,
                              columns=14)
        partiallemtagtrain = traindir + "partiallemtagtrain.conll"
        conll.trees2conllFile(traintrees + lexicontrees,
                              partiallemtagtrain,
                              columns=14)
    else:
        alllemtagtrain = alldeptraining
        partiallemtagtrain = traindir + "partialdeptrain.conll"

    #creating files used for evaluation
    #if isinstance(evaluationPercent, str): evaluationPercent = int(evaluationPercent)
    mateLogs("trainfiles created")
    if verbose:
        print "just testing whether i can load them..."
        conll.conllFile2trees(traindir + "partialdeptrain.conll")
        conll.conllFile2trees(traindir + "emptytest.conll")
        conll.conllFile2trees(traindir + "test.conll")

    mateLogs("training of partial tree file for evaluation... ====")
    lemodelpartial, tagmodelpartial, parsemodelpartial = makeTrainingModels(
        basepath,
        lemtagin=partiallemtagtrain,
        depin=traindir + "partialdeptrain.conll",
        outfolder=modeldir,
        memory=memory,
        testfile=traindir + "emptytest.conll",
        evalfile=traindir + "test.conll",
        lemma=lemma,
        parserType=parserType)
    mateLogs("evaluation...")
    #evaluation
    evaluFileName = detailedEvaluation(parserType=parserType,
                                       memory=memory,
                                       testfile=traindir +
                                       "emptytest.conll_parse",
                                       evalfile=traindir + "test.conll",
                                       path=logdir,
                                       evaluationPercent=evaluationPercent)

    evalu = unicode(evaluFileName) + "\n"
    with codecs.open(evaluFileName, "r", "utf-8") as f:
        evalu += f.read()

    #full training
    mateLogs("training of full tree file for parsing... ====")
    lemodel, tagmodel, parsemodel = makeTrainingModels(basepath,
                                                       lemtagin=alllemtagtrain,
                                                       depin=alldeptraining,
                                                       outfolder=modeldir,
                                                       memory=memory,
                                                       lemma=lemma,
                                                       parserType=parserType)
    #getting texts to parse
    mateLogs(
        "training and evaluation complete. Starting the parse...\n\n{evalu}".
        format(evalu=evalu))
    #filenames=getTextsForParsing.main(project, parsedir)
    if parseDB:
        filenames = getTextsForParsing.extractConllFiles(project, parsedir)
        #parsing
        for infile in filenames:
            #mateLogs("Training and evaluation complete. Starting the parse of {infile}\n\n{evalu}".format(infile=infile, evalu=evalu))
            mateLogs(
                "Training and evaluation complete. Starting the parse of {}\n\n"
                .format(infile))
            parsedfile = parsing(infile,
                                 lemodel=lemodel,
                                 tagmodel=tagmodel,
                                 parsemodel=parsemodel,
                                 outfolder=parsedir,
                                 parserType=parserType,
                                 memory=memory)
            #update on base
            newname = os.path.basename(parsedfile)
            updateTrees.updateParseResult(project,
                                          parsedir,
                                          filepattern=newname,
                                          annotatorName=resultAnnotator,
                                          removeToGetDB="_parse")

    # make it easy for everyone to erase all this stuff:
    for root, dirs, files in os.walk(basepath):
        for momo in dirs:
            try:
                os.chmod(os.path.join(root, momo), 0777)
            except:
                pass
        for momo in files:
            try:
                os.chmod(os.path.join(root, momo), 0666)
            except:
                pass

    totaltime = (time.time() - ti) / 60
    mateLogs(
        "Ready. It took {totaltime} minutes for the whole process\n\n{evalu}".
        format(totaltime=round(totaltime, 1), evalu=evalu))
Example #18
0
def exportGoodTexts(project,
                    lastHuman=False,
                    onlyValidated=True,
                    pattern=False):
    """
	TODO :
	- ajouter parametre p/selectionner Texte
	ex : "UD_ZH_[number]"
	"""
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    if onlyValidated: onlyValidated = "and todos.status=1"
    else: onlyValidated = ""
    # take all texts where a validator has validated
    if pattern:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and users.rowid=todos.userid and texts.textname {pattern};".format(
            pattern=pattern)  # like 'UD_ZH%'
    else:
        command = "select distinct * from texts, todos, users where texts.rowid=todos.textid and todos.type=1 {onlyValidated} and users.rowid=todos.userid;".format(
            onlyValidated=onlyValidated)
    for row in cursor.execute(command):
        textname, nrtokens, userid, textid, validator, status, comment, user, realname = row
        goodTexts[textid] = (textname, userid, user)
        print "i'll take", textname, "validated by", user, "with", nrtokens, "tokens"
    sentenceValidationInValidatedText(cursor, sql, db)
    outdir = os.path.join("..", "projects", project, "export")
    try:
        os.mkdir(outdir)
    except OSError:
        pass

    for textid, (textname, userid, user) in goodTexts.iteritems():
        textname = textname.replace("-one-word-per-line.conll14_Parse", "")

        if lastHuman:
            outfile = os.path.join(outdir, textname + ".lastHuman.conll")
        else:
            outfile = os.path.join(
                outdir, "validated." + textname + "." + user + ".conll")
        print "doing", textname, textid
        trees = []

        if lastHuman:
            snr2all = {}
            for row in cursor.execute(
                    """
			select sentences.nr as snr, trees.rowid as treeid, users.user, trees.timestamp 
			from sentences, trees, users 
			where sentences.textid=? 
			and sentences.rowid=trees.sentenceid 
			and users.rowid = trees.userid; """, (textid, )):
                snr, treeid, user, timestamp = row
                snr2all[snr] = snr2all.get(snr,
                                           []) + [(timestamp, user, treeid)]
            lastpourc = -1
            for c, snr in enumerate(sorted(snr2all)):
                pourc = int(float(c) / len(snr2all) * 100)
                if pourc != lastpourc:
                    print "___{pourc}%___\r".format(pourc=pourc),

                lastusersnotparser = sorted([
                    (timestamp, user, treeid)
                    for (timestamp, user, treeid) in snr2all[snr]
                    if user not in ["parser", "mate"]
                ])
                if len(lastusersnotparser) > 0:
                    time, u, tid = lastusersnotparser[-1]  # last tree by human
                else:
                    time, u, tid = sorted(
                        snr2all[snr])[-1]  # last tree by whoever
                #print "je prends l'arbre de",u
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        else:

            for (
                    treeid,
                    sentencenr,
            ) in cursor.execute(
                    "select trees.rowid, sentences.nr from texts, trees, sentences where texts.rowid=? and trees.userid=? and trees.sentenceid = sentences.rowid and sentences.textid=texts.rowid order by sentences.nr;",
                (
                    textid,
                    userid,
                )).fetchall():
                #print "ooo",sentencenr,"\r",
                print "nr", sentencenr, "_____\r",
                trees += [
                    sql.gettree(treeid=treeid, indb=db,
                                incursor=cursor)["tree"]
                ]

        print "exporting", len(trees), "trees into", outfile
        outfiles += [outfile]
        conll.trees2conllFile(trees, outfile, columns=10)
    return outfiles
Example #19
0
def fusionForgottenTrees(project="Platinum",
                         fusdir="../projects/OrfeoGold2016/platinum/*",
                         annotators=["admin"]):
    """
	takes trees from project ordered by annotators. if they exist fuse them into the fusdir
	result has the extension "cool.conll"
	,"Sy","Marion"
	"""

    #print lemmacorrection
    sys.path.insert(0, '../tools')
    import difflib
    outfiles = []
    sql = SQL(project)
    db, cursor = sql.open()
    goodTexts = {}
    outdir = os.path.join("..", "projects", project, "exportcool")
    try:
        os.mkdir(outdir)
    except OSError:
        pass
    for annotator in annotators:
        print[
            list(
                cursor.execute("select rowid from users where user =?;",
                               (annotator, )))
        ]
    annotatorIds = tuple(a for (a, ) in [
        list(
            cursor.execute("select rowid from users where user =?;", (
                annotator, )))[0] for annotator in annotators
    ])
    print annotators, annotatorIds

    for textid, textname, nrtokens in list(
            cursor.execute("select rowid, * from texts;")):  # for each text
        print "\n__________________________doing", textname, "with", nrtokens, "tokens"
        nrutids = {}
        for nr, userid, treeid in list(
                cursor.execute(
                    "select nr,userid,trees.rowid as treeid from trees, sentences where sentenceid=sentences.rowid and userid in {annotatorIds} and  textid = ? order by nr;"
                    .format(annotatorIds=annotatorIds), (textid, ))):
            nrutids[nr] = nrutids.get(nr, {})
            nrutids[nr][userid] = treeid
        trees = {}
        for nr in sorted(nrutids):  # for each sentence
            tree = None
            for aid in annotatorIds:  # for each interesting annotator id
                if aid in nrutids[nr]:
                    tree = sql.gettree(treeid=nrutids[nr][aid],
                                       indb=db,
                                       incursor=cursor)["tree"]
                    trees[nr] = tree
                    #print "atree:",tree
                    break
            #if not tree:
            #print "problem: no tree for nr",nr,"type",type(nr)
            #print "annotatorIds",annotatorIds
            #raise Exception('no tree', nr)
        #print trees
        print len(trees), "trees from", project
        print textname, textname.split(".")[0]
        btextname = os.path.basename(textname).split(".")[0]
        if btextname.endswith("-one-word-per-line"):
            btextname = btextname[:-len("-one-word-per-line")]
        #print glob.glob(fusdir),[os.path.basename(fi).split(".")[0] for fi in glob.glob(fusdir)]
        cooltrees = []
        ptrees, ftrees = 0, 0
        for fi in glob.glob(fusdir):
            if btextname == os.path.basename(fi).split(".")[0]:
                print "yes", btextname
                fustrees = conll.conllFile2trees(fi)
                print len(fustrees), "ftrees", fi
                for nr, ftree in enumerate(fustrees):
                    if nr + 1 in trees:
                        #print "added tree",nr+1,"from database"
                        #ptree=platinum(trees[nr+1])
                        ptree = trees[nr + 1]
                        for iii in ptree:
                            ptree[iii]["tag2"] = "_"
                            if ptree[iii]["lemma"] in lemmacorrection:
                                ptree[iii]["lemma"] = lemmacorrection[
                                    ptree[iii]["lemma"]]
                        cooltrees += [ptree]
                        #print nr+1,"tree from",project#,tree
                        ptrees += 1
                        if ftree.sentence() != u" ".join(
                            [ptree[i].get("t", "") for i in sorted(ptree)]):
                            print "\n_________", nr + 1
                            print ftree.sentence()
                            print u" ".join(
                                [ptree[i].get("t", "") for i in sorted(ptree)])
                            #for l in difflib.context_diff(ftree.sentence() ,u" ".join([ptree[i].get("t","") for i in sorted(ptree)])):print l

                        #print "dbtree",platinum(trees[nr+1])
                    else:
                        for iii in ftree:
                            ftree[iii]["tag2"] = "_"
                            if ftree[iii]["lemma"] in lemmacorrection:
                                ftree[iii]["lemma"] = lemmacorrection[
                                    ftree[iii]["lemma"]]
                        #print nr+1,"tree from",fusdir#,tree
                        ftrees += 1
                        cooltrees += [ftree]
                        #print "added tree",nr+1,"from fustrees",fi
                outfile = os.path.join(outdir, textname + ".cool.conll")
                conll.trees2conllFile(cooltrees, outfile=outfile, columns=10)
                print "wrote", outfile
                print ptrees, "ptrees, ", ftrees, "ftrees"
                break
        if len(cooltrees) == 0: print "nothing for", btextname
        outfiles += [outfile]
        #qsdf
    return outfiles
Example #20
0

#translate(u"准许 一 位 人士 入境 的 权力".split())

for conllinfile in glob.glob(os.path.join("corpus/conll/", 'CONV*.*')):

    print conllinfile
    trees = conllFile2trees(conllinfile)
    path, base = os.path.split(conllinfile)
    translateDic = {}
    counter = 0
    for tree in trees:
        for i, node in tree.iteritems():
            node["tag2"] = pinyin.get(node["t"])
            translateDic[node["t"]] = None
        counter += 1
        if not counter % 100: print counter, "trees"

    words = sorted(translateDic)
    print len(words), "words"
    trads = translate(words)
    translateDic = dict(zip(words, trads))
    print len(translateDic), "translations"
    for tree in trees:
        for i, node in tree.iteritems():
            node["gloss"] = translateDic[node["t"]]
        counter += 1
        if not counter % 100: print counter, "trees"
        #lines+=[u" ".join(words+[u"."])]
    trees2conllFile(trees, path + "/" + "UD-" + base[len("CONV-CORREC-"):])