Example #1
0
def getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql):
    """
	for each entry in sents,
	adds new trees to trees and annotators
	"""
    for i, (sid, nr, sentence, teid) in enumerate(sents):
        print "\r", i,
        if parserid >= 0:
            tr = list(
                cursor.execute(
                    "select trees.rowid,* from trees, users where sentenceid=? and userid<>? and userid=users.rowid order by timestamp DESC limit 1;",
                    (sid, parserid)).fetchall())
        else:
            tr = list(
                cursor.execute(
                    "select trees.rowid,* from trees, users where sentenceid=? and userid=users.rowid order by timestamp DESC limit 1;",
                    (sid, )).fetchall())
        #print trees
        if len(tr):
            treeid, sidd, usid, annotype, status, comment, ts, user, realname = tr[
                0]
            tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"]
            #print tree
            trees += [conll.Tree(tree)]
            annotators[user] = annotators.get(user, 0) + 1
Example #2
0
def getSpecificTrees(sql, db, cursor, nrutids, annotatorIds):
    trees = []
    for nr in sorted(nrutids):  # for each sentence
        tree = None
        for aid in annotatorIds:  # for each interesting annotator id
            if aid in nrutids[nr]:
                #tree=)
                tree = conll.Tree(
                    sql.gettree(treeid=nrutids[nr][aid],
                                indb=db,
                                incursor=cursor)["tree"])
                trees += [tree]
                #print "atree:",tree
                break
        if not tree:
            print "problem: no tree for nr", nr, "type", type(nr)
            print "annotatorIds", annotatorIds
            return []
            #raise Exception('no tree', nr)
    return trees
Example #3
0
def getValidatedTrees(project, folder, whoseTrees="validator"):
    sql = SQL(project)
    db, cursor = sql.open()
    sentenceValidationInValidatedText(cursor, sql, db)
    #on récupère les nouveaux arbres
    b = databaseQuery(cursor, table=whoseTrees)
    print len(b), u"trees to extract"
    sids2all = {}
    trees = []
    error_trees = []
    textnames = {}
    for nr, (treeid, textname, user, snr, sid, uid, annotype, status, comment,
             timestamp) in enumerate(b):
        # TODO: remove:
        #if textname.startswith("mandarinParsed"):continue
        sids2all[sid] = sids2all.get(
            sid, []) + [(timestamp, textname, user, snr, treeid)]
        textnames[textname] = None
    #print len(sids2all)
    print u"trees extracted from the samples", ", ".join(sorted(textnames))
    lastpourc = -1
    for c, sid in enumerate(sids2all):
        pourc = int(float(c) / len(sids2all) * 100)
        if pourc != lastpourc:
            sys.stdout.write("{pourc}%\r".format(pourc=pourc))
        sys.stdout.flush()

        snr, treeid2get = sorted(sids2all[sid])[0][-2:]
        #print treeid2get, type(treeid2get)
        #lknlk
        dic = sql.gettree(None, None, treeid2get, indb=db,
                          incursor=cursor)  # dic -> get tree
        #if treeid2get==9669:
        #print 9669,dic

        if dic:
            sentencetree = dic["tree"]
            sentencetree = corrigerNumerotation(sentencetree)

            tree = conll.Tree(sentencetree)

            trees.append(tree)
            #print " ".join(node["t"] for i,node in sentencetree.iteritems())
            if checkTree(tree)[0] == False:
                if checkTree(tree)[1] == "self":
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user, "node " + str(checkTree(tree)[2]) +
                            " points to itself"
                        ])
                    ]
                else:
                    error_trees += [
                        "\t".join([
                            textname,
                            str(snr), user,
                            "no gov at node " + str(checkTree(tree)[2])
                        ])
                    ]
                trees.remove(tree)
                #print "nr arbres",len(trees)
        lastpourc = pourc
    print len(error_trees), "arbre(s) avec erreurs."
    if len(error_trees) > 0:
        print "\t".join(["Texte", "num phrase", "correcteur", "cause"])
        for x in sorted(list(set(error_trees))):
            print x
        f = codecs.open(
            folder + "logs/log_erreurs." +
            datetime.datetime.now().strftime('%Y-%m-%d') + ".tsv", "w",
            "utf-8")
        f.write("\t".join(["Texte", "num phrase", "correcteur", "cause"]) +
                '\n')
        for e in error_trees:
            f.write(e + '\n')
        f.close()
        print "Erreurs dans", f.name
    print len(trees), "arbres restants pour entrainement"
    #Creation d'un fichier log
    db.commit()
    db.close()
    return trees