def getTreesForSents(sents, trees, annotators, parserid, cursor, db, sql): """ for each entry in sents, adds new trees to trees and annotators """ for i, (sid, nr, sentence, teid) in enumerate(sents): print "\r", i, if parserid >= 0: tr = list( cursor.execute( "select trees.rowid,* from trees, users where sentenceid=? and userid<>? and userid=users.rowid order by timestamp DESC limit 1;", (sid, parserid)).fetchall()) else: tr = list( cursor.execute( "select trees.rowid,* from trees, users where sentenceid=? and userid=users.rowid order by timestamp DESC limit 1;", (sid, )).fetchall()) #print trees if len(tr): treeid, sidd, usid, annotype, status, comment, ts, user, realname = tr[ 0] tree = sql.gettree(treeid=treeid, indb=db, incursor=cursor)["tree"] #print tree trees += [conll.Tree(tree)] annotators[user] = annotators.get(user, 0) + 1
def getSpecificTrees(sql, db, cursor, nrutids, annotatorIds): trees = [] for nr in sorted(nrutids): # for each sentence tree = None for aid in annotatorIds: # for each interesting annotator id if aid in nrutids[nr]: #tree=) tree = conll.Tree( sql.gettree(treeid=nrutids[nr][aid], indb=db, incursor=cursor)["tree"]) trees += [tree] #print "atree:",tree break if not tree: print "problem: no tree for nr", nr, "type", type(nr) print "annotatorIds", annotatorIds return [] #raise Exception('no tree', nr) return trees
def getValidatedTrees(project, folder, whoseTrees="validator"): sql = SQL(project) db, cursor = sql.open() sentenceValidationInValidatedText(cursor, sql, db) #on récupère les nouveaux arbres b = databaseQuery(cursor, table=whoseTrees) print len(b), u"trees to extract" sids2all = {} trees = [] error_trees = [] textnames = {} for nr, (treeid, textname, user, snr, sid, uid, annotype, status, comment, timestamp) in enumerate(b): # TODO: remove: #if textname.startswith("mandarinParsed"):continue sids2all[sid] = sids2all.get( sid, []) + [(timestamp, textname, user, snr, treeid)] textnames[textname] = None #print len(sids2all) print u"trees extracted from the samples", ", ".join(sorted(textnames)) lastpourc = -1 for c, sid in enumerate(sids2all): pourc = int(float(c) / len(sids2all) * 100) if pourc != lastpourc: sys.stdout.write("{pourc}%\r".format(pourc=pourc)) sys.stdout.flush() snr, treeid2get = sorted(sids2all[sid])[0][-2:] #print treeid2get, type(treeid2get) #lknlk dic = sql.gettree(None, None, treeid2get, indb=db, incursor=cursor) # dic -> get tree #if treeid2get==9669: #print 9669,dic if dic: sentencetree = dic["tree"] sentencetree = corrigerNumerotation(sentencetree) tree = conll.Tree(sentencetree) trees.append(tree) #print " ".join(node["t"] for i,node in sentencetree.iteritems()) if checkTree(tree)[0] == False: if checkTree(tree)[1] == "self": error_trees += [ "\t".join([ textname, str(snr), user, "node " + str(checkTree(tree)[2]) + " points to itself" ]) ] else: error_trees += [ "\t".join([ textname, str(snr), user, "no gov at node " + str(checkTree(tree)[2]) ]) ] trees.remove(tree) #print "nr arbres",len(trees) lastpourc = pourc print len(error_trees), "arbre(s) avec erreurs." if len(error_trees) > 0: print "\t".join(["Texte", "num phrase", "correcteur", "cause"]) for x in sorted(list(set(error_trees))): print x f = codecs.open( folder + "logs/log_erreurs." + datetime.datetime.now().strftime('%Y-%m-%d') + ".tsv", "w", "utf-8") f.write("\t".join(["Texte", "num phrase", "correcteur", "cause"]) + '\n') for e in error_trees: f.write(e + '\n') f.close() print "Erreurs dans", f.name print len(trees), "arbres restants pour entrainement" #Creation d'un fichier log db.commit() db.close() return trees