def __init__(self, doc, export_fname, corpus_name): ExportCorpusReader.__init__(self, doc, export_fname) db = get_corpus(corpus_name) self.db = db self.sentences = db.corpus.attribute("s", 's') self.words = db.words try: self.deprel = db.corpus.attribute("deprel", "p") self.attach = db.corpus.attribute("attach", "p") except KeyError: self.deprel = None self.attach = None try: self.lemmas = db.corpus.attribute('lemma', 'p') except KeyError: self.lemmas = None self.discourse = db.db.discourse tasks = [self.db.get_task(x) for x in task_names] self.spans = sorted( set([ tuple(span) for task in tasks if task is not None for span in task.spans ])) print >> sys.stderr, "%d spans found" % (len(self.spans), ) self.span_idx = 0
def __init__(self, corpus_name): db = get_corpus(corpus_name) self.db = db self.sentences = db.corpus.attribute("s", 's') self.words = db.words try: self.lemmas = db.corpus.attribute('lemma', 'p') except KeyError: self.lemmas = None tasks = [self.db.get_task(x) for x in task_names] self.spans = sorted( set([ tuple(span) for task in tasks if task is not None for span in task.spans ])) print >> sys.stderr, "%d spans found" % (len(self.spans), ) self.span_idx = 0 if want_wsd: self.wsd = sorted(self.db.db.annotation.find({ 'level': 'wsd', 'annotator': 'wsdgold' }), key=lambda x: x['span'][0]) print >> sys.stderr, "%d WSD annotations found" % (len(self.wsd), ) else: self.wsd = [] self.wsd_idx = 0 self.sent_start = 0
def corpus(self): data = self.cookies.get('corpus') if not data or data not in allowed_corpora_admin: data = default_database if 'force_corpus' in self.args: data2 = self.args['force_corpus'] if data2 in allowed_corpora and data != data2: data = data2 return get_corpus(data)
def archive_user(user): from annodb.database import get_corpus new_name = user + '*old' for corpus_name in allowed_corpora_nologin: db = get_corpus(corpus_name) coll = db.db.discourse for doc in coll.find({'_user': user}): disc_id = doc['_docno'] old_id = doc['_id'] doc['_user'] = new_name doc['_id'] = '%s~%s' % (disc_id, new_name) coll.update({'_id': old_id}, doc)
def index(request): def by_id(x): return x._id corpus_name = request.cookies.get('corpus') try: corpus_name = request.args['corpus'] except KeyError: pass if not corpus_name or corpus_name not in allowed_corpora_admin: corpus_name = default_database db = get_corpus(corpus_name) if not request.user: tasks = sorted(db.get_tasks(), key=by_id) corpora = allowed_corpora_nologin tasks_ready = [] else: user = request.user tasks = [] tasks_ready = [] for task in sorted(db.get_tasks(request.user), key=by_id): if task.get_status(user): tasks_ready.append(task) else: tasks.append(task) corpora = allowed_corpora if user in ADMINS: corpora = allowed_corpora_admin response = render_template('index.html', user=request.user, tasks=tasks, tasks_ready=tasks_ready, corpus_name=corpus_name, corpora=corpora) expire_date = datetime.datetime.now() + datetime.timedelta(30) response.set_cookie('corpus', corpus_name, expires=expire_date) return response
anno2=self.db2.db.annotation.find_one({'_id':anno['_id']}) if force or anno2 is None: self.db2.db.annotation.save(anno) oparse=optparse.OptionParser() oparse.add_option('-f','--force',dest="force", default=False, action='store_true', help="overwrite existing annotations") oparse.add_option('-l','--list',dest="list", default=False, action='store_true', help="list annotations, don't do anything") if __name__=='__main__': opts,args=oparse.parse_args() print opts, args db1=get_corpus(args[0]) db2=get_corpus(args[1]) if len(args)>=3: task_re=args[2] else: task_re=None all_names=[] if task_re is None: all_names=[x['_id'] for x in db1.db.tasks.find()] else: all_names=[x['_id'] for x in db1.db.tasks.find({'_id':{'$regex':task_re}})] if opts.list: print >>sys.stderr, "Affected tasks: (RE=%s, n=%s)"%(task_re,len(all_names)) for x in sorted(all_names): print x else:
doc2['_id'] = doc['_id'] print doc2 db.db.discourse.save(doc2) return None else: old_id = doc['_id'] print "doc %s does not match text %s" % (old_id, t_id) for offset in [-1, 1, -2, 2, -3, 3]: similar, same = matches_doc(db, doc, t_id + offset) if similar: print "is really %s" % (t_id + offset, ) (part1, annotator) = old_id.split('~') print 'id => %s~%s' % (t_id + offset, annotator) #db.db.discourse.remove({'_id':old_id}) doc['_docno'] = t_id + offset doc['_id'] = '%s~%s' % (t_id + offset, annotator) #db.db.discourse.save(doc) break if __name__ == '__main__': db = get_corpus(sys.argv[1]) db2 = get_corpus(sys.argv[2]) for doc in list(db.db.discourse.find()): try: check_doc(db2, doc) except KeyError, e: print e # we could actually delete these? pass
pass else: edu_markable = Edu() edu_markable.span = (start + ctx_start, end + ctx_start) edu_markable.xml_id = 'edu_%s_%d_%d' % (t_id, next_sent, sub_edu) text_markable.edus['%d.%d' % (next_sent, sub_edu)] = edu_markable edu_markable.edu_idx = len(text_markable.edu_list) text_markable.edu_list.append(edu_markable) ctx.register_object(edu_markable) parse_relations(doc['relations'], text_markable, ctx) if __name__ == '__main__': db = database.get_corpus('TUEBA4') text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') if len(sys.argv) > 1: annotator = sys.argv[1] else: annotator = '*gold*' results = db.db.discourse.find({'_user': annotator}) doc = make_implicit_doc() print '<?xml version="1.0" encoding="ISO-8859-15"?>' print '<exml-doc>' reader = DiscourseReader(doc, db) # do the actual conversion for r in results: try: docid = int(r['_docno'])
def action_list_empty_tasks(dbname='R9PRE1'): for task in get_corpus(dbname).get_tasks(): if not task.annotators: print task._id, print
def action_remove_task(dbname='xxx', taskname='task1'): get_corpus(dbname).remove_task(taskname)