def discourse_rels(request): db = request.corpus words = db.words text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') results = db.db.discourse.find({'_user': request.user}) docs = {} rel_counts = defaultdict(int) rel_occurrences = defaultdict(list) sum_all = 0 for r in results: try: docid = int(r['_docno']) except KeyError: pass else: txt0 = text_ids[docid] txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5])) txt = txt.decode('ISO-8859-15') rels = isolate_relations(r['relations']) for k in rels: sum_all += len(rels[k]) rel_counts[k] += len(rels[k]) rel_occurrences[k].append((docid, txt, rels[k])) result = [] for rel in sorted(rel_counts.keys(), key=lambda x: -rel_counts[x]): result.append((rel, rel_counts[rel], rel_occurrences[rel])) return render_template('discourse_rels.html', corpus_name=db.corpus_name, results=result, sum_all=sum_all)
def __init__(self, ctx, db): self.ctx = ctx self.texts = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') self.sentences = db.corpus.attribute("s", 's') self.words = db.words self.postags = db.corpus.attribute("pos", 'p')
def list_discourse(request): db = request.corpus words = db.words text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') docids = sorted( set([ r['_docno'] for r in db.db.discourse.find( {'_user': { '$in': [request.user, '*gold*'] }}) if '_docno' in r ])) doc_lst = [] for docid in docids: txt0 = text_ids[docid] txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5])) if request.user in ADMINS: users = [ doc['_user'] for doc in db.db.discourse.find({'_docno': docid}) ] else: users = [ doc['_user'] for doc in db.db.discourse.find({'_docno': docid}) if (doc['_user'] in ['*gold*', request.user] or request.user is not None and doc['_user'].startswith(request.user + '*')) ] doc_lst.append((request.user, docid, txt.decode('ISO-8859-15'), users)) return render_template('discourse_list.html', corpus_name=db.corpus_name, user=request.user, results=doc_lst)
def render_discourse(request, disc_no): db = request.corpus corpus = db.corpus t_id = int(disc_no) doc = db.get_discourse(t_id, request.user) texts = corpus.attribute(corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') sents = corpus.attribute("s", 's') start, end, text_attrs = texts[t_id] sent_id = sents.cpos2struc(start) response = render_template_nocache( 'discourse.html', corpus_name=json.dumps(request.corpus.corpus_name), disc_id=disc_no, sent_id=sent_id, sentences=json.dumps(doc['sentences']), edus=json.dumps(doc['edus']), tokens=json.dumps(doc['tokens']), indent=json.dumps(doc['indent']), relations=json.dumps(doc.get('relations', '')), nonedu=json.dumps(doc.get('nonedu', {})), uedus=json.dumps(doc.get('uedus', {})), topics=json.dumps(doc.get('topics', []))) request.set_corpus_cookie(response) return response
def __init__(self, ctx, db): self.ctx = ctx self.texts = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') self.sentences = db.corpus.attribute("s", 's') self.words = db.words self.postags = db.corpus.attribute("pos", 'p') self.morph = db.corpus.attribute("morph", 'p') self.deprel = db.corpus.attribute("deprel", "p") self.attach = db.corpus.attribute("attach", "p") self.lemma = db.corpus.attribute("lemma", "p") self.db = db
def render_sentence(request, sent_no): db = request.corpus tueba_corpus = db.corpus sno = int(sent_no) - 1 words = db.words sents = db.sentences texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'), 's') texts_d = tueba_corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') max_sent = len(sents) start, end = sents[sno][:2] tokens = [] for i in xrange(start, end + 1): tokens.append(words[i].decode('ISO-8859-1')) t_id = texts.cpos2struc(end - 1) t_id_d = texts_d.cpos2struc(end - 1) unused_start, unused_end, t_attrs = texts[t_id] if db.corpus_name in corpus_urls: text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name) else: text_url = '#' parses = db.get_parses(sno) alignments = db.get_alignments(sno) trees_out = StringIO() parses_html = '' response = render_template('sentence.tmpl', sent_id=sno + 1, sent_text=' '.join(tokens), parses_html=parses_html, text_id=t_attrs, text_url=text_url, prev_sent='/pycwb/sentence/%d' % (sno, ), next_sent='/pycwb/sentence/%d' % (sno + 2, ), disc_id=t_id_d, corpus_name=request.corpus.corpus_name, has_gold=False) request.set_corpus_cookie(response) return response
edu_markable = Edu() edu_markable.span = (start + ctx_start, end + ctx_start) edu_markable.xml_id = 'edu_%s_%d_%d' % (t_id, next_sent, sub_edu) text_markable.edus['%d.%d' % (next_sent, sub_edu)] = edu_markable edu_markable.edu_idx = len(text_markable.edu_list) text_markable.edu_list.append(edu_markable) ctx.register_object(edu_markable) parse_relations(doc['relations'], text_markable, ctx) if __name__ == '__main__': db = database.get_corpus('TUEBA4') text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') if len(sys.argv) > 1: annotator = sys.argv[1] else: annotator = '*gold*' results = db.db.discourse.find({'_user': annotator}) doc = make_implicit_doc() print '<?xml version="1.0" encoding="ISO-8859-15"?>' print '<exml-doc>' reader = DiscourseReader(doc, db) # do the actual conversion for r in results: try: docid = int(r['_docno']) except KeyError: pass
def render_sentence(request, sent_no): db = request.corpus tueba_corpus = db.corpus sno = int(sent_no) - 1 words = db.words sents = db.sentences texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'), 's') texts_d = tueba_corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') max_sent = len(sents) start, end = sents[sno][:2] tokens = [] for i in xrange(start, end + 1): tokens.append(words[i].decode('ISO-8859-1')) t_id = texts.cpos2struc(end - 1) coref = db.db.referential.find_one({'_id': t_id}) discourse = db.db.discourse.find_one({'_id': '%s~*gold*' % (t_id, )}) t_id_d = texts_d.cpos2struc(end - 1) unused_start, unused_end, t_attrs = texts[t_id] if db.corpus_name in corpus_urls: text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name) else: text_url = '#' parses = db.get_parses(sno) alignments = db.get_alignments(sno) trees_out = StringIO() names_parses = sorted([k for k in parses.iterkeys() if k != '_id']) names_alignments = sorted([k for k in alignments.iterkeys() if k != '_id']) if coref is not None: names_coref = sorted([k for k in coref.iterkeys() if k != '_id']) else: names_coref = [] annotations = db.find_annotations([start, end], '*gold*') if names_parses or names_alignments or annotations: print >> trees_out, '<div id="parses-tabs">' print >> trees_out, '<ul class="nav nav-tabs">' for k in names_parses: print >> trees_out, '<li><a href="#parses-%s">%s (parse)</a></li>' % ( k, k) for k in names_alignments: print >> trees_out, '<li><a href="#alignments-%s">%s (align)</a></li>' % ( k, k) for k in names_coref: print >> trees_out, '<li><a href="#coref-%s">%s (coref)</a></li>' % ( k, k) levels = defaultdict(StringIO) for anno in annotations: level = anno['level'] schema.schemas[level].make_display(anno, db, levels[level], None) names = sorted(levels.iterkeys()) for k in names: print >> trees_out, '<li><a href="#level-tabs-%s">%s</a></li>' % ( k, k) print >> trees_out, '</ul>' for k in names_parses: v = parses[k] print >> trees_out, '<div id="parses-%s">' % (k, ) #trees_out.write('<b>%s</b> <a href="javascript:$(\'tree:%s\').toggle()">[show]</a><br/>\n'%(k,k)) t = export.from_json(v) csstree.write_html(t, trees_out, _id='tree-' + k) print >> trees_out, '</div>' for k in names_alignments: v = alignments[k] print >> trees_out, '<div id="alignments-%s">' % (k, ) write_alignment(v, trees_out) print >> trees_out, '</div>' for k in names_coref: v = coref[k] print >> trees_out, '<div id="coref-%s">' % (k, ) write_coref(db, v, trees_out, start, end + 1) print >> trees_out, '</div>' for k in names: print >> trees_out, '<div id="level-tabs-%s">' % (k, ) trees_out.write(''.join(levels[k].getvalue())) print >> trees_out, "</div>" print >> trees_out, '</div>' parses_html = trees_out.getvalue().decode('ISO-8859-15') else: parses_html = '' response = render_template('sentence.tmpl', sent_id=sno + 1, sent_text=' '.join(tokens), parses_html=parses_html, text_id=t_attrs, text_url=text_url, prev_sent='/pycwb/sentence/%d' % (sno, ), next_sent='/pycwb/sentence/%d' % (sno + 2, ), disc_id=t_id_d, corpus_name=request.corpus.corpus_name, has_gold=(discourse is not None)) request.set_corpus_cookie(response) return response