def discourse_rels(request): db = request.corpus words = db.words text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') results = db.db.discourse.find({'_user': request.user}) docs = {} rel_counts = defaultdict(int) rel_occurrences = defaultdict(list) sum_all = 0 for r in results: try: docid = int(r['_docno']) except KeyError: pass else: txt0 = text_ids[docid] txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5])) txt = txt.decode('ISO-8859-15') rels = isolate_relations(r['relations']) for k in rels: sum_all += len(rels[k]) rel_counts[k] += len(rels[k]) rel_occurrences[k].append((docid, txt, rels[k])) result = [] for rel in sorted(rel_counts.keys(), key=lambda x: -rel_counts[x]): result.append((rel, rel_counts[rel], rel_occurrences[rel])) return render_template('discourse_rels.html', corpus_name=db.corpus_name, results=result, sum_all=sum_all)
def list_discourse(request): db = request.corpus words = db.words text_ids = db.corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') docids = sorted( set([ r['_docno'] for r in db.db.discourse.find( {'_user': { '$in': [request.user, '*gold*'] }}) if '_docno' in r ])) doc_lst = [] for docid in docids: txt0 = text_ids[docid] txt = "%s: %s" % (txt0[2], ' '.join(words[txt0[0]:txt0[0] + 5])) if request.user in ADMINS: users = [ doc['_user'] for doc in db.db.discourse.find({'_docno': docid}) ] else: users = [ doc['_user'] for doc in db.db.discourse.find({'_docno': docid}) if (doc['_user'] in ['*gold*', request.user] or request.user is not None and doc['_user'].startswith(request.user + '*')) ] doc_lst.append((request.user, docid, txt.decode('ISO-8859-15'), users)) return render_template('discourse_list.html', corpus_name=db.corpus_name, user=request.user, results=doc_lst)
def render_search(request, word): db = request.corpus tueba_corpus = db.corpus print >> sys.stderr, repr(tueba_corpus) words = tueba_corpus.attribute("word", "p") sents = tueba_corpus.attribute("s", 's') matches = [] try: idlist = words.find(word) message = '%d Treffer.' % (len(idlist), ) for k in idlist: sno = sents.cpos2struc(k) tokens = [] start, end = sents[sno][:2] for i in xrange(start, end + 1): w = words[i].decode('ISO-8859-1') if i == k: tokens.append(u'<b>%s</b>' % (escape(w), )) else: tokens.append(escape(w)) matches.append((sno + 1, ' '.join(tokens))) except KeyError: message = 'Nichts gefunden.' return render_template('matches.tmpl', word=escape(word.decode('ISO-8859-15')), matches=matches, message=message)
def render_sentence(request, sent_no): db = request.corpus tueba_corpus = db.corpus sno = int(sent_no) - 1 words = db.words sents = db.sentences texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'), 's') texts_d = tueba_corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') max_sent = len(sents) start, end = sents[sno][:2] tokens = [] for i in xrange(start, end + 1): tokens.append(words[i].decode('ISO-8859-1')) t_id = texts.cpos2struc(end - 1) t_id_d = texts_d.cpos2struc(end - 1) unused_start, unused_end, t_attrs = texts[t_id] if db.corpus_name in corpus_urls: text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name) else: text_url = '#' parses = db.get_parses(sno) alignments = db.get_alignments(sno) trees_out = StringIO() parses_html = '' response = render_template('sentence.tmpl', sent_id=sno + 1, sent_text=' '.join(tokens), parses_html=parses_html, text_id=t_attrs, text_url=text_url, prev_sent='/pycwb/sentence/%d' % (sno, ), next_sent='/pycwb/sentence/%d' % (sno + 2, ), disc_id=t_id_d, corpus_name=request.corpus.corpus_name, has_gold=False) request.set_corpus_cookie(response) return response
def compare_discourse(request, disc_no): db = request.corpus t_id = int(disc_no) if ('user1' not in request.args and 'user2' not in request.args): user1 = request.user user2 = '*gold*' else: if ('user1' not in request.args or 'user2' not in request.args): return NotFound('need user1, user2') user1 = request.args['user1'] user2 = request.args['user2'] doc1 = db.get_discourse(t_id, user1) doc2 = db.get_discourse(t_id, user2) tokens = doc1['tokens'] sentences = doc1['sentences'] sent_gold = sentences[:] sent_gold.append(len(tokens)) exclude = set(sent_gold) edus1 = doc1['edus'] edus2 = doc2['edus'] interesting1 = set(edus1).difference(exclude) interesting2 = set(edus2).difference(exclude) common = interesting1.intersection(interesting2) edu_only1 = interesting1.difference(interesting2) edu_only2 = interesting2.difference(interesting1) edus = sorted(common.union(sent_gold)) diffs_seg = [] markers = [] sent_idx = 0 for n in sorted(edu_only1.union(edu_only2)): while sent_gold[sent_idx] < n: sent_idx += 1 if n in edu_only1: diagnosis = "Nur %s" % (user1, ) markers.append((n, '1', 'edu')) else: diagnosis = "Nur %s" % (user2, ) markers.append((n, '2', 'edu')) diffs_seg.append( (diagnosis, "[%d] %s | %s" % (sent_idx, ' '.join(tokens[n - 2:n]), ' '.join(tokens[n:n + 2])))) n_common = len(common) n_only1 = len(edu_only1) n_only2 = len(edu_only2) #for i,(start,end) in enumerate(zip(sent_gold[:-1],sent_gold[1:])): # sentences.append((i+1,tokens[start:end])) if n_common == 0: f_val_seg = 0 else: f_val_seg = 2 * n_common / (len(interesting1) + len(interesting2)) diffs_topic = [] topics1 = dict([x for x in doc1.get('topics', [])]) topics2 = dict([x for x in doc2.get('topics', [])]) #print >>sys.stderr, topics2 topics = [] sent_idx = 0 for start, topic_str in sorted(topics1.iteritems()): if start not in topics2: while sent_gold[sent_idx] < start: sent_idx += 1 diffs_topic.append( ("Nur %s" % (user1, ), "[%s] %s" % (sent_idx, topic_str))) topics.append( (start, '<span class="marker1">[%s]</span> %s' % (user1, topic_str))) else: topics.append((start, '%s / %s' % (topic_str, topics2[start]))) for start, topic_str in sorted(topics2.iteritems()): if start not in topics1: while sent_gold[sent_idx] < start: sent_idx += 1 diffs_topic.append( ("Nur %s" % (user2, ), "[%s] %s" % (sent_idx, topic_str))) topics.append( (start, '<span class="marker2">[%s]</span> %s' % (user2, topic_str))) topics.sort() users = [doc['_user'] for doc in db.db.discourse.find({'_docno': t_id})] comp_result = make_comparison(db, t_id, user1, user2) rels = comp_result.rels_compare.make_display_rels() # render common view of discourse display = render_document_html(doc, rels, markers, replacement_topics=topics) return render_template('discourse_diff.html', display=display.decode('ISO-8859-15'), all_users=users, docid=t_id, user1=user1, user2=user2, sentences=sentences, f_val_seg=f_val_seg, diffs_seg=diffs_seg, diffs_topic=diffs_topic)
def render_sentence(request, sent_no): db = request.corpus tueba_corpus = db.corpus sno = int(sent_no) - 1 words = db.words sents = db.sentences texts = tueba_corpus.attribute(corpus_sattr.get(db.corpus_name, 'text_id'), 's') texts_d = tueba_corpus.attribute( corpus_d_sattr.get(db.corpus_name, 'text_id'), 's') max_sent = len(sents) start, end = sents[sno][:2] tokens = [] for i in xrange(start, end + 1): tokens.append(words[i].decode('ISO-8859-1')) t_id = texts.cpos2struc(end - 1) coref = db.db.referential.find_one({'_id': t_id}) discourse = db.db.discourse.find_one({'_id': '%s~*gold*' % (t_id, )}) t_id_d = texts_d.cpos2struc(end - 1) unused_start, unused_end, t_attrs = texts[t_id] if db.corpus_name in corpus_urls: text_url = corpus_urls[db.corpus_name](t_attrs, db.corpus_name) else: text_url = '#' parses = db.get_parses(sno) alignments = db.get_alignments(sno) trees_out = StringIO() names_parses = sorted([k for k in parses.iterkeys() if k != '_id']) names_alignments = sorted([k for k in alignments.iterkeys() if k != '_id']) if coref is not None: names_coref = sorted([k for k in coref.iterkeys() if k != '_id']) else: names_coref = [] annotations = db.find_annotations([start, end], '*gold*') if names_parses or names_alignments or annotations: print >> trees_out, '<div id="parses-tabs">' print >> trees_out, '<ul class="nav nav-tabs">' for k in names_parses: print >> trees_out, '<li><a href="#parses-%s">%s (parse)</a></li>' % ( k, k) for k in names_alignments: print >> trees_out, '<li><a href="#alignments-%s">%s (align)</a></li>' % ( k, k) for k in names_coref: print >> trees_out, '<li><a href="#coref-%s">%s (coref)</a></li>' % ( k, k) levels = defaultdict(StringIO) for anno in annotations: level = anno['level'] schema.schemas[level].make_display(anno, db, levels[level], None) names = sorted(levels.iterkeys()) for k in names: print >> trees_out, '<li><a href="#level-tabs-%s">%s</a></li>' % ( k, k) print >> trees_out, '</ul>' for k in names_parses: v = parses[k] print >> trees_out, '<div id="parses-%s">' % (k, ) #trees_out.write('<b>%s</b> <a href="javascript:$(\'tree:%s\').toggle()">[show]</a><br/>\n'%(k,k)) t = export.from_json(v) csstree.write_html(t, trees_out, _id='tree-' + k) print >> trees_out, '</div>' for k in names_alignments: v = alignments[k] print >> trees_out, '<div id="alignments-%s">' % (k, ) write_alignment(v, trees_out) print >> trees_out, '</div>' for k in names_coref: v = coref[k] print >> trees_out, '<div id="coref-%s">' % (k, ) write_coref(db, v, trees_out, start, end + 1) print >> trees_out, '</div>' for k in names: print >> trees_out, '<div id="level-tabs-%s">' % (k, ) trees_out.write(''.join(levels[k].getvalue())) print >> trees_out, "</div>" print >> trees_out, '</div>' parses_html = trees_out.getvalue().decode('ISO-8859-15') else: parses_html = '' response = render_template('sentence.tmpl', sent_id=sno + 1, sent_text=' '.join(tokens), parses_html=parses_html, text_id=t_attrs, text_url=text_url, prev_sent='/pycwb/sentence/%d' % (sno, ), next_sent='/pycwb/sentence/%d' % (sno + 2, ), disc_id=t_id_d, corpus_name=request.corpus.corpus_name, has_gold=(discourse is not None)) request.set_corpus_cookie(response) return response
def senseEditor(request): db = request.corpus return render_template('senses.html', corpus_name=db.corpus_name)