def get_evaluation_stats(self): """ Collects the statistics that are show on the top right: @return: a list of tuples: (username, num_started, num_approved) """ user_id = users.get_current_user().email() num_annotated = DocumentAnnotation.all().filter('user_id =', user_id).count() num_approved = DocumentAnnotation.all().filter('user_id =', user_id).filter('approved =', True).count() return (num_annotated, num_approved)
def get_evaluation_stats(self): """ Collects the statistics that are show on the top right: @return: a list of tuples: (username, num_started, num_approved) """ user_id = users.get_current_user().email() num_annotated = DocumentAnnotation.all().filter('user_id =', user_id).count() num_approved = DocumentAnnotation.all().filter( 'user_id =', user_id).filter('approved =', True).count() return (num_annotated, num_approved)
def store_annotations(self, user_id, data): doc_filename = data['doc'] doc = Document.all().filter("filename =", doc_filename).get() doc_annotation = DocumentAnnotation.all().filter( "user_id =", user_id).filter("document =", doc.filename).get() if not doc_annotation: doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename) doc_annotation.approved = data["approved"] doc_annotation.concepts = [Text(item) for item in data["concepts"]] doc_annotation.relations = [Text(item) for item in data["relations"]] doc_annotation.arg_units = [Text(item) for item in data["arg_units"]] doc_annotation.notes = Text(data['notes']) log("Storing annotations " + ("[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str(doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace("\n", " NL ")) db.get(doc_annotation.put())
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({"corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def load_annotations(self, user_id, doc): annotation = None for anno in DocumentAnnotation.all().filter("document =", doc.filename): if anno.user_id == user_id: annotation = anno break return annotation
def get_processing_state(user_id): """ Creates a dictionary {document-filename => processing state} """ def constant_factory(value): return itertools.repeat(value).next processed_docs = collections.defaultdict(constant_factory(UNPROCESSED)) for annotation in DocumentAnnotation.all().filter("user_id =", user_id).run(): if annotation.approved: processed_docs[annotation.document] = COMPLETE else: processed_docs[annotation.document] = IN_PROGRESS return processed_docs
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({ "corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter( "document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({ 'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers[ 'Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def store_annotations(self, user_id, data): doc_filename = data['doc'] doc = Document.all().filter("filename =", doc_filename).get() doc_annotation = DocumentAnnotation.all().filter("user_id =", user_id).filter("document =", doc.filename).get() if not doc_annotation: doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename) doc_annotation.approved = data["approved"] doc_annotation.concepts = [Text(item) for item in data["concepts"]] doc_annotation.relations = [Text(item) for item in data["relations"]] doc_annotation.arg_units = [Text(item) for item in data["arg_units"]] doc_annotation.notes = Text(data['notes']) log("Storing annotations " + ( "[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str( doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace( "\n", " NL ")) db.get(doc_annotation.put())
def get(self): if not self.validate_user(): return if self.request.path.endswith('/annotate'): user_id = access_control.get_current_user_id() # Allow to take the role of another user if self.request.get('user', None) in access_control.get_all_users(): user_id = self.request.get('user', None) documents_sorted_by_id = get_docs_ordered_by_id(user_id); processing_state = get_processing_state(user_id) # Per user_id annotation statistics num_annotated, num_approved = self.get_evaluation_stats() template_values = {'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.ANNOTATE_VIEW_ID} if not documents_sorted_by_id: template_values.update({ 'has_document': False, 'navigation_docs': [], 'num_annotated': num_annotated, 'num_approved': num_approved, 'message': "No documents to display." }) else: doc_filename = self.request.get('doc', None) doc = mark_as_current_doc(user_id, doc_filename, documents_sorted_by_id[0]) topics = get_all_topics() current_topic = get_current_topic(user_id) opening_time = time.strftime("%H:%M:%S %Z", time.localtime()) docs_with_processing_state = [(d, processing_state[d.filename]) for d in documents_sorted_by_id] # Get arg_units to show arg_units = [] relations = [] concepts = [] doc_approved = False notes = "" personal_annotations = DocumentAnnotation.all().filter('user_id =', user_id).filter( 'document =', doc.filename).get() if personal_annotations: doc_approved = personal_annotations.approved notes = personal_annotations.notes arg_units = [str(item) for item in personal_annotations.arg_units] relations = [str(item) for item in personal_annotations.relations] concepts = [str(item) for item in personal_annotations.concepts] template_values.update({'navigation_docs': docs_with_processing_state, 'documents_sorted_by_id': documents_sorted_by_id, 'text': doc.text, 'doc_url': doc.url, 'doc_filename': doc.filename, 'doc_approved': doc_approved, 'num_sentences': doc.num_sentences, 'num_tokens': doc.num_tokens, 'time': opening_time, 'arg_units': json.dumps(arg_units), 'relations': json.dumps(relations), 'concepts': json.dumps(concepts), 'all_topics': topics, 'current_topic': current_topic, 'num_annotated': num_annotated, 'num_approved': num_approved, 'notes': notes, 'has_document': True, 'message': ""}) template = JINJA_ENVIRONMENT.get_template('annotate.html') self.response.write(template.render(template_values)) elif self.request.path.endswith('/selecttopic'): user_id = access_control.get_current_user_id() topics = self.request.get_all('topic', None) set_as_current_topics(user_id, topics) self.redirect('/argunit/annotate')
def setApproval(self, annotator, document, isApproved): annotation = DocumentAnnotation.all().filter( "user_id =", annotator).filter("document =", document).get() if annotation: annotation.approved = isApproved annotation.put()
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect('%s?%s' % ( self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False); response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, ( i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = {'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message} template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')
def drop_all_annotations(self): db.delete(DocumentAnnotation.all().fetch(10000)) db.delete(ArgumentationUnit.all().fetch(10000)) db.delete(UserData.all().fetch(10000))
def drop_all_data(self): db.delete(DocumentAnnotation.all().fetch(10000)) db.delete(ArgumentationUnit.all().fetch(10000)) db.delete(Document.all().fetch(10000)) db.delete(UserData.all().fetch(10000)) db.delete(CorpusMetadata.all().fetch(10000))
def setApproval(self, annotator, document, isApproved): annotation = DocumentAnnotation.all().filter("user_id =", annotator).filter("document =", document).get(); if annotation: annotation.approved = isApproved annotation.put()
def get(self): if not self.validate_user(): return if self.request.path.endswith('/annotate'): user_id = access_control.get_current_user_id() # Allow to take the role of another user if self.request.get('user', None) in access_control.get_all_users(): user_id = self.request.get('user', None) documents_sorted_by_id = get_docs_ordered_by_id(user_id) processing_state = get_processing_state(user_id) # Per user_id annotation statistics num_annotated, num_approved = self.get_evaluation_stats() template_values = { 'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.ANNOTATE_VIEW_ID } if not documents_sorted_by_id: template_values.update({ 'has_document': False, 'navigation_docs': [], 'num_annotated': num_annotated, 'num_approved': num_approved, 'message': "No documents to display." }) else: doc_filename = self.request.get('doc', None) doc = mark_as_current_doc(user_id, doc_filename, documents_sorted_by_id[0]) topics = get_all_topics() current_topic = get_current_topic(user_id) opening_time = time.strftime("%H:%M:%S %Z", time.localtime()) docs_with_processing_state = [(d, processing_state[d.filename]) for d in documents_sorted_by_id] # Get arg_units to show arg_units = [] relations = [] concepts = [] doc_approved = False notes = "" personal_annotations = DocumentAnnotation.all().filter( 'user_id =', user_id).filter('document =', doc.filename).get() if personal_annotations: doc_approved = personal_annotations.approved notes = personal_annotations.notes arg_units = [ str(item) for item in personal_annotations.arg_units ] relations = [ str(item) for item in personal_annotations.relations ] concepts = [ str(item) for item in personal_annotations.concepts ] template_values.update({ 'navigation_docs': docs_with_processing_state, 'documents_sorted_by_id': documents_sorted_by_id, 'text': doc.text, 'doc_url': doc.url, 'doc_filename': doc.filename, 'doc_approved': doc_approved, 'num_sentences': doc.num_sentences, 'num_tokens': doc.num_tokens, 'time': opening_time, 'arg_units': json.dumps(arg_units), 'relations': json.dumps(relations), 'concepts': json.dumps(concepts), 'all_topics': topics, 'current_topic': current_topic, 'num_annotated': num_annotated, 'num_approved': num_approved, 'notes': notes, 'has_document': True, 'message': "" }) template = JINJA_ENVIRONMENT.get_template('annotate.html') self.response.write(template.render(template_values)) elif self.request.path.endswith('/selecttopic'): user_id = access_control.get_current_user_id() topics = self.request.get_all('topic', None) set_as_current_topics(user_id, topics) self.redirect('/argunit/annotate')
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False) response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter( "user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[ i * documents_per_line:min(num_docs, (i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = { 'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message } template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')