def store_annotations(self, user_id, data): doc_filename = data['doc'] doc = Document.all().filter("filename =", doc_filename).get() doc_annotation = DocumentAnnotation.all().filter( "user_id =", user_id).filter("document =", doc.filename).get() if not doc_annotation: doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename) doc_annotation.approved = data["approved"] doc_annotation.concepts = [Text(item) for item in data["concepts"]] doc_annotation.relations = [Text(item) for item in data["relations"]] doc_annotation.arg_units = [Text(item) for item in data["arg_units"]] doc_annotation.notes = Text(data['notes']) log("Storing annotations " + ("[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str(doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace("\n", " NL ")) db.get(doc_annotation.put())
def get_docs_in_topics_by_id(topics): result = [] for doc in Document.all().order('filename').run(): if not topics or doc.topic in topics: result.append(doc) return result
def get_all_topics(): """ Collects and sorts the unique topics in the document collection. """ topics = [document.topic for document in Document.all().run()] topics.append(ALL_TOPICS_PLACEHOLDER) return sorted(list(set(topics)))
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({"corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({ "corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter( "document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({ 'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers[ 'Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def mark_as_current_doc(user_id, doc_filename, default_doc): """ According to the given document filename, select the current document for the given user_id 1. If doc_filename is not None, it takes precedence 2. If the user_id has no current document or there is no document for the given filename, resort to the given default value """ user_data = load_user_data(user_id) if doc_filename: doc = Document.all().filter("filename =", doc_filename).get() if not doc: doc = default_doc elif user_data.current_doc: doc = user_data.current_doc else: doc = default_doc user_data.current_doc = doc db.get(user_data.put()) return doc
def store_annotations(self, user_id, data): doc_filename = data['doc'] doc = Document.all().filter("filename =", doc_filename).get() doc_annotation = DocumentAnnotation.all().filter("user_id =", user_id).filter("document =", doc.filename).get() if not doc_annotation: doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename) doc_annotation.approved = data["approved"] doc_annotation.concepts = [Text(item) for item in data["concepts"]] doc_annotation.relations = [Text(item) for item in data["relations"]] doc_annotation.arg_units = [Text(item) for item in data["arg_units"]] doc_annotation.notes = Text(data['notes']) log("Storing annotations " + ( "[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str( doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace( "\n", " NL ")) db.get(doc_annotation.put())
def drop_all_data(self): db.delete(DocumentAnnotation.all().fetch(10000)) db.delete(ArgumentationUnit.all().fetch(10000)) db.delete(Document.all().fetch(10000)) db.delete(UserData.all().fetch(10000)) db.delete(CorpusMetadata.all().fetch(10000))
def post(self): if not self.validate_user(): return if self.request.path.endswith('importannotations'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_annotations() verbatim_message = "" json_data = json.loads(dump_content) for doc_data in json_data: if "corpus_metadata" in doc_data: continue filename = doc_data["file"] doc = Document.all().filter("filename =", filename).get() if doc: arg_units = [] relations = [] concepts = [] for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json( annotation_data, doc) arg_units.extend(anno.arg_units) relations.extend(anno.relations) concepts.extend(anno.concepts) anno.put() verbatim_message += "IMPORTED %25s: %4d arg. units\n" % ( filename, len(arg_units)) else: verbatim_message += "SKIPPED %25s: Document not in collection.\n" % ( filename) message = "Annotations imported." else: verbatim_message = "" message = "No file to import!" self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({ "message": message, "verbatim_message": verbatim_message }))) if self.request.path.endswith('importdump'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_data() json_data = json.loads(dump_content) docs = [] annos = [] for doc_data in json_data: if "corpus_metadata" in doc_data: metadata = extract_metadata(doc_data) metadata.put() else: doc = Document() initialize_document(doc, doc_data) docs.append(doc) for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json( annotation_data, doc) annos.append(anno) db.put(docs) db.put(annos) message = "Corpus dump imported." else: message = "No file to import!" self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": message})))
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect('%s?%s' % ( self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False); response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, ( i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = {'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message} template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')
def post(self): if not self.validate_user(): return if self.request.path.endswith('importannotations'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_annotations() verbatim_message = "" json_data = json.loads(dump_content) for doc_data in json_data: if "corpus_metadata" in doc_data: continue filename = doc_data["file"] doc = Document.all().filter("filename =", filename).get() if doc: arg_units = [] relations = [] concepts = [] for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json(annotation_data, doc) arg_units.extend(anno.arg_units) relations.extend(anno.relations) concepts.extend(anno.concepts) anno.put() verbatim_message += "IMPORTED %25s: %4d arg. units\n" % ( filename, len(arg_units)) else: verbatim_message += "SKIPPED %25s: Document not in collection.\n" % ( filename) message = "Annotations imported." else: verbatim_message = "" message = "No file to import!" self.redirect('%s?%s' % (self.base_path(), urllib.urlencode( {"message": message, "verbatim_message": verbatim_message}))) if self.request.path.endswith('importdump'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_data() json_data = json.loads(dump_content) docs = [] annos = [] for doc_data in json_data: if "corpus_metadata" in doc_data: metadata = extract_metadata(doc_data) metadata.put() else: doc = Document() initialize_document(doc, doc_data) docs.append(doc) for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json(annotation_data, doc) annos.append(anno) db.put(docs); db.put(annos); message = "Corpus dump imported." else: message = "No file to import!" self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": message})))
def load_documents(self, force_update=False): response_text = "" data_folder = 'data/' metadata_filename = data_folder + 'metadata.properties' if os.path.exists(metadata_filename): db.delete(CorpusMetadata.all()) metadata_map = {} metadata_file = open(metadata_filename, 'r') for line in metadata_file.readlines(): if not line.startswith("#"): parts = line.split("=", 1) if len(parts) is 2: metadata_map[parts[0]] = parts[1].strip() metadata = extract_metadata(metadata_map) metadata.put() files = glob.glob(data_folder + '*.json') doc_ids = [] new_documents = 0 skipped_documents = 0 updated_documents = 0 dropped_documents = 0 if force_update: response_text += "Update forced!\n" for f in sorted(files): basename = os.path.basename(f) jdata = json.load(open(f, 'r')) documents_with_same_url = Document.all().filter("url =", jdata['url']) is_document_in_datastore = 0 != documents_with_same_url.count() jdata['file'] = basename doc_ids.append(basename) if is_document_in_datastore: existing_doc = documents_with_same_url.get() if force_update: initialize_document(existing_doc, jdata) existing_doc.put() response_text += 'UPDATED: ' + str(basename) + " " + str(jdata['url']) updated_documents += 1 else: response_text += 'SKIPPED: ' + str(basename) + " " + str(jdata['url']) skipped_documents += 1 else: doc = Document() initialize_document(doc, jdata) doc.put() response_text += ' NEW: ' + str(basename) + " " + str(jdata['url']) new_documents += 1 response_text += '\n' response_text += "----\n" if force_update: for document in Document.all(): if document.filename not in doc_ids: dropped_documents += 1 db.delete(document); response_text += "DROPPED: " + document.filename + "\n" response_text += "=" * 100 + "\n" response_text += "Summary:\n" response_text += "\tNew: " + str(new_documents) + "\n" response_text += "\tUpdated:" + str(updated_documents) + "\n" response_text += "\tSkipped:" + str(skipped_documents) + "\n" response_text += "\tDropped:" + str(dropped_documents) + "\n" return response_text
def load_documents(self, force_update=False): response_text = "" data_folder = 'data/' metadata_filename = data_folder + 'metadata.properties' if os.path.exists(metadata_filename): db.delete(CorpusMetadata.all()) metadata_map = {} metadata_file = open(metadata_filename, 'r') for line in metadata_file.readlines(): if not line.startswith("#"): parts = line.split("=", 1) if len(parts) is 2: metadata_map[parts[0]] = parts[1].strip() metadata = extract_metadata(metadata_map) metadata.put() files = glob.glob(data_folder + '*.json') doc_ids = [] new_documents = 0 skipped_documents = 0 updated_documents = 0 dropped_documents = 0 if force_update: response_text += "Update forced!\n" for f in sorted(files): basename = os.path.basename(f) jdata = json.load(open(f, 'r')) documents_with_same_url = Document.all().filter( "url =", jdata['url']) is_document_in_datastore = 0 != documents_with_same_url.count() jdata['file'] = basename doc_ids.append(basename) if is_document_in_datastore: existing_doc = documents_with_same_url.get() if force_update: initialize_document(existing_doc, jdata) existing_doc.put() response_text += 'UPDATED: ' + str(basename) + " " + str( jdata['url']) updated_documents += 1 else: response_text += 'SKIPPED: ' + str(basename) + " " + str( jdata['url']) skipped_documents += 1 else: doc = Document() initialize_document(doc, jdata) doc.put() response_text += ' NEW: ' + str(basename) + " " + str( jdata['url']) new_documents += 1 response_text += '\n' response_text += "----\n" if force_update: for document in Document.all(): if document.filename not in doc_ids: dropped_documents += 1 db.delete(document) response_text += "DROPPED: " + document.filename + "\n" response_text += "=" * 100 + "\n" response_text += "Summary:\n" response_text += "\tNew: " + str(new_documents) + "\n" response_text += "\tUpdated:" + str(updated_documents) + "\n" response_text += "\tSkipped:" + str(skipped_documents) + "\n" response_text += "\tDropped:" + str(dropped_documents) + "\n" return response_text
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False) response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter( "user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[ i * documents_per_line:min(num_docs, (i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = { 'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message } template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')