def extract_metadata(dictionary): metadata = CorpusMetadata() metadata.segmenter = dictionary["segmenter"] if "segmenter" in dictionary else "" metadata.preprocessing_date = dictionary[ "preprocessing_date"] if "preprocessing_date" in dictionary else "" return metadata
def extract_metadata(dictionary): metadata = CorpusMetadata() metadata.segmenter = dictionary[ "segmenter"] if "segmenter" in dictionary else "" metadata.preprocessing_date = dictionary[ "preprocessing_date"] if "preprocessing_date" in dictionary else "" return metadata
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({"corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def dump_corpus(self): docs = Document.all().run() jsonResponse = [] metadata = CorpusMetadata.all().get() if metadata: jsonResponse.append({ "corpus_metadata": "true", "segmenter": metadata.segmenter, "preprocessing_date": metadata.preprocessing_date }) for doc in docs: annotations = [] for annotation in DocumentAnnotation.all().filter( "document =", doc.filename).run(): anno_dict = {} anno_dict['annotator'] = annotation.user_id anno_dict['arg_units'] = annotation.arg_units anno_dict['relations'] = annotation.relations anno_dict['concepts'] = annotation.concepts anno_dict['approved'] = str(annotation.approved) anno_dict['notes'] = annotation.notes annotations.append(anno_dict) jsonResponse.append({ 'file': doc.filename, 'text': doc.text, 'url': doc.url, 'user_annotations': annotations, 'num_tokens': doc.num_tokens, 'num_sentences': doc.num_sentences }) dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json" self.response.headers['Content-Type'] = 'application/json' self.response.headers[ 'Content-Disposition'] = "attachment; filename=%s" % dump_filename self.response.write( json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect('%s?%s' % ( self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False); response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, ( i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = {'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message} template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')
def drop_all_data(self): db.delete(DocumentAnnotation.all().fetch(10000)) db.delete(ArgumentationUnit.all().fetch(10000)) db.delete(Document.all().fetch(10000)) db.delete(UserData.all().fetch(10000)) db.delete(CorpusMetadata.all().fetch(10000))
def load_documents(self, force_update=False): response_text = "" data_folder = 'data/' metadata_filename = data_folder + 'metadata.properties' if os.path.exists(metadata_filename): db.delete(CorpusMetadata.all()) metadata_map = {} metadata_file = open(metadata_filename, 'r') for line in metadata_file.readlines(): if not line.startswith("#"): parts = line.split("=", 1) if len(parts) is 2: metadata_map[parts[0]] = parts[1].strip() metadata = extract_metadata(metadata_map) metadata.put() files = glob.glob(data_folder + '*.json') doc_ids = [] new_documents = 0 skipped_documents = 0 updated_documents = 0 dropped_documents = 0 if force_update: response_text += "Update forced!\n" for f in sorted(files): basename = os.path.basename(f) jdata = json.load(open(f, 'r')) documents_with_same_url = Document.all().filter("url =", jdata['url']) is_document_in_datastore = 0 != documents_with_same_url.count() jdata['file'] = basename doc_ids.append(basename) if is_document_in_datastore: existing_doc = documents_with_same_url.get() if force_update: initialize_document(existing_doc, jdata) existing_doc.put() response_text += 'UPDATED: ' + str(basename) + " " + str(jdata['url']) updated_documents += 1 else: response_text += 'SKIPPED: ' + str(basename) + " " + str(jdata['url']) skipped_documents += 1 else: doc = Document() initialize_document(doc, jdata) doc.put() response_text += ' NEW: ' + str(basename) + " " + str(jdata['url']) new_documents += 1 response_text += '\n' response_text += "----\n" if force_update: for document in Document.all(): if document.filename not in doc_ids: dropped_documents += 1 db.delete(document); response_text += "DROPPED: " + document.filename + "\n" response_text += "=" * 100 + "\n" response_text += "Summary:\n" response_text += "\tNew: " + str(new_documents) + "\n" response_text += "\tUpdated:" + str(updated_documents) + "\n" response_text += "\tSkipped:" + str(skipped_documents) + "\n" response_text += "\tDropped:" + str(dropped_documents) + "\n" return response_text
def get(self): user_id = access_control.get_current_user_id() if not self.validate_user(): return elif self.request.path.endswith('dump'): self.dump_corpus() elif self.request.path.endswith('dropall'): self.drop_all_data() self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'}))) elif self.request.path.endswith('dropanno'): self.drop_all_annotations() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'}))) elif self.request.path.endswith('loaddata'): response_text = self.load_documents() self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('forceupdate'): response_text = self.load_documents(force_update=True) self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text}))) elif self.request.path.endswith('unapprove'): annotator = self.request.get("annotator") document = self.request.get("doc") self.setApproval(annotator, document, False) response_text = "Unapproved: %s:%s" % (annotator, document) self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text}))) elif self.request.path.endswith('/managedata'): all_documents = [doc.filename for doc in Document.all()] all_documents.sort() all_users = access_control.get_all_users() all_users.sort() status_table = dict() for user in all_users: status_table[user] = dict() for doc in all_documents: anno = DocumentAnnotation.all().filter( "user_id =", user).filter("document =", doc).get() if not anno: status_table[user][doc] = UNPROCESSED elif not anno.approved: status_table[user][doc] = IN_PROGRESS else: status_table[user][doc] = COMPLETE documents_per_line = 44 num_docs = len(all_documents) num_lines = (num_docs + documents_per_line - 1) / documents_per_line partitioned_docs = [] for i in range(0, num_lines): partitioned_docs.append(all_documents[ i * documents_per_line:min(num_docs, (i + 1) * documents_per_line)]) message = self.request.get('message', "") verbatim_message = self.request.get('verbatim_message', "") metadata = CorpusMetadata.all().get() segmenter = "unknown" preprocessing_date = "unknown" if metadata: segmenter = metadata.segmenter preprocessing_date = metadata.preprocessing_date template_values = { 'user': user_id, 'logout_url': users.create_logout_url('/argunit/'), 'all_views': access_control.get_view_ids(user_id), 'current_view': access_control.MANAGE_DATA_VIEW_ID, 'num_documents': len(all_documents), 'segmenter': segmenter, 'preprocessing_date': preprocessing_date, 'all_documents': all_documents, 'docs_per_line': documents_per_line, 'partitioned_docs': partitioned_docs, 'all_users': all_users, 'status_table': status_table, 'message': message, 'verbatim_message': verbatim_message } template = JINJA_ENVIRONMENT.get_template('managedata.html') self.response.write(template.render(template_values)) else: self.redirect('/argunit/managedata')
def load_documents(self, force_update=False): response_text = "" data_folder = 'data/' metadata_filename = data_folder + 'metadata.properties' if os.path.exists(metadata_filename): db.delete(CorpusMetadata.all()) metadata_map = {} metadata_file = open(metadata_filename, 'r') for line in metadata_file.readlines(): if not line.startswith("#"): parts = line.split("=", 1) if len(parts) is 2: metadata_map[parts[0]] = parts[1].strip() metadata = extract_metadata(metadata_map) metadata.put() files = glob.glob(data_folder + '*.json') doc_ids = [] new_documents = 0 skipped_documents = 0 updated_documents = 0 dropped_documents = 0 if force_update: response_text += "Update forced!\n" for f in sorted(files): basename = os.path.basename(f) jdata = json.load(open(f, 'r')) documents_with_same_url = Document.all().filter( "url =", jdata['url']) is_document_in_datastore = 0 != documents_with_same_url.count() jdata['file'] = basename doc_ids.append(basename) if is_document_in_datastore: existing_doc = documents_with_same_url.get() if force_update: initialize_document(existing_doc, jdata) existing_doc.put() response_text += 'UPDATED: ' + str(basename) + " " + str( jdata['url']) updated_documents += 1 else: response_text += 'SKIPPED: ' + str(basename) + " " + str( jdata['url']) skipped_documents += 1 else: doc = Document() initialize_document(doc, jdata) doc.put() response_text += ' NEW: ' + str(basename) + " " + str( jdata['url']) new_documents += 1 response_text += '\n' response_text += "----\n" if force_update: for document in Document.all(): if document.filename not in doc_ids: dropped_documents += 1 db.delete(document) response_text += "DROPPED: " + document.filename + "\n" response_text += "=" * 100 + "\n" response_text += "Summary:\n" response_text += "\tNew: " + str(new_documents) + "\n" response_text += "\tUpdated:" + str(updated_documents) + "\n" response_text += "\tSkipped:" + str(skipped_documents) + "\n" response_text += "\tDropped:" + str(dropped_documents) + "\n" return response_text