def post(self): if not self.validate_user(): return if self.request.path.endswith('importannotations'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_annotations() verbatim_message = "" json_data = json.loads(dump_content) for doc_data in json_data: if "corpus_metadata" in doc_data: continue filename = doc_data["file"] doc = Document.all().filter("filename =", filename).get() if doc: arg_units = [] relations = [] concepts = [] for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json( annotation_data, doc) arg_units.extend(anno.arg_units) relations.extend(anno.relations) concepts.extend(anno.concepts) anno.put() verbatim_message += "IMPORTED %25s: %4d arg. units\n" % ( filename, len(arg_units)) else: verbatim_message += "SKIPPED %25s: Document not in collection.\n" % ( filename) message = "Annotations imported." else: verbatim_message = "" message = "No file to import!" self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({ "message": message, "verbatim_message": verbatim_message }))) if self.request.path.endswith('importdump'): dump_content = self.request.get("dumpfile") if dump_content: self.drop_all_data() json_data = json.loads(dump_content) docs = [] annos = [] for doc_data in json_data: if "corpus_metadata" in doc_data: metadata = extract_metadata(doc_data) metadata.put() else: doc = Document() initialize_document(doc, doc_data) docs.append(doc) for annotation_data in doc_data["user_annotations"]: anno = self.annotation_from_json( annotation_data, doc) annos.append(anno) db.put(docs) db.put(annos) message = "Corpus dump imported." else: message = "No file to import!" self.redirect( '%s?%s' % (self.base_path(), urllib.urlencode({"message": message})))
def load_documents(self, force_update=False): response_text = "" data_folder = 'data/' metadata_filename = data_folder + 'metadata.properties' if os.path.exists(metadata_filename): db.delete(CorpusMetadata.all()) metadata_map = {} metadata_file = open(metadata_filename, 'r') for line in metadata_file.readlines(): if not line.startswith("#"): parts = line.split("=", 1) if len(parts) is 2: metadata_map[parts[0]] = parts[1].strip() metadata = extract_metadata(metadata_map) metadata.put() files = glob.glob(data_folder + '*.json') doc_ids = [] new_documents = 0 skipped_documents = 0 updated_documents = 0 dropped_documents = 0 if force_update: response_text += "Update forced!\n" for f in sorted(files): basename = os.path.basename(f) jdata = json.load(open(f, 'r')) documents_with_same_url = Document.all().filter( "url =", jdata['url']) is_document_in_datastore = 0 != documents_with_same_url.count() jdata['file'] = basename doc_ids.append(basename) if is_document_in_datastore: existing_doc = documents_with_same_url.get() if force_update: initialize_document(existing_doc, jdata) existing_doc.put() response_text += 'UPDATED: ' + str(basename) + " " + str( jdata['url']) updated_documents += 1 else: response_text += 'SKIPPED: ' + str(basename) + " " + str( jdata['url']) skipped_documents += 1 else: doc = Document() initialize_document(doc, jdata) doc.put() response_text += ' NEW: ' + str(basename) + " " + str( jdata['url']) new_documents += 1 response_text += '\n' response_text += "----\n" if force_update: for document in Document.all(): if document.filename not in doc_ids: dropped_documents += 1 db.delete(document) response_text += "DROPPED: " + document.filename + "\n" response_text += "=" * 100 + "\n" response_text += "Summary:\n" response_text += "\tNew: " + str(new_documents) + "\n" response_text += "\tUpdated:" + str(updated_documents) + "\n" response_text += "\tSkipped:" + str(skipped_documents) + "\n" response_text += "\tDropped:" + str(dropped_documents) + "\n" return response_text