Exemple #1
0
    def post(self):

        if not self.validate_user():
            return
        if self.request.path.endswith('importannotations'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_annotations()
                verbatim_message = ""

                json_data = json.loads(dump_content)
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        continue

                    filename = doc_data["file"]

                    doc = Document.all().filter("filename =", filename).get()
                    if doc:

                        arg_units = []
                        relations = []
                        concepts = []

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(
                                annotation_data, doc)
                            arg_units.extend(anno.arg_units)
                            relations.extend(anno.relations)
                            concepts.extend(anno.concepts)
                            anno.put()

                        verbatim_message += "IMPORTED %25s: %4d arg. units\n" % (
                            filename, len(arg_units))
                    else:
                        verbatim_message += "SKIPPED  %25s: Document not in collection.\n" % (
                            filename)

                message = "Annotations imported."
            else:
                verbatim_message = ""
                message = "No file to import!"

            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({
                     "message": message,
                     "verbatim_message": verbatim_message
                 })))
        if self.request.path.endswith('importdump'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_data()

                json_data = json.loads(dump_content)
                docs = []
                annos = []
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        metadata = extract_metadata(doc_data)
                        metadata.put()
                    else:
                        doc = Document()
                        initialize_document(doc, doc_data)
                        docs.append(doc)

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(
                                annotation_data, doc)
                            annos.append(anno)

                db.put(docs)
                db.put(annos)

                message = "Corpus dump imported."
            else:
                message = "No file to import!"

            self.redirect(
                '%s?%s' %
                (self.base_path(), urllib.urlencode({"message": message})))
Exemple #2
0
    def load_documents(self, force_update=False):
        response_text = ""

        data_folder = 'data/'
        metadata_filename = data_folder + 'metadata.properties'

        if os.path.exists(metadata_filename):
            db.delete(CorpusMetadata.all())
            metadata_map = {}
            metadata_file = open(metadata_filename, 'r')
            for line in metadata_file.readlines():
                if not line.startswith("#"):
                    parts = line.split("=", 1)
                    if len(parts) is 2:
                        metadata_map[parts[0]] = parts[1].strip()
            metadata = extract_metadata(metadata_map)
            metadata.put()

        files = glob.glob(data_folder + '*.json')
        doc_ids = []
        new_documents = 0
        skipped_documents = 0
        updated_documents = 0
        dropped_documents = 0

        if force_update:
            response_text += "Update forced!\n"

        for f in sorted(files):
            basename = os.path.basename(f)
            jdata = json.load(open(f, 'r'))

            documents_with_same_url = Document.all().filter(
                "url =", jdata['url'])
            is_document_in_datastore = 0 != documents_with_same_url.count()

            jdata['file'] = basename
            doc_ids.append(basename)

            if is_document_in_datastore:

                existing_doc = documents_with_same_url.get()

                if force_update:
                    initialize_document(existing_doc, jdata)
                    existing_doc.put()

                    response_text += 'UPDATED: ' + str(basename) + " " + str(
                        jdata['url'])
                    updated_documents += 1

                else:

                    response_text += 'SKIPPED: ' + str(basename) + " " + str(
                        jdata['url'])
                    skipped_documents += 1
            else:
                doc = Document()
                initialize_document(doc, jdata)

                doc.put()

                response_text += '    NEW: ' + str(basename) + " " + str(
                    jdata['url'])
                new_documents += 1

            response_text += '\n'

        response_text += "----\n"

        if force_update:
            for document in Document.all():
                if document.filename not in doc_ids:
                    dropped_documents += 1
                    db.delete(document)
                    response_text += "DROPPED: " + document.filename + "\n"

        response_text += "=" * 100 + "\n"
        response_text += "Summary:\n"
        response_text += "\tNew:    " + str(new_documents) + "\n"
        response_text += "\tUpdated:" + str(updated_documents) + "\n"
        response_text += "\tSkipped:" + str(skipped_documents) + "\n"
        response_text += "\tDropped:" + str(dropped_documents) + "\n"

        return response_text