Python Document Examples, au.data.datastore.Document Python Examples

Example #1

0

Show file

    def store_annotations(self, user_id, data):
        doc_filename = data['doc']
        doc = Document.all().filter("filename =", doc_filename).get()

        doc_annotation = DocumentAnnotation.all().filter(
            "user_id =", user_id).filter("document =", doc.filename).get()
        if not doc_annotation:
            doc_annotation = DocumentAnnotation(user_id=user_id,
                                                document=doc.filename)

        doc_annotation.approved = data["approved"]

        doc_annotation.concepts = [Text(item) for item in data["concepts"]]
        doc_annotation.relations = [Text(item) for item in data["relations"]]
        doc_annotation.arg_units = [Text(item) for item in data["arg_units"]]

        doc_annotation.notes = Text(data['notes'])

        log("Storing annotations " +
            ("[Approved]" if doc_annotation.approved else "") + " for " +
            str(doc.filename) + ": " + str(doc_annotation.arg_units) +
            " - notes: " +
            doc_annotation.notes.encode("utf-8").replace("\n", " NL "))

        db.get(doc_annotation.put())

Example #2

0

Show file

File: utils.py Project: UKPLab/argmin2015-DiGAT

def get_docs_in_topics_by_id(topics):
    result = []
    for doc in Document.all().order('filename').run():
        if not topics or doc.topic in topics:
            result.append(doc)

    return result

Example #3

0

Show file

def get_all_topics():
    """
    Collects and sorts the unique topics in the document collection.
    """
    topics = [document.topic for document in Document.all().run()]
    topics.append(ALL_TOPICS_PLACEHOLDER)
    return sorted(list(set(topics)))

Example #4

0

Show file

def get_docs_in_topics_by_id(topics):
    result = []
    for doc in Document.all().order('filename').run():
        if not topics or doc.topic in topics:
            result.append(doc)

    return result

Example #5

0

Show file

File: utils.py Project: UKPLab/argmin2015-DiGAT

def get_all_topics():
    """
    Collects and sorts the unique topics in the document collection.
    """
    topics = [document.topic for document in Document.all().run()]
    topics.append(ALL_TOPICS_PLACEHOLDER)
    return sorted(list(set(topics)))

Example #6

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({"corpus_metadata": "true",
                                 "segmenter": metadata.segmenter,
                                 "preprocessing_date": metadata.preprocessing_date
                                 })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({'file': doc.filename,
                                 'text': doc.text,
                                 'url': doc.url,
                                 'user_annotations': annotations,
                                 'num_tokens': doc.num_tokens,
                                 'num_sentences': doc.num_sentences
                                 })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))

Example #7

0

Show file

    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({
                "corpus_metadata":
                "true",
                "segmenter":
                metadata.segmenter,
                "preprocessing_date":
                metadata.preprocessing_date
            })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter(
                    "document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({
                'file': doc.filename,
                'text': doc.text,
                'url': doc.url,
                'user_annotations': annotations,
                'num_tokens': doc.num_tokens,
                'num_sentences': doc.num_sentences
            })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers[
            'Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse,
                       indent=2,
                       sort_keys=False,
                       separators=(',', ':')))

Example #8

0

Show file

File: utils.py Project: UKPLab/argmin2015-DiGAT

def mark_as_current_doc(user_id, doc_filename, default_doc):
    """
    According to the given document filename, select the current document for the given user_id
    1. If doc_filename is not None, it takes precedence
    2. If the user_id has no current document or there is no document for the given filename, 
       resort to the given default value
    """
    user_data = load_user_data(user_id)

    if doc_filename:
        doc = Document.all().filter("filename =", doc_filename).get()
        if not doc:
            doc = default_doc
    elif user_data.current_doc:
        doc = user_data.current_doc
    else:
        doc = default_doc

    user_data.current_doc = doc
    db.get(user_data.put())

    return doc

Example #9

0

Show file

def mark_as_current_doc(user_id, doc_filename, default_doc):
    """
    According to the given document filename, select the current document for the given user_id
    1. If doc_filename is not None, it takes precedence
    2. If the user_id has no current document or there is no document for the given filename, 
       resort to the given default value
    """
    user_data = load_user_data(user_id)

    if doc_filename:
        doc = Document.all().filter("filename =", doc_filename).get()
        if not doc:
            doc = default_doc
    elif user_data.current_doc:
        doc = user_data.current_doc
    else:
        doc = default_doc

    user_data.current_doc = doc
    db.get(user_data.put())

    return doc

Example #10

0

Show file

File: annotate.py Project: UKPLab/argmin2015-DiGAT

    def store_annotations(self, user_id, data):
        doc_filename = data['doc']
        doc = Document.all().filter("filename =", doc_filename).get()

        doc_annotation = DocumentAnnotation.all().filter("user_id =", user_id).filter("document =",
                                                                                      doc.filename).get()
        if not doc_annotation:
            doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename)

        doc_annotation.approved = data["approved"]

        doc_annotation.concepts = [Text(item) for item in data["concepts"]]
        doc_annotation.relations = [Text(item) for item in data["relations"]]
        doc_annotation.arg_units = [Text(item) for item in data["arg_units"]]

        doc_annotation.notes = Text(data['notes'])

        log("Storing annotations " + (
        "[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str(
            doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace(
            "\n", " NL "))

        db.get(doc_annotation.put())

Example #11

0

Show file

 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))

Example #12

0

Show file

    def post(self):

        if not self.validate_user():
            return
        if self.request.path.endswith('importannotations'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_annotations()
                verbatim_message = ""

                json_data = json.loads(dump_content)
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        continue

                    filename = doc_data["file"]

                    doc = Document.all().filter("filename =", filename).get()
                    if doc:

                        arg_units = []
                        relations = []
                        concepts = []

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(
                                annotation_data, doc)
                            arg_units.extend(anno.arg_units)
                            relations.extend(anno.relations)
                            concepts.extend(anno.concepts)
                            anno.put()

                        verbatim_message += "IMPORTED %25s: %4d arg. units\n" % (
                            filename, len(arg_units))
                    else:
                        verbatim_message += "SKIPPED  %25s: Document not in collection.\n" % (
                            filename)

                message = "Annotations imported."
            else:
                verbatim_message = ""
                message = "No file to import!"

            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({
                     "message": message,
                     "verbatim_message": verbatim_message
                 })))
        if self.request.path.endswith('importdump'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_data()

                json_data = json.loads(dump_content)
                docs = []
                annos = []
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        metadata = extract_metadata(doc_data)
                        metadata.put()
                    else:
                        doc = Document()
                        initialize_document(doc, doc_data)
                        docs.append(doc)

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(
                                annotation_data, doc)
                            annos.append(anno)

                db.put(docs)
                db.put(annos)

                message = "Corpus dump imported."
            else:
                message = "No file to import!"

            self.redirect(
                '%s?%s' %
                (self.base_path(), urllib.urlencode({"message": message})))

Example #13

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect('%s?%s' % (
            self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False);
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =",
                                                                                     doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line - 1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, (
                i + 1) * documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {'user': user_id,
                               'logout_url': users.create_logout_url('/argunit/'),
                               'all_views': access_control.get_view_ids(user_id),
                               'current_view': access_control.MANAGE_DATA_VIEW_ID,
                               'num_documents': len(all_documents),
                               'segmenter': segmenter,
                               'preprocessing_date': preprocessing_date,
                               'all_documents': all_documents,
                               'docs_per_line': documents_per_line,
                               'partitioned_docs': partitioned_docs,
                               'all_users': all_users,
                               'status_table': status_table,
                               'message': message,
                               'verbatim_message': verbatim_message}
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')

Example #14

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def post(self):

        if not self.validate_user():
            return
        if self.request.path.endswith('importannotations'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_annotations()
                verbatim_message = ""

                json_data = json.loads(dump_content)
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        continue

                    filename = doc_data["file"]

                    doc = Document.all().filter("filename =", filename).get()
                    if doc:

                        arg_units = []
                        relations = []
                        concepts = []

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(annotation_data, doc)
                            arg_units.extend(anno.arg_units)
                            relations.extend(anno.relations)
                            concepts.extend(anno.concepts)
                            anno.put()

                        verbatim_message += "IMPORTED %25s: %4d arg. units\n" % (
                        filename, len(arg_units))
                    else:
                        verbatim_message += "SKIPPED  %25s: Document not in collection.\n" % (
                        filename)

                message = "Annotations imported."
            else:
                verbatim_message = ""
                message = "No file to import!"

            self.redirect('%s?%s' % (self.base_path(), urllib.urlencode(
                {"message": message, "verbatim_message": verbatim_message})))
        if self.request.path.endswith('importdump'):
            dump_content = self.request.get("dumpfile")
            if dump_content:

                self.drop_all_data()

                json_data = json.loads(dump_content)
                docs = []
                annos = []
                for doc_data in json_data:
                    if "corpus_metadata" in doc_data:
                        metadata = extract_metadata(doc_data)
                        metadata.put()
                    else:
                        doc = Document()
                        initialize_document(doc, doc_data)
                        docs.append(doc)

                        for annotation_data in doc_data["user_annotations"]:
                            anno = self.annotation_from_json(annotation_data, doc)
                            annos.append(anno)

                db.put(docs);
                db.put(annos);

                message = "Corpus dump imported."
            else:
                message = "No file to import!"

            self.redirect('%s?%s' % (self.base_path(), urllib.urlencode({"message": message})))

Example #15

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))

Example #16

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def load_documents(self, force_update=False):
        response_text = ""

        data_folder = 'data/'
        metadata_filename = data_folder + 'metadata.properties'

        if os.path.exists(metadata_filename):
            db.delete(CorpusMetadata.all())
            metadata_map = {}
            metadata_file = open(metadata_filename, 'r')
            for line in metadata_file.readlines():
                if not line.startswith("#"):
                    parts = line.split("=", 1)
                    if len(parts) is 2:
                        metadata_map[parts[0]] = parts[1].strip()
            metadata = extract_metadata(metadata_map)
            metadata.put()

        files = glob.glob(data_folder + '*.json')
        doc_ids = []
        new_documents = 0
        skipped_documents = 0
        updated_documents = 0
        dropped_documents = 0

        if force_update:
            response_text += "Update forced!\n"

        for f in sorted(files):
            basename = os.path.basename(f)
            jdata = json.load(open(f, 'r'))

            documents_with_same_url = Document.all().filter("url =", jdata['url'])
            is_document_in_datastore = 0 != documents_with_same_url.count()

            jdata['file'] = basename
            doc_ids.append(basename)

            if is_document_in_datastore:

                existing_doc = documents_with_same_url.get()

                if force_update:
                    initialize_document(existing_doc, jdata)
                    existing_doc.put()

                    response_text += 'UPDATED: ' + str(basename) + " " + str(jdata['url'])
                    updated_documents += 1

                else:

                    response_text += 'SKIPPED: ' + str(basename) + " " + str(jdata['url'])
                    skipped_documents += 1
            else:
                doc = Document()
                initialize_document(doc, jdata)

                doc.put()

                response_text += '    NEW: ' + str(basename) + " " + str(jdata['url'])
                new_documents += 1

            response_text += '\n'

        response_text += "----\n"

        if force_update:
            for document in Document.all():
                if document.filename not in doc_ids:
                    dropped_documents += 1
                    db.delete(document);
                    response_text += "DROPPED: " + document.filename + "\n"

        response_text += "=" * 100 + "\n"
        response_text += "Summary:\n"
        response_text += "\tNew:    " + str(new_documents) + "\n"
        response_text += "\tUpdated:" + str(updated_documents) + "\n"
        response_text += "\tSkipped:" + str(skipped_documents) + "\n"
        response_text += "\tDropped:" + str(dropped_documents) + "\n"

        return response_text

Example #17

0

Show file

    def load_documents(self, force_update=False):
        response_text = ""

        data_folder = 'data/'
        metadata_filename = data_folder + 'metadata.properties'

        if os.path.exists(metadata_filename):
            db.delete(CorpusMetadata.all())
            metadata_map = {}
            metadata_file = open(metadata_filename, 'r')
            for line in metadata_file.readlines():
                if not line.startswith("#"):
                    parts = line.split("=", 1)
                    if len(parts) is 2:
                        metadata_map[parts[0]] = parts[1].strip()
            metadata = extract_metadata(metadata_map)
            metadata.put()

        files = glob.glob(data_folder + '*.json')
        doc_ids = []
        new_documents = 0
        skipped_documents = 0
        updated_documents = 0
        dropped_documents = 0

        if force_update:
            response_text += "Update forced!\n"

        for f in sorted(files):
            basename = os.path.basename(f)
            jdata = json.load(open(f, 'r'))

            documents_with_same_url = Document.all().filter(
                "url =", jdata['url'])
            is_document_in_datastore = 0 != documents_with_same_url.count()

            jdata['file'] = basename
            doc_ids.append(basename)

            if is_document_in_datastore:

                existing_doc = documents_with_same_url.get()

                if force_update:
                    initialize_document(existing_doc, jdata)
                    existing_doc.put()

                    response_text += 'UPDATED: ' + str(basename) + " " + str(
                        jdata['url'])
                    updated_documents += 1

                else:

                    response_text += 'SKIPPED: ' + str(basename) + " " + str(
                        jdata['url'])
                    skipped_documents += 1
            else:
                doc = Document()
                initialize_document(doc, jdata)

                doc.put()

                response_text += '    NEW: ' + str(basename) + " " + str(
                    jdata['url'])
                new_documents += 1

            response_text += '\n'

        response_text += "----\n"

        if force_update:
            for document in Document.all():
                if document.filename not in doc_ids:
                    dropped_documents += 1
                    db.delete(document)
                    response_text += "DROPPED: " + document.filename + "\n"

        response_text += "=" * 100 + "\n"
        response_text += "Summary:\n"
        response_text += "\tNew:    " + str(new_documents) + "\n"
        response_text += "\tUpdated:" + str(updated_documents) + "\n"
        response_text += "\tSkipped:" + str(skipped_documents) + "\n"
        response_text += "\tDropped:" + str(dropped_documents) + "\n"

        return response_text

Example #18

0

Show file

    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False)
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter(
                        "user_id =", user).filter("document =", doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line -
                         1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[
                    i * documents_per_line:min(num_docs, (i + 1) *
                                               documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {
                'user': user_id,
                'logout_url': users.create_logout_url('/argunit/'),
                'all_views': access_control.get_view_ids(user_id),
                'current_view': access_control.MANAGE_DATA_VIEW_ID,
                'num_documents': len(all_documents),
                'segmenter': segmenter,
                'preprocessing_date': preprocessing_date,
                'all_documents': all_documents,
                'docs_per_line': documents_per_line,
                'partitioned_docs': partitioned_docs,
                'all_users': all_users,
                'status_table': status_table,
                'message': message,
                'verbatim_message': verbatim_message
            }
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')