Python DocumentAnnotation Examples, au.data.datastore.DocumentAnnotation Python Examples

Example #1

0

Show file

    def store_annotations(self, user_id, data):
        doc_filename = data['doc']
        doc = Document.all().filter("filename =", doc_filename).get()

        doc_annotation = DocumentAnnotation.all().filter(
            "user_id =", user_id).filter("document =", doc.filename).get()
        if not doc_annotation:
            doc_annotation = DocumentAnnotation(user_id=user_id,
                                                document=doc.filename)

        doc_annotation.approved = data["approved"]

        doc_annotation.concepts = [Text(item) for item in data["concepts"]]
        doc_annotation.relations = [Text(item) for item in data["relations"]]
        doc_annotation.arg_units = [Text(item) for item in data["arg_units"]]

        doc_annotation.notes = Text(data['notes'])

        log("Storing annotations " +
            ("[Approved]" if doc_annotation.approved else "") + " for " +
            str(doc.filename) + ": " + str(doc_annotation.arg_units) +
            " - notes: " +
            doc_annotation.notes.encode("utf-8").replace("\n", " NL "))

        db.get(doc_annotation.put())

Example #2

0

Show file

File: annotate.py Project: UKPLab/argmin2015-DiGAT

 def get_evaluation_stats(self):
     """
     Collects the statistics that are show on the top right:
     
     @return: a list of tuples: (username, num_started, num_approved)
     """
     user_id = users.get_current_user().email()
     num_annotated = DocumentAnnotation.all().filter('user_id =', user_id).count()
     num_approved = DocumentAnnotation.all().filter('user_id =', user_id).filter('approved =',
                                                                                 True).count()
     return (num_annotated, num_approved)

Example #3

0

Show file

 def get_evaluation_stats(self):
     """
     Collects the statistics that are show on the top right:
     
     @return: a list of tuples: (username, num_started, num_approved)
     """
     user_id = users.get_current_user().email()
     num_annotated = DocumentAnnotation.all().filter('user_id =',
                                                     user_id).count()
     num_approved = DocumentAnnotation.all().filter(
         'user_id =', user_id).filter('approved =', True).count()
     return (num_annotated, num_approved)

Example #4

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({"corpus_metadata": "true",
                                 "segmenter": metadata.segmenter,
                                 "preprocessing_date": metadata.preprocessing_date
                                 })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({'file': doc.filename,
                                 'text': doc.text,
                                 'url': doc.url,
                                 'user_annotations': annotations,
                                 'num_tokens': doc.num_tokens,
                                 'num_sentences': doc.num_sentences
                                 })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))

Example #5

0

Show file

File: compare.py Project: UKPLab/argmin2015-DiGAT

    def load_annotations(self, user_id, doc):
        annotation = None

        for anno in DocumentAnnotation.all().filter("document =", doc.filename):
            if anno.user_id == user_id:
                annotation = anno
                break

        return annotation

Example #6

0

Show file

    def load_annotations(self, user_id, doc):
        annotation = None

        for anno in DocumentAnnotation.all().filter("document =",
                                                    doc.filename):
            if anno.user_id == user_id:
                annotation = anno
                break

        return annotation

Example #7

0

Show file

    def annotation_from_json(self, annotation_data, doc):
        anno = DocumentAnnotation()
        anno.approved = self.bool_from_string(
            annotation_data["approved"]
        ) if "approved" in annotation_data else False
        anno.user_id = annotation_data["annotator"]

        anno.notes = annotation_data[
            "notes"] if "notes" in annotation_data else ""

        if "arg_units" in annotation_data:
            anno.arg_units = [Text(p) for p in annotation_data["arg_units"]]
        else:
            anno.arg_units = [Text(p) for p in annotation_data["propositions"]]

        if "relations" in annotation_data:
            anno.relations = [Text(p) for p in annotation_data["relations"]]

        if "concepts" in annotation_data:
            anno.concepts = [Text(p) for p in annotation_data["concepts"]]

        self.convert_from_legacy_arg_units(anno)
        anno.document = doc.filename
        return anno

Example #8

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def annotation_from_json(self, annotation_data, doc):
        anno = DocumentAnnotation()
        anno.approved = self.bool_from_string(
            annotation_data["approved"]) if "approved" in annotation_data else False
        anno.user_id = annotation_data["annotator"]

        anno.notes = annotation_data["notes"] if "notes" in annotation_data else ""

        if "arg_units" in annotation_data:
            anno.arg_units = [Text(p) for p in annotation_data["arg_units"]]
        else:
            anno.arg_units = [Text(p) for p in annotation_data["propositions"]]

        if "relations" in annotation_data:
            anno.relations = [Text(p) for p in annotation_data["relations"]]

        if "concepts" in annotation_data:
            anno.concepts = [Text(p) for p in annotation_data["concepts"]]

        self.convert_from_legacy_arg_units(anno)
        anno.document = doc.filename
        return anno

Example #9

0

Show file

def get_processing_state(user_id):
    """
    Creates a dictionary {document-filename => processing state}
    """
    def constant_factory(value):
        return itertools.repeat(value).next

    processed_docs = collections.defaultdict(constant_factory(UNPROCESSED))

    for annotation in DocumentAnnotation.all().filter("user_id =",
                                                      user_id).run():
        if annotation.approved:
            processed_docs[annotation.document] = COMPLETE
        else:
            processed_docs[annotation.document] = IN_PROGRESS

    return processed_docs

Example #10

0

Show file

File: utils.py Project: UKPLab/argmin2015-DiGAT

def get_processing_state(user_id):
    """
    Creates a dictionary {document-filename => processing state}
    """

    def constant_factory(value):
        return itertools.repeat(value).next

    processed_docs = collections.defaultdict(constant_factory(UNPROCESSED))

    for annotation in DocumentAnnotation.all().filter("user_id =", user_id).run():
        if annotation.approved:
            processed_docs[annotation.document] = COMPLETE
        else:
            processed_docs[annotation.document] = IN_PROGRESS

    return processed_docs

Example #11

0

Show file

    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({
                "corpus_metadata":
                "true",
                "segmenter":
                metadata.segmenter,
                "preprocessing_date":
                metadata.preprocessing_date
            })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter(
                    "document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({
                'file': doc.filename,
                'text': doc.text,
                'url': doc.url,
                'user_annotations': annotations,
                'num_tokens': doc.num_tokens,
                'num_sentences': doc.num_sentences
            })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers[
            'Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse,
                       indent=2,
                       sort_keys=False,
                       separators=(',', ':')))

Example #12

0

Show file

File: annotate.py Project: UKPLab/argmin2015-DiGAT

    def store_annotations(self, user_id, data):
        doc_filename = data['doc']
        doc = Document.all().filter("filename =", doc_filename).get()

        doc_annotation = DocumentAnnotation.all().filter("user_id =", user_id).filter("document =",
                                                                                      doc.filename).get()
        if not doc_annotation:
            doc_annotation = DocumentAnnotation(user_id=user_id, document=doc.filename)

        doc_annotation.approved = data["approved"]

        doc_annotation.concepts = [Text(item) for item in data["concepts"]]
        doc_annotation.relations = [Text(item) for item in data["relations"]]
        doc_annotation.arg_units = [Text(item) for item in data["arg_units"]]

        doc_annotation.notes = Text(data['notes'])

        log("Storing annotations " + (
        "[Approved]" if doc_annotation.approved else "") + " for " + str(doc.filename) + ": " + str(
            doc_annotation.arg_units) + " - notes: " + doc_annotation.notes.encode("utf-8").replace(
            "\n", " NL "))

        db.get(doc_annotation.put())

Example #13

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))

Example #14

0

Show file

File: annotate.py Project: UKPLab/argmin2015-DiGAT

    def get(self):
        if not self.validate_user():
            return

        if self.request.path.endswith('/annotate'):

            user_id = access_control.get_current_user_id()
            # Allow to take the role of another user
            if self.request.get('user', None) in access_control.get_all_users():
                user_id = self.request.get('user', None)

            documents_sorted_by_id = get_docs_ordered_by_id(user_id);
            processing_state = get_processing_state(user_id)

            # Per user_id annotation statistics
            num_annotated, num_approved = self.get_evaluation_stats()

            template_values = {'user': user_id,
                               'logout_url': users.create_logout_url('/argunit/'),
                               'all_views': access_control.get_view_ids(user_id),
                               'current_view': access_control.ANNOTATE_VIEW_ID}
            if not documents_sorted_by_id:
                template_values.update({
                    'has_document': False,
                    'navigation_docs': [],
                    'num_annotated': num_annotated,
                    'num_approved': num_approved,
                    'message': "No documents to display."
                })
            else:

                doc_filename = self.request.get('doc', None)

                doc = mark_as_current_doc(user_id, doc_filename, documents_sorted_by_id[0])

                topics = get_all_topics()
                current_topic = get_current_topic(user_id)

                opening_time = time.strftime("%H:%M:%S %Z", time.localtime())

                docs_with_processing_state = [(d, processing_state[d.filename]) for d in
                                              documents_sorted_by_id]

                # Get arg_units to show
                arg_units = []
                relations = []
                concepts = []
                doc_approved = False
                notes = ""
                personal_annotations = DocumentAnnotation.all().filter('user_id =', user_id).filter(
                    'document =', doc.filename).get()
                if personal_annotations:
                    doc_approved = personal_annotations.approved
                    notes = personal_annotations.notes
                    arg_units = [str(item) for item in personal_annotations.arg_units]
                    relations = [str(item) for item in personal_annotations.relations]
                    concepts = [str(item) for item in personal_annotations.concepts]

                template_values.update({'navigation_docs': docs_with_processing_state,
                                        'documents_sorted_by_id': documents_sorted_by_id,
                                        'text': doc.text,
                                        'doc_url': doc.url,
                                        'doc_filename': doc.filename,
                                        'doc_approved': doc_approved,
                                        'num_sentences': doc.num_sentences,
                                        'num_tokens': doc.num_tokens,
                                        'time': opening_time,
                                        'arg_units': json.dumps(arg_units),
                                        'relations': json.dumps(relations),
                                        'concepts': json.dumps(concepts),
                                        'all_topics': topics,
                                        'current_topic': current_topic,
                                        'num_annotated': num_annotated,
                                        'num_approved': num_approved,
                                        'notes': notes,
                                        'has_document': True,
                                        'message': ""})

            template = JINJA_ENVIRONMENT.get_template('annotate.html')
            self.response.write(template.render(template_values))
        elif self.request.path.endswith('/selecttopic'):
            user_id = access_control.get_current_user_id()
            topics = self.request.get_all('topic', None)
            set_as_current_topics(user_id, topics)
            self.redirect('/argunit/annotate')

Example #15

0

Show file

 def setApproval(self, annotator, document, isApproved):
     annotation = DocumentAnnotation.all().filter(
         "user_id =", annotator).filter("document =", document).get()
     if annotation:
         annotation.approved = isApproved
         annotation.put()

Example #16

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect('%s?%s' % (
            self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False);
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =",
                                                                                     doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line - 1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, (
                i + 1) * documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {'user': user_id,
                               'logout_url': users.create_logout_url('/argunit/'),
                               'all_views': access_control.get_view_ids(user_id),
                               'current_view': access_control.MANAGE_DATA_VIEW_ID,
                               'num_documents': len(all_documents),
                               'segmenter': segmenter,
                               'preprocessing_date': preprocessing_date,
                               'all_documents': all_documents,
                               'docs_per_line': documents_per_line,
                               'partitioned_docs': partitioned_docs,
                               'all_users': all_users,
                               'status_table': status_table,
                               'message': message,
                               'verbatim_message': verbatim_message}
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')

Example #17

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

 def drop_all_annotations(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))

Example #18

0

Show file

    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False)
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter(
                        "user_id =", user).filter("document =", doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line -
                         1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[
                    i * documents_per_line:min(num_docs, (i + 1) *
                                               documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {
                'user': user_id,
                'logout_url': users.create_logout_url('/argunit/'),
                'all_views': access_control.get_view_ids(user_id),
                'current_view': access_control.MANAGE_DATA_VIEW_ID,
                'num_documents': len(all_documents),
                'segmenter': segmenter,
                'preprocessing_date': preprocessing_date,
                'all_documents': all_documents,
                'docs_per_line': documents_per_line,
                'partitioned_docs': partitioned_docs,
                'all_users': all_users,
                'status_table': status_table,
                'message': message,
                'verbatim_message': verbatim_message
            }
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')

Example #19

0

Show file

File: managedata.py Project: UKPLab/argmin2015-DiGAT

 def setApproval(self, annotator, document, isApproved):
     annotation = DocumentAnnotation.all().filter("user_id =", annotator).filter("document =",
                                                                                 document).get();
     if annotation:
         annotation.approved = isApproved
         annotation.put()

Example #20

0

Show file

    def get(self):
        if not self.validate_user():
            return

        if self.request.path.endswith('/annotate'):

            user_id = access_control.get_current_user_id()
            # Allow to take the role of another user
            if self.request.get('user',
                                None) in access_control.get_all_users():
                user_id = self.request.get('user', None)

            documents_sorted_by_id = get_docs_ordered_by_id(user_id)
            processing_state = get_processing_state(user_id)

            # Per user_id annotation statistics
            num_annotated, num_approved = self.get_evaluation_stats()

            template_values = {
                'user': user_id,
                'logout_url': users.create_logout_url('/argunit/'),
                'all_views': access_control.get_view_ids(user_id),
                'current_view': access_control.ANNOTATE_VIEW_ID
            }
            if not documents_sorted_by_id:
                template_values.update({
                    'has_document': False,
                    'navigation_docs': [],
                    'num_annotated': num_annotated,
                    'num_approved': num_approved,
                    'message': "No documents to display."
                })
            else:

                doc_filename = self.request.get('doc', None)

                doc = mark_as_current_doc(user_id, doc_filename,
                                          documents_sorted_by_id[0])

                topics = get_all_topics()
                current_topic = get_current_topic(user_id)

                opening_time = time.strftime("%H:%M:%S %Z", time.localtime())

                docs_with_processing_state = [(d, processing_state[d.filename])
                                              for d in documents_sorted_by_id]

                # Get arg_units to show
                arg_units = []
                relations = []
                concepts = []
                doc_approved = False
                notes = ""
                personal_annotations = DocumentAnnotation.all().filter(
                    'user_id =', user_id).filter('document =',
                                                 doc.filename).get()
                if personal_annotations:
                    doc_approved = personal_annotations.approved
                    notes = personal_annotations.notes
                    arg_units = [
                        str(item) for item in personal_annotations.arg_units
                    ]
                    relations = [
                        str(item) for item in personal_annotations.relations
                    ]
                    concepts = [
                        str(item) for item in personal_annotations.concepts
                    ]

                template_values.update({
                    'navigation_docs': docs_with_processing_state,
                    'documents_sorted_by_id': documents_sorted_by_id,
                    'text': doc.text,
                    'doc_url': doc.url,
                    'doc_filename': doc.filename,
                    'doc_approved': doc_approved,
                    'num_sentences': doc.num_sentences,
                    'num_tokens': doc.num_tokens,
                    'time': opening_time,
                    'arg_units': json.dumps(arg_units),
                    'relations': json.dumps(relations),
                    'concepts': json.dumps(concepts),
                    'all_topics': topics,
                    'current_topic': current_topic,
                    'num_annotated': num_annotated,
                    'num_approved': num_approved,
                    'notes': notes,
                    'has_document': True,
                    'message': ""
                })

            template = JINJA_ENVIRONMENT.get_template('annotate.html')
            self.response.write(template.render(template_values))
        elif self.request.path.endswith('/selecttopic'):
            user_id = access_control.get_current_user_id()
            topics = self.request.get_all('topic', None)
            set_as_current_topics(user_id, topics)
            self.redirect('/argunit/annotate')

Example #21

0

Show file

 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))

Example #22

0

Show file

 def drop_all_annotations(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))