Esempio n. 1
0
def extract_metadata(dictionary):
    metadata = CorpusMetadata()
    metadata.segmenter = dictionary["segmenter"] if "segmenter" in dictionary else ""
    metadata.preprocessing_date = dictionary[
        "preprocessing_date"] if "preprocessing_date" in dictionary else ""

    return metadata
Esempio n. 2
0
def extract_metadata(dictionary):
    metadata = CorpusMetadata()
    metadata.segmenter = dictionary[
        "segmenter"] if "segmenter" in dictionary else ""
    metadata.preprocessing_date = dictionary[
        "preprocessing_date"] if "preprocessing_date" in dictionary else ""

    return metadata
Esempio n. 3
0
    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({"corpus_metadata": "true",
                                 "segmenter": metadata.segmenter,
                                 "preprocessing_date": metadata.preprocessing_date
                                 })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter("document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({'file': doc.filename,
                                 'text': doc.text,
                                 'url': doc.url,
                                 'user_annotations': annotations,
                                 'num_tokens': doc.num_tokens,
                                 'num_sentences': doc.num_sentences
                                 })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers['Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse, indent=2, sort_keys=False, separators=(',', ':')))
Esempio n. 4
0
    def dump_corpus(self):
        docs = Document.all().run()
        jsonResponse = []
        metadata = CorpusMetadata.all().get()
        if metadata:
            jsonResponse.append({
                "corpus_metadata":
                "true",
                "segmenter":
                metadata.segmenter,
                "preprocessing_date":
                metadata.preprocessing_date
            })

        for doc in docs:
            annotations = []
            for annotation in DocumentAnnotation.all().filter(
                    "document =", doc.filename).run():
                anno_dict = {}
                anno_dict['annotator'] = annotation.user_id
                anno_dict['arg_units'] = annotation.arg_units
                anno_dict['relations'] = annotation.relations
                anno_dict['concepts'] = annotation.concepts
                anno_dict['approved'] = str(annotation.approved)
                anno_dict['notes'] = annotation.notes
                annotations.append(anno_dict)

            jsonResponse.append({
                'file': doc.filename,
                'text': doc.text,
                'url': doc.url,
                'user_annotations': annotations,
                'num_tokens': doc.num_tokens,
                'num_sentences': doc.num_sentences
            })

        dump_filename = "dump_" + time.strftime("%Y-%m-%d_%H:%M:%S") + ".json"
        self.response.headers['Content-Type'] = 'application/json'
        self.response.headers[
            'Content-Disposition'] = "attachment; filename=%s" % dump_filename
        self.response.write(
            json.dumps(jsonResponse,
                       indent=2,
                       sort_keys=False,
                       separators=(',', ':')))
Esempio n. 5
0
    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect('%s?%s' % (
            self.base_path(), urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False);
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect(
                '%s?%s' % (self.base_path(), urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter("user_id =", user).filter("document =",
                                                                                     doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line - 1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[i * documents_per_line:min(num_docs, (
                i + 1) * documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {'user': user_id,
                               'logout_url': users.create_logout_url('/argunit/'),
                               'all_views': access_control.get_view_ids(user_id),
                               'current_view': access_control.MANAGE_DATA_VIEW_ID,
                               'num_documents': len(all_documents),
                               'segmenter': segmenter,
                               'preprocessing_date': preprocessing_date,
                               'all_documents': all_documents,
                               'docs_per_line': documents_per_line,
                               'partitioned_docs': partitioned_docs,
                               'all_users': all_users,
                               'status_table': status_table,
                               'message': message,
                               'verbatim_message': verbatim_message}
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')
Esempio n. 6
0
 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))
Esempio n. 7
0
    def load_documents(self, force_update=False):
        response_text = ""

        data_folder = 'data/'
        metadata_filename = data_folder + 'metadata.properties'

        if os.path.exists(metadata_filename):
            db.delete(CorpusMetadata.all())
            metadata_map = {}
            metadata_file = open(metadata_filename, 'r')
            for line in metadata_file.readlines():
                if not line.startswith("#"):
                    parts = line.split("=", 1)
                    if len(parts) is 2:
                        metadata_map[parts[0]] = parts[1].strip()
            metadata = extract_metadata(metadata_map)
            metadata.put()

        files = glob.glob(data_folder + '*.json')
        doc_ids = []
        new_documents = 0
        skipped_documents = 0
        updated_documents = 0
        dropped_documents = 0

        if force_update:
            response_text += "Update forced!\n"

        for f in sorted(files):
            basename = os.path.basename(f)
            jdata = json.load(open(f, 'r'))

            documents_with_same_url = Document.all().filter("url =", jdata['url'])
            is_document_in_datastore = 0 != documents_with_same_url.count()

            jdata['file'] = basename
            doc_ids.append(basename)

            if is_document_in_datastore:

                existing_doc = documents_with_same_url.get()

                if force_update:
                    initialize_document(existing_doc, jdata)
                    existing_doc.put()

                    response_text += 'UPDATED: ' + str(basename) + " " + str(jdata['url'])
                    updated_documents += 1

                else:

                    response_text += 'SKIPPED: ' + str(basename) + " " + str(jdata['url'])
                    skipped_documents += 1
            else:
                doc = Document()
                initialize_document(doc, jdata)

                doc.put()

                response_text += '    NEW: ' + str(basename) + " " + str(jdata['url'])
                new_documents += 1

            response_text += '\n'

        response_text += "----\n"

        if force_update:
            for document in Document.all():
                if document.filename not in doc_ids:
                    dropped_documents += 1
                    db.delete(document);
                    response_text += "DROPPED: " + document.filename + "\n"

        response_text += "=" * 100 + "\n"
        response_text += "Summary:\n"
        response_text += "\tNew:    " + str(new_documents) + "\n"
        response_text += "\tUpdated:" + str(updated_documents) + "\n"
        response_text += "\tSkipped:" + str(skipped_documents) + "\n"
        response_text += "\tDropped:" + str(dropped_documents) + "\n"

        return response_text
Esempio n. 8
0
    def get(self):

        user_id = access_control.get_current_user_id()

        if not self.validate_user():
            return
        elif self.request.path.endswith('dump'):
            self.dump_corpus()
        elif self.request.path.endswith('dropall'):
            self.drop_all_data()
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": 'Dropped all data.'})))

        elif self.request.path.endswith('dropanno'):
            self.drop_all_annotations()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"message": 'Dropped all annotations.'})))

        elif self.request.path.endswith('loaddata'):
            response_text = self.load_documents()
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))

        elif self.request.path.endswith('forceupdate'):
            response_text = self.load_documents(force_update=True)
            self.redirect(
                '%s?%s' %
                (self.base_path(),
                 urllib.urlencode({"verbatim_message": response_text})))
        elif self.request.path.endswith('unapprove'):
            annotator = self.request.get("annotator")
            document = self.request.get("doc")
            self.setApproval(annotator, document, False)
            response_text = "Unapproved: %s:%s" % (annotator, document)
            self.redirect('%s?%s' %
                          (self.base_path(),
                           urllib.urlencode({"message": response_text})))
        elif self.request.path.endswith('/managedata'):

            all_documents = [doc.filename for doc in Document.all()]
            all_documents.sort()
            all_users = access_control.get_all_users()
            all_users.sort()
            status_table = dict()
            for user in all_users:
                status_table[user] = dict()
                for doc in all_documents:
                    anno = DocumentAnnotation.all().filter(
                        "user_id =", user).filter("document =", doc).get()
                    if not anno:
                        status_table[user][doc] = UNPROCESSED
                    elif not anno.approved:
                        status_table[user][doc] = IN_PROGRESS
                    else:
                        status_table[user][doc] = COMPLETE

            documents_per_line = 44
            num_docs = len(all_documents)
            num_lines = (num_docs + documents_per_line -
                         1) / documents_per_line
            partitioned_docs = []
            for i in range(0, num_lines):
                partitioned_docs.append(all_documents[
                    i * documents_per_line:min(num_docs, (i + 1) *
                                               documents_per_line)])

            message = self.request.get('message', "")
            verbatim_message = self.request.get('verbatim_message', "")

            metadata = CorpusMetadata.all().get()
            segmenter = "unknown"
            preprocessing_date = "unknown"
            if metadata:
                segmenter = metadata.segmenter
                preprocessing_date = metadata.preprocessing_date

            template_values = {
                'user': user_id,
                'logout_url': users.create_logout_url('/argunit/'),
                'all_views': access_control.get_view_ids(user_id),
                'current_view': access_control.MANAGE_DATA_VIEW_ID,
                'num_documents': len(all_documents),
                'segmenter': segmenter,
                'preprocessing_date': preprocessing_date,
                'all_documents': all_documents,
                'docs_per_line': documents_per_line,
                'partitioned_docs': partitioned_docs,
                'all_users': all_users,
                'status_table': status_table,
                'message': message,
                'verbatim_message': verbatim_message
            }
            template = JINJA_ENVIRONMENT.get_template('managedata.html')
            self.response.write(template.render(template_values))
        else:
            self.redirect('/argunit/managedata')
Esempio n. 9
0
 def drop_all_data(self):
     db.delete(DocumentAnnotation.all().fetch(10000))
     db.delete(ArgumentationUnit.all().fetch(10000))
     db.delete(Document.all().fetch(10000))
     db.delete(UserData.all().fetch(10000))
     db.delete(CorpusMetadata.all().fetch(10000))
Esempio n. 10
0
    def load_documents(self, force_update=False):
        response_text = ""

        data_folder = 'data/'
        metadata_filename = data_folder + 'metadata.properties'

        if os.path.exists(metadata_filename):
            db.delete(CorpusMetadata.all())
            metadata_map = {}
            metadata_file = open(metadata_filename, 'r')
            for line in metadata_file.readlines():
                if not line.startswith("#"):
                    parts = line.split("=", 1)
                    if len(parts) is 2:
                        metadata_map[parts[0]] = parts[1].strip()
            metadata = extract_metadata(metadata_map)
            metadata.put()

        files = glob.glob(data_folder + '*.json')
        doc_ids = []
        new_documents = 0
        skipped_documents = 0
        updated_documents = 0
        dropped_documents = 0

        if force_update:
            response_text += "Update forced!\n"

        for f in sorted(files):
            basename = os.path.basename(f)
            jdata = json.load(open(f, 'r'))

            documents_with_same_url = Document.all().filter(
                "url =", jdata['url'])
            is_document_in_datastore = 0 != documents_with_same_url.count()

            jdata['file'] = basename
            doc_ids.append(basename)

            if is_document_in_datastore:

                existing_doc = documents_with_same_url.get()

                if force_update:
                    initialize_document(existing_doc, jdata)
                    existing_doc.put()

                    response_text += 'UPDATED: ' + str(basename) + " " + str(
                        jdata['url'])
                    updated_documents += 1

                else:

                    response_text += 'SKIPPED: ' + str(basename) + " " + str(
                        jdata['url'])
                    skipped_documents += 1
            else:
                doc = Document()
                initialize_document(doc, jdata)

                doc.put()

                response_text += '    NEW: ' + str(basename) + " " + str(
                    jdata['url'])
                new_documents += 1

            response_text += '\n'

        response_text += "----\n"

        if force_update:
            for document in Document.all():
                if document.filename not in doc_ids:
                    dropped_documents += 1
                    db.delete(document)
                    response_text += "DROPPED: " + document.filename + "\n"

        response_text += "=" * 100 + "\n"
        response_text += "Summary:\n"
        response_text += "\tNew:    " + str(new_documents) + "\n"
        response_text += "\tUpdated:" + str(updated_documents) + "\n"
        response_text += "\tSkipped:" + str(skipped_documents) + "\n"
        response_text += "\tDropped:" + str(dropped_documents) + "\n"

        return response_text