Example #1
0
    def test_custom_headers(self):
        ''' verifies that custom headers are preserved and no headers added '''
        docs = ({'id': 123,
                 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.',
                 'title': 'Testdokument :)',
                 'format': 'text/html',
                 'header': {'dc:author': 'Ana', 'dc:source': 'http://test.org'}
                 },
                {'id': 124,
                 'body': 'Das zweite Dokument.',
                 'title': 'Zweites Testdokument :)',
                 'format': 'text/html',
                 'header': {},
                 },
                )

        j = Jeremia()
        first, second = j.submit_documents(docs)
        # swap documents, if required
        if first['content_id'] == '124':
            first, second = second, first

        assert 'dc:source="http://test.org"' in first['xml_content']
        assert 'dc:author="Ana"' in first['xml_content']

        assert '<wl:page xmlns:wl="http://www.weblyzard.com/wl/2013#" xmlns:dc="http://purl.org/dc/elements/1.1/" wl:id="124" dc:format="text/html" xml:lang="de" wl:nilsimsa="8030473ac029f400680409349e47100e00a29585c04a25ec808342b4c0a1aec8">' in second[
            'xml_content']
    def test_missing_space_tokenattribute(self):
        def text_as_doc(text):
            docs = [{
                'id': 'alpha',
                'body': text,
                'title': '',
                'format': 'text/html',
                'header': {}
            }]
            return docs

        j = Jeremia()

        test_texts = {
            'Min. 25 000 Kč - řidiči nákladních automobilů, tahačů a… Úřad práce Písek http://t.co/QowX6PQjrR':
            17,
            'Retos de la #RSE (II): 1. Más autocrítica en las memorias de sostenibilidad':
            17,
        }

        for text, token_number in list(test_texts.items()):
            result = j.submit_documents(documents=text_as_doc(text))
            res_xml = list(result)[0]['xml_content']
            assert len(list(
                XMLContent(res_xml).sentences[0].tokens)) == token_number
Example #3
0
    def test_docs_serialization_format(self):
        import json
        from eWRT.util.module_path import get_resource

        DOCS = [{'id': 7,
                 'body': 'Ehre sei Gott.',
                 'title': '',
                 'format': 'text/html',
                 'header': {'test': 'testvalue'}},
                {'id': 8,
                 'body': '',
                 'title': 'Guten Tag!',
                 'format': 'text/html',
                 'header': {}}]
        REFERENCE_MULTI = json.load(
            open(get_resource(__file__, 'data/jeremia_reference_output_documents.json')))
        REFERENCE_SINGLE = json.load(open(get_resource(
            __file__, 'data/jeremia_reference_output_single_document.json')))

        # document list
        j = Jeremia()
        result = j.submit_documents(DOCS)
        result.sort()
        REFERENCE_MULTI.sort()
        assert REFERENCE_MULTI == result

        # single document
        result = j.submit_document(DOCS[0])
        assert REFERENCE_SINGLE == result
    def test_custom_headers(self):
        ''' verifies that custom headers are preserved and no headers added '''
        docs = (
            {
                'id': 123,
                'body':
                'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.',
                'title': 'Testdokument :)',
                'format': 'text/html',
                'header': {
                    'dc:author': 'Ana',
                    'dc:source': 'http://test.org'
                }
            },
            {
                'id': 124,
                'body': 'Das zweite Dokument.',
                'title': 'Zweites Testdokument :)',
                'format': 'text/html',
                'header': {},
            },
        )

        j = Jeremia()
        first, second = j.submit_documents(docs)
        # swap documents, if required
        if first['content_id'] == '124':
            first, second = second, first

        assert 'dc:source="http://test.org"' in first['xml_content']
        assert 'dc:author="Ana"' in first['xml_content']

        assert '<wl:page xmlns:wl="http://www.weblyzard.com/wl/2013#" xmlns:dc="http://purl.org/dc/elements/1.1/" wl:id="124" dc:format="text/html" xml:lang="de" wl:nilsimsa="8030473ac029f400680409349e47100e00a29585c04a25ec808342b4c0a1aec8">'.lower(
        ) in second['xml_content'].lower()
def get_weblyzard_xml_documents(corpus_documents):
    '''
    Performs the pre-processing of the corpus documents (i.e. text
    files are converted into the weblyzard XML format.
    '''
    jeremia = Jeremia()
    xml_content = [XMLContent(doc['xml_content'])
                   for doc in jeremia.submit_documents(corpus_documents)]
    return {doc.content_id: doc for doc in xml_content}
Example #6
0
def get_weblyzard_xml_documents(corpus_documents):
    '''
    Performs the pre-processing of the corpus documents (i.e. text
    files are converted into the weblyzard XML format.
    '''
    jeremia = Jeremia()
    xml_content = [
        XMLContent(doc['xml_content'])
        for doc in jeremia.submit_documents(corpus_documents)
    ]
    return {doc.content_id: doc for doc in xml_content}
    def test_sentence_splitting(self):
        j = Jeremia()

        for doc in j.submit_documents(self.DOCS[:1]):
            # extract sentences
            print(doc)
            xml_obj = XMLContent(doc['xml_content'])
            sentences = [s.sentence for s in xml_obj.sentences]
            print(doc['xml_content'])
            assert 'wl:is_title' in doc['xml_content']
            print(sentences)
Example #8
0
    def test_sentence_splitting(self):
        j = Jeremia()

        for doc in j.submit_documents(self.DOCS[:1]):
            # extract sentences
            print(doc)
            xml_obj = XMLContent(doc['xml_content'])
            sentences = [s.sentence for s in xml_obj.sentences]
            print(doc['xml_content'])
            assert 'wl:is_title' in doc['xml_content']
            print(sentences)
    def test_blacklist(self):
        ''' tests the blacklist-based sentence filtering '''
        source_id = 1
        blacklist = [
            '6e44889df94d6408bbeeab8837bfbe01',
            '422d7f2000393b8c50a37f9d363ad511'
        ]
        docs = [{
            'id': 123,
            'body':
            'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.',
            'title': 'Testdokument :)',
            'format': 'text/html',
            'header': {}
        }]

        # use the blacklist
        j = Jeremia()
        j.update_blacklist(source_id=source_id, blacklist=blacklist)
        sentences = self._get_sentences(
            j.submit_documents(docs, source_id=source_id)).pop()
        assert 'Hier wird die Zensur zuschlagen.' not in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # check blacklist items
        assert blacklist == j.get_blacklist(source_id)

        # no blacklist
        sentences = self._get_sentences(j.submit_documents(docs)).pop()
        assert 'Hier wird die Zensur zuschlagen.' in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # clear blacklist
        j.clear_blacklist(source_id)
        sentences = self._get_sentences(
            j.submit_documents(docs, source_id=source_id)).pop()
        assert 'Hier wird die Zensur zuschlagen.' in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # check empty blacklist
        assert [] == j.get_blacklist(source_id)
Example #10
0
    def test_illegal_xml_format_filtering(self):
        DOCS = [{'id': 'alpha',
                 'body': 'This is an illegal XML Sequence: J\x1amica',
                 'title': 'Hello "world" more ',
                 'format': 'text/html',
                 'header': {}}]

        j = Jeremia()
        for doc in j.submit_documents(DOCS):
            xml = XMLContent(doc['xml_content'])
            print(doc['xml_content'])
            assert xml.sentences[0].sentence != None
    def test_illegal_xml_format_filtering(self):
        DOCS = [{
            'id': 'alpha',
            'body': 'This is an illegal XML Sequence: J\x1amica',
            'title': 'Hello "world" more ',
            'format': 'text/html',
            'header': {}
        }]

        j = Jeremia()
        for doc in j.submit_documents(DOCS):
            xml = XMLContent(doc['xml_content'])
            print(doc['xml_content'])
            assert xml.sentences[0].sentence != None
Example #12
0
    def test_blacklist(self):
        ''' tests the blacklist-based sentence filtering '''
        source_id = 1
        blacklist = ['6e44889df94d6408bbeeab8837bfbe01',
                     '422d7f2000393b8c50a37f9d363ad511']
        docs = [{'id': 123,
                 'body': 'Hier wird die Zensur zuschlagen. Der zweite Satz ist aber okay.',
                 'title': 'Testdokument :)',
                 'format': 'text/html',
                 'header': {}}]

        # use the blacklist
        j = Jeremia()
        j.update_blacklist(source_id=source_id, blacklist=blacklist)
        sentences = self._get_sentences(
            j.submit_documents(docs, source_id=source_id)).pop()
        assert 'Hier wird die Zensur zuschlagen.' not in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # check blacklist items
        assert blacklist == j.get_blacklist(source_id)

        # no blacklist
        sentences = self._get_sentences(j.submit_documents(docs)).pop()
        assert 'Hier wird die Zensur zuschlagen.' in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # clear blacklist
        j.clear_blacklist(source_id)
        sentences = self._get_sentences(
            j.submit_documents(docs, source_id=source_id)).pop()
        assert 'Hier wird die Zensur zuschlagen.' in sentences
        assert 'Der zweite Satz ist aber okay.' in sentences

        # check empty blacklist
        assert [] == j.get_blacklist(source_id)
Example #13
0
    def test_missing_space_tokenattribute(self):
        def text_as_doc(text):
            docs = [{'id': 'alpha',
                     'body': text,
                     'title': '',
                     'format': 'text/html',
                     'header': {}}]
            return docs

        j = Jeremia()

        test_texts = {
            'Min. 25 000 Kč - řidiči nákladních automobilů, tahačů a… Úřad práce Písek http://t.co/QowX6PQjrR': 17,
            'Retos de la #RSE (II): 1. Más autocrítica en las memorias de sostenibilidad': 17,
        }

        for text, token_number in test_texts.iteritems():
            result = j.submit_documents(documents=text_as_doc(text))
            res_xml = list(result)[0]['xml_content']
            assert len(
                list(XMLContent(res_xml).sentences[0].tokens)) == token_number
    def test_docs_serialization_format(self):
        import json
        from eWRT.util.module_path import get_resource

        DOCS = [{
            'id': 7,
            'body': 'Ehre sei Gott.',
            'title': '',
            'format': 'text/html',
            'header': {
                'test': 'testvalue'
            }
        }, {
            'id': 8,
            'body': '',
            'title': 'Guten Tag!',
            'format': 'text/html',
            'header': {}
        }]
        REFERENCE_MULTI = json.load(
            open(
                get_resource(__file__,
                             'data/jeremia_reference_output_documents.json')))
        REFERENCE_SINGLE = json.load(
            open(
                get_resource(
                    __file__,
                    'data/jeremia_reference_output_single_document.json')))

        # document list
        j = Jeremia()
        result = j.submit_documents(DOCS)
        result.sort()
        REFERENCE_MULTI.sort()
        assert REFERENCE_MULTI == result

        # single document
        result = j.submit_document(DOCS[0])
        assert REFERENCE_SINGLE == result
 def test_batch_processing(self):
     j = Jeremia()
     docs = j.submit_documents(self.DOCS)
     self.assertEqual(len(docs), 20)
    def test_illegal_input_args(self):
        j = Jeremia()

        with self.assertRaises(ValueError):
            j.submit_documents([])
Example #17
0
    def test_illegal_input_args(self):
        j = Jeremia()

        with self.assertRaises(ValueError):
            j.submit_documents([])
Example #18
0
 def test_batch_processing(self):
     j = Jeremia()
     docs = j.submit_documents(self.DOCS)
     self.assertEqual(len(docs), 20)