def test_sparv_corpus_stats(self):

        from corpora.corpus_statistics import CorpusPoSStatistics, SUC_POS_TAGS
        import re

        filepath = join_test_data_path('.\\temp\\daedalus_articles_pos_xml_1931-2017.zip')

        column_functions = [
            ('year', lambda df: df.filename.apply(lambda x: int(re.search(r'daedalus_volume_(\d{4})', x).group(1)))),
            ('article', lambda df: df.filename.apply(lambda x: int(re.search(r'article_(\d{2})', x).group(1)))),
            ('segment', lambda df: df.filename.apply(lambda x: int(re.search(r'(\d{2})\.txt', x).group(1))))
        ]

        transforms = [ csr.lower_case_transform(), csr.remove_empty_filter() ]
        opts = dict(transforms=transforms, postags='', lemmatize=False, append_pos="_", ignores="")
        stream = SparvCorpusSourceReader(source=ZipReader(filepath, '*.xml'), **opts)
        df = CorpusPoSStatistics(tag_set=SUC_POS_TAGS).generate(stream, column_functions=column_functions)
        df.to_excel('daedalus_full_pos_stats.xlsx')

        transforms = [ csr.lower_case_transform(), csr.min_token_size_filter(3), csr.remove_empty_filter(), csr.stopwords_filter('swedish') ]
        opts = dict(transforms=transforms, postags='|NN|PM|', lemmatize=True, append_pos="_")
        stream = SparvCorpusSourceReader(source=ZipReader(filepath, '*.xml'), **opts)
        df = CorpusPoSStatistics(tag_set=SUC_POS_TAGS).generate(stream, column_functions=column_functions)
        df.to_excel('daedalus_nn_pm_pos_lemma_nostop_min3_stats.xlsx')

        print(df.columns)
    def test_extract_specific_pos(self):

        source = [('test.xml', self.xml_data)]
        expected_nouns = ['föremål', 'vertikalfräsmaskinen', 'styrskåp', 'omformare', 'steg', 'utveckling', 'verkstadsindustri']
        expected_verbs = ['bestå', 'utgöra' ]

        opts = dict(source=source, chunk_size=None, transforms=[ to_lower ], lemmatize=True)
        stream = SparvCorpusSourceReader(postags="'|NN|'", **opts)
        document, tokens = next(iter(stream))
        print(tokens)
        self.assertEqual('test_01.txt', document)
        self.assertIsNotNone(tokens)
        self.assertSetEqual(set(expected_nouns), set(tokens))

        stream = SparvCorpusSourceReader(postags="'|VB|'", **opts)
        document, tokens = next(iter(stream))

        self.assertEqual('test_01.txt', document)
        self.assertIsNotNone(tokens)
        self.assertSetEqual(set(expected_verbs), set(tokens))

        stream = SparvCorpusSourceReader(postags="'|NN|VB|'", **opts)
        document, tokens = next(iter(stream))

        self.assertEqual('test_01.txt', document)
        self.assertIsNotNone(tokens)
        self.assertSetEqual(set(expected_nouns + expected_verbs), set(tokens))
    def test_lowercase_options(self):
        transforms = [ (lambda tokens: [ x.lower() for x in tokens ]) ]
        xml_data = '''
        <corpus><paragraph>
        <sentence id="ecacfd-e8c6ce">
        <w pos="NN" lemma="|">Mårtensons</w>
        <w pos="NN" lemma="|skapelse|">skapelse</w>
        <w pos="MID" lemma="|">-</w>
        <w pos="JJ" lemma="|svensk|">Svensk</w>
        <w pos="NN" lemma="|">Celluloidindustri</w>
        <w pos="PM" lemma="|AB|">AB</w>
        <w pos="MID" lemma="|">-</w>
        <w pos="VB" lemma="|vara|">är</w>
        <w pos="DT" lemma="|en|">ett</w>
        <w pos="NN" lemma="|exempel|">exempel</w>
        <w pos="PP" lemma="|på|">på</w>
        <w pos="JJ" lemma="|småländsk|">småländsk</w>
        <w pos="NN" lemma="|företagaranda|">företagaranda</w>
        <w pos="MAD" lemma="|" >.</w>
        </sentence>
        </paragraph></corpus>
        '''
        expected_tokens = ['Mårtensons', 'skapelse', 'Celluloidindustri', 'exempel', 'företagaranda']
        source = [('test.xml', xml_data)]
        stream = SparvCorpusSourceReader(source=source, postags="'|NN|'", chunk_size=None)
        _, tokens = next(iter(stream))
        self.assertSetEqual(set(expected_tokens), set(tokens))

        stream = SparvCorpusSourceReader(source=source, postags="'|NN|'", chunk_size=None, transforms=transforms)
        _, tokens = next(iter(stream))
        self.assertSetEqual(set(map(lambda x: x.lower(), expected_tokens)), set(tokens))
 def test_can_extract_original_text(self):
     source = [('test.xml', self.xml_data)]
     stream = SparvCorpusSourceReader(source=source, postags="''", chunk_size=None, lemmatize=False)
     _, tokens = next(iter(stream))
     expected_tokens = ['Föremålet', 'som', 'förutom', 'själva', 'vertikalfräsmaskinen', 'består', 'av', 'ett', 'styrskåp',
         'och', 'en', 'omformare', 'utgör', 'ett', 'viktigt', 'steg', 'i', 'utvecklingen', 'av', 'den', 'moderna', 'verkstadsindustrin']
     self.assertSetEqual(set(expected_tokens), set(tokens))
 def test_extract_all_lemma(self):
     source = [('test.xml', self.xml_data)]
     opts = dict(source=source, chunk_size=None, transforms=[ csr.lower_case_transform(), csr.min_token_size_filter(3) ], lemmatize=True)
     expected_tokens = ['föremål', 'som', 'förutom', 'själv', 'vertikalfräsmaskinen', 'bestå', 'styrskåp', 'och',
         'omformare', 'utgöra', 'viktig', 'steg', 'utveckling', 'den', 'modern', 'verkstadsindustri']
     stream = SparvCorpusSourceReader(postags="''", **opts)
     _, tokens = next(iter(stream))
     self.assertSetEqual(set(expected_tokens), set(tokens))
 def test_pos_extract_of_larger_file(self):
     filename = '1987_article_08.xml'
     filepath = join_test_data_path(filename)
     with open(filepath, 'r', encoding='utf-8') as f:
         content = f.read()
     source = [(filename, content)]
     pos_tags = '|PM|'
     stream = SparvCorpusSourceReader(source=source, postags="'{}'".format(pos_tags), chunk_size=9999, transforms=[ to_lower ])
     _, tokens = next(iter(stream))
     self.assertSetEqual(197, len(tokens))
    def test_tagged_token_read(self):

        xml_data = '''
        <corpus><paragraph>
        <sentence id="xxx">
        <w pos="NN"  lemma="|">Humlab</w>
        <w pos="VB"  lemma="|vara|">är</w>
        <w pos="DT"  lemma="|en|">ett</w>
        <w pos="NN"  lemma="|exempel|">exempel</w>
        <w pos="PP"  lemma="|på|">på</w>
        <w pos="DT"  lemma="|en|">ett</w>
        <w pos="NN"  lemma="|arbetsenhet|">arbetsenhet</w>
        <w pos="HP"  lemma="|">som</w>
        <w pos="JJ"  lemma="|digital|">digital</w>
        <w pos="VB"  lemma="|utför|">utför</w>
        <w pos="NN"  lemma="|forskning|">forskning</w>
        <w pos="MAD" lemma="|" >.</w>
        </sentence>
        <sentence id="xxx">
        <w pos="NN"  lemma="|">Humlab</w>
        <w pos="VB"  lemma="|vara|">är</w>
        <w pos="DT"  lemma="|en|">en</w>
        <w pos="NN"  lemma="|arbetsenhet|">arbetsenhet</w>
        <w pos="KN"  lemma="|och|">och</w>
        <w pos="AB"  lemma="|inte|">inte</w>
        <w pos="DT"  lemma="|en|">en</w>
        <w pos="NN"  lemma="|centrumbildning|">centrumbildning</w>
        <w pos="MAD" lemma="|" >.</w>
        </sentence>
        </paragraph></corpus>
        '''

        expected_tokens = ['humlab_nn', 'är_vb', 'ett_dt', 'exempel_nn', 'på_pp', 'ett_dt', 'arbetsenhet_nn', 'som_hp', 'digital_jj', 'utför_vb',
            'forskning_nn', 'humlab_nn', 'är_vb', 'en_dt', 'arbetsenhet_nn', 'och_kn', 'inte_ab', 'en_dt', 'centrumbildning_nn']

        source = [ ('test.xml', xml_data)]
        transforms = [ csr.lower_case_transform() ]
        stream = SparvCorpusSourceReader(source=source, postags="", chunk_size=None, transforms=transforms, lemmatize=False, append_pos="_" )

        corpus = SparvTextCorpus(stream=stream)
        self.assertIsNotNone(corpus)

        _, tokens = next(iter(stream))

        self.assertIsNotNone(tokens)
        self.assertSetEqual(set(expected_tokens), set(tokens))
    def test_zip_archive(self):

        filename = '.\\temp\\daedalus_articles_pos_xml_1931-2017.zip'
        filepath = join_test_data_path(filename)
        source = ZipReader(filepath, '*.xml')
        pos_delimiter = "_"
        #for document, tokens in source:
        #    print("{}: {}".format(document, len(tokens)))

        import collections
        import pandas as pd
        import re

        stream = SparvCorpusSourceReader(
            source=source,
            transforms=[ lambda tokens: [ x.lower()  for x in tokens] ],
            postags="''",
            chunk_size=None,
            lemmatize=False,
            append_pos=pos_delimiter,
            ignores=""
        )
        pos_tags = {

            'AB': 0, #  'Adverb',
            'DT': 0, #  'Determiner',
            'HA': 0, #  'WH-adverb',
            'HD': 0, #  'WH-determiner',
            'HP': 0, #  'WH-pronoun',
            'HS': 0, #  'WH-possessive',
            'IE': 0, #  'Infinitival marker',
            'IN': 0, #  'Interjection',
            'JJ': 0, #  'Adjective',
            'KN': 0, #  'Coordinating conjunction',
            'NN': 0, #  'Noun',
            'PC': 0, #  'Participle',
            'PL': 0, #  'Particle',
            'PM': 0, #  'Proper Noun',
            'PN': 0, #  'Pronoun',
            'PP': 0, #  'Preposition',
            'PS': 0, #  'Possessive pronoun',
            'RG': 0, #  'Cardinal number',
            'RO': 0, #  'Ordinal number',
            'SN': 0, #  'Subordinating conjunction',
            'VB': 0, #  'Verb',
            'UO': 0, #  'Foreign word',

            'MAD': 0, #  'Major delimiter',
            'MID': 0, #  'Minor delimiter',
            'PAD': 0, #  'Pairwise delimiter',
            '???': 0
        }

        pos_statistics = []
        pos_total_counter = collections.Counter()
        for document, tokens in stream:

            counter = collections.Counter([ x.split(pos_delimiter)[-1].upper() if pos_delimiter in x else '???' for x in tokens ])
            pos_total_counter.update(counter)

            counter_dict = dict(counter)

            pos_counts = extend(pos_tags, { k: v for k, v in counter_dict.items() if k in pos_tags.keys() })
            other_counts = [ k for k in counter_dict.keys() if k not in pos_tags.keys() ]

            if len(other_counts) > 0:
                logger.warning('Warning strange PoS tags: File %s, tags %s', document, other_counts)

            pos_statistics.append(extend(pos_counts, filename=document))

        #v = {k: [dic[k] for dic in LD] for k in LD[0]}
        df = pd.DataFrame(pos_statistics)
        df['year'] = df.filename.apply(lambda x: int(re.search(r'daedalus_volume_(\d{4})', x).group(1)))
        df['article'] = df.filename.apply(lambda x: int(re.search(r'article_(\d{2})', x).group(1)))
        df['segment'] = df.filename.apply(lambda x: int(re.search(r'(\d{2})\.txt', x).group(1)))
        df.to_excel("stats.xlsx")
    def test_sparv_corpus(self):

        xml_data = '''
        <corpus><paragraph>
        <sentence id="xxx">
        <w pos="NN"  lemma="|">Humlab</w>
        <w pos="VB"  lemma="|vara|">är</w>
        <w pos="DT"  lemma="|en|">ett</w>
        <w pos="NN"  lemma="|exempel|">exempel</w>
        <w pos="PP"  lemma="|på|">på</w>
        <w pos="DT"  lemma="|en|">ett</w>
        <w pos="NN"  lemma="|arbetsenhet|">arbetsenhet</w>
        <w pos="HP"  lemma="|">som</w>
        <w pos="JJ"  lemma="|digital|">digital</w>
        <w pos="VB"  lemma="|utför|">utför</w>
        <w pos="NN"  lemma="|forskning|">forskning</w>
        <w pos="MAD" lemma="|" >.</w>
        </sentence>
        <sentence id="xxx">
        <w pos="NN"  lemma="|">Humlab</w>
        <w pos="VB"  lemma="|vara|">är</w>
        <w pos="DT"  lemma="|en|">en</w>
        <w pos="NN"  lemma="|arbetsenhet|">arbetsenhet</w>
        <w pos="KN"  lemma="|och|">och</w>
        <w pos="AB"  lemma="|inte|">inte</w>
        <w pos="DT"  lemma="|en|">en</w>
        <w pos="NN"  lemma="|centrumbildning|">centrumbildning</w>
        <w pos="MAD" lemma="|" >.</w>
        </sentence>
        </paragraph></corpus>
        '''

        expected_tokens = ['humlab', 'exempel', 'arbetsenhet', 'forskning', 'centrumbildning']

        source = [ ('test.xml', xml_data)]
        transforms = [ csr.lower_case_transform(), csr.min_token_size_filter(3) ]
        stream = SparvCorpusSourceReader(source=source, postags="'|NN|'", chunk_size=None, transforms=transforms, lemmatize=True)

        corpus = SparvTextCorpus(stream=stream)
        self.assertIsNotNone(corpus)
        document, tokens = next(iter(stream))
        self.assertIsNotNone(tokens)
        self.assertEqual('test_01.txt', document)
        self.assertSetEqual(set(expected_tokens), set(tokens))

        self.assertEqual(len(expected_tokens), len(corpus.dictionary.token2id.keys()))
        self.assertEqual(len(corpus.dictionary.token2id.keys()), len(corpus.dictionary.id2token.keys()))

        temp_corpus_filename = generate_temp_filename('corpus.mm')
        temp_dictionary_filename = generate_temp_filename('corpus.dict.gz')

        corpora.MmCorpus.serialize(temp_corpus_filename, corpus)
        corpus.dictionary.save(temp_dictionary_filename)

        loaded_dictionary = corpora.Dictionary.load(temp_dictionary_filename)
        loaded_corpus = corpora.MmCorpus(temp_corpus_filename)

        self.assertDictEqual(corpus.dictionary.token2id, loaded_dictionary.token2id)
        self.assertDictEqual(corpus.dictionary.id2token, loaded_dictionary.id2token)

        doc0_expected = set((corpus.dictionary[x], y) for x, y in next(iter(corpus)))
        doc0_stored = set((loaded_dictionary[x], y) for x, y in next(iter(loaded_corpus)))

        self.assertSetEqual(doc0_expected, doc0_stored)
        os.remove(temp_corpus_filename)
        os.remove(temp_dictionary_filename)