Example #1
0
def test_text_tokenized_tagged_both():
    file = new_file(text_type=('tokenized', 'tagged_both'))

    text_flat_tokens = wordless_text.Wordless_Text(main,
                                                   file,
                                                   flat_tokens=True)
    text = wordless_text.Wordless_Text(main, file, flat_tokens=False)

    assert text_flat_tokens.tokens_flat != []
    assert text_flat_tokens.tags_pos != [[]] * len(
        text_flat_tokens.tokens_flat)
    assert text_flat_tokens.tags_non_pos != [[]] * len(
        text_flat_tokens.tokens_flat)
    assert text_flat_tokens.tags_all != [[]] * len(
        text_flat_tokens.tokens_flat)

    assert text_flat_tokens.offsets_paras == [0]
    assert text_flat_tokens.offsets_sentences == [0]
    assert text_flat_tokens.offsets_clauses == [0]

    assert (len(text_flat_tokens.tokens_flat) == len(text_flat_tokens.tags_pos)
            == len(text_flat_tokens.tags_non_pos) == len(
                text_flat_tokens.tags_all))

    assert len(text_flat_tokens.offsets_paras) == len(
        text_flat_tokens.tokens_hierarchical)
    assert len(text_flat_tokens.offsets_sentences) == sum(
        [len(para) for para in text_flat_tokens.tokens_hierarchical])
    assert len(text_flat_tokens.offsets_clauses) == sum([
        len(sentence) for para in text_flat_tokens.tokens_hierarchical
        for sentence in para
    ])

    assert text.tokens_flat != []
    assert text.tags_pos != [[]] * len(text.tokens_flat)
    assert text.tags_non_pos != [[]] * len(text.tokens_flat)
    assert text.tags_all != [[]] * len(text.tokens_flat)

    assert text.offsets_paras != [0]
    assert text.offsets_sentences != [0]
    assert text.offsets_clauses != [0]

    assert (len(text.tokens_flat) == len(text.tags_pos) == len(
        text.tags_non_pos) == len(text.tags_all))

    assert len(text.offsets_paras) == len(text.tokens_hierarchical)
    assert len(text.offsets_sentences) == sum(
        [len(para) for para in text.tokens_hierarchical])
    assert len(text.offsets_clauses) == sum([
        len(sentence) for para in text.tokens_hierarchical for sentence in para
    ])
Example #2
0
def testing_text(title, file, tokens_only=True):
    text = wordless_text.Wordless_Text(main, file, tokens_only=tokens_only)

    if tokens_only:
        print(f'---------- {title} [Tokens Only] ----------')
    else:
        print(f'---------- {title} ----------')

    print(f'Tokens ({len(text.tokens)}):')
    print(f'\t{text.tokens}')

    print(f'POS Tags ({len(text.tags_pos)}):')
    print(f'\t{text.tags_pos}')

    print(f'Non-POS Tags ({len(text.tags_non_pos)}):')
    print(f'\t{text.tags_non_pos}')

    print(f'All Tags ({len(text.tags_all)}):')
    print(f'\t{text.tags_all}')

    print(f'Paragraph Offsets ({len(text.para_offsets)}):')
    print(f'\t{text.para_offsets}')

    print(f'Sentence Offsets ({len(text.sentence_offsets)}):')
    print(f'\t{text.sentence_offsets}')
    def process_data(self):
        texts = []

        settings = self.main.settings_custom['wordlist']
        files = self.main.wordless_files.get_selected_files()

        # Frequency
        for file in files:
            text = wordless_text.Wordless_Text(self.main, file)

            tokens = wordless_token_processing.wordless_process_tokens_wordlist(
                text, token_settings=settings['token_settings'])

            texts.append(text)
            self.tokens_freq_files.append(collections.Counter(tokens))

        # Total
        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.tokens = [
                token for text in texts for token in text.tokens
            ]

            texts.append(text_total)
            self.tokens_freq_files.append(
                sum(self.tokens_freq_files, collections.Counter()))

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Dispersion & Adjusted Frequency
        text_measure_dispersion = settings['generation_settings'][
            'measure_dispersion']
        text_measure_adjusted_freq = settings['generation_settings'][
            'measure_adjusted_freq']

        measure_dispersion = self.main.settings_global['measures_dispersion'][
            text_measure_dispersion]['func']
        measure_adjusted_freq = self.main.settings_global[
            'measures_adjusted_freq'][text_measure_adjusted_freq]['func']

        tokens_total = self.tokens_freq_files[-1].keys()

        for text in texts:
            tokens_stats_file = {}

            # Dispersion
            number_sections = self.main.settings_custom['measures'][
                'dispersion']['general']['number_sections']

            sections_freq = [
                collections.Counter(section)
                for section in wordless_text_utils.to_sections(
                    text.tokens, number_sections)
            ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token] = [measure_dispersion(counts)]

            # Adjusted Frequency
            if not self.main.settings_custom['measures']['adjusted_freq'][
                    'general']['use_same_settings_dispersion']:
                number_sections = self.main.settings_custom['measures'][
                    'adjusted_freq']['general']['number_sections']

                sections_freq = [
                    collections.Counter(section)
                    for section in wordless_text_utils.to_sections(
                        text.tokens, number_sections)
                ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token].append(measure_adjusted_freq(counts))

            self.tokens_stats_files.append(tokens_stats_file)

        if len(files) == 1:
            self.tokens_freq_files *= 2
            self.tokens_stats_files *= 2
Example #4
0
def test_text_tokenized_tagged_both():
    file = new_file(file_name='tokenized_tagged_both',
                    text_type=('tokenized', 'tagged_both'))

    text_tokens_only = wordless_text.Wordless_Text(main,
                                                   file,
                                                   tokens_only=True)
    text = wordless_text.Wordless_Text(main, file, tokens_only=False)

    assert text_tokens_only.tokens == [
        '', 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that',
        'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and',
        'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.',
        'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic',
        'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great',
        'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',',
        'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from',
        'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It',
        'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',',
        'but', 'its', 'vocabulary', 'has', 'been', 'significantly',
        'influenced', 'by', 'other', 'Germanic', 'languages', ',',
        'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language',
        ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and',
        'French', '.'
    ]
    assert text_tokens_only.tags_pos == [[], ['_JJ'], ['_VBZ'],
                                         ['_DT'], ['_JJ'], ['_JJ'], ['_NN'],
                                         ['_IN'], ['_VBD'], ['_RB'], ['_VBN'],
                                         ['_IN'], ['_RB'], ['_JJ'], ['_NN'],
                                         ['_CC'], ['_RB'], ['_VBD'], ['_DT'],
                                         ['_JJ'], ['_FW'], ['_FW'], ['_.'],
                                         ['_VBN'], ['_IN'], ['_DT'], ['_NNS'],
                                         ['_,'], ['_CD'], ['_IN'], ['_DT'],
                                         ['_JJ'], ['_NNS'], ['_IN'], ['_VBN'],
                                         ['_IN'], ['_DT'], ['_NN'], ['_IN'],
                                         ['_JJ'], ['_NNP'], ['_IN'], ['_MD'],
                                         ['_RB'], ['_VB'], ['_PRP$'], ['_NN'],
                                         ['_,'], ['_NN'], ['_,'], ['_DT'],
                                         ['_NNS'], ['_RB'], ['_VBG'], ['_IN'],
                                         ['_DT'], ['_NNP'], ['_NN'], ['_IN'],
                                         ['_DT'], ['_JJ'], ['_NNP'], ['_.'],
                                         ['_PRP'], ['_VBZ'], ['_RB'], ['_VBN'],
                                         ['_IN'], ['_DT'], ['_NNP'], ['_NNS'],
                                         ['_,'], ['_CC'], ['_PRP$'], ['_NN'],
                                         ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'],
                                         ['_IN'], ['_JJ'], ['_JJ'], ['_NNS'],
                                         ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'],
                                         ['_DT'], ['_JJ'], ['_JJ'], ['_NN'],
                                         ['_-RRB-'], ['_,'], ['_CC'], ['_IN'],
                                         ['_DT'], ['_JJR'], ['_NN'], ['_JJ'],
                                         ['_CC'], ['_JJ'], ['_.']]
    assert text_tokens_only.tags_non_pos == [
        ['<TAG1>',
         '<TAG2>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], [], ['[4]', '[5]'], [], [], [], [], [], [], [], [],
        [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], ['[6]', '<TAG4>', '<TAG5>']
    ]
    assert text_tokens_only.tags_all == [['<TAG1>', '<TAG2>'], ['_JJ'],
                                         ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'],
                                         ['_NN'], ['_IN'], ['_VBD'], ['_RB'],
                                         ['_VBN'], ['_IN'], ['_RB'], ['_JJ'],
                                         ['_NN'], ['_CC'], ['_RB'], ['_VBD'],
                                         ['_DT'], ['_JJ'], ['_FW'], ['_FW'],
                                         ['_.', '[4]',
                                          '[5]'], ['_VBN'], ['_IN'], ['_DT'],
                                         ['_NNS'], ['_,'], ['_CD'], ['_IN'],
                                         ['_DT'], ['_JJ'], ['_NNS', '<TAG3>'],
                                         ['_IN'], ['_VBN'], ['_IN'], ['_DT'],
                                         ['_NN'], ['_IN'], ['_JJ'], ['_NNP'],
                                         ['_IN'], ['_MD'], ['_RB'], ['_VB'],
                                         ['_PRP$'], ['_NN'], ['_,'], ['_NN'],
                                         ['_,'], ['_DT'], ['_NNS'], ['_RB'],
                                         ['_VBG'], ['_IN'], ['_DT'], ['_NNP'],
                                         ['_NN'], ['_IN'], ['_DT'], ['_JJ'],
                                         ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'],
                                         ['_RB', '<TAG3>'], ['_VBN'], ['_IN'],
                                         ['_DT'], ['_NNP'], ['_NNS'], ['_,'],
                                         ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'],
                                         ['_VBN'], ['_RB'], ['_VBN'], ['_IN'],
                                         ['_JJ', '<TAG3>'], ['_JJ'], ['_NNS'],
                                         ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'],
                                         ['_DT'], ['_JJ'], ['_JJ'], ['_NN'],
                                         ['_-RRB-'], ['_,'], ['_CC'], ['_IN'],
                                         ['_DT'], ['_JJR'], ['_NN'], ['_JJ'],
                                         ['_CC'], ['_JJ'],
                                         ['_.', '[6]', '<TAG4>', '<TAG5>']]
    assert text_tokens_only.offsets_paras == [0]
    assert text_tokens_only.offsets_sentences == [0]
    assert text_tokens_only.offsets_clauses == [0, 28, 48, 50, 72, 84, 93]

    assert (len(text_tokens_only.tokens) == len(text_tokens_only.tags_pos) ==
            len(text_tokens_only.tags_non_pos) == len(
                text_tokens_only.tags_all))
    assert len(text_tokens_only.offsets_paras) == len(
        text_tokens_only.tokens_sentences_paras)
    assert len(text_tokens_only.offsets_sentences) == sum(
        [len(para) for para in text_tokens_only.tokens_sentences_paras])

    assert text.tokens == [
        '', 'English', 'is', 'a', 'West', 'Germanic', 'language', 'that',
        'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and',
        'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.',
        'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic',
        'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great',
        'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',',
        'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from',
        'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It',
        'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',',
        'but', 'its', 'vocabulary', 'has', 'been', 'significantly',
        'influenced', 'by', 'other', 'Germanic', 'languages', ',',
        'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language',
        ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and',
        'French', '.'
    ]
    assert text.tags_pos == [[], ['_JJ'], ['_VBZ'], ['_DT'], ['_JJ'], ['_JJ'],
                             ['_NN'], ['_IN'], ['_VBD'], ['_RB'], ['_VBN'],
                             ['_IN'], ['_RB'], ['_JJ'], ['_NN'], ['_CC'],
                             ['_RB'], ['_VBD'], ['_DT'], ['_JJ'], ['_FW'],
                             ['_FW'], ['_.'], ['_VBN'], ['_IN'], ['_DT'],
                             ['_NNS'], ['_,'], ['_CD'], ['_IN'], ['_DT'],
                             ['_JJ'], ['_NNS'], ['_IN'], ['_VBN'], ['_IN'],
                             ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'],
                             ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'],
                             ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'],
                             ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'],
                             ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'],
                             ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'], ['_RB'],
                             ['_VBN'], ['_IN'], ['_DT'], ['_NNP'], ['_NNS'],
                             ['_,'], ['_CC'], ['_PRP$'], ['_NN'], ['_VBZ'],
                             ['_VBN'], ['_RB'], ['_VBN'], ['_IN'], ['_JJ'],
                             ['_JJ'], ['_NNS'], ['_,'], ['_RB'], ['_NNP'],
                             ['_-LRB-'], ['_DT'], ['_JJ'], ['_JJ'], ['_NN'],
                             ['_-RRB-'], ['_,'], ['_CC'], ['_IN'], ['_DT'],
                             ['_JJR'], ['_NN'], ['_JJ'], ['_CC'], ['_JJ'],
                             ['_.']]
    assert text.tags_non_pos == [
        ['<TAG1>',
         '<TAG2>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], [], ['[4]', '[5]'], [], [], [], [], [], [], [], [],
        [], ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        ['<TAG3>'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
        [], [], [], [], [], ['[6]', '<TAG4>', '<TAG5>']
    ]
    assert text.tags_all == [['<TAG1>', '<TAG2>'], ['_JJ'], ['_VBZ'], ['_DT'],
                             ['_JJ'], ['_JJ'], ['_NN'], ['_IN'], ['_VBD'],
                             ['_RB'], ['_VBN'], ['_IN'], ['_RB'], ['_JJ'],
                             ['_NN'], ['_CC'], ['_RB'], ['_VBD'], ['_DT'],
                             ['_JJ'], ['_FW'], ['_FW'], ['_.', '[4]', '[5]'],
                             ['_VBN'], ['_IN'], ['_DT'], ['_NNS'], ['_,'],
                             ['_CD'], ['_IN'], ['_DT'], ['_JJ'],
                             ['_NNS', '<TAG3>'], ['_IN'], ['_VBN'], ['_IN'],
                             ['_DT'], ['_NN'], ['_IN'], ['_JJ'], ['_NNP'],
                             ['_IN'], ['_MD'], ['_RB'], ['_VB'], ['_PRP$'],
                             ['_NN'], ['_,'], ['_NN'], ['_,'], ['_DT'],
                             ['_NNS'], ['_RB'], ['_VBG'], ['_IN'], ['_DT'],
                             ['_NNP'], ['_NN'], ['_IN'], ['_DT'], ['_JJ'],
                             ['_NNP'], ['_.'], ['_PRP'], ['_VBZ'],
                             ['_RB', '<TAG3>'], ['_VBN'], ['_IN'], ['_DT'],
                             ['_NNP'], ['_NNS'], ['_,'], ['_CC'], ['_PRP$'],
                             ['_NN'], ['_VBZ'], ['_VBN'], ['_RB'], ['_VBN'],
                             ['_IN'], ['_JJ', '<TAG3>'], ['_JJ'], ['_NNS'],
                             ['_,'], ['_RB'], ['_NNP'], ['_-LRB-'], ['_DT'],
                             ['_JJ'], ['_JJ'], ['_NN'], ['_-RRB-'], ['_,'],
                             ['_CC'], ['_IN'], ['_DT'], ['_JJR'], ['_NN'],
                             ['_JJ'], ['_CC'], ['_JJ'],
                             ['_.', '[6]', '<TAG4>', '<TAG5>']]
    assert text.offsets_paras == [0]
    assert text.offsets_sentences == [0, 23, 63]
    assert text.offsets_clauses == [0, 23, 28, 48, 50, 63, 72, 84, 93]

    assert (len(text.tokens) == len(text.tags_pos) == len(text.tags_non_pos) ==
            len(text.tags_all))
    assert len(text.offsets_paras) == len(text.tokens_sentences_paras)
    assert len(text.offsets_sentences) == sum(
        [len(para) for para in text.tokens_sentences_paras])
Example #5
0
def test_text_untokenized_untagged():
    file = new_file(file_name='untokenized_untagged',
                    text_type=('untokenized', 'untagged'))

    text_tokens_only = wordless_text.Wordless_Text(main,
                                                   file,
                                                   tokens_only=True)
    text = wordless_text.Wordless_Text(main, file, tokens_only=False)

    assert text_tokens_only.tokens == [
        'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was',
        'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and',
        'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.',
        'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic',
        'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great',
        'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',',
        'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from',
        'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It',
        'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',',
        'but', 'its', 'vocabulary', 'has', 'been', 'significantly',
        'influenced', 'by', 'other', 'Germanic', 'languages', ',',
        'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language',
        ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and',
        'French', '.'
    ]
    assert text_tokens_only.tags_pos == [[]] * len(text_tokens_only.tokens)
    assert text_tokens_only.tags_non_pos == [[]] * len(text_tokens_only.tokens)
    assert text_tokens_only.tags_all == [[]] * len(text_tokens_only.tokens)
    assert text_tokens_only.offsets_paras == [0]
    assert text_tokens_only.offsets_sentences == [0]
    assert text_tokens_only.offsets_clauses == [0, 27, 47, 49, 71, 83, 92]

    assert (len(text_tokens_only.tokens) == len(text_tokens_only.tags_pos) ==
            len(text_tokens_only.tags_non_pos) == len(
                text_tokens_only.tags_all))
    assert len(text_tokens_only.offsets_paras) == len(
        text_tokens_only.tokens_sentences_paras)
    assert len(text_tokens_only.offsets_sentences) == sum(
        [len(para) for para in text_tokens_only.tokens_sentences_paras])

    assert text.tokens == [
        'English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was',
        'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and',
        'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.',
        'Named', 'after', 'the', 'Angles', ',', 'one', 'of', 'the', 'Germanic',
        'tribes', 'that', 'migrated', 'to', 'the', 'area', 'of', 'Great',
        'Britain', 'that', 'would', 'later', 'take', 'their', 'name', ',',
        'England', ',', 'both', 'names', 'ultimately', 'deriving', 'from',
        'the', 'Anglia', 'peninsula', 'in', 'the', 'Baltic', 'Sea', '.', 'It',
        'is', 'closely', 'related', 'to', 'the', 'Frisian', 'languages', ',',
        'but', 'its', 'vocabulary', 'has', 'been', 'significantly',
        'influenced', 'by', 'other', 'Germanic', 'languages', ',',
        'particularly', 'Norse', '(', 'a', 'North', 'Germanic', 'language',
        ')', ',', 'and', 'to', 'a', 'greater', 'extent', 'Latin', 'and',
        'French', '.'
    ]
    assert text.tags_pos == [[]] * len(text.tokens)
    assert text.tags_non_pos == [[]] * len(text.tokens)
    assert text.tags_all == [[]] * len(text.tokens)
    assert text.offsets_paras == [0]
    assert text.offsets_sentences == [0, 22, 62]
    assert text.offsets_clauses == [0, 22, 27, 47, 49, 62, 71, 83, 92]

    assert (len(text.tokens) == len(text.tags_pos) == len(text.tags_non_pos) ==
            len(text.tags_all))
    assert len(text.offsets_paras) == len(text.tokens_sentences_paras)
    assert len(text.offsets_sentences) == sum(
        [len(para) for para in text.tokens_sentences_paras])
    def process_data(self):
        texts = []

        settings = self.main.settings_custom['keywords']
        ref_file = self.main.wordless_files.find_file_by_name(
            settings['generation_settings']['ref_file'], selected_only=True)

        files = [
            file for file in self.main.wordless_files.get_selected_files()
            if file != ref_file
        ]

        # Frequency
        for i, file in enumerate([ref_file] + files):
            text = wordless_text.Wordless_Text(self.main, file)

            tokens = wordless_token_processing.wordless_process_tokens_wordlist(
                text, token_settings=settings['token_settings'])

            self.keywords_freq_files.append(collections.Counter(tokens))

            if i > 0:
                texts.append(text)
            else:
                tokens_ref = text.tokens
                len_tokens_ref = len(tokens_ref)

        # Total
        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.tokens = [
                token for text in texts for token in text.tokens
            ]

            texts.append(text_total)
            self.keywords_freq_files.append(
                sum(self.keywords_freq_files, collections.Counter()))

            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in text_total.tokens
            }
        else:
            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in self.keywords_freq_files[1]
            }

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Keyness
        text_test_significance = settings['generation_settings'][
            'test_significance']
        text_measure_effect_size = settings['generation_settings'][
            'measure_effect_size']

        test_significance = self.main.settings_global['tests_significance'][
            'keywords'][text_test_significance]['func']
        measure_effect_size = self.main.settings_global[
            'measures_effect_size']['keywords'][text_measure_effect_size][
                'func']

        keywords_freq_file_observed = self.keywords_freq_files[-1]
        keywords_freq_file_ref = self.keywords_freq_files[0]

        for text in texts:
            keywords_stats_file = {}

            tokens_observed = text.tokens
            len_tokens_observed = len(tokens_observed)

            if text_test_significance in [
                    self.tr('Student\'s t-test (Two-sample)'),
                    self.tr('Mann-Whitney U Test')
            ]:
                # Test Statistic, p-value & Bayes Factor
                if text_test_significance == self.tr(
                        'Student\'s t-test (Two-sample)'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['use_data']
                elif text_test_significance == self.tr('Mann-Whitney U Test'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'use_data']

                sections_observed = wordless_text_utils.to_sections(
                    tokens_observed, number_sections)
                sections_ref = wordless_text_utils.to_sections(
                    tokens_ref, number_sections)

                sections_freq_observed = [
                    collections.Counter(section)
                    for section in sections_observed
                ]
                sections_freq_ref = [
                    collections.Counter(section)
                    for section in sections_observed
                ]

                len_sections_observed = [
                    len(section) for section in sections_observed
                ]
                len_sections_ref = [len(section) for section in sections_ref]

                if use_data == self.tr('Absolute Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_observed
                        ]
                        counts_ref = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_ref
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)
                elif use_data == self.tr('Relative Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0) /
                            len_sections_observed[i]
                            for i, section_freq in enumerate(
                                sections_freq_observed)
                        ]
                        counts_ref = [
                            section_freq.get(token, 0) / len_sections_ref[i]
                            for i, section_freq in enumerate(sections_freq_ref)
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)

                # Effect Size
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))
            else:
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    # Test Statistic, p-value & Bayes Factor
                    keywords_stats_file[token] = test_significance(
                        self.main, c11, c12, c21, c22)

                    # Effect Size
                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))

            self.keywords_stats_files.append(keywords_stats_file)

        if len(files) == 1:
            self.keywords_freq_files.append(self.keywords_freq_files[1])
            self.keywords_stats_files *= 2
Example #7
0
    def process_data(self):
        texts = []

        settings = self.main.settings_custom['overview']
        files = self.main.wordless_files.get_selected_files()

        for i, file in enumerate(files):
            text = wordless_text.Wordless_Text(self.main,
                                               file,
                                               flat_tokens=False)
            wordless_token_processing.wordless_process_tokens_overview(
                text, token_settings=settings['token_settings'])

            texts.append(text)

        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.offsets_paras = [
                offset for text in texts for offset in text.offsets_paras
            ]
            text_total.offsets_sentences = [
                offset for text in texts for offset in text.offsets_sentences
            ]
            text_total.offsets_clauses = [
                offset for text in texts for offset in text.offsets_clauses
            ]
            text_total.tokens_hierarchical = [
                para for text in texts for para in text.tokens_hierarchical
            ]
            text_total.tokens_flat = [
                token for text in texts for token in text.tokens_flat
            ]

            texts.append(text_total)
        else:
            texts.append(texts[0])

        self.progress_updated.emit(self.tr('Processing data ...'))

        base_sttr = settings['generation_settings']['base_sttr']

        for text in texts:
            texts_stats_file = []

            # Paragraph length
            len_paras_in_sentence = [
                len(para) for para in text.tokens_hierarchical
            ]
            len_paras_in_clause = [
                sum([len(sentence) for sentence in para])
                for para in text.tokens_hierarchical
            ]
            len_paras_in_token = [
                sum([len(clause) for sentence in para for clause in sentence])
                for para in text.tokens_hierarchical
            ]

            # Sentence length
            len_sentences = [
                sum([len(clause) for clause in sentence])
                for para in text.tokens_hierarchical for sentence in para
            ]

            # Clause length
            len_clauses = [
                len(clause) for para in text.tokens_hierarchical
                for sentence in para for clause in sentence
            ]

            # Token length
            len_tokens = [len(token) for token in text.tokens_flat]
            # Type length
            len_types = [
                len(token_type) for token_type in set(text.tokens_flat)
            ]

            count_tokens = len(len_tokens)
            count_types = len(len_types)

            # TTR
            if count_tokens == 0:
                ttr = 0
            else:
                ttr = count_types / count_tokens

            # STTR
            if count_tokens < base_sttr:
                sttr = ttr
            else:
                token_sections = wordless_text_utils.to_sections_unequal(
                    text.tokens_flat, base_sttr)

                # Discard the last section if number of tokens in it is smaller than the base of sttr
                if len(token_sections[-1]) < base_sttr:
                    ttrs = [
                        len(set(token_section)) / len(token_section)
                        for token_section in token_sections[:-1]
                    ]
                else:
                    ttrs = [
                        len(set(token_section)) / len(token_section)
                        for token_section in token_sections
                    ]

                sttr = sum(ttrs) / len(ttrs)

            texts_stats_file.append(len_paras_in_sentence)
            texts_stats_file.append(len_paras_in_clause)
            texts_stats_file.append(len_paras_in_token)
            texts_stats_file.append(len_sentences)
            texts_stats_file.append(len_clauses)
            texts_stats_file.append(len_tokens)
            texts_stats_file.append(len_types)
            texts_stats_file.append(ttr)
            texts_stats_file.append(sttr)

            self.texts_stats_files.append(texts_stats_file)
Example #8
0
    def process_data(self):
        texts = []

        settings = self.main.settings_custom['overview']
        files = self.main.wordless_files.get_selected_files()

        for i, file in enumerate(files):
            text = wordless_text.Wordless_Text(self.main, file, tokens_only = False)
            text.tokens = wordless_token_processing.wordless_process_tokens_overview(text,
                                                                                     token_settings = settings['token_settings'])

            texts.append(text)

        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.para_offsets = [offset for text in texts for offset in text.para_offsets]
            text_total.sentence_offsets = [offset for text in texts for offset in text.sentence_offsets]
            text_total.tokens = [token for text in texts for token in text.tokens]

            texts.append(text_total)
        else:
            texts.append(texts[0])

        self.progress_updated.emit(self.tr('Processing data ...'))

        base_sttr = settings['generation_settings']['base_sttr']

        for text in texts:
            texts_stats_file = []

            count_paras = len(text.para_offsets)
            count_sentences = len(text.sentence_offsets)
            count_tokens = len(text.tokens)
            count_types = len(set(text.tokens))

            len_tokens = [len(token) for token in text.tokens]
            self.texts_len_tokens_files.append(collections.Counter(len_tokens))

            count_chars = sum(len_tokens)

            if count_tokens == 0:
                ttr = 0
            else:
                ttr = count_types / count_tokens

            if count_tokens < base_sttr:
                sttr = ttr
            else:
                token_sections = wordless_text_utils.to_sections_unequal(text.tokens, base_sttr)

                # Discard the last section if number of tokens in it is smaller than the base of sttr
                if len(token_sections[-1]) < base_sttr:
                    ttrs = [len(set(token_section)) / len(token_section) for token_section in token_sections[:-1]]
                else:
                    ttrs = [len(set(token_section)) / len(token_section) for token_section in token_sections]

                sttr = sum(ttrs) / len(ttrs)

            texts_stats_file.append(count_paras)
            texts_stats_file.append(count_sentences)
            texts_stats_file.append(count_tokens)
            texts_stats_file.append(count_types)
            texts_stats_file.append(count_chars)
            texts_stats_file.append(ttr)
            texts_stats_file.append(sttr)

            self.texts_stats_files.append(texts_stats_file)