def process_data(self):
        texts = []

        settings = self.main.settings_custom['keywords']
        ref_file = self.main.wordless_files.find_file_by_name(
            settings['generation_settings']['ref_file'], selected_only=True)

        files = [
            file for file in self.main.wordless_files.get_selected_files()
            if file != ref_file
        ]

        # Frequency
        for i, file in enumerate([ref_file] + files):
            text = wordless_text.Wordless_Text(self.main, file)

            tokens = wordless_token_processing.wordless_process_tokens_wordlist(
                text, token_settings=settings['token_settings'])

            self.keywords_freq_files.append(collections.Counter(tokens))

            if i > 0:
                texts.append(text)
            else:
                tokens_ref = text.tokens
                len_tokens_ref = len(tokens_ref)

        # Total
        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.tokens = [
                token for text in texts for token in text.tokens
            ]

            texts.append(text_total)
            self.keywords_freq_files.append(
                sum(self.keywords_freq_files, collections.Counter()))

            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in text_total.tokens
            }
        else:
            self.keywords_freq_files[0] = {
                token: freq
                for token, freq in self.keywords_freq_files[0].items()
                if token in self.keywords_freq_files[1]
            }

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Keyness
        text_test_significance = settings['generation_settings'][
            'test_significance']
        text_measure_effect_size = settings['generation_settings'][
            'measure_effect_size']

        test_significance = self.main.settings_global['tests_significance'][
            'keywords'][text_test_significance]['func']
        measure_effect_size = self.main.settings_global[
            'measures_effect_size']['keywords'][text_measure_effect_size][
                'func']

        keywords_freq_file_observed = self.keywords_freq_files[-1]
        keywords_freq_file_ref = self.keywords_freq_files[0]

        for text in texts:
            keywords_stats_file = {}

            tokens_observed = text.tokens
            len_tokens_observed = len(tokens_observed)

            if text_test_significance in [
                    self.tr('Student\'s t-test (Two-sample)'),
                    self.tr('Mann-Whitney U Test')
            ]:
                # Test Statistic, p-value & Bayes Factor
                if text_test_significance == self.tr(
                        'Student\'s t-test (Two-sample)'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance'][
                            'students_t_test_2_sample']['use_data']
                elif text_test_significance == self.tr('Mann-Whitney U Test'):
                    number_sections = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'number_sections']
                    use_data = self.main.settings_custom['measures'][
                        'statistical_significance']['mann_whitney_u_test'][
                            'use_data']

                sections_observed = wordless_text_utils.to_sections(
                    tokens_observed, number_sections)
                sections_ref = wordless_text_utils.to_sections(
                    tokens_ref, number_sections)

                sections_freq_observed = [
                    collections.Counter(section)
                    for section in sections_observed
                ]
                sections_freq_ref = [
                    collections.Counter(section)
                    for section in sections_observed
                ]

                len_sections_observed = [
                    len(section) for section in sections_observed
                ]
                len_sections_ref = [len(section) for section in sections_ref]

                if use_data == self.tr('Absolute Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_observed
                        ]
                        counts_ref = [
                            section_freq.get(token, 0)
                            for section_freq in sections_freq_ref
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)
                elif use_data == self.tr('Relative Frequency'):
                    for token in keywords_freq_file_observed:
                        counts_observed = [
                            section_freq.get(token, 0) /
                            len_sections_observed[i]
                            for i, section_freq in enumerate(
                                sections_freq_observed)
                        ]
                        counts_ref = [
                            section_freq.get(token, 0) / len_sections_ref[i]
                            for i, section_freq in enumerate(sections_freq_ref)
                        ]

                        keywords_stats_file[token] = test_significance(
                            self.main, counts_observed, counts_ref)

                # Effect Size
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))
            else:
                for token in keywords_freq_file_observed:
                    c11 = keywords_freq_file_observed.get(token, 0)
                    c12 = keywords_freq_file_ref.get(token, 0)
                    c21 = len_tokens_observed - c11
                    c22 = len_tokens_ref - c12

                    # Test Statistic, p-value & Bayes Factor
                    keywords_stats_file[token] = test_significance(
                        self.main, c11, c12, c21, c22)

                    # Effect Size
                    keywords_stats_file[token].append(
                        measure_effect_size(self.main, c11, c12, c21, c22))

            self.keywords_stats_files.append(keywords_stats_file)

        if len(files) == 1:
            self.keywords_freq_files.append(self.keywords_freq_files[1])
            self.keywords_stats_files *= 2
    def process_data(self):
        texts = []

        settings = self.main.settings_custom['wordlist']
        files = self.main.wordless_files.get_selected_files()

        # Frequency
        for file in files:
            text = wordless_text.Wordless_Text(self.main, file)

            tokens = wordless_token_processing.wordless_process_tokens_wordlist(
                text, token_settings=settings['token_settings'])

            texts.append(text)
            self.tokens_freq_files.append(collections.Counter(tokens))

        # Total
        if len(files) > 1:
            text_total = wordless_text.Wordless_Text_Blank()
            text_total.tokens = [
                token for text in texts for token in text.tokens
            ]

            texts.append(text_total)
            self.tokens_freq_files.append(
                sum(self.tokens_freq_files, collections.Counter()))

        self.progress_updated.emit(self.tr('Processing data ...'))

        # Dispersion & Adjusted Frequency
        text_measure_dispersion = settings['generation_settings'][
            'measure_dispersion']
        text_measure_adjusted_freq = settings['generation_settings'][
            'measure_adjusted_freq']

        measure_dispersion = self.main.settings_global['measures_dispersion'][
            text_measure_dispersion]['func']
        measure_adjusted_freq = self.main.settings_global[
            'measures_adjusted_freq'][text_measure_adjusted_freq]['func']

        tokens_total = self.tokens_freq_files[-1].keys()

        for text in texts:
            tokens_stats_file = {}

            # Dispersion
            number_sections = self.main.settings_custom['measures'][
                'dispersion']['general']['number_sections']

            sections_freq = [
                collections.Counter(section)
                for section in wordless_text_utils.to_sections(
                    text.tokens, number_sections)
            ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token] = [measure_dispersion(counts)]

            # Adjusted Frequency
            if not self.main.settings_custom['measures']['adjusted_freq'][
                    'general']['use_same_settings_dispersion']:
                number_sections = self.main.settings_custom['measures'][
                    'adjusted_freq']['general']['number_sections']

                sections_freq = [
                    collections.Counter(section)
                    for section in wordless_text_utils.to_sections(
                        text.tokens, number_sections)
                ]

            for token in tokens_total:
                counts = [
                    section_freq[token] for section_freq in sections_freq
                ]

                tokens_stats_file[token].append(measure_adjusted_freq(counts))

            self.tokens_stats_files.append(tokens_stats_file)

        if len(files) == 1:
            self.tokens_freq_files *= 2
            self.tokens_stats_files *= 2
Beispiel #3
0
def test_to_sections():
    tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    token_sections = wordless_text_utils.to_sections(tokens, num_sections=5)

    assert token_sections == [[1, 2, 3], [4, 5, 6], [7, 8], [9, 10], [11, 12]]