def test_text_tokenized_tagged_both(): file = new_file(text_type=('tokenized', 'tagged_both')) text_flat_tokens = wl_text.Wl_Text(main, file, flat_tokens=True) text = wl_text.Wl_Text(main, file, flat_tokens=False) assert text_flat_tokens.tokens_flat != [] assert text_flat_tokens.tags_pos != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.tags_non_pos != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.tags_all != [[]] * len( text_flat_tokens.tokens_flat) assert text_flat_tokens.offsets_paras == [0] assert text_flat_tokens.offsets_sentences == [0] assert text_flat_tokens.offsets_clauses == [0] assert (len(text_flat_tokens.tokens_flat) == len(text_flat_tokens.tags_pos) == len(text_flat_tokens.tags_non_pos) == len( text_flat_tokens.tags_all)) assert len(text_flat_tokens.offsets_paras) == len( text_flat_tokens.tokens_hierarchical) assert len(text_flat_tokens.offsets_sentences) == sum( [len(para) for para in text_flat_tokens.tokens_hierarchical]) assert len(text_flat_tokens.offsets_clauses) == sum([ len(sentence) for para in text_flat_tokens.tokens_hierarchical for sentence in para ]) assert text.tokens_flat != [] assert text.tags_pos != [[]] * len(text.tokens_flat) assert text.tags_non_pos != [[]] * len(text.tokens_flat) assert text.tags_all != [[]] * len(text.tokens_flat) assert text.offsets_paras != [0] assert text.offsets_sentences != [0] assert text.offsets_clauses != [0] assert (len(text.tokens_flat) == len(text.tags_pos) == len( text.tags_non_pos) == len(text.tags_all)) assert len(text.offsets_paras) == len(text.tokens_hierarchical) assert len(text.offsets_sentences) == sum( [len(para) for para in text.tokens_hierarchical]) assert len(text.offsets_clauses) == sum([ len(sentence) for para in text.tokens_hierarchical for sentence in para ])
def run(self): texts = [] settings = self.main.settings_custom['overview'] files = self.main.wl_files.get_selected_files() for i, file in enumerate(files): text = wl_text.Wl_Text(self.main, file, flat_tokens=False) text = wl_token_processing.wl_process_tokens_overview( text, token_settings=settings['token_settings']) texts.append(text) if len(files) > 1: text_total = wl_text.Wl_Text_Blank() text_total.offsets_paras = [ offset for text in texts for offset in text.offsets_paras ] text_total.offsets_sentences = [ offset for text in texts for offset in text.offsets_sentences ] text_total.offsets_clauses = [ offset for text in texts for offset in text.offsets_clauses ] text_total.tokens_multilevel = [ para for text in texts for para in text.tokens_multilevel ] text_total.tokens_flat = [ token for text in texts for token in text.tokens_flat ] texts.append(text_total) else: texts.append(texts[0]) self.progress_updated.emit(self.tr('Processing data ...')) base_sttr = settings['generation_settings']['base_sttr'] for text in texts: texts_stats_file = [] # Paragraph length len_paras_in_sentence = [ len(para) for para in text.tokens_multilevel ] len_paras_in_clause = [ sum([len(sentence) for sentence in para]) for para in text.tokens_multilevel ] len_paras_in_token = [ sum([len(clause) for sentence in para for clause in sentence]) for para in text.tokens_multilevel ] # Sentence length len_sentences = [ sum([len(clause) for clause in sentence]) for para in text.tokens_multilevel for sentence in para ] # Clause length len_clauses = [ len(clause) for para in text.tokens_multilevel for sentence in para for clause in sentence ] # Token length len_tokens = [len(token) for token in text.tokens_flat] # Type length len_types = [ len(token_type) for token_type in set(text.tokens_flat) ] count_tokens = len(len_tokens) count_types = len(len_types) # TTR if count_tokens: ttr = count_types / count_tokens else: ttr = 0 # STTR if count_tokens < base_sttr: sttr = ttr else: token_sections = wl_text_utils.to_sections_unequal( text.tokens_flat, base_sttr) # Discard the last section if number of tokens in it is smaller than the base of sttr if len(token_sections[-1]) < base_sttr: ttrs = [ len(set(token_section)) / len(token_section) for token_section in token_sections[:-1] ] else: ttrs = [ len(set(token_section)) / len(token_section) for token_section in token_sections ] sttr = sum(ttrs) / len(ttrs) texts_stats_file.append(len_paras_in_sentence) texts_stats_file.append(len_paras_in_clause) texts_stats_file.append(len_paras_in_token) texts_stats_file.append(len_sentences) texts_stats_file.append(len_clauses) texts_stats_file.append(len_tokens) texts_stats_file.append(len_types) texts_stats_file.append(ttr) texts_stats_file.append(sttr) self.texts_stats_files.append(texts_stats_file)
def run(self): texts = [] settings = self.main.settings_custom['wordlist'] files = self.main.wl_files.get_selected_files() # Frequency for file in files: text = wl_text.Wl_Text(self.main, file) text = wl_token_processing.wl_process_tokens_wordlist( text, token_settings=settings['token_settings']) # Remove empty tokens tokens = [token for token in text.tokens_flat if token] self.tokens_freq_files.append(collections.Counter(tokens)) texts.append(text) # Total if len(files) > 1: text_total = wl_text.Wl_Text_Blank() text_total.tokens_flat = [ token for text in texts for token in text.tokens_flat ] self.tokens_freq_files.append( sum(self.tokens_freq_files, collections.Counter())) texts.append(text_total) self.progress_updated.emit(self.tr('Processing data ...')) # Dispersion & Adjusted Frequency text_measure_dispersion = settings['generation_settings'][ 'measure_dispersion'] text_measure_adjusted_freq = settings['generation_settings'][ 'measure_adjusted_freq'] measure_dispersion = self.main.settings_global['measures_dispersion'][ text_measure_dispersion]['func'] measure_adjusted_freq = self.main.settings_global[ 'measures_adjusted_freq'][text_measure_adjusted_freq]['func'] tokens_total = self.tokens_freq_files[-1].keys() for text in texts: tokens_stats_file = {} # Dispersion number_sections = self.main.settings_custom['measures'][ 'dispersion']['general']['number_sections'] sections_freq = [ collections.Counter(section) for section in wl_text_utils.to_sections( text.tokens_flat, number_sections) ] for token in tokens_total: counts = [ section_freq[token] for section_freq in sections_freq ] tokens_stats_file[token] = [measure_dispersion(counts)] # Adjusted Frequency if not self.main.settings_custom['measures']['adjusted_freq'][ 'general']['use_same_settings_dispersion']: number_sections = self.main.settings_custom['measures'][ 'adjusted_freq']['general']['number_sections'] sections_freq = [ collections.Counter(section) for section in wl_text_utils.to_sections( text.tokens_flat, number_sections) ] for token in tokens_total: counts = [ section_freq[token] for section_freq in sections_freq ] tokens_stats_file[token].append(measure_adjusted_freq(counts)) self.tokens_stats_files.append(tokens_stats_file) if len(files) == 1: self.tokens_freq_files *= 2 self.tokens_stats_files *= 2
def run(self): texts = [] settings = self.main.settings_custom['keyword'] ref_file = self.main.wl_files.find_file_by_name( settings['generation_settings']['ref_file'], selected_only=True) files = [ file for file in self.main.wl_files.get_selected_files() if file != ref_file ] # Frequency for i, file in enumerate([ref_file] + files): text = wl_text.Wl_Text(self.main, file) text = wl_token_processing.wl_process_tokens_keyword( text, token_settings=settings['token_settings']) # Remove empty tokens tokens = [token for token in text.tokens_flat if token] self.keywords_freq_files.append(collections.Counter(tokens)) if i > 0: texts.append(text) else: tokens_ref = text.tokens_flat.copy() len_tokens_ref = len(tokens_ref) # Total if len(files) > 1: text_total = wl_text.Wl_Text_Blank() text_total.tokens_flat = [ token for text in texts for token in text.tokens_flat ] self.keywords_freq_files.append( sum(self.keywords_freq_files, collections.Counter())) self.keywords_freq_files[0] = { token: freq for token, freq in self.keywords_freq_files[0].items() if token in text_total.tokens_flat } texts.append(text_total) else: self.keywords_freq_files[0] = { token: freq for token, freq in self.keywords_freq_files[0].items() if token in self.keywords_freq_files[1] } self.progress_updated.emit(self.tr('Processing data ...')) # Keyness text_test_significance = settings['generation_settings'][ 'test_significance'] text_measure_effect_size = settings['generation_settings'][ 'measure_effect_size'] test_significance = self.main.settings_global['tests_significance'][ 'keyword'][text_test_significance]['func'] measure_effect_size = self.main.settings_global[ 'measures_effect_size']['keyword'][text_measure_effect_size][ 'func'] keywords_freq_file_observed = self.keywords_freq_files[-1] keywords_freq_file_ref = self.keywords_freq_files[0] for text in texts: keywords_stats_file = {} tokens_observed = text.tokens_flat len_tokens_observed = len(tokens_observed) if text_test_significance in [ self.tr('Student\'s t-test (Two-sample)'), self.tr('Mann-Whitney U Test') ]: # Test Statistic, p-value & Bayes Factor if text_test_significance == self.tr( 'Student\'s t-test (Two-sample)'): number_sections = self.main.settings_custom['measures'][ 'statistical_significance'][ 'students_t_test_2_sample']['number_sections'] use_data = self.main.settings_custom['measures'][ 'statistical_significance'][ 'students_t_test_2_sample']['use_data'] elif text_test_significance == self.tr('Mann-Whitney U Test'): number_sections = self.main.settings_custom['measures'][ 'statistical_significance']['mann_whitney_u_test'][ 'number_sections'] use_data = self.main.settings_custom['measures'][ 'statistical_significance']['mann_whitney_u_test'][ 'use_data'] sections_observed = wl_text_utils.to_sections( tokens_observed, number_sections) sections_ref = wl_text_utils.to_sections( tokens_ref, number_sections) sections_freq_observed = [ collections.Counter(section) for section in sections_observed ] sections_freq_ref = [ collections.Counter(section) for section in sections_observed ] len_sections_observed = [ len(section) for section in sections_observed ] len_sections_ref = [len(section) for section in sections_ref] if use_data == self.tr('Absolute Frequency'): for token in keywords_freq_file_observed: counts_observed = [ section_freq.get(token, 0) for section_freq in sections_freq_observed ] counts_ref = [ section_freq.get(token, 0) for section_freq in sections_freq_ref ] keywords_stats_file[token] = test_significance( self.main, counts_observed, counts_ref) elif use_data == self.tr('Relative Frequency'): for token in keywords_freq_file_observed: counts_observed = [ section_freq.get(token, 0) / len_sections_observed[i] for i, section_freq in enumerate( sections_freq_observed) ] counts_ref = [ section_freq.get(token, 0) / len_sections_ref[i] for i, section_freq in enumerate(sections_freq_ref) ] keywords_stats_file[token] = test_significance( self.main, counts_observed, counts_ref) # Effect Size for token in keywords_freq_file_observed: c11 = keywords_freq_file_observed.get(token, 0) c12 = keywords_freq_file_ref.get(token, 0) c21 = len_tokens_observed - c11 c22 = len_tokens_ref - c12 keywords_stats_file[token].append( measure_effect_size(self.main, c11, c12, c21, c22)) else: for token in keywords_freq_file_observed: c11 = keywords_freq_file_observed.get(token, 0) c12 = keywords_freq_file_ref.get(token, 0) c21 = len_tokens_observed - c11 c22 = len_tokens_ref - c12 # Test Statistic, p-value & Bayes Factor keywords_stats_file[token] = test_significance( self.main, c11, c12, c21, c22) # Effect Size keywords_stats_file[token].append( measure_effect_size(self.main, c11, c12, c21, c22)) self.keywords_stats_files.append(keywords_stats_file) if len(files) == 1: self.keywords_freq_files.append(self.keywords_freq_files[1]) self.keywords_stats_files *= 2