def test_letter_count(): textstat.set_lang("en_US") count = textstat.letter_count(long_test) count_spaces = textstat.letter_count(long_test, ignore_spaces=False) assert count == 1688 assert count_spaces == 2061
def test_char_count(): textstat.set_lang("en_US") count = textstat.char_count(long_test) count_spaces = textstat.char_count(long_test, ignore_spaces=False) assert count == 1750 assert count_spaces == 2123
def test_lexicon_count(): textstat.set_lang("en_US") count = textstat.lexicon_count(long_test) count_punc = textstat.lexicon_count(long_test, removepunct=False) assert count == 372 assert count_punc == 376
def __init__(self, language): switcher = { "dutch": textstat.set_lang("nl"), "english": textstat.set_lang("en"), "german": textstat.set_lang("de") } switcher.get(language, "Invalid language")
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False): if multi: if 'complexity' in data: data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']]) else: data_head, data_tail = multi_data(data[['id','corpus','sentence','token']]) base, ext = os.path.splitext(path) path_head = base+'_head'+ext path_tail = base+'_tail'+ext omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse) _ = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse) multi_compute(data, path, path_head, path_tail, omit) return # based on aspect word data['word_len'] = data['token'].str.len().to_numpy() data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy() data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy() data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy() # based on context omit = set() if context: corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus') for corpus_name in corpus_dummies: data[corpus_name] = corpus_dummies[corpus_name] omit.add(corpus_name) tagdict = load('help/tagsets/upenn_tagset.pickle') tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation] POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1) for tag in tags: data['POS_'+tag] = (POS == tag) * 1 funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)] for elem in tqdm(funcs): method = eval(elem) if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']: continue textstat.set_lang("en") data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy() omit.add(method.__name__) data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy() data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy() omit.add('SMOGIndex'); omit.add('DaleChallIndex') if parse and parse_lst_path is not None: parse_lst = pkl.load(open(parse_lst_path, 'rb')) parse_tree_depths = [] token_depths = [] num_words_at_depths = [] for parse_tree, token in tqdm(zip(parse_lst, data['token'])): parse_tree_depths.append(parse_tree.height()) token_depths.append(token_depth(parse_tree, token)) num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1])) data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64) omit.add('parse_tree_depth') data['token_depth'] = np.array(token_depths).astype(np.int64) data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64) data.to_csv(path, sep='\t') return omit
def test_text_standard(): textstat.set_lang("en_US") standard = textstat.text_standard(long_test) assert standard == "9th and 10th grade" standard = textstat.text_standard(short_test) assert standard == "2nd and 3rd grade"
def test_gunning_fog(): textstat.set_lang("en_US") score = textstat.gunning_fog(long_test) assert score == 11.26 # FOG-PL textstat.set_lang("pl_PL") score_pl = textstat.gunning_fog(long_test) assert score_pl == 10.40
def test_changing_lang_clears_cache(): textstat.set_lang("en_US") # Clear any cache and call reading ease textstat.flesch_reading_ease.cache_clear() textstat.flesch_reading_ease(short_test) # Check the cache has only been missed once assert textstat.flesch_reading_ease.cache_info().misses == 1 # Change the language and recall reading ease textstat.set_lang("fr") textstat.flesch_reading_ease(short_test) # Check the cache hasn't been hit again assert textstat.flesch_reading_ease.cache_info().misses == 1
def test_lru_caching(): textstat.set_lang("en_US") # Clear any cache textstat.sentence_count._cache.clear() textstat.avg_sentence_length._cache.clear() # Make a call that uses `sentence_count` textstat.avg_sentence_length(long_test) # Test that `sentence_count` was called assert textstat.sentence_count._cache.misses == 1 # Call `avg_sentence_length` again textstat.avg_sentence_length(long_test) # Test that `sentence_count` wasn't called again assert textstat.sentence_count._cache.lookups == 1
def test_lru_caching(): textstat.set_lang("en_US") # Clear any cache textstat.sentence_count.cache_clear() textstat.avg_sentence_length.cache_clear() # Make a call that uses `sentence_count` textstat.avg_sentence_length(long_test) # Test that `sentence_count` was called assert textstat.sentence_count.cache_info().misses == 1 # Call `avg_sentence_length` again, but clear it's cache first textstat.avg_sentence_length.cache_clear() textstat.avg_sentence_length(long_test) # Test that `sentence_count` wasn't called again assert textstat.sentence_count.cache_info().hits == 1
def main(filepath): article_df = pd.read_csv(filepath, delimiter=',', encoding='utf-8') # Set language: English textstat.set_lang("en") temp_df = article_df.apply(lambda x: textstat_stats(x['text']), axis=1) textstat_df = pd.concat([article_df, temp_df], axis=1, sort=False) # Save output outputPath = "../Outputs/textstat/" check_path(outputPath) left = filepath.find('_') right = filepath.find('.csv') outputPath = outputPath + "textstat_" + str( filepath[left + 1:right]) + ".csv" # Get text stat data textstat_df.to_csv(outputPath, index=False, header=True)
def test_polysyllabcount(): textstat.set_lang("en_US") count = textstat.polysyllabcount(long_test) assert count == 32
def test_rix(): textstat.set_lang("en_US") score = textstat.rix(long_test) assert score == 5.13
def test_lix(): textstat.set_lang("en_US") score = textstat.lix(long_test) assert score == 45.11
def test_difficult_words_list(): textstat.set_lang("en_US") result = textstat.difficult_words_list(short_test) assert result == ["sunglasses"]
def test_linsear_write_formula(): textstat.set_lang("en_US") result = textstat.linsear_write_formula(long_test) assert result == 14.5
def test_difficult_words(): textstat.set_lang("en_US") result = textstat.difficult_words(long_test) assert result == 49
def test_syllable_count(): textstat.set_lang("en_US") count = textstat.syllable_count(long_test) assert count == 521
def test_avg_letter_per_word(): textstat.set_lang("en_US") avg = textstat.avg_letter_per_word(long_test) assert avg == 4.54
def test_dale_chall_readability_score_v2(): textstat.set_lang("en_US") score = textstat.dale_chall_readability_score_v2(long_test) assert score == 6.87
def test_unicode_support(): textstat.set_lang("en_US") textstat.text_standard( "\u3042\u308a\u304c\u3068\u3046\u3054\u3056\u3044\u307e\u3059") textstat.text_standard(u"ありがとうございます")
def test_flesch_kincaid_grade(): textstat.set_lang("en_US") score = textstat.flesch_kincaid_grade(long_test) assert score == 10.0
def test_flesch_reading_ease(): textstat.set_lang("en_US") score = textstat.flesch_reading_ease(long_test) assert score == 64.75 textstat.set_lang("de_DE") score = textstat.flesch_reading_ease(long_test) assert score == 63.1 textstat.set_lang("es_ES") score = textstat.flesch_reading_ease(long_test) assert score == 84.37 textstat.set_lang("fr_FR") score = textstat.flesch_reading_ease(long_test) assert score == 80.31 textstat.set_lang("it_IT") score = textstat.flesch_reading_ease(long_test) assert score == 89.27 textstat.set_lang("nl_NL") score = textstat.flesch_reading_ease(long_test) assert score == 61.97 textstat.set_lang("ru_RU") score = textstat.flesch_reading_ease(long_test) assert score == 116.45
def test_avg_sentence_per_word(): textstat.set_lang("en_US") avg = textstat.avg_sentence_per_word(long_test) assert avg == 0.04
def test_spache_readability(): textstat.set_lang("en_US") spache = textstat.spache_readability(easy_text, False) assert spache == 2
def test_smog_index(): textstat.set_lang("en_US") index = textstat.smog_index(long_test) assert index == 11.2
def test_default_lang_configs(): # Config from default en_US should be used textstat.set_lang("en_GB") score = textstat.flesch_reading_ease(long_test) assert score == 64.75
def test_coleman_liau_index(): textstat.set_lang("en_US") index = textstat.coleman_liau_index(long_test) assert index == 9.35
import pandas as pd import datetime import calendar from detoxify import Detoxify from sklearn.preprocessing import RobustScaler import ssl ssl._create_default_https_context = ssl._create_unverified_context # each model takes in either a string or a list of strings week_days = [ 'day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday', 'day_Tuesday', 'day_Wednesday' ] textstat.set_lang("en") toxicity_model = Detoxify('original') def subjective(text): words = ["i", "my"] count = 0 text = text.lower().split() for word in words: count += text.count(word) return count def weekday_from_date(date): year, month, day = date.split("-")
def test_automated_readability_index(): textstat.set_lang("en_US") index = textstat.automated_readability_index(long_test) assert index == 12.3