Esempio n. 1
0
def test_letter_count():
    textstat.set_lang("en_US")
    count = textstat.letter_count(long_test)
    count_spaces = textstat.letter_count(long_test, ignore_spaces=False)

    assert count == 1688
    assert count_spaces == 2061
Esempio n. 2
0
def test_char_count():
    textstat.set_lang("en_US")
    count = textstat.char_count(long_test)
    count_spaces = textstat.char_count(long_test, ignore_spaces=False)

    assert count == 1750
    assert count_spaces == 2123
Esempio n. 3
0
def test_lexicon_count():
    textstat.set_lang("en_US")
    count = textstat.lexicon_count(long_test)
    count_punc = textstat.lexicon_count(long_test, removepunct=False)

    assert count == 372
    assert count_punc == 376
Esempio n. 4
0
    def __init__(self, language):

        switcher = {
            "dutch": textstat.set_lang("nl"),
            "english": textstat.set_lang("en"),
            "german": textstat.set_lang("de")
        }
        switcher.get(language, "Invalid language")
Esempio n. 5
0
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False):
    if multi:
        if 'complexity' in data:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']])
        else:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token']])
        base, ext = os.path.splitext(path)
        path_head = base+'_head'+ext
        path_tail = base+'_tail'+ext
        omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse)
        _    = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse)
        multi_compute(data, path, path_head, path_tail, omit)
        return
    # based on aspect word
    data['word_len'] = data['token'].str.len().to_numpy()
    data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy()
    data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy()
    data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy()
    # based on context
    omit = set()
    if context:
        corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus')
        for corpus_name in corpus_dummies:
            data[corpus_name] = corpus_dummies[corpus_name]
            omit.add(corpus_name)
        tagdict = load('help/tagsets/upenn_tagset.pickle')
        tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation]
        POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1)
        for tag in tags:
            data['POS_'+tag] = (POS == tag) * 1
        funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)]
        for elem in tqdm(funcs):
            method = eval(elem)
            if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']:
                continue
            textstat.set_lang("en")
            data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy()
            omit.add(method.__name__)
        data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy()
        data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy()
        omit.add('SMOGIndex'); omit.add('DaleChallIndex')
        if parse and parse_lst_path is not None:
            parse_lst = pkl.load(open(parse_lst_path, 'rb'))
            parse_tree_depths = []
            token_depths = []
            num_words_at_depths = []
            for parse_tree, token in tqdm(zip(parse_lst, data['token'])):
                parse_tree_depths.append(parse_tree.height())
                token_depths.append(token_depth(parse_tree, token))
                num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1]))
            data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64)
            omit.add('parse_tree_depth')
            data['token_depth'] = np.array(token_depths).astype(np.int64)
            data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64)
    data.to_csv(path, sep='\t')
    return omit
Esempio n. 6
0
def test_text_standard():
    textstat.set_lang("en_US")
    standard = textstat.text_standard(long_test)

    assert standard == "9th and 10th grade"

    standard = textstat.text_standard(short_test)

    assert standard == "2nd and 3rd grade"
Esempio n. 7
0
def test_gunning_fog():
    textstat.set_lang("en_US")
    score = textstat.gunning_fog(long_test)

    assert score == 11.26

    # FOG-PL
    textstat.set_lang("pl_PL")
    score_pl = textstat.gunning_fog(long_test)

    assert score_pl == 10.40
Esempio n. 8
0
def test_changing_lang_clears_cache():
    textstat.set_lang("en_US")

    # Clear any cache and call reading ease
    textstat.flesch_reading_ease.cache_clear()
    textstat.flesch_reading_ease(short_test)

    # Check the cache has only been missed once
    assert textstat.flesch_reading_ease.cache_info().misses == 1

    # Change the language and recall reading ease
    textstat.set_lang("fr")
    textstat.flesch_reading_ease(short_test)

    # Check the cache hasn't been hit again
    assert textstat.flesch_reading_ease.cache_info().misses == 1
Esempio n. 9
0
def test_lru_caching():
    textstat.set_lang("en_US")
    # Clear any cache
    textstat.sentence_count._cache.clear()
    textstat.avg_sentence_length._cache.clear()

    # Make a call that uses `sentence_count`
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` was called
    assert textstat.sentence_count._cache.misses == 1

    # Call `avg_sentence_length` again
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` wasn't called again
    assert textstat.sentence_count._cache.lookups == 1
Esempio n. 10
0
def test_lru_caching():
    textstat.set_lang("en_US")
    # Clear any cache
    textstat.sentence_count.cache_clear()
    textstat.avg_sentence_length.cache_clear()

    # Make a call that uses `sentence_count`
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` was called
    assert textstat.sentence_count.cache_info().misses == 1

    # Call `avg_sentence_length` again, but clear it's cache first
    textstat.avg_sentence_length.cache_clear()
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` wasn't called again
    assert textstat.sentence_count.cache_info().hits == 1
Esempio n. 11
0
def main(filepath):
    article_df = pd.read_csv(filepath, delimiter=',', encoding='utf-8')

    # Set language: English
    textstat.set_lang("en")

    temp_df = article_df.apply(lambda x: textstat_stats(x['text']), axis=1)
    textstat_df = pd.concat([article_df, temp_df], axis=1, sort=False)

    # Save output
    outputPath = "../Outputs/textstat/"
    check_path(outputPath)

    left = filepath.find('_')
    right = filepath.find('.csv')
    outputPath = outputPath + "textstat_" + str(
        filepath[left + 1:right]) + ".csv"

    # Get text stat data
    textstat_df.to_csv(outputPath, index=False, header=True)
Esempio n. 12
0
def test_polysyllabcount():
    textstat.set_lang("en_US")
    count = textstat.polysyllabcount(long_test)

    assert count == 32
Esempio n. 13
0
def test_rix():
    textstat.set_lang("en_US")
    score = textstat.rix(long_test)

    assert score == 5.13
Esempio n. 14
0
def test_lix():
    textstat.set_lang("en_US")
    score = textstat.lix(long_test)

    assert score == 45.11
Esempio n. 15
0
def test_difficult_words_list():
    textstat.set_lang("en_US")
    result = textstat.difficult_words_list(short_test)

    assert result == ["sunglasses"]
Esempio n. 16
0
def test_linsear_write_formula():
    textstat.set_lang("en_US")
    result = textstat.linsear_write_formula(long_test)

    assert result == 14.5
Esempio n. 17
0
def test_difficult_words():
    textstat.set_lang("en_US")
    result = textstat.difficult_words(long_test)

    assert result == 49
Esempio n. 18
0
def test_syllable_count():
    textstat.set_lang("en_US")
    count = textstat.syllable_count(long_test)

    assert count == 521
Esempio n. 19
0
def test_avg_letter_per_word():
    textstat.set_lang("en_US")
    avg = textstat.avg_letter_per_word(long_test)

    assert avg == 4.54
Esempio n. 20
0
def test_dale_chall_readability_score_v2():
    textstat.set_lang("en_US")
    score = textstat.dale_chall_readability_score_v2(long_test)

    assert score == 6.87
Esempio n. 21
0
def test_unicode_support():
    textstat.set_lang("en_US")
    textstat.text_standard(
        "\u3042\u308a\u304c\u3068\u3046\u3054\u3056\u3044\u307e\u3059")

    textstat.text_standard(u"ありがとうございます")
Esempio n. 22
0
def test_flesch_kincaid_grade():
    textstat.set_lang("en_US")
    score = textstat.flesch_kincaid_grade(long_test)

    assert score == 10.0
Esempio n. 23
0
def test_flesch_reading_ease():
    textstat.set_lang("en_US")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 64.75

    textstat.set_lang("de_DE")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 63.1

    textstat.set_lang("es_ES")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 84.37

    textstat.set_lang("fr_FR")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 80.31

    textstat.set_lang("it_IT")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 89.27

    textstat.set_lang("nl_NL")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 61.97

    textstat.set_lang("ru_RU")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 116.45
Esempio n. 24
0
def test_avg_sentence_per_word():
    textstat.set_lang("en_US")
    avg = textstat.avg_sentence_per_word(long_test)

    assert avg == 0.04
Esempio n. 25
0
def test_spache_readability():
    textstat.set_lang("en_US")
    spache = textstat.spache_readability(easy_text, False)

    assert spache == 2
Esempio n. 26
0
def test_smog_index():
    textstat.set_lang("en_US")
    index = textstat.smog_index(long_test)

    assert index == 11.2
Esempio n. 27
0
def test_default_lang_configs():
    # Config from default en_US should be used
    textstat.set_lang("en_GB")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 64.75
Esempio n. 28
0
def test_coleman_liau_index():
    textstat.set_lang("en_US")
    index = textstat.coleman_liau_index(long_test)

    assert index == 9.35
Esempio n. 29
0
import pandas as pd
import datetime
import calendar
from detoxify import Detoxify
from sklearn.preprocessing import RobustScaler
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# each model takes in either a string or a list of strings
week_days = [
    'day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday',
    'day_Tuesday', 'day_Wednesday'
]

textstat.set_lang("en")
toxicity_model = Detoxify('original')


def subjective(text):
    words = ["i", "my"]
    count = 0
    text = text.lower().split()
    for word in words:
        count += text.count(word)

    return count


def weekday_from_date(date):
    year, month, day = date.split("-")
Esempio n. 30
0
def test_automated_readability_index():
    textstat.set_lang("en_US")
    index = textstat.automated_readability_index(long_test)

    assert index == 12.3