Ejemplos de set_lang en Python, ejemplos de textstat.set_lang en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test.py Proyecto: pymetrics/textstat-pym

def test_letter_count():
    textstat.set_lang("en_US")
    count = textstat.letter_count(long_test)
    count_spaces = textstat.letter_count(long_test, ignore_spaces=False)

    assert count == 1688
    assert count_spaces == 2061

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test.py Proyecto: pymetrics/textstat-pym

def test_char_count():
    textstat.set_lang("en_US")
    count = textstat.char_count(long_test)
    count_spaces = textstat.char_count(long_test, ignore_spaces=False)

    assert count == 1750
    assert count_spaces == 2123

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test.py Proyecto: wjj962464/textstat

def test_lexicon_count():
    textstat.set_lang("en_US")
    count = textstat.lexicon_count(long_test)
    count_punc = textstat.lexicon_count(long_test, removepunct=False)

    assert count == 372
    assert count_punc == 376

Ejemplo n.º 4

0

Mostrar archivo

Archivo: textstat.py Proyecto: svrijenhoek/dart

    def __init__(self, language):

        switcher = {
            "dutch": textstat.set_lang("nl"),
            "english": textstat.set_lang("en"),
            "german": textstat.set_lang("de")
        }
        switcher.get(language, "Invalid language")

Ejemplo n.º 5

0

Mostrar archivo

def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False):
    if multi:
        if 'complexity' in data:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']])
        else:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token']])
        base, ext = os.path.splitext(path)
        path_head = base+'_head'+ext
        path_tail = base+'_tail'+ext
        omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse)
        _    = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse)
        multi_compute(data, path, path_head, path_tail, omit)
        return
    # based on aspect word
    data['word_len'] = data['token'].str.len().to_numpy()
    data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy()
    data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy()
    data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy()
    # based on context
    omit = set()
    if context:
        corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus')
        for corpus_name in corpus_dummies:
            data[corpus_name] = corpus_dummies[corpus_name]
            omit.add(corpus_name)
        tagdict = load('help/tagsets/upenn_tagset.pickle')
        tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation]
        POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1)
        for tag in tags:
            data['POS_'+tag] = (POS == tag) * 1
        funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)]
        for elem in tqdm(funcs):
            method = eval(elem)
            if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']:
                continue
            textstat.set_lang("en")
            data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy()
            omit.add(method.__name__)
        data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy()
        data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy()
        omit.add('SMOGIndex'); omit.add('DaleChallIndex')
        if parse and parse_lst_path is not None:
            parse_lst = pkl.load(open(parse_lst_path, 'rb'))
            parse_tree_depths = []
            token_depths = []
            num_words_at_depths = []
            for parse_tree, token in tqdm(zip(parse_lst, data['token'])):
                parse_tree_depths.append(parse_tree.height())
                token_depths.append(token_depth(parse_tree, token))
                num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1]))
            data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64)
            omit.add('parse_tree_depth')
            data['token_depth'] = np.array(token_depths).astype(np.int64)
            data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64)
    data.to_csv(path, sep='\t')
    return omit

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_text_standard():
    textstat.set_lang("en_US")
    standard = textstat.text_standard(long_test)

    assert standard == "9th and 10th grade"

    standard = textstat.text_standard(short_test)

    assert standard == "2nd and 3rd grade"

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_gunning_fog():
    textstat.set_lang("en_US")
    score = textstat.gunning_fog(long_test)

    assert score == 11.26

    # FOG-PL
    textstat.set_lang("pl_PL")
    score_pl = textstat.gunning_fog(long_test)

    assert score_pl == 10.40

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test.py Proyecto: vargeus/textstat

def test_changing_lang_clears_cache():
    textstat.set_lang("en_US")

    # Clear any cache and call reading ease
    textstat.flesch_reading_ease.cache_clear()
    textstat.flesch_reading_ease(short_test)

    # Check the cache has only been missed once
    assert textstat.flesch_reading_ease.cache_info().misses == 1

    # Change the language and recall reading ease
    textstat.set_lang("fr")
    textstat.flesch_reading_ease(short_test)

    # Check the cache hasn't been hit again
    assert textstat.flesch_reading_ease.cache_info().misses == 1

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_lru_caching():
    textstat.set_lang("en_US")
    # Clear any cache
    textstat.sentence_count._cache.clear()
    textstat.avg_sentence_length._cache.clear()

    # Make a call that uses `sentence_count`
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` was called
    assert textstat.sentence_count._cache.misses == 1

    # Call `avg_sentence_length` again
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` wasn't called again
    assert textstat.sentence_count._cache.lookups == 1

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test.py Proyecto: vargeus/textstat

def test_lru_caching():
    textstat.set_lang("en_US")
    # Clear any cache
    textstat.sentence_count.cache_clear()
    textstat.avg_sentence_length.cache_clear()

    # Make a call that uses `sentence_count`
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` was called
    assert textstat.sentence_count.cache_info().misses == 1

    # Call `avg_sentence_length` again, but clear it's cache first
    textstat.avg_sentence_length.cache_clear()
    textstat.avg_sentence_length(long_test)

    # Test that `sentence_count` wasn't called again
    assert textstat.sentence_count.cache_info().hits == 1

Ejemplo n.º 11

0

Mostrar archivo

Archivo: get_textstat.py Proyecto: suraj-swaroop/Wikiplugin

def main(filepath):
    article_df = pd.read_csv(filepath, delimiter=',', encoding='utf-8')

    # Set language: English
    textstat.set_lang("en")

    temp_df = article_df.apply(lambda x: textstat_stats(x['text']), axis=1)
    textstat_df = pd.concat([article_df, temp_df], axis=1, sort=False)

    # Save output
    outputPath = "../Outputs/textstat/"
    check_path(outputPath)

    left = filepath.find('_')
    right = filepath.find('.csv')
    outputPath = outputPath + "textstat_" + str(
        filepath[left + 1:right]) + ".csv"

    # Get text stat data
    textstat_df.to_csv(outputPath, index=False, header=True)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_polysyllabcount():
    textstat.set_lang("en_US")
    count = textstat.polysyllabcount(long_test)

    assert count == 32

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_rix():
    textstat.set_lang("en_US")
    score = textstat.rix(long_test)

    assert score == 5.13

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_lix():
    textstat.set_lang("en_US")
    score = textstat.lix(long_test)

    assert score == 45.11

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_difficult_words_list():
    textstat.set_lang("en_US")
    result = textstat.difficult_words_list(short_test)

    assert result == ["sunglasses"]

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_linsear_write_formula():
    textstat.set_lang("en_US")
    result = textstat.linsear_write_formula(long_test)

    assert result == 14.5

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_difficult_words():
    textstat.set_lang("en_US")
    result = textstat.difficult_words(long_test)

    assert result == 49

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_syllable_count():
    textstat.set_lang("en_US")
    count = textstat.syllable_count(long_test)

    assert count == 521

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_avg_letter_per_word():
    textstat.set_lang("en_US")
    avg = textstat.avg_letter_per_word(long_test)

    assert avg == 4.54

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_dale_chall_readability_score_v2():
    textstat.set_lang("en_US")
    score = textstat.dale_chall_readability_score_v2(long_test)

    assert score == 6.87

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_unicode_support():
    textstat.set_lang("en_US")
    textstat.text_standard(
        "\u3042\u308a\u304c\u3068\u3046\u3054\u3056\u3044\u307e\u3059")

    textstat.text_standard(u"ありがとうございます")

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_flesch_kincaid_grade():
    textstat.set_lang("en_US")
    score = textstat.flesch_kincaid_grade(long_test)

    assert score == 10.0

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_flesch_reading_ease():
    textstat.set_lang("en_US")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 64.75

    textstat.set_lang("de_DE")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 63.1

    textstat.set_lang("es_ES")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 84.37

    textstat.set_lang("fr_FR")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 80.31

    textstat.set_lang("it_IT")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 89.27

    textstat.set_lang("nl_NL")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 61.97

    textstat.set_lang("ru_RU")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 116.45

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_avg_sentence_per_word():
    textstat.set_lang("en_US")
    avg = textstat.avg_sentence_per_word(long_test)

    assert avg == 0.04

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_spache_readability():
    textstat.set_lang("en_US")
    spache = textstat.spache_readability(easy_text, False)

    assert spache == 2

Ejemplo n.º 26

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_smog_index():
    textstat.set_lang("en_US")
    index = textstat.smog_index(long_test)

    assert index == 11.2

Ejemplo n.º 27

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_default_lang_configs():
    # Config from default en_US should be used
    textstat.set_lang("en_GB")
    score = textstat.flesch_reading_ease(long_test)

    assert score == 64.75

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_coleman_liau_index():
    textstat.set_lang("en_US")
    index = textstat.coleman_liau_index(long_test)

    assert index == 9.35

Ejemplo n.º 29

0

Mostrar archivo

Archivo: preprocessing_utils.py Proyecto: bely66/reddit-upvotes

import pandas as pd
import datetime
import calendar
from detoxify import Detoxify
from sklearn.preprocessing import RobustScaler
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# each model takes in either a string or a list of strings
week_days = [
    'day_Friday', 'day_Monday', 'day_Saturday', 'day_Sunday', 'day_Thursday',
    'day_Tuesday', 'day_Wednesday'
]

textstat.set_lang("en")
toxicity_model = Detoxify('original')


def subjective(text):
    words = ["i", "my"]
    count = 0
    text = text.lower().split()
    for word in words:
        count += text.count(word)

    return count


def weekday_from_date(date):
    year, month, day = date.split("-")

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test.py Proyecto: rinaldo-rex/textstat

def test_automated_readability_index():
    textstat.set_lang("en_US")
    index = textstat.automated_readability_index(long_test)

    assert index == 12.3