Esempio n. 1
0
def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'
Esempio n. 2
0
def test_stopwords():
    from b4msa.textmodel import TextModel
    tm = TextModel(lang='es', del_dup=False)
    text = tm.text_transformations('como esta mi carro')
    print(text)
    text1 = tm.lang.transform(text, stopwords='delete')
    print(text1)
    assert text1 == '~carro~'
    text1 = tm.lang.transform(text, stopwords='group')
    print(text1)
    assert text1 == '~_sw~_sw~_sw~carro~'
Esempio n. 3
0
text = 'I like playing football'
output = []
for word in text.split():
    w = stemmer.stem(word)
    output.append(w)
output = " ".join(output)
output

text = 'I like playing football on Saturday'
words = text.split()
n = 3
n_grams = []
for a in zip(*[words[i:] for i in range(n)]):
    n_grams.append("~".join(a))
n_grams

text = 'I like playing'
q = 4
q_grams = []
for a in zip(*[text[i:] for i in range(q)]):
    q_grams.append("".join(a))
q_grams

text = 'I like playing football with @mgraffg'
tm = TextModel(token_list=[-1, 5],
               lang='english',
               usr_option=OPTION_GROUP,
               stemming=True)
tm.text_transformations(text)

tm.tokenize(text)