Ejemplo n.º 1
0
def get_bigram_list(full_sentence_list, stem=False):
    sentence_stream = [doc.split(" ") for doc in full_sentence_list]
    #print(sentence_stream)
    stemmer = RafiStemmer()

    bigram = Phrases(sentence_stream, min_count=2, threshold=5, delimiter=b'_')

    bigram_phraser = Phraser(bigram)

    bigram_list = []

    #print(bigram_phraser)
    for sent in sentence_stream:
        tokens_ = bigram_phraser[sent]

        for each_bigram in tokens_:
            if each_bigram.count('_') == 1:
                #print(each_bigram)
                if stem == True:
                    bigram_list.append(stemmer.stem_word(each_bigram))
                else:
                    bigram_list.append(each_bigram)

    bigram_count_list = []
    for each_unique_bigram in set(bigram_list):
        bigram_count_list.append(
            [each_unique_bigram,
             bigram_list.count(each_unique_bigram)])

    return (bigram_count_list)
Ejemplo n.º 2
0
def stemmer(words):
    steams = []
    stemmer = RafiStemmer()
    for i in words:
        steams.append(stemmer.stem_word(i))

    return steams
Ejemplo n.º 3
0
def preprocess_documents(doc):

    preprocessed_list_of_docs = []
    stemmer = RafiStemmer()

    stop_words = load_stop_word()
    preprocessed_docs = []

    doc_token = []

    if isinstance(doc, str):
        for token in punctuation_remover(doc).split():
            if token not in stop_words and len(token) >= 3:
                if len(stemmer.stem_word(token)) >= 2:
                    doc_token.append(stemmer.stem_word(token))

    return doc_token
Ejemplo n.º 4
0
def preprocess_documents(doc):

    preprocessed_list_of_docs = []
    stemmer = RafiStemmer()

    stop_words = load_stop_word()
    preprocessed_docs = []

    doc_token = []

    if isinstance(doc, str):
        for token in punctuation_remover(doc).split():
            if token not in stop_words and len(token) >= 3:
                if len(stemmer.stem_word(token)) >= 2:
                    doc_token.append(stemmer.stem_word(token))


    return doc_token


    header_list = list(range(0, len(dictionary)-1))
    all_list = [header_list]

    for each_list in bow_corpus:
        temp_list = [0]*len(dictionary)
        for each_tuple in each_list:
            temp_list[each_tuple[0]] = each_tuple[1]
        all_list.append(temp_list)

    minimal_all_list = []

    minimal_header_list = []

    for i in range(len(dictionary)):
        minimal_header_list.append(dictionary[i])

    minimal_all_list.append(minimal_header_list)

    for each_mini_list in all_list[1:]:
        minimal_all_list.append(each_mini_list)


    return(minimal_all_list)
Ejemplo n.º 5
0
from bengali_stemmer.rafikamal2014 import RafiStemmer
stemmer = RafiStemmer()
stemmed_word = stemmer.stem_word('বাংলায়')
print(stemmed_word)
Ejemplo n.º 6
0
        for each_bigram in tokens_:
            if '_' in each_bigram:
                #print(each_bigram)
                bigram_list.append(each_bigram)

        # for each_trigram in trigram_tokens_:
        #     #print(each_trigram)
        #     if each_trigram.count('_') == 2:
        #         trigram_list.append(each_trigram)

    #print(len(bigram_list))

    stemmed_bigram_count_list = []

    stemmer = RafiStemmer()
    stemmed_bigram_list = [
        stemmer.stem_word(each_non_stemmed_bigram)
        for each_non_stemmed_bigram in bigram_list
    ]
    for each_unique_stemmed_bigram in set(stemmed_bigram_list):
        stemmed_bigram_count_list.append([
            each_unique_stemmed_bigram,
            stemmed_bigram_list.count(each_unique_stemmed_bigram)
        ])

    bigram_count_list = []

    for each_unique_bigram in set(bigram_list):
        bigram_count_list.append(
            [each_unique_bigram,
Ejemplo n.º 7
0
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from bengali_stemmer.rafikamal2014 import RafiStemmer



"""
Stem is the form of a word before any inflectional affixes are added
it is like word

"""



ps=PorterStemmer()
example_words=["করাল","কইরালাইতেছে","নাম্বারটা","উচ্চারন","করিজাইও"]
stemmer = RafiStemmer()

for w in example_words:
    print(stemmer.stem_word(w))