コード例 #1
0
def stemm(token_words):
    print("\nStemmed Text..")
    v = open('temp_file.txt', 'w')
    exceptions=['ഇതെല്ലാം']
    index_nos=[]
    
    v.writelines(["%s\n" % item  for item in token_words]) #writing all the splitted words to a file temp_file.txt 
    v.close()
    u = open("temp_file.txt", "r")
    sentence=u.readlines()
    str1 = ''.join(sentence)
    noise="[]',\n"
    for char in str1:
        if char in noise:
            str1=str1.replace(char," ")
    
    data = str1.split() #split string into a list
    tokenlist=[]
    for word in data:
        tokenlist.append(word)
    for x in tokenlist:
     if x in exceptions:           
        index_nos.append(tokenlist.index(x))
    
    stemmer = Stemmer()
    result = stemmer.stem(language='malayalam', text=str1)
        
    return result  
コード例 #2
0
 def __init__(self):
     """
     Initialize necessary resources.
     """
     self.dictionary_file = open(
         os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt'))
     self.dictionary = self.dictionary_file.readlines()
     self.dictionary_file.close()
     try:
         self.dictionary = marisa_trie.Trie(
             [x.strip().decode('utf-8') for x in self.dictionary])
     except:
         self.dictionary = marisa_trie.Trie(
             [x.strip() for x in self.dictionary])
     self.stemmer = Stemmer()
     self.inflector = inflector.Inflector(lang='ml')
     self.soundex = Soundex()
     self.syllabalizer = Syllabifier()
     self.ngrammer = Ngram()
コード例 #3
0
ファイル: mean.py プロジェクト: roseanil/SA
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
wordnet_lemmatizer = WordNetLemmatizer()

f = open("output.txt", "w", encoding="utf-8")
f2 = open("test.txt", "w", encoding="utf-8")
x = "\n"
#File read
mal = open('malayalam.txt', 'r').read()
f.write(mal)
f.write("\n")
f.write("\n")

stemmer = Stemmer()
sentence_mal = mal.split(".")
#lenmatize each word
strings1 = ""
for i in sentence_mal:
    result = stemmer.stem(language='ml_IN', text=i)
    for word, output in result.items():
        strings1 = strings1 + " " + output['stem']
    strings1 = strings1 + "."

#remove punctuations and stop words
strings2 = ""
punctuations = ['?', ':', '!', ',', ';']
stopwords_mal = ["ൽ", "ഉം", "മാ൪", "ആം", "കൾ"]
for i in strings1:
    if i not in punctuations and i not in stopwords_mal:
コード例 #4
0
from libindic.stemmer import Stemmer
stemmer = Stemmer()
result = stemmer.stem(language='malayalam', text='രാമന്റെ വീട്ടിലേക്ക്')
for word, output in result.items():
    print(word, " : ", output['stem'], " : ", output['inflection'])