def stemm(token_words): print("\nStemmed Text..") v = open('temp_file.txt', 'w') exceptions=['ഇതെല്ലാം'] index_nos=[] v.writelines(["%s\n" % item for item in token_words]) #writing all the splitted words to a file temp_file.txt v.close() u = open("temp_file.txt", "r") sentence=u.readlines() str1 = ''.join(sentence) noise="[]',\n" for char in str1: if char in noise: str1=str1.replace(char," ") data = str1.split() #split string into a list tokenlist=[] for word in data: tokenlist.append(word) for x in tokenlist: if x in exceptions: index_nos.append(tokenlist.index(x)) stemmer = Stemmer() result = stemmer.stem(language='malayalam', text=str1) return result
def __init__(self): """ Initialize necessary resources. """ self.dictionary_file = open( os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt')) self.dictionary = self.dictionary_file.readlines() self.dictionary_file.close() try: self.dictionary = marisa_trie.Trie( [x.strip().decode('utf-8') for x in self.dictionary]) except: self.dictionary = marisa_trie.Trie( [x.strip() for x in self.dictionary]) self.stemmer = Stemmer() self.inflector = inflector.Inflector(lang='ml') self.soundex = Soundex() self.syllabalizer = Syllabifier() self.ngrammer = Ngram()
from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet wordnet_lemmatizer = WordNetLemmatizer() f = open("output.txt", "w", encoding="utf-8") f2 = open("test.txt", "w", encoding="utf-8") x = "\n" #File read mal = open('malayalam.txt', 'r').read() f.write(mal) f.write("\n") f.write("\n") stemmer = Stemmer() sentence_mal = mal.split(".") #lenmatize each word strings1 = "" for i in sentence_mal: result = stemmer.stem(language='ml_IN', text=i) for word, output in result.items(): strings1 = strings1 + " " + output['stem'] strings1 = strings1 + "." #remove punctuations and stop words strings2 = "" punctuations = ['?', ':', '!', ',', ';'] stopwords_mal = ["ൽ", "ഉം", "മാ൪", "ആം", "കൾ"] for i in strings1: if i not in punctuations and i not in stopwords_mal:
from libindic.stemmer import Stemmer stemmer = Stemmer() result = stemmer.stem(language='malayalam', text='രാമന്റെ വീട്ടിലേക്ക്') for word, output in result.items(): print(word, " : ", output['stem'], " : ", output['inflection'])