def porterStemmer(self): ps = stemmer.Stemmer() self.dataMailStem = [] for d in self.dataMailSet: try: self.dataMailStem.append(ps.stem(d)) except BaseException, Argument: print "error", Argument
def __init__(self, Stemming=False): self.stop_words = stopwords.words('english') extra_stop_words = ['i\'ll', 'i\'d', 'i\'m', 'i\'ve'] #expand the stopwords self.stop_words = self.stop_words + extra_stop_words self.Doc_ID = 0 self.Stemmer = None if Stemming: self.Stemmer = stemmer.Stemmer()
def __init__(self): self.stop_words = stopwords.words('english') self.stop_words.extend([ "rt", "n't", "'re", "gon", "na", "covid", "coronavirus", "covid-19" ]) self.punctuation_to_remove = punctuation.replace('#', '').replace( '@', '').replace('%', '').replace('$', '') self.symbols = "<>:\"/\\|!?*~.'`-_()^,+=;" self.token_stemmer = stemmer.Stemmer()
def virtualdictinit(list, vd): st = r'speller.db' conn = sqlite3.connect(os.path.join (os.path.dirname (os.path.abspath(__file__)), st)) cursor = conn.cursor() st = stemmer.Stemmer() for l in list: cursor.execute("insert into {} values(?,?)". format(vd), (st.stem(l), l)) conn.commit() conn.close()
def perform_operation(): object = stemmer.Stemmer() object.stem_init(e1.get()) e2.delete(0, END) e3.delete(0, END) try: object.hstem() e2.insert(0, object.rem) e3.insert(0, object.output) except: e2.insert(0, 'Not Found') e3.insert(0, 'Not Found')
def stem_words(filteredText): stem = stemmer.Stemmer('english') keywords = [] word = "" for token in filteredText: if not token.isdigit(): if len(token) > 2: if token.isalpha(): st = stem.stemWord(token) word += st.lower() if word: keywords.append(word) word = "" return keywords
def print_sorted_tfidf(self, sentence): porter = stemmer.Stemmer() stemed_sentence = [] for word in porter.remove_symbol(sentence.lower()).replace("\n", "").split(): stemed_sentence.append(porter.stem(word, 0, len(word) - 1)) stemed_sentence = " ".join(stemed_sentence) sc_lst = self.calc_sent_tfidf(stemed_sentence) sc_lst = sorted(sc_lst.items(), key=(lambda x: x[1]), reverse=True) print "=" * 50 print "input query: %s\n" % sentence print "stemed query: %s\n" % stemed_sentence print " [doc_path | tf-idf]" for doc, score in sc_lst[:5]: print " [%s | %f]" % (doc, score)
def __init__(self): self.porter = stemmer.Stemmer()
def initiality_stem(wrd, initial): if not initial: st = stemmer.Stemmer() wrd = st.stem(wrd) return wrd
#!/usr/bin/env python # coding: utf-8 # In[1]: import stemmer # In[8]: myStemmer = stemmer.Stemmer() output = myStemmer.stemWord("ladkaa") if (output == "ladka"): print("Function stemWord passed! ") # In[9]: output = myStemmer.stemListOfWords(["ladkii", "ladkaaaa", "firaaangii"]) if (output[0] == 'ladki' and output[1] == 'ladka' and output[2] == 'firangi'): print("Function stemListOfWords passed!") # In[18]: output = myStemmer.stem2dListOfWords([["merii", "merraa"], ["terii", "terraaa", "aaajjjaa"]]) if (output[0][0] == 'meri' and output[0][1] == 'mera' and output[1][0] == 'teri' and output[1][2] == 'aja'): print("Function stem2dListOfWords passed!")
unknown_processed_words = set() for token in text_tokens: if is_bg_word(checked_word=token, bul_words=bg_words): continue else: synonym = check_and_get_match_for_synonyms( checked_word=token, synonyms=foreign_synonyms) if synonym: words_suggestions[token] = synonym else: unknown_processed_words.add(token) return words_suggestions, list(unknown_processed_words) if __name__ == '__main__': stem = stemmer.Stemmer() tokens = tokenizer.tokenize_text("assets/input.txt") tokens = list(set(tokens)) print(f'Нашият входен текст съдържа {len(tokens)} значими думи') print('Зареждаме чуждиците и техните синоними ') foreign_synonyms = load_synonims("assets/synonyms.txt") print('Зареждаме корпусът с български думи') bg_words = load_bulgarian_words("assets/bg_words.txt") print('Зареждаме корпусът с английски думи') en_words = load_bulgarian_words("assets/en_words.txt") for word in foreign_synonyms.keys(): value = foreign_synonyms[word] foreign_synonyms.pop(word)
import stemmer stemming = stemmer.Stemmer() def filter(s): s = s.lower() s = s.strip() s = rm_encoding(s) s = rm_punctuation(s) s = mv_tags(s) return s def rm_encoding(s): # Peregrine # return s.decode('utf-8').encode('ascii', 'ignore') # Hadoop return s.encode('ascii','ignore') def rm_punctuation(s): s = s.encode('utf-8') s = s.translate(None, stemming.punctuation) return s def mv_tags(tweet): import re _digits = re.compile('\d') words = tweet.split() for i, word in enumerate(words): # word = word.strip()
class Translator: #text file containing all the tagalog-english translations __WORDS_DIR = os.path.dirname( os.path.realpath("translator")) + "\\trainingData\\tag-eng.txt" __tagalog_words = {} __stemmer = stemmer.Stemmer() def __init__(self): self.train() pass """ method used to train the model """ def train(self, tag_eng=__WORDS_DIR): freader = open(tag_eng, "r") contents = freader.readlines() freader.close() for line in contents: word_def = line.split(" : ") #definition is the always in the second index of word_def #replace remaining ":" if there is remaining defn = word_def[1].replace(":", "").strip() defn = defn.replace(word_def[0], "").strip() #remove the line's other transformation, #ex: ... (word1, word2, word3) ... defn = re.sub("[(].+?[)]", "", defn).strip() #regular expression to detect the tag for each entry tags_re = "n\.|adv\.|adj\.|v\.|intrj\.|comp\.|gram\.|conj\.|expr\.|prep\.|pref\.|imp\.|coll\.|interrog\.|idiom." #some pos tag cannot be found try: pos_tag = re.findall(tags_re, defn)[0] #remove pos tag, numberings and special characters defn = re.sub("[A-Za-z0-9]{1,10}[.],?|^!|[?!@.,]", "", defn).strip() defn = re.sub("([/][A-Za-z]+? )|([/][A-Za-z]+?$)", "", defn).strip() #split the different definitions, clean each of unneccessary whitespace #lowercase for consistency defn = [ self.clean_string(i).strip().lower() for i in defn.split(";") ] #if the dictionary has already registered the word if self.__tagalog_words.has_key(word_def[0]): #if the word-dictionary has already registered a specific pos tag if self.__tagalog_words[word_def[0]].has_key(pos_tag): #append it to the current self.__tagalog_words[word_def[0]][pos_tag] += defn else: #initialize the list with defn self.__tagalog_words[word_def[0]][pos_tag] = defn else: self.__tagalog_words[word_def[0]] = {} self.__tagalog_words[word_def[0]][pos_tag] = defn except: pass """ *model should be trained first method used for tagalog translation, accepts a string word and a string pos_tag word is the word to be translated pos_tag is the pos tag of the word to be translated; by default it is "" returns a list of strings containing the english translations """ def translate(self, word, pos_tag=""): #if the translation fails (dictionary lookup), stem it try: #if the pos tag is unspecified if pos_tag == "" or pos_tag == "AMB" or pos_tag == "UNK": #initialize the translations container translations = [] #append all translations, regardless of pos tag for key in self.__tagalog_words[word].keys(): translations += self.__tagalog_words[word][key] return translations else: if pos_tag.lower() + "." in self.__tagalog_words[word].keys(): #return translation for a specific pos tag return self.__tagalog_words[word][pos_tag.lower() + "."] elif self.__tagalog_words[word].keys() > 0: return self.translate(word) #if the translation errors due to an index not found except: try: if self.stem2x(word) == word: return word return self.translate(self.stem2x(word)) + ["~"] except: return [] def stem2x(self, word): word = self.__stemmer.stem(word) return self.__stemmer.stem(word) #remove non-alphabet characters def clean_string(self, word): return re.sub("[^A-Za-z0-9 ]", "", word)
class Translator: #text file containing all the tagalog-english translations __WORDS_DIR = os.path.dirname( os.path.realpath("translator")) + "\\trainingData\\tag-eng.txt" __tagalog_words = {} __stemmer = stemmer.Stemmer() def __init__(self): self.train() pass def train(self, tag_eng=__WORDS_DIR): freader = open(tag_eng, "r") contents = freader.readlines() freader.close() for line in contents: word_def = line.split(" : ") #definition is the always in the second index of word_def #replace remaining ":" if there is remaining defn = word_def[1].replace(":", "").strip() """tag = re.findall("(adj[.])|(v[.])|(n[.])|(adv[.])|(conj[.])|(prep[.])",defn[1])[0] try: tag = re.findall("(adj[.])|(v[.])|(n[.])|(adv[.])|(conj[.])|(prep[.])",defn[1])[0] except: print defn""" defn = defn.replace(word_def[0], "").strip() #remove the line's other transformation, #ex: ... (word1, word2, word3) ... defn = re.sub("[(].+?[)]", "", defn).strip() #seperate different translations; synonyms defn = defn.split(";") #remove POS tags like n., v., inf., and numberings 1. 2. ... for index in range(len(defn)): defn[index] = re.sub("[A-Za-z0-9]{1,4}[.],?", "", defn[index]).strip() defn[index] = re.sub("([/][A-Za-z]+? )|([/][A-Za-z]+?$)", "", defn[index]) #if word is in dictionary, then add the definitions/translations if word_def[0] in self.__tagalog_words: for definition in defn: self.__tagalog_words[word_def[0]].append(definition) #if not, then create a new entry, then add the definitions/translations else: self.__tagalog_words[word_def[0]] = [] for definition in defn: self.__tagalog_words[word_def[0]].append(definition) def translate(self, word, pos_tag=""): try: return self.__tagalog_words[word] except: try: return self.__tagalog_words[self.__stemmer.stem(word)] except: return []
def __init__(self): self.stop_words = stopwords.words('english') self.token_stemmer = stemmer.Stemmer()