def hi_stem1(tokens): if tokens not in stop_words: token = '' a = [] for token in tokens.split(" "): a.append(hi_stem(token)) return (a) else: a = [] a.append(hi_stem(tokens)) return (a)
def tokenizer_hindi(document, sentence): stopWords = getHindiStopWords() text = cfg.clean_text(sentence) if len(text) < 1: return [] tokens = Text(text) tokens = [hi_stem(tkn) for tkn in tokens.words] tokens = [t for t in tokens if t not in stopWords] for token in tokens: if token in document.stemSentenceMap: document.stemSentenceMap[token].append(sentence) else: document.stemSentenceMap[token] = [sentence] return tokens
def fileringSentences(fName, lemm=True, stemm=True, stop=True,lan = "english"): if stemm: if lan in ["spanish","english"]: stemmer = SnowballStemmer(lan) with codecs.open(fName, 'r',encoding="utf-8") as f: f_start = fName.replace(".conf","") ending = "" if stemm: ending += "_stemmed" if lemm: ending += "_lemmed" if stop: ending += "_stop" if lan in ["spanish","english"]: stop_words = set(stopwords.words(lan)) if not(stemm) and not(lemm): ending += "_raw" ending += ".conf" with codecs.open(f_start+ending,"w",encoding="utf-8") as write_file: for line in f: l = line.split(",") l2 = line.replace(l[0]+ ",",'') l3 = l2.replace("-"," ") #l3 = re.sub('[^A-Za-z0-9\ ]+', '', l3) for rem in ["\n","\t","\r",".","?","!",u'¿',u'¡',u'।',u"\u0964","|"]: l3 = l3.replace(rem,"") for pun in string.punctuation: l3 = l3.replace(pun,"") l3 = l3.lower() while " " in l3: l3 = l3.replace(" "," ") if(line != "" and l3 != "") : filtered_sentence = l3.split(" ") if stop: filtered_sentence = [w for w in filtered_sentence if not w in stop_words] if stemm: if lan == "hindi": filtered_sentence = [hi_stem(w) for w in filtered_sentence] else: filtered_sentence = [stemmer.stem(w) for w in filtered_sentence] if lemm: filtered_sentence = [lemmatizer.lemmatize(w) for w in filtered_sentence] if len(filtered_sentence) > 0: out_string = l[0]+ "," + " ".join(filtered_sentence) write_file.write(out_string+"\n") #print out_string print lan,lemm,stemm,stop
def search(word): check_for_db() conn = sqlite3.connect("words.db") cursor = conn.cursor() test = "'" + word + "'" search_table = "select * FROM words WHERE word=" + test cursor.execute(search_table) x = cursor.fetchall() if (len(x) != 0): print("word found in the database:", x) else: x = hindi_stemmer.hi_stem(word) insert_text = "INSERT INTO words (word,stemmed) VALUES (\'%s\', \'%s\')" % ( word, x) cursor.execute(insert_text) conn.commit() print("word not found in the database:", x, "inserted into the database")
def Remove_stem(tweet): tweet_list = tweet.split() tweet_list = [hs.hi_stem(word) for word in tweet_list] tweet_new = " ".join(tweet_list) return tweet_new
def tokenizer_hindi(document): tokens = Text(cfg.clean_text(document)) tokens = [hi_stem(tkn) for tkn in tokens.words] return tokens