def des_extrect(): filename_list = [] file_stopwords = file('stopwords.txt', "r") stopwords = [line.strip() for line in file_stopwords.readlines()] for file_name in os.listdir(DESCRIPTION_DIR): filename_list.append(file_name) for filename in filename_list: path = os.path.join(DESCRIPTION_DIR, filename) fr = file(path, 'r') fw = file(filename+'.des', 'w') soup = BeautifulSoup(fr.read()) docs = soup.findAll('doc') for doc in docs: content = str(doc['title'] + doc.snippet.text) content = re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content) stemmer = SnowballStemmer('english') content = content.split() pro_content = '' for w in content: w = stemmer.stem(w) #去停用词 if w not in stopwords: pro_content += w + ' ' fw.write(doc['rank'] + ' ' +pro_content+'\n') fw.close() fr.close()
def text_token_data_generator(): global id_text_index_map translation_table = string.maketrans( string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase ) snowball_stemmer = SnowballStemmer("english") for f in glob.glob("json/text/*.json"): for line in open(f).readlines(): extract_row = json.loads(line) id_text_index_map[extract_row["file_id"]] = len(id_text_index_map) visible_text = extract_row["visible_text"].encode("ascii", "ignore") visible_text = visible_text.translate(translation_table) visible_text = [ snowball_stemmer.stem(word) for word in visible_text.split() if word not in ENGLISH_STOP_WORDS and len(word) > 1 ] title = extract_row["title"].encode("ascii", "ignore") title = title.translate(translation_table) title = [ "t^{}".format(snowball_stemmer.stem(word)) for word in title.split() if word not in ENGLISH_STOP_WORDS and len(word) > 1 ] visible_text.extend(title) yield " ".join(visible_text)
def normalized_token(token): """ Use stemmer to normalize the token. 建图时调用该函数,而不是在file_text改变词形的存储 """ stemmer = SnowballStemmer("english") return stemmer.stem(token.lower())
class ModelBuilder(): def __init__(self): self.model = {} self.stemmer = SnowballStemmer('english') def build(self): with open('data/candidate_synonyms.txt') as f: all_words = f.read().split('\n') for words in all_words: if words: word, similar = words.split(',') word, similar = self.stemmer.stem(word), self.stemmer.stem(similar) if word not in self.model: self.model[word] = {} self.model[word][similar] = 1 return self def condense(self): condensed_model = {} for word, similars in self.model.items(): for similar in similars: if self.model.get(similar, {}).has_key(word): if condensed_model.has_key(word): condensed_model[word].append(similar) else: condensed_model[word] = [similar] self.model = condensed_model return self
def procesar(request, identificador): lmtzr = WordNetLemmatizer() d = Documento.objects.get(id=identificador) #nltk.corpus.cess_esp.words() tokens = nltk.word_tokenize(d.contenido.replace('.', ' . ')) #print tokens #scentence = d.contenido #scentence = scentence.lower() words = tokens spanish_stemmer = SnowballStemmer('spanish') #This is the simple way to remove stop words important_words=[] for word in words: if word not in stopwords.words('spanish'): important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)]) return render_to_response('templates/documentoProcesado.html', { 'original': d.contenido, 'tokens': tokens, 'important_words' : important_words, #'pos_tags': pos_tags, #'ne_chunks': ne_chunks.subtrees(), })
def stemmed(text,language): stemmer= SnowballStemmer(language) tas=text.split() text="" for word in tas: text=" ".join((text,stemmer.stem(word))) return text.lstrip()
def norm_corpus(document_list): norm_doc_list = [] # lowercase document_list = [word.lower() for word in document_list] # remove symbols in text symbols = ",.?!" for sym in symbols: document_list = [word.replace(sym,'') for word in document_list] # loop through each string i.e. review in the column for doc in document_list: doc = nltk.word_tokenize(doc) # remove stopwords doc = [word for word in doc if word not in stopwords.words('english')] # stem words stemmer = SnowballStemmer("english") doc = [stemmer.stem(word) for word in doc] # make tokenised text one string norm_doc = " ".join(doc) norm_doc_list.append(norm_doc) return norm_doc_list
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50): recipes = [] with open(input_path, 'r') as f: for i, line in enumerate(f): if line == '\n': break if i == 0: continue # skip header fields = line.split('\t') recipes.append(fields[1].replace("\n", "")) recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes)) recipe_words = re.split("\s+", recipe_text) stemmer = SnowballStemmer("english") recipe_stems = [stemmer.stem(w) for w in recipe_words] if stopwords is not None: recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords]) top_words = Counter(recipe_stems).most_common(n_most_common) # write to a file # do a second pass of the recipe to determine how many of the documents the term is in freq_table = open(output_path, 'wb') for elt in top_words: doc_freq = sum([elt[0] in recipe for recipe in recipes]) freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n') freq_table.close()
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) #Remove Special Characters text=special_character_removal.sub('',text) #Replace Numbers text=replace_numbers.sub('n',text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def preprocessing(doc): #stop word as optional x = re.sub("[^a-zA-Z]", " ", doc) #only words x = x.lower().split() stemmer = SnowballStemmer("english") # use snowball stops = set(stopwords.words("english")) # set is faster than list x = [stemmer.stem(word) for word in x if word not in stops] return(x)
def __call__(self, doc ): snowball_stemmer = SnowballStemmer('english') #tokenizer = RegexpTokenizer(r'\w+') #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)] words=[snowball_stemmer.stem(t) for t in word_tokenize(doc)] stop_words=set(stopwords.words('english')) stop_words.update(self.mystops) stop_words=list(stop_words) return [i.lower() for i in words if i not in stop_words]
def preprocess_tweets(tweets): stemmer = SnowballStemmer("english") stop = set(stopwords.words("english")) tweet_texts = [ " ".join(stemmer.stem(i) if len(i) > 1 else i for i in ("".join(c for c in word if c not in string.punctuation) for word in tweet["text"].lower().split()) if i and i not in stop) for tweet in tweets ] return list(set(tweet_texts))
def stemLem(w): lemmatizer = WordNetLemmatizer() stemmer = SnowballStemmer("english") #stemmer = PorterStemmer() lem = lemmatizer.lemmatize(w) if len(w) > len(lem): return lem return stemmer.stem(w)
def stemWordMatch2(question,sentence): question_tokens = set(nltk.word_tokenize(question)) sentence_tokens=set(nltk.word_tokenize(sentence)) # Finding the match between two words from the same root using Lancaster Stemmizer '''stemmer=LancasterStemmer() for i in sentence_tokens: stem_words_list.append(stemmer.stem(i)) for i in question_tokens: question_words_list.append(stemmer.stem(i)) #print 'Stem word list',stem_words_list #print 'Question word list', question_words_list stem_count=0 for i in stem_words_list: #Finding the exact word match if i.lower() in [x.lower() for x in question_words_list]: #print 'Question word is',x #print 'Sentence word stem is :',i #print 'Match' stem_count=stem_count+6 stem_word_match_counter.append(count)''' stem_word_match_counter=[] stem_words_list=[] question_words_list=[] # Finding the match between two words from the same root using Snowball Stemmizer snowball_stemmer = SnowballStemmer('english') for i in sentence_tokens: stem_words_list.append(snowball_stemmer.stem(i)) for i in question_tokens: question_words_list.append(snowball_stemmer.stem(i)) #print 'Stem word list',stem_words_list #print 'Question word list', question_words_list stem_count=0 for i in stem_words_list: #Finding the exact word match if i.lower() in [x.lower() for x in question_words_list]: #print 'Question word is',x #print 'Sentence word stem is :',i #print 'Match' stem_count=stem_count+6 #print 'Stem word count match score is :', stem_count return stem_count
def stem(self, content): import re original_string = content new_content = re.sub('[^a-zA-Z0-9\n\.]', ' ', original_string) words = new_content.split() stemmer = SnowballStemmer('english') singles = [stemmer.stem(wordsa) for wordsa in words] return (' '.join(singles))
def stemmed_top_user_words(usertxt, num=10): wl_usertxt = word_tokenize(usertxt.lower()) num = min(num, len(wl_usertxt)) snowball_stemmer = SnowballStemmer("english") stemmed_fl_usertxt = [snowball_stemmer.stem(w) for w in wl_usertxt if (len(w)>4 and w not in ewl)] fd_user_ls = [w[0] for w in FreqDist(Text(stemmed_fl_usertxt)).most_common(num)] return fd_user_ls
def main(input_file, dbname): """ Main function. Connects to a database and reads a\ CSV with the arousal and valence. Uses the sentiment \ library to compute the sentiment of a new. :param input_file: the ANEW file :param dbname: the name of the database """ # read ANEW file if not os.path.exists(input_file): logging.error('File %s does not exist', input_file) sys.exit(1) else: csvfile = open(input_file, 'r') reader = csv.reader(csvfile, delimiter=',') reader.next() # skip headers stemmer = SnowballStemmer('spanish') anew = dict([(stemmer.stem(unicode(row[2], 'utf-8')), {'valence': float(row[3]), 'arousal': float(row[5])}) for row in reader]) couch = couchdb.Server() database = couch[dbname] logging.info('Established connection with the db %s', dbname) for element in database: doc = database.get(element) comments = " ".join([comment['cleaned_summary'] for comment in doc['comments']]) description = " ".join([database.get(element)['title'], doc['description']]) sentiment_comments = get_sentiment(anew, comments) sentiment_description = get_sentiment(anew, description) if sentiment_comments is not None and sentiment_description is not None: logging.info('%s val: %.2f - %.2f aro: %.2f - %.2f : %s', doc.id, sentiment_comments[0], sentiment_description[0], sentiment_comments[1], sentiment_description[1], doc['title']) doc['sentiments'] = {'comments': {'valence': sentiment_comments[0], 'arousal': sentiment_comments[1]}, 'description': {'valence': sentiment_description[0], 'arousal': sentiment_description[1]}} database.save(doc) else: logging.warn('%s could not be analyzed. skiping ...', database.get(element)['title'])
def stem_text(self): ''' Perform stemming ''' stemmer = SnowballStemmer("english") stemmed_sents = [] for sent in self.tok_text: stemmed_sents.append([stemmer.stem(tok) for tok in sent]) self.stem_text = stemmed_sents
def process_spanish_owned(): from inflector import Inflector, Spanish inflector = Inflector(Spanish) from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("spanish") file_valid = open('valid_words.txt', "r") lines = file_valid.readlines() valid_words = lines[0].split(' ') print len(valid_words) file_valid.close() #valid_words = set(valid_words) owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda'] file = open("raw_words.txt", 'r') fileout = open("spanish_words_owned.txt", 'w') fout_sing = open("spanish_words_sing.txt", 'w') fout_stem = open("spanish_words_stem.txt", 'w') nline = 0 for line in file: nline += 1 words = line.split(' ') processed = [] ini_line = True for word in words: if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'): word = word.replace('\n', '') if (word in valid_words) | (word in owned_words): processed.append(word) if word != 'bus': word_singular = inflector.singularize(word) #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o') else: word_singular = word word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8') if ini_line: fileout.write(word) fout_sing.write(word_singular) fout_stem.write(word_stemmed) ini_line = False else: fileout.write(' ' + word) fout_sing.write(' ' + word_singular) fout_stem.write(' ' + word_stemmed) print nline, word, word_singular, word_stemmed fileout.write('\n') fout_sing.write('\n') fout_stem.write('\n') file.close() fileout.close() fout_sing.close() fout_stem.close()
def prepare_request(request, synonyms = False): #request = translate(request) request = re.sub(r"(\n)", " ", request.lower()) request = re.sub(r"(-\n)", "", request) request = re.split("[^a-z0-9]", request) stop_words = stopwords.words('english') stemmer = SnowballStemmer('english') if synonyms == True: request = add_synonyms([word for word in request if word not in stop_words]) request = [stemmer.stem(word) for word in request if (word not in stop_words) & (len(word) > 1) & (len(word) < 20)] return ' '.join(request)
def stemming(self, words): ''' Make stem for each word in array @return array of stemming words ''' russian_stemmer = SnowballStemmer('russian') stemming = list() for w in words: try: stemming.append(russian_stemmer.stem(w)) except Exception, e: pass
def tokenize(resultList1): entrada=[] for i in range(0,len(resultList1)): sentence=resultList1[i] tokens = word_tokenize(sentence) filtered_words = [w for w in tokens if not w in stopwords.words('spanish')] stemmer = SnowballStemmer('spanish') for i in filtered_words: entrada.append( stemmer.stem(i)) return entrada
def tokenize(resultList1): entrada=[] tokens = word_tokenize(resultList1) filtered_words = [w for w in tokens if not w in stopwords.words('spanish')] stemmer = SnowballStemmer('spanish') for i in filtered_words: stri = unicode(i,errors='replace') entrada.append(stemmer.stem(stri)) return entrada
def proc_text(text): s = remove_punctuation(text) ls = word_tokenize(s) # remover stop words sw = set(stopwords.words('spanish')) ls = filter(lambda x: x not in sw, ls) # lematizar stemmer = SnowballStemmer('spanish') ls = map(lambda x: stemmer.stem(x), ls) return ls
def asr_to_bow(asr_file_path, vocab, dfs): stemmer = SnowballStemmer('english') vec = [0 for i in range(len(vocab))] for line in open(asr_file_path): word = line.split()[4] word = stemmer.stem(word) if word not in vocab: continue tid = vocab[word] vec[tid] += 1 for i in range(len(vec)): vec[i] *= math.log(883.0/dfs[i]) return vec
def main(): from nltk.stem import SnowballStemmer # Imported to perform stemming on the data stemmer = SnowballStemmer('english') stop_words = stopwords.words("english") for line in sys.stdin: line = line.strip() id,label,review = line.split('||') # Separates each line into id,label,review html_strip = BeautifulSoup(review,'html.parser') words = re.sub("[^a-zA-Z]"," ",html_strip.get_text() ) words = words.split() words = [w.lower() for w in words if w.lower() not in stop_words] #collecting words which are not stop words words = [stemmer.stem(word) for word in words] print '%s\t%s\t%s' % (label,id,' '.join(words)) # Mapper output with Label as key and the rest are values
def wordnet_sim(query, db): """ This function imlements simple wordnet definition lookup and compares it with a different block of text. For every word match between the definition token and text token doc receives +1. INPUT: query -- string that represents user query expanded with word net defs db -- dict representation of database xml file OUTPUT: maxdoc -- the document with the highest score """ # print('QUERY:', query) # initializing SnowballStemmer from nltk sst = SnowballStemmer("english") # taking stopwords from nltk stop = stopwords.words("english") # creating translation table to remove punctuation transnone = {ord(c): None for c in string.punctuation} # first we remove any punctuation and concatenate specific nodes into one query_nopunct = query.lower().translate(transnone) query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop] doc_scores = defaultdict(float) for doc in db: for block, text in db[doc].items(): # normalize block text if not text: continue text_nopunct = text.lower().translate(transnone) text = [sst.stem(t) for t in text_nopunct.split() if t not in stop] if len(text) == 0: text += " " # here we can finetune the block score multiplicators # some blocks are more important than the others if block == "description": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 2 elif block == "trivia": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 0.5 elif block == "history": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 0.5 elif block == "comments": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) maxdoc = max(doc_scores, key=lambda x: doc_scores[x]) debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1]) return (debug, maxdoc)
def clean_data(data): punctuations = list(string.punctuation) data = data.replace("\n"," ").replace(":", " ").replace(",","").replace(".","").replace("'s","").replace("?","") stemmer = PorterStemmer() stemmer2 = SnowballStemmer('english') tokenizer = RegexpTokenizer(r'\w+') tokenizer.tokenize(data) ndata1 = list(mysplit(data)) ndata1 = [[stemmer.stem(xi) for xi in y.split(" ")] for y in ndata1] ndata1 = [[stemmer2.stem(xi) for xi in y] for y in ndata1] ndata = [x for x in ndata1 if not x == ":"] ndata = [filter(None, x) for x in ndata] ndata = [x for x in ndata if x != []] return ndata
def get_stems(articles) : stems = collections.defaultdict(list) stopwords = get_stop_words() stemmer = SnowballStemmer('english') for i in articles : for word,stem in [ (word,stemmer.stem(word)) for word in clean_text(i.title + ' ' + i.abstract).split() if word not in stopwords ] : if stem not in stems[i.id] : stems[i.id].append(stem) for k in stems : stems[k].sort() return dict(stems)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"60k", " 60000 ", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
import argparse import numpy as np import pandas as pd import math from gensim.models import Word2Vec from scipy.spatial.distance import cosine from sentiment_sampling import linear_svm from utils import noise_generator from tqdm import tqdm from random import random, choice from six.moves import cPickle from nltk.corpus import stopwords from nltk.stem import SnowballStemmer stop = set(stopwords.words('english')) snowball_stemmer = SnowballStemmer("english") def get_args(): parser = argparse.ArgumentParser() parser.add_argument("-m", "--mode", help="random, word2vec, robust", type=str, default="random") parser.add_argument("-s", "--save-dir", help="directory with stored robust model", type=str, default="save") parser.add_argument("-w",
import pickle import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline from matplotlib import figure from sklearn.metrics import accuracy_score, fbeta_score,classification_report from wordcloud import WordCloud from nltk.tokenize import word_tokenize from nltk.corpus import stopwords nltk.download('stopwords') stop = stopwords.words("english") from nltk.stem import SnowballStemmer ss = SnowballStemmer("english") data = pd.read_csv('C:/Users/user/Downloads/emails.csv') data.shape data.columns data = data.iloc[0:10000,3:5] data.describe() stop data.groupby('Class').describe().T data["Class"].value_counts().plot(kind = 'pie', explode = [0,0.1], figsize = (6,6), autopct = '%1.2f%%') plt.ylabel("Abusive vs Non Abusive") plt.legend(["Abusive", " Non Abusive"]) plt.show()
def processCluster(Dir): global senlist global toklist global senvec senlist = [] toklist = [] filelist = os.listdir(Dir) articles = "" for fil in filelist: with open(Dir + "/" + fil) as f: text = f.read() if '<text>' in text: res_tr = r'<text>(.*?)</text>' m_tr = re.findall(res_tr, text, re.S | re.M) text = m_tr[0] articles += " " + text articles = filterDoc(articles) senStart = [] senEnd = [] lenth = len(articles) isStart = True for i in range(lenth): if isStart and articles[i] != ' ': senStart.append(i) isStart = False if articles[i] == '.' or articles[i] == '?' or articles[i] == '!': senEnd.append(i) isStart = True for i in range(len(senEnd)): senlist.append(articles[senStart[i]:senEnd[i] + 1]) stemmer = SnowballStemmer("english") for s in senlist: toklist.append(s.split(' ')) tmplist = [] siglist = ['.', ':', '?', '!', "'s", '"'] for s in senlist: s = s.lower() for sig in siglist: s = s.replace(sig, '') s = s.split(' ') tempsen = [stemmer.stem(w) for w in s] sen = "" for w in tempsen: sen += w + " " tmplist.append(sen) vectorizer = TfidfVectorizer(stop_words='english') senvec = vectorizer.fit_transform(tmplist) #toklist = vectorizer.inverse_transform(toklist) senvec = senvec.toarray() print list(senvec[1]) print len(senvec[1]) ''' with open("test",'w') as fw: fw.write(str(len(senlist))) fw.write('\n') # fw.write(str(len(toklist)) + '\n') fw.write(str(toklist) + '\n') ''' return (senlist, toklist, senvec)
# coding: utf-8 # In[5]: #get_ipython().system(u'pip install --upgrade pip') #get_ipython().system(u'pip install tensorflow') #get_ipython().system(u'pip install tflearn') # In[9]: # things we need for NLP import nltk #nltk.download('punkt') #Para solucionar el 'tokenizers/punkt/PY3/english.pickle' from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("spanish") # things we need for Tensorflow import numpy as np import tflearn import tensorflow as tf import random # In[10]: # import our chat-bot intents file import json with open('c:\\Users\\Lucs\\Desktop\\ChatPy\\intents_esp.json') as json_data: intents = json.load(json_data) #intents # In[11]:
# Convert words to lower case keywords = keywords.lower() #Tokenize document and remove all non-characters tokenizer = RegexpTokenizer('[a-z]\w+') tokened_text = tokenizer.tokenize(keywords) text_no_sw = [] # TODO: Remove stopwords stop_words = set(stopwords.words('english')) for word in tokened_text: if word not in stop_words: text_no_sw.append(word) # print('before',text_no_sw) # TODO: Stem words stemmer = SnowballStemmer('english') tokens_stemmed = [stemmer.stem(x) for x in text_no_sw] frequency_dict = {} for keyword in tokens_stemmed: if not keyword in set(frequency_dict.keys()): frequency_dict[keyword] = 0 frequency_dict[keyword] += 1 processed_raw_data[date] = frequency_dict print('------------- processed keywords/frequencies (news.db) -------------') ################################################################## # Insert entries into tables for date in processed_raw_data: frequency_dict = processed_raw_data[date]
import os import time import spacy import platform import functools import KeyExt.config from keybert import KeyBERT from string import punctuation from nltk.stem import SnowballStemmer from stempel import StempelStemmer # Initialize all required stemmers once. stemmers = { 'english': SnowballStemmer('english'), 'french': SnowballStemmer('french'), 'spanish': SnowballStemmer('spanish'), 'portuguese': SnowballStemmer('portuguese'), 'polish': StempelStemmer.default() } def load_models(): """ Function which loads the english NLP model, and the Keybert model. This needs to run once since all models need a few seconds to load. """ return (spacy.load('en_core_web_sm'), KeyBERT('distiluse-base-multilingual-cased-v2')) def preprocess(lis, language):
def clean_text(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True): # Clean the text, with the option to remove stopwords and to stem words. # dirty words text = text.lower() text = re.sub( r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text) text = re.sub( r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text) if clean_wiki_tokens: # Drop the image text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text) text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text) # Drop css text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ", text) text = re.sub(r"\{\|[^\}]*\|\}", " ", text) # Clean templates text = re.sub(r"\[?\[user:.*\]", " ", text) text = re.sub(r"\[?\[user:.*\|", " ", text) text = re.sub(r"\[?\[wikipedia:.*\]", " ", text) text = re.sub(r"\[?\[wikipedia:.*\|", " ", text) text = re.sub(r"\[?\[special:.*\]", " ", text) text = re.sub(r"\[?\[special:.*\|", " ", text) text = re.sub(r"\[?\[category:.*\]", " ", text) text = re.sub(r"\[?\[category:.*\|", " ", text) for typo, correct in clean_word_dict.items(): text = re.sub(typo, " " + correct + " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\?", " ? ", text) text = re.sub(r"\!", " ! ", text) text = re.sub(r"\"", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = replace_numbers.sub(' ', text) text = special_character_removal.sub('', text) if count_null_words: text = text.split() for t in text: word_count_dict[t] += 1 text = " ".join(text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) return (text)
def text_to_wordlist(text, remove_stop_words=True, stem_words=False): # Clean the text, with the option to remove stop_words and to stem words. # Clean the text text = re.sub(r"[^A-Za-z0-9]", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"What's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"I'm", "I am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"60k", " 60000 ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", " 911 ", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " America ", text) text = re.sub(r" USA ", " America ", text) text = re.sub(r" u s ", " America ", text) text = re.sub(r" uk ", " England ", text) text = re.sub(r" UK ", " England ", text) text = re.sub(r"india", "India", text) text = re.sub(r"switzerland", "Switzerland", text) text = re.sub(r"china", "China", text) text = re.sub(r"chinese", "Chinese", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r"quora", "Quora", text) text = re.sub(r" dms ", " direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r"KMs", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iPhone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"gps", "GPS", text) text = re.sub(r"gst", "GST", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"dna", "DNA", text) text = re.sub(r"III", "3", text) text = re.sub(r"the US", "America", text) text = re.sub(r"Astrology", "astrology", text) text = re.sub(r"Method", "method", text) text = re.sub(r"Find", "find", text) text = re.sub(r"banglore", "Banglore", text) text = re.sub(r" J K ", " JK ", text) # Remove punctuation from text text = ''.join([c for c in text if c not in punctuation]) # Optionally, remove stop words if remove_stop_words: text = text.split() text = [w for w in text if not w in stop_words] #text = [w for w in text if not w in ['a', 'an', 'the']] text = " ".join(text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
print(w) #Porter Stemmer pStemmer = PorterStemmer() print("Porter steeming output \n") for p in wordtokens: print(pStemmer.stem(str(p))) #lancasters Stemmer lStemmer = LancasterStemmer() print(" Lancaster stemming output\n") for t in wordtokens: print(lStemmer.stem(str(t))) #Snowball Stemmer sStemmer = SnowballStemmer('english') print("Snowball steeming oupput \n") for s in wordtokens: print(sStemmer.stem(str(s))) #parts of speech print("Parts of Speech \n") print(nltk.pos_tag(wordtokens)) #Lemmatizer print("Lemmatizer \n") lemmatizer = WordNetLemmatizer() for l in wordtokens: print(lemmatizer.lemmatize(str(l))) #Trigram
from utilities.db_manager import DBManager # Import Packages for NLP import gensim from gensim import corpora, models from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * # import nltk.stem as stemmer import numpy as np np.random.seed(2018) import nltk nltk.download('wordnet') stemmer = SnowballStemmer('english') # DWH = os.getenv('MIMIC_DWH') # engine = create_engine(DWH) pd.options.display.max_columns = 1000 pd.options.display.max_rows = 1000 pd.set_option('display.float_format', lambda x: '%.3f' % x) # Currently set limit to 200,000 for just testing purposes. Will want to remove later. QUERY = """ select subject_id, hadm_id, chartdate, text
def lemmatize_stemming(text): stemmer = SnowballStemmer("english") #text = text.decode('utf-8') return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def my_clean(text,stops = False,stemming = False,minLength = 2): text = str(text) text = re.sub(r" US ", " u s ", text) text = text.lower().split() if stemming and stops: text = [word for word in text if word not in stopwords.words('english')] wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=False) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = [word for word in text if word not in stopwords.words('english')] elif stops: text = [word for word in text if word not in stopwords.words('english')] elif stemming: wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=False) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = " ".join(text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"don't", "do not ", text) text = re.sub(r"aren't", "are not ", text) text = re.sub(r"isn't", "is not ", text) text = re.sub(r"%", " percent ", text) text = re.sub(r"that's", "that is ", text) text = re.sub(r"doesn't", "does not ", text) text = re.sub(r"he's", "he is ", text) text = re.sub(r"she's", "she is ", text) text = re.sub(r"it's", "it is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r" e - mail ", " email ", text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ", text) text = re.sub(r";", " ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ", text) text = re.sub(r"\+", " ", text) text = re.sub(r"\-", " ", text) text = re.sub(r"\=", " ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r" j k ", " jk ", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) #Removes every number text = text.lower().split() text = [w for w in text if len(w) >= minLength] if stemming and stops: text = [word for word in text if word not in stopwords.words('english')] wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=False) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = [word for word in text if word not in stopwords.words('english')] elif stops: text = [word for word in text if word not in stopwords.words('english')] elif stemming: wordnet_lemmatizer = WordNetLemmatizer() englishStemmer = SnowballStemmer("english", ignore_stopwords=False) text = [englishStemmer.stem(word) for word in text] text = [wordnet_lemmatizer.lemmatize(word) for word in text] text = " ".join(text) return text
from sklearn.model_selection import train_test_split import collections import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix from keras.callbacks import ModelCheckpoint from keras.callbacks import EarlyStopping from textblob import TextBlob from langdetect import detect_langs from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.svm import LinearSVC import statistics spanishStemmer = SnowballStemmer("spanish", ignore_stopwords=True) exclude = set(string.punctuation) WORD2VECMODEL = "/Users/frandm/Documents/Tesis/Code/SBW-vectors-300-min5.bin.gz" THRESHOLD = 15 VOCAB_SIZE = 5000 stop_words = set(stopwords.words('spanish')) def dblite_connect(dbname): conn = sqlite3.connect(dbname) return conn.cursor(), conn
def main(): parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/glove.6B', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=300, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=300, help='number of hidden units per layer') parser.add_argument('--dlin', type=int, default=500, help='number linear transformation nodes') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--wd', type=float, default=0.0, help='adam l2 weight decay') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--rnn', type=str, default='lstm', help='lstm or gru') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--pipeline', action='store_true', help='use pipeline file') parser.add_argument('--psw', type=int, default=1, help='remove stop words') parser.add_argument('--ppunc', action='store_true', help='remove punctuation') parser.add_argument('--pntok', action='store_true', help='use number tokens') parser.add_argument('--pkq', action='store_true', help='keep question words') parser.add_argument('--stem', action='store_true', help='use stemmer') parser.add_argument('--lemma', action='store_true', help='use lemmatizer') parser.add_argument('--bidir', action='store_false', help='bidirectional') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() if not args.presaved: pipe = None if args.pipeline: stemmer, lemmatizer = None, None if args.stem: stemmer = SnowballStemmer('english') elif args.lemma: lemmatizer = WordNetLemmatizer() pipe = functools.partial(pipeline, rm_stop_words=args.psw, rm_punc=args.ppunc, number_token=args.pntok, keep_questions=args.pkq, stemmer=stemmer, lemmatizer=lemmatizer) corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) print('Loading Data') # train_data = pd.read_csv(args.data) #Shuffle order of training data train_data = pd.read_csv('../data/train_data_shuffle.csv') val_data = pd.read_csv('../data/val_data_shuffle.csv') print('Cleaning and Tokenizing') q1, q2, y = clean_and_tokenize(train_data, corpus) q1_val, q2_val, y_val = clean_and_tokenize(val_data, corpus) train_feat = list(map(feature_gen, zip(q1, q2))) val_feat = list(map(feature_gen, zip(q1_val, q2_val))) scalar = preprocessing.StandardScaler() train_feat = scalar.fit_transform(train_feat) val_feat = scalar.transform(val_feat) print('Piping Data') q1 = corpus.pipe_data(q1) q2 = corpus.pipe_data(q2) q1_val = corpus.pipe_data(q1_val) q2_val = corpus.pipe_data(q2_val) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(val_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() torch.save(X, '../data/X_feat.t') torch.save(y, '../data/y_feat.t') torch.save(X_val, '../data/X_val_feat.t') torch.save(y_val, '../data/y_val_feat.t') with open(args.save + '_corpus_feat.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) else: n_feat = 22 d_in = args.din print('Loading Presaved Data') X = torch.load(args.data + 'X_feat.t') y = torch.load(args.data + 'y_feat.t') X_val = torch.load(args.data + 'X_val_feat.t') y_val = torch.load(args.data + 'y_val_feat.t') with open('../data/corpus_feat.pkl', 'rb') as f: corpus = pkl.load(f) if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = ConvRNNFeat(args.din, args.dhid, args.dout, args.demb, args.dlin, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda, args.rnn, args.bidir, n_feat) if args.cuda: model.cuda() criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) model_config = '\t'.join([str(x) for x in (torch.__version__, args.rnn, args.bidir, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.dlin, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.lr, args.wd, args.vocabsize, args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma)]) print('Pytorch | RNN | BiDir | Clip | #Layers | InSize | EmbDim | HiddenDim | LinearDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | WeightDecay | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma') print(model_config) # best_val_acc = 0.78 best_ll = 0.5 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print('| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format( epoch, ind, len(X) // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print('Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'.format( epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89)
if __name__ == "__main__": train_df = pd.read_csv('data/training_tweets_es.txt', sep='\t', header=0) classlabels = pd.read_csv('data/training_truth_es.txt', header=0) raw_docs_train = train_df['phrase'].values classlabels1 = classlabels['label'].values #print(classlabels1.shape) #print(len(raw_docs_train)) #print(raw_docs_train[4318]) stop_words = set(stopwords.words('dutch')) stop_words.update( ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) stemmer = SnowballStemmer('dutch') print "pre-processing train docs..." processed_docs_train = [] for doc in raw_docs_train: doc = doc.decode("utf8") tokens = word_tokenize(doc) filtered = [word for word in tokens if word not in stop_words] stemmed = [stemmer.stem(word) for word in filtered] processed_docs_train.append(stemmed) processed_docs_all = processed_docs_train dictionary = corpora.Dictionary(processed_docs_all) dictionary_size = len(dictionary.keys()) print "dictionary size: ", dictionary_size
# print(cleaned) # stemmer = SnowballStemmer("english") # print(stemmer.stem("hygenist")) # print(" ".join([stemmer.stem(word) for word in cleaned.split()])) # print(review) # v.parse_words() # v.save_vocabulary() # t = Tokenizer("tokenizer", "vocabulary.txt") # print(len(t.word2Index)) # print(clean_sentence("I really enjoyed my stay at this hotel on 5/12/2020!! This is jibberish abcd but here is this cool website: https://www.berkeley.edu")) sent = "I really enjoyed my stay at this hotel on five one two two zero two zero ! ! I am sure we will come again! Do not mind this jibberish abcd but here is this cool website :" stemmer = SnowballStemmer("english") cleaned_review = " ".join([stemmer.stem(word) for word in sent.split()]) tokenized_review = tokenizer.word_tokenizer(cleaned_review) print(tokenized_review) # bpe = ByteBPETokenizer("yelp_bpe/yelp-bpe-vocab.json", "yelp_bpe/yelp-bpe-merges.txt") # enc = bpe.encode("I really enjoyed my stay at this hotel on 5/12/2020!! This is jibberish abcd but here is this cool website: https://www.berkeley.edu") # print(enc.tokens) # bpe.trainBPE(paths=["cleaned_reviews.txt"], vocab_size=25000)
tknzr = TweetTokenizer() #make list of tokens instead of list of tweets tokenizedTweets = [tknzr.tokenize(i) for i in tweets] print("Tokenized Tweets: ", tokenizedTweets) #remove stop words (to get these stopwords uncomment the following lines (only has to be run once) #import nltk #nltk.download("stopwords") stopwords = stopwords.words('english') #remove every word from tokenized tweets which is in stopwords (keep the rest) filteredTweets = [[word for word in tweet if word not in stopwords] for tweet in tokenizedTweets] print("Filtered Tweets: ", filteredTweets) #Stemming st = SnowballStemmer("english") stemmedTweets = [[st.stem(word) for word in tweet if word not in stopwords] for tweet in tokenizedTweets] print("Stemmed Tweets: ", stemmedTweets) flatstemmedTweets = [" ".join(tweets) for tweets in stemmedTweets] print("Stemmed Tweets flattened: ", flatstemmedTweets) #PART 2: Noise Removal #TF-IDF representation vectorizer = TfidfVectorizer(min_df=1) tweetsTF = vectorizer.fit_transform(flatstemmedTweets) #DBScan #function using DBSCAN function from scikit-learn
def lemmatize_stemming(text): stemmer = SnowballStemmer("english") return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
class TextSlack(BaseEstimator, TransformerMixin): def __init__(self, variety='BrE', user_abbrevs={}, lang='english'): try: self.variety = variety self.user_abbrevs = user_abbrevs self.lang = lang if self.lang in stopwords.fileids( ) and self.lang in SnowballStemmer.languages: self.stop_words = stopwords.words(lang) else: raise LanguageNotFoundException( '{} is currently not supported by textslack.'.format( self.lang), 'Keep checking for support in the future updates.') self.lemmatizer = WordNetLemmatizer() self.stemmer = SnowballStemmer(lang, ignore_stopwords=True) except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def fit(self, X, y=None): return self def transform(self, X, *_): if isinstance(X, pd.Series): return X.apply(self._preprocess_text) elif isinstance(X, list): return [self._preprocess_text(x) for x in X] else: return self._preprocess_text(X) def _preprocess_text(self, text): if self.lang == 'english': normalised_text = self._normalise(text) normalised_text = re.sub(' +', ' ', normalised_text) words = regexp_tokenize(normalised_text.lower(), r'[A-Za-z]+') removed_punct = self._remove_punct(words) removed_stopwords = self._remove_stopwords(removed_punct) return self._lemmatize(removed_stopwords) else: words = word_tokenize(text.lower()) removed_punct = self._remove_punct(words) removed_stopwords = self._remove_stopwords(removed_punct) return ' '.join([w for w in removed_stopwords]) def _normalise(self, text): try: return ' '.join( normalise(word_tokenize(text), variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False)) except: return text def _remove_punct(self, words): return [w for w in words if w not in string.punctuation] def _remove_stopwords(self, words): return [w for w in words if w not in self.stop_words and len(w) > 1] def _lemmatize(self, words): return ' '.join([self.lemmatizer.lemmatize(w, pos='v') for w in words]) def _stem(self, words): return ' '.join([self.stemmer.stem(w) for w in words]) def extract_nouns(self, text): try: if self.lang == 'english': processed_text = self._preprocess_text(text) pos_tags, _ = self._blob_features(processed_text) return ' '.join([w for w, p in pos_tags if p == 'NN']) else: raise LanguageNotFoundException( 'Sorry for the inconvenience, textslack is still learning {}.' .format(self.lang), 'Keep checking for support in the future updates.') except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def extract_verbs(self, text): try: if self.lang == 'english': processed_text = self._preprocess_text(text) pos_tags, _ = self._blob_features(processed_text) return ' '.join([w for w, p in pos_tags if p == 'VB']) else: raise LanguageNotFoundException( 'Sorry for the inconvenience, textslack is still learning {}.' .format(self.lang), 'Keep checking for support in the future updates.') except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def extract_adjectives(self, text): try: if self.lang == 'english': processed_text = self._preprocess_text(text) pos_tags, _ = self._blob_features(processed_text) return ' '.join([w for w, p in pos_tags if p == 'JJ']) else: raise LanguageNotFoundException( 'Sorry for the inconvenience, textslack is still learning {}.' .format(self.lang), 'Keep checking for support in the future updates.') except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def extract_adverbs(self, text): try: if self.lang == 'english': processed_text = self._preprocess_text(text) pos_tags, _ = self._blob_features(processed_text) return ' '.join([w for w, p in pos_tags if p == 'RB']) else: raise LanguageNotFoundException( 'Sorry for the inconvenience, textslack is still learning {}.' .format(self.lang), 'Keep checking for support in the future updates.') except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def sentiment(self, text): try: if self.lang == 'english': processed_text = self._preprocess_text(text) _, polarity = self._blob_features(processed_text) return 'pos' if polarity > 0.0 else 'neg' if polarity < 0.0 else 'neu' else: raise LanguageNotFoundException( 'Sorry for the inconvenience, textslack is still learning {}.' .format(self.lang), 'Keep checking for support in the future updates.') except LanguageNotFoundException as e: print(str(e)) print('Details: {}'.format(e.details)) def _blob_features(self, text): blob = TextBlob(text) return blob.tags, blob.polarity def word_occurances(self, word, text): word_count_dic = dict(Counter([w for w in word_tokenize(text)])) return [c for w, c in word_count_dic.items() if w == word][0]
def stemmization(text, stemmer=SnowballStemmer('russian')): stem = [stemmer.stem(w) for w in remove_punctuation(text).split()] return ' '.join(stem)
def __init__(self): self.bl_tokenizer = LineTokenizer() self.re_tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.stemmer = SnowballStemmer('english') self.NGRAM_RANGE = 3
import os import re import nltk import argparse import time import torch from torch.autograd import Variable import torch.nn as nn from models import NLINet from mutils import get_optimizer from data import get_nli, get_batch from nltk.stem import SnowballStemmer snowball_stemmer1 = SnowballStemmer('spanish') snowball_stemmer2 = SnowballStemmer('english') snowball_stemmer1.stem snowball_stemmer2.stem # from nltk.corpus import stopwords # stops1 = set(stopwords.words("spanish")) # stops2 = set(stopwords.words("english")) #################### READ DATA #################### df_train_en_sp = pd.read_csv('./input/cikm_english_train_20180516.txt',sep=' ', header=None,error_bad_lines=False) df_train_sp_en = pd.read_csv('./input/cikm_spanish_train_20180516.txt',sep=' ', header=None,error_bad_lines=False) df_train_en_sp.columns = ['english1', 'spanish1', 'english2', 'spanish2', 'result']
plt.figure(figsize=(20, 5)) lemmatized_words_freqdist.plot(len(lemmatized_words_freqdist.most_common(50))) plt.show() # Convert Lemmatized Words FreqDist Object into a Dictionary lemmatized_words_freq_dict = dict(lemmatized_words_freqdist) print(len(lemmatized_words_freq_dict)) print(lemmatized_words_freq_dict) #Further Cleaning Using NLTK- Stem (From Word List after removing Stopwords) # Snowball Stemmer from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('english') words_stemmed = [stemmer.stem(word) for word in words_no_stopwords] print(len(words_stemmed)) print(words_stemmed[:100]) #Calculate the Frequency of Words stemmed_words_freqdist = nltk.FreqDist(words_stemmed) print(len(stemmed_words_freqdist)) print(stemmed_words_freqdist.items()) print(stemmed_words_freqdist.most_common(50)) import matplotlib.pyplot as plt plt.figure(figsize=(20, 5)) stemmed_words_freqdist.plot(len(stemmed_words_freqdist.most_common(50)))
tokens = convertStemms(tokensStop) print(text[:100]) vocabulary = getVocabulary(text) positions = initializeContext(tokens, vocabulary) #Initialize Context #Get contexts contexts = {} print("Context:") for term in vocabulary: contexts[term] = getContext(term, positions, 4, tokens) #Get frecuency, vectors = {} vectors = getFrecuency(vocabulary, contexts) word = "grande" ss = SnowballStemmer("spanish") stemWord = ss.stem(word) similitud = {} similitud = getSimilitud(vocabulary, vectors, stemWord) l = list() for key, val in similitud.items(): l.append((val, key)) l.sort(reverse = True) print(l[:10]) createFileDic(nameFile, l)
import nltk from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('english') import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.tokenize import word_tokenize with open('trial2.txt', 'r') as myfile: data2=myfile.read().replace('\n', ' ') sp=set(stopwords.words("english")) variable = nltk.word_tokenize(data2) sp2 = [stemmer.stem(w) for w in sp] variable2 = [stemmer.stem(w) for w in variable] filtered_sentence = [w for w in variable2 if not w in sp2] a=" ".join(filtered_sentence) sent = nltk.sent_tokenize(a) b="\n".join(sent) open('naya.txt','w').write(b)
def convertStemms(tokens): ss = SnowballStemmer("spanish") text = [] for t in tokens: text.append(ss.stem(t)) return text
from gensim import corpora, models, similarities from gensim.models.ldamulticore import LdaMulticore from gensim.models.wrappers import LdaMallet from gensim.models import CoherenceModel from gensim.models.phrases import Phrases, Phraser from gensim.parsing import preprocessing from gensim.utils import simple_preprocess import contractions import os from tqdm import tqdm from pprint import pprint import pickle import spacy #from nltk.corpus import stopwords from nltk.stem import SnowballStemmer snowball = SnowballStemmer(language='english') # disable parse, named entity recognition to speed it up. nlp = spacy.load('en', disable=['parser', 'ner']) # Add new stop words: https://stackoverflow.com/questions/41170726/add-remove-stop-words-with-spacy # |= : syntactic sugar for creating doing a union with the with {} nlp.Defaults.stop_words |= { 'table', 'ref', 'formula', 'citation', 'cit', 'references' 'fig', 'figure', 'abstract', 'introduction', 'description', 'conclusion', 'results', 'discussion' } # Load the Mallet LDA Java program mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet' '''
nltk.download('wordnet') data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False) data_text = data[['headline_text']] data_text['index'] = data_text.index documents = data_text print(len(documents)) print(documents[:5]) np.random.seed(2018) print(WordNetLemmatizer().lemmatize('went', pos='v')) ### Performing stem operation stemmer = SnowballStemmer('english') original_words = [ 'caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted' ] singles = [stemmer.stem(plural) for plural in original_words] chk = pd.DataFrame(data={'original word': original_words, 'stemmed': singles}) print(chk) def lemmatize_stemming(text): return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) def preprocess(text):
word_counts[word] = 1 else: word_counts[word] += 1 print("Size of Vocabulary:", len(word_counts)) # Create a list of tuples of unique words and their corresponding frequencies word_counts_list = [(key, value) for key, value in word_counts.items()] # Sort the list of word_counts in descending order word_counts_list = sorted(word_counts_list, key=lambda x: x[1], reverse=True) # Print the top 100 words print(word_counts_list[:100]) # Define a snowball stemmer object for English stemmer = SnowballStemmer("english") # Load GloVe's embeddings embeddings_index = {} with open(directory + folder + "glove.840B.300d.txt", encoding='utf-8') as f: for line in f: values = line.split(' ') word = stemmer.stem(values[0]) embedding = np.asarray(values[1:], dtype='float32') embeddings_index[word] = embedding print('Word embeddings:', len(embeddings_index)) import numpy as np import json
printall += "Fichero " + docIndex.get(d) + "\n" # Obtain article and print title article = re.split(delimiter_noticia, data)[p + 1] printall += re.split(delimiter_title, article)[1] + "\n" cont += 1 if len(res) <= 2: # Print whole article text = re.split(delimiter_text, article)[1] printall += text + "\n" elif len(res) <= 5: # Print snippets toprint = snippet(re.split(delimiter_text, article)[1], wordlist) printall += toprint + "\n" # Print number of results and timing total_time = time.time() - start_time printall += "%d resultados obtenidos en %.9f segundos\n" % (len(res), total_time) return printall print("Loading mini_enero.data...") # Retrieve data from file with open("mini_enero.data", "rb") as f: (index, docIndex, titleIndex, catIndex, dateIndex, universe, stems, permuterm) = pickle.load(f) f.close() print("Loaded!") # Prepare variables stemmer = SnowballStemmer('spanish')