def text_summarization_main(ORIGINAL_TEXT): STOPWORDS = set(stopwords.words('english')) STOPWORDS.add("-") ORIGINAL_TEXT = str(ORIGINAL_TEXT) intermedia_text = ORIGINAL_TEXT.lower().replace(". ", " qwertyuiop") intermedia_text = re.sub('[^a-zA-Z]', ' ', intermedia_text) intermedia_text = re.sub(r'\s+', ' ', intermedia_text) intermedia_text = intermedia_text.split(" qwertyuiop") average_sentence_word_count = len(intermedia_text) sum_word_count = 0 for c, text in enumerate(intermedia_text): intermedia_text[c] = ' '.join( [word for word in text.split() if word not in STOPWORDS]) sum_word_count += len(intermedia_text[c].split(" ")) average_sentence_word_count = sum_word_count / average_sentence_word_count sentence_scores = get_text_weighted_score(intermedia_text, average_sentence_word_count) original_dict = {} ORIGINAL_TEXT = ORIGINAL_TEXT.split(". ") for i, sentences in enumerate(sentence_scores.items()): original_dict[ORIGINAL_TEXT[i]] = sentences[1] sorted_sentences = sorted(original_dict.items(), key=lambda x: x[1], reverse=True) final_list = [] for i, s in enumerate(sorted_sentences): if i < 10: final_list.append(s[0]) return final_list
def pre_process_text(ORIGINAL_TEXT): """Polishes text""" STOPWORDS = set(stopwords.words('english')) STOPWORDS.add("-") frp = [] for i, c in enumerate(ORIGINAL_TEXT): reg = c.lower() reg = ' '.join(reg) reg = ' '.join([word for word in c.split() if word not in STOPWORDS]) reg = re.sub('[^a-zA-Z]', ' ', reg) reg = re.sub(r'\s+', ' ', reg) frp.append(reg) return frp
def preprocess(self, text, min_len=2, max_len=240, remove_common=False): ''' Function to remove stop words and perform lemmatization. INPUT: - text (str): numpy array of tweet text. - min_len (int): words with less characters than the value of min_len will be removed. - max_len (int): words with more character than the value of max_len will be removed. - remove_common (bool): add common words in the corpus to the stopwords list. OUTPUT: cleaned string ''' result = [] stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() stopwords.update(spanish) stopwords.update(['http', 'f**k', 'rt']) if remove_common: stopwords.update(['google', 'apple', 'twitter', 'microsoft']) for token in gensim.utils.simple_preprocess(text, min_len=min_len, max_len=max_len): if token not in stopwords: result.append(self._lemmatize_stemming(token)) return result
def make_stop_words(): global stop_words letters = list('abcdefghijklmnopqrstuvwxyz') numbers = list('0123456789') words = ['oz', 'ml', 'pour', 'poured', 'bottle', 'can', 'ounce',\ 'bomber', 'botttle', 'stubby', 'ouncer', 'pouring', 'growler', 'snifter',\ 'tulip', 'bottled', 'brewery', 'pint', 'glass', 'cap', 'cork'] stopwords = stop_words.union(set(letters)).union(set(numbers)).union(set(words)) my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords) return my_stop_words
def make_wordcloud(recipe_list): fig = plt.figure() stop_words = stopwords.words('english') stop_words.extend(['i','ive',"i've",'didnt','them', 'little','use','added','good','great', 'think', 'taste',\ 'recipe', 'used','made','make','still','also','baked','bake','thank','thanks','cup']) stop_words = STOPWORDS.union(set(stop_words)) review_list = interactions[interactions['recipe_id'].isin(recipe_list.id)]['review'] text = " ".join(str(review) for review in review_list) wordcloud = WordCloud(stopwords=stop_words, background_color="white").generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") fig.savefig('/images/wordcloud.png')
def lemm_tokenize_doc_spacy_pos(doc): ''' INPUT: string that corresponds to a document in a raw corpus and a list of stop words. OUTPUT: (1) a list of tokens that corresponds to a corpus document. Strings are byte decoded, punctuation, digits, and newlines removed, words are lowered and lemmatized (words brought back to their 'base' form), only nouns are kept, non-words and stop-words are removed. PACKAGE USED: spaCy ''' # decode bytes to utf-8 from doc ascii_doc = unidecode(doc.decode('utf-8')) # remove punctuation, digits, newlines, and lower the text clean_doc = ascii_doc.translate(None, punctuation).translate( None, digits).replace('\n', '').lower() # spaCy expects a unicode object spacy_doc = nlp(clean_doc.decode('utf-8')) # lemmatize, only keep nouns and verbs, transform to ascii as will no longer use spaCy noun_tokens = [ unidecode(token.lemma_) for token in spacy_doc if token.pos_ == 'NOUN' or token.pos_ == 'VERB' ] # keep tokens longer than 2 characters long_tokens = [ token for token in noun_tokens if len(token) >= 3 and len(token) < 15 ] # remove tokens that have 3 equal consecutive characters triples = [ ''.join(triple) for triple in zip(ascii_lowercase, ascii_lowercase, ascii_lowercase) ] good_tokens = [ token for token in long_tokens if not [triple for triple in triples if triple in token] ] # remove tokens that are present in stoplist stop_specific = [ 'date', 'state', 'surface', 'location', 'oil', 'operator', 'commission', 'colorado', 'conservation', 'denver', 'ogcc', 'cogcc' ] # remove tokens that are present in stoplist # stop_specific = ['wattenberg', 'yes', 'acre', 'number', 'mum', 'nwse', 'swne', 'lease', 'rule', 'drilling', 'permit', 'application', 'form', 'felfwl', 'fnlfsl', 'fnl', 'fsl', 'page', 'file', 'date', 'state', 'surface', 'location', 'oil', 'operator', 'commission', 'colorado', 'conservation', 'prod', 'formation', 'denver', 'ogcc', 'cogcc'] NLTKstopwords = sw.words('english') stoplist = STOPWORDS.union(NLTKstopwords).union(stop_specific) final_tokens = [token for token in good_tokens if token not in stoplist] return final_tokens
def _preprocess(self, text, min_len=2, max_len=240, custom_stopwords=False): result = [] if custom_stopwords: stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() custom = self._get_custom_stopwords() stopwords.update(spanish) # stopwords.update(['http', 'f**k', 'rt']) stopwords.update(custom) else: stopwords = STOPWORDS.copy() for token in gensim.utils.simple_preprocess(text, min_len=min_len, max_len=max_len): if token not in stopwords: result.append(self._lemmatize_stemming(token)) return result
def pd(test): tests = [] tests.append([ word for word in re.sub("_", " ", test.lower()).split() if word not in STOPWORDS.union(stoplist) ]) tests = tests[0] w2v = [word2vec.wv[word] for word in tests if word in word2vec.wv] X = np.empty((len(w2v), 8000)) w2v = np.reshape(w2v, (-1)) zero = np.zeros((8000) - int(w2v.size)) X = np.concatenate((w2v, zero)) ans = int(sgd.predict(X.reshape(1, -1))[0]) return ans
def tokenize_text(self, text): tokens = [] # Adding to stopwords stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() stopwords.update(spanish) stopwords.update(['http', 'f**k', 'rt']) for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): # if word not in stopwords: if len(word) < 2: continue # tokens.append(self._lemmatize_stemming(word.lower())) tokens.append(word.lower()) return tokens
def extract_text_n_corpus(docs, remove_uniq=True): stoplist = set('bitcoin bitcoins s m d t u ll ur ve'.split()) texts = [[word for word in re.split("\W+", re.sub(r"[,.]", "", doc.lower())) if word not in STOPWORDS.union(stoplist) and word is not ""] for doc in docs] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 if remove_uniq: # Remove all empty strings frequency[''] = 0 # Extract only duplicate words texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) return texts, corpus
def processing(body_text): p = PorterStemmer() stopset = set([ 'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author', 'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy', r'\usepackage{amsfonts', r'\usepackage{mathrsfs', r'\usepackage{amssymb', r'\usepackage{wasysym', r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek', r'\documentclass[12pt]{minimal' ]) cStopwords = STOPWORDS.union(stopset) resultlist = [] for text in body_text: tokens = [] for item in gensim.parsing.preprocess_string(text): if item not in cStopwords: p.stem(item) tokens.append(item) yield model.infer_vector(tokens)
def nettoyer_texte(text): # Replacing specials chars and specific strings like "http" result = text.lower() result = result.replace('\n', ' ') result = re.sub("www", " ", result) result = re.sub("http", " ", result) result = re.sub(".com", " ", result) result = re.sub(".gg", " ", result) result = re.sub(r"[0-9,.;@\-\*\(\)/#?%!:|&$]+\ *", " ", result) result = re.sub("\[.*?\]", " " , result) result = re.sub(" +", " ", result) # Removing stopwords # list found on https://gist.github.com/sebleier/554280 stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"] all_stopwords = STOPWORDS.union(stopwords) words = result.split() result = [word for word in words if word not in all_stopwords] result = ' '.join( [word for word in result if len(word)>1] ) return result
def lemm_tokenize_doc(doc, stop): ''' INPUT: string that corresponds to a document in a raw corpus and a list of stop words. OUTPUT: (1) a list of tokens that corresponds to a corpus document. Strings are byte decoded, punctuation, digits, and newlines removed, words are lowered and lemmatized (words brought back to their 'base' form), only nouns are kept, non-words and stop-words are removed. PACKAGE USED: spaCy ''' # decode bytes to utf-8 from doc ascii_doc = unidecode(doc.decode('utf-8')) # remove punctuation, digits, newlines, and lower the text clean_doc = ascii_doc.translate(None, punctuation).translate( None, digits).replace('\n', '').lower() # spaCy expects a unicode object spacy_doc = nlp(clean_doc.decode('utf-8')) # lemmatize, only keep nouns, transform to ascii as will no longer use spaCy # noun_tokens = [unidecode(token.lemma_) for token in spacy_doc if token.pos_ == 'NOUN'] noun_tokens = [unidecode(token.lemma_) for token in spacy_doc] # keep tokens longer than 2 characters long_tokens = [token for token in noun_tokens if len(token) >= 3] # remove tokens that have 3 equal consecutive characters triples = [ ''.join(triple) for triple in zip(ascii_lowercase, ascii_lowercase, ascii_lowercase) ] good_tokens = [ token for token in long_tokens if not [triple for triple in triples if triple in token] ] NLTKstopwords = sw.words('english') stoplist = STOPWORDS.union(NLTKstopwords).union(stop) final_tokens = [token for token in good_tokens if token not in stoplist] return final_tokens
import sqlite3 from gensim.parsing.preprocessing import remove_stopwords from gensim.parsing.preprocessing import STOPWORDS import pandas as pd import csv my_stop_words = STOPWORDS.union( set(['I', 'The', 'If', 'But', 'This', 'like', 'going'])) word_list = [] flair_list = ['Loss', 'Gain'] d = {} def freq_for_all(): conn = sqlite3.connect('stonks.db') c = conn.cursor() c.execute("select text from posts where text <> '[removed]'") total = c.fetchall() for i in total: for x in i: filtered = remove_stopwords(x) split = filtered.split() for z in split: if z in my_stop_words: pass else: word_list.append(z) conn.commit() conn.close()
# parse docs into individual words ignoring words that are less than 3 letters long # and stopwords: him, her, them, for, there, ect since "their" is not a topic. # then append the tolkens into a list with open('../data/more_stop_words.txt', 'r') as f: customize_stop_words = f.read().replace('\n', '') with open('../data/add_num_stops.txt', 'r') as f: customize_stop_nums = f.read().replace('\n', '') combined_stops = [customize_stop_words + customize_stop_nums] from gensim.parsing.preprocessing import STOPWORDS expanded_stop_words = STOPWORDS.union(set(combined_stops)) # print(expanded_stop_words, type(expanded_stop_words)) def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in expanded_stop_words and len(token) > 3: nltk.bigrams(token) result.append(lemmatize_stemming(token)) return result # look at a random row 4310 and see if things worked out # note that the document created was already preprocessed
from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * from scipy.stats import entropy from tempfile import TemporaryFile from scipy.special import (entr, rel_entr) from numpy import (arange, putmask, ravel, ones, shape, ndarray, zeros, floor, logical_and, log, sqrt, place, argmax, vectorize, asarray, nan, inf, isinf, NINF, empty) from libs.my_paths import base_model_lda, base_model_ngram, base_model MY_STOP_WORDS = STOPWORDS.union( set([ 'use', 'be', 'work', 'user', 'try', 'cell', 'row', 'want', 'item', 'go', 'get', 'add', 'went', 'tried', 'return', 'sort', 'test', 'run', 'check', 'click', 'hour', 'minute', 'second', 'version', 'app', 'paragraph', 'error', 'log', 'press', 'need', 'feed', 'thank', 'way', 'like', 'kill', 'help' ])) def clear_text(text): text = re.sub('<code>(.|\n)*?<\/code>', '', text) text = re.sub(r'(\<(/?[^>]+)>)', '', text) text = re.sub("[\'\"\\/\@\%\(\)\~\`\{\}]", '', text) text = re.sub('\s+', ' ', text) return text def lemmatize_stemming(text, stemmer):
def preprocess(texts_list): my_stopwords = STOPWORDS.union({'\n'}) texts_tokens = [tokenize(text, lower=True) for text in texts_list] texts_no_stop = [[word for word in text if word not in my_stopwords] for text in texts_tokens] return texts_no_stop
def clean_df(text): print(text) #remove url text0 = remove_URL(text) #remove the phrace 'item documentation' if (text0[:22] == '{{item documentation}}') | ( text0[:22] == '{{Item documentation}}') | ( text0[:22] == '{{item Documentation}}') | ( text0[:22] == '{{Item Documentation}}'): text1 = text0[22:] else: text1 = text0 # split into words #text_tokenization = word_tokenize(text0) #text_regular_expresion=regexp_tokenize(text0,pattern='\w+|\$[\d\.]+|\S+') text_wordpunct = wordpunct_tokenize(text1) #text_whitespace=WhitespaceTokenizer().tokenize(text0) #text_stanford=StanfordTokenizer().tokenize(text0) # convert to lower case text_lowercase = [w.lower() for w in text_wordpunct] # remove punctuation from each word table = str.maketrans('', '', string.punctuation) #text_punctuation1= [w.translate(table) for w in text_lowercase] text_punctuation = [w for w in text_lowercase if w.translate(table)] # filter out stop words #stop_words = set(stopwords.words('english')) #text_stopwords = [w for w in text_lowercase if not w in stop_words] all_stopwords_gensim = STOPWORDS.union(set(['likes', 'play'])) text_stopwords = [ w for w in text_punctuation if not w in all_stopwords_gensim ] #remove extra stop words text_extra_stopwords = extend_stopwords(text_stopwords) #remove markup text_markup_words = markup_words_WikidataSymbols(text_extra_stopwords) #remove months months = {m.lower() for m in month_name[1:]} # create a set of month names text_no_months = [word for word in text_markup_words if not word in months] #remove non English words #words_engl = set(nltk.corpus.words.words()) #text_non_english_words=[w for w in text_no_months if w in words_engl or not w.isalpha()] #replace q with item text_item1 = [ 'item' if (val[:1] == 'q') and any(chr.isdigit() for chr in val) else val for k, val in enumerate(text_no_months) ] text_item2 = [ 'item' if val == 'q' else val for k, val in enumerate(text_item1) ] text_item3 = [ val for k, val in enumerate(text_item2) if not ((val == 'item') and (text_item2[(k - 1)] == 'item')) ] #replace p with property text_property1 = [ 'property' if (val[:1] == 'p') and any(chr.isdigit() for chr in val) else val for k, val in enumerate(text_item3) ] text_property2 = [ 'property' if val == 'p' else val for k, val in enumerate(text_property1) ] text_property3 = [ val for k, val in enumerate(text_property2) if not ( (val == 'property') and (text_property2[(k - 1)] == 'property')) ] # remove remaining tokens that are not alphabetic text_not_alphabetic = [word for word in text_property3 if word.isalpha()] #spelling check #spells = [spell(w) for w in (nltk.word_tokenize(text))] #remove sigle letters text_single_letters = [w for w in text_not_alphabetic if len(w) > 2] #remove the non Engilish words/lowercase words like april are concidered false text_non_english = remove_non_english_words(text_single_letters) #remove non English words #words_engl = set(nltk.corpus.words.words()) #text_non_english_words=[w for w in text_no_months if w in words_engl or not w.isalpha()] #stemming of words porter = PorterStemmer() text_stemmed = [porter.stem(word) for word in text_non_english] #lemmatization lemmatizer = WordNetLemmatizer() text_lemma = [lemmatizer.lemmatize(t, pos="v") for t in text_stemmed] return text_lemma
import pandas as pd import os import operator import gensim import numpy as np import matplotlib.pyplot as plt from gensim.parsing.preprocessing import STOPWORDS from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * np.random.seed(2018) import pickle stemmer = PorterStemmer() from spellchecker import SpellChecker spell = SpellChecker() STOPWORDS = list(STOPWORDS) STOPWORDS.append('covid') STOPWORDS.append('coronavirus') STOPWORDS.append('corona') STOPWORDS.append('uganda') from utils import preprocess, BreakIt, produce_mapping from apiclient import discovery from google.oauth2 import service_account from datetime import datetime from numpy.random import multinomial from numpy import log, exp from numpy import argmax class MovieGroupProcess: def __init__(self, K=8, alpha=0.1, beta=0.1, n_iters=30): '''
from spacy.lang.en import English parser = English() unfiltered_tokens = parser(text) tokens = [ preprocess_token(token) for token in unfiltered_tokens if is_token_allowed(token) ] return tokens ###### Tokenization with Gensim ##### from gensim import utils import gensim.parsing.preprocessing as gsp from gensim.parsing.preprocessing import STOPWORDS my_stop_words = STOPWORDS.union(set(['http', 'com', 'www'])) def preprocess(text): result = [] for token in gsp.utils.simple_preprocess(text): if token not in my_stop_words: result.append(token) return ' '.join(result) filters = [ gsp.strip_tags, gsp.strip_punctuation, gsp.strip_multiple_whitespaces, gsp.strip_numeric, gsp.remove_stopwords, gsp.strip_short, gsp.stem_text ]
def suggest_next_video(original_id, input_chunks, search_term): if(search_term == ''): global last_search search_term = last_search # This video_id is just a test case #if (original_id == 'R9npBuS9AsE'): # output_id_list = get_canned_search_results() #else: output_video_list = query_video_ids(search_term) output_name_map = dict(output_video_list) output_id_list = [video[0] for video in output_video_list] #Truncate possible video list to 20 for performance reasons try: output_id_list.remove(original_id) except: pass output_id_list = output_id_list[:40] chunk_lookup_dict = {} start = time.time() chunk_counter = 0 output_chunks = [] myq = queue.Queue() threads = list() for video_id in output_id_list: thread = threading.Thread(target=queueTranscript,args=(video_id,myq)) threads.append(thread) thread.start() for thread in threads: thread.join() for transcript in list(myq.queue): transcript_counter = 0 #try: # output_video_list = yttapi.get_transcript(str(video_id)) #except yttapi.CouldNotRetrieveTranscript: # continue output_video_list = transcript[1] video_length = len(transcript[1]) video_id = transcript[0] for i in range(video_length//10): chunk_text_list = [] for j in range(10): try: chunk_text_list.append(output_video_list[transcript_counter]['text']) except Exception: break chunk_text = ' '.join(chunk_text_list) transcript_counter += 1 output_chunks.append(chunk_text) chunk_lookup_dict[chunk_counter] = video_id chunk_counter += 1 print ("After chunking output: " + str(time.time() - start)) start = time.time() # Exclude common stop words and those used frequently in YouTube transcripts my_stop_words = STOPWORDS.union(set(['[Music]', '[music]', '[Applause]', 'subscribe', 'channel', 'youtube'])) #stoplist = set('for a of the and to in [music]'.split()) texts = [ [word for word in document.lower().split() if word not in my_stop_words] for document in output_chunks ] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) # generates an index of the corpus, need only do this once index = similarities.MatrixSimilarity(lsi[corpus]) print ("After building index: " + str(time.time() - start)) video_average_score = {} for video_id in output_id_list: video_average_score[video_id] = [] start = time.time() # Go through each input chunk and get an average score for each video for i in range(len(input_chunks)): # Skip over chunks the user didn't watch watched_score = input_chunks[i][1] if (watched_score == 0): continue doc=input_chunks[i][0] #doc=input_chunks[0][0] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] similarity_score = index[vec_lsi] # sorts based on descending relevance (earlier sort order = more useful) similarity_scores = sorted(enumerate(similarity_score), key=lambda item: -item[1]) #chunk_ranking = [(documents[x],y) for (x,y) in similarity_scores] video_total_score = {} video_chunk_counts = {} for video_id in output_id_list: video_total_score[video_id] = 0 video_chunk_counts[video_id] = 0 for chunk_id, score in similarity_scores: video_total_score[chunk_lookup_dict[chunk_id]] += score video_chunk_counts[chunk_lookup_dict[chunk_id]] += 1 # Multiply the similarity ranking by the 'score' given to us that represents how slowly they # watched the video chunk and how many times they repeated it # We append this to a list of average scores for the video for video_id in output_id_list: if (video_chunk_counts[video_id] == 0): video_average_score[video_id].append(0) else: avg_score = video_total_score[video_id]/video_chunk_counts[video_id] video_average_score[video_id].append(avg_score) print ("After looping through input chunks: " + str(time.time() - start)) video_sum = {} for idx, video_id in enumerate(video_average_score.keys()): total_score = sum(x for x in video_average_score[video_id]) #video_sum[video_id] = (total_score * (1 + RL_WEIGHT_FACTOR * rl_network.weights['param_' + str(idx)]), output_name_map[video_id]) video_sum[video_id] = (total_score, output_name_map[video_id]) sorted_videos = list(sorted(video_sum.items(), key=lambda kv: -kv[1][0])) # now apply geva return_videos = [] for i in range(0,10): return_videos.append((sorted_videos[i][0], (sorted_videos[i][1][0]* (1.0+RL_WEIGHT_FACTOR * rl_network.weights['param_'+str(i)]),sorted_videos[i][1][1]))) return return_videos
return perplexity, coherence_lda if __name__ == '__main__': # Get singltracks trail summary data X = get_st_descriptions() # Create initial stopwords to remove before creating n-grams not_stops_firstpass = [ 'not', 'bottom', 'few', 'many', 'more', 'less', 'most', 'least', 'never', 'off', 'out', 'very', 'too', 'overly', 'so' ] new_stops_firstpass = ['br'] first_stopwords = ( STOPWORDS.difference(not_stops_firstpass)).union(new_stops_firstpass) # Create second set of stopwords to use after creating n-grams my_stopwords = set([ 'climb', 'mountain', 'road', 'singletrack', 'loop', 'trail', 'trails', 'ride', 'area', 'route', 'way', 'feature', 'section', 'sections', 'riding', 'loop', 'br', 'mile', 'miles', 'right', 'left', 'www', 'http', 'https', 'bike', 'bikes', 'bicycle', 'bicycles', 'continue', 'rider', 'riders', 'parking', 'lot', 'turn', 'start', 'starts', 'description', 'cross', 'north', 'south', 'east', 'west', '-PRON-', 'pron', 'nee', 'regard', 'shall', 'use', 'win', 'park', 'point', 'biking', 'follow', 'single', 'track', 'intersection', 'trailhead', 'head', 'good', 'great', 'nice', 'time', 'include', 'place', 'come', 'downhill', 'look', 'near' ]) bitri_stops = set([
noContLines = [] for line in lines: noContLines += [decontracted(line)] noPuncLines = [] for line in noContLines: noPuncLines += [re.sub(r'[^\w\s]', ' ', line)] noNumbLines = [] for line in noPuncLines: noNumbLines += [re.sub(r'[0-9]+', ' ', line)] from gensim.parsing.preprocessing import STOPWORDS stopwords_gensim = STOPWORDS.union(set(['like', 'also', 'let', 'lot', 'hi'])) # stopwords_gensim.add("like"); # stopwords_gensim.add("also"); # stopwords_gensim.add("let"); noStopLines = [] for line in noNumbLines: noStopLines += [ ' '.join([ word for word in line.split() if not word in stopwords_gensim if len(word) > 1 ]) ] from nltk.stem.wordnet import WordNetLemmatizer noLemmaLines = []
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from gensim.parsing.preprocessing import remove_stopwords from nltk.tokenize import RegexpTokenizer from nltk.stem.wordnet import WordNetLemmatizer from gensim.corpora import Dictionary from gensim.models import LdaModel import json from pprint import pprint from gensim.parsing.preprocessing import STOPWORDS from matplotlib.ticker import MaxNLocator from entity_properties.wikiapi import get_text import timeit STOPWORDS = list(STOPWORDS) STOPWORDS.extend('add pp new ed isbn year time'.split()) with open('entity_properties/property_blacklist.txt') as f: prop_blacklist = f.readlines() prop_blacklist = [p.rstrip() for p in prop_blacklist] with open('entity_properties/property_frequencies.json') as f: data = json.load(f) ''' TFIDF = tf(t,d)*log(N/(df+1)) => http://www.tfidf.com/ tf(t,d) = count t in d / number of words in d df(t) = occ of t in docs 1. build a matrix with tfidf of every word-property pair 2. add up the row for each word
import preprocessor as p from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation from nltk.stem import WordNetLemmatizer, SnowballStemmer from sklearn.model_selection import GridSearchCV import gensim from gensim.parsing.preprocessing import STOPWORDS from joblib import dump, load from datetime import datetime n_samples = 2000 n_features = 1000 n_components = 30 n_top_words = 5 stop_words = STOPWORDS.union( set(['', 'ive', 'im', 'amp', 'like', 'f**k', 'shit'])) p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.EMOJI) stemmer = SnowballStemmer('english') punct_str = '''!"$%&'()*+,-./:;<=>?[\]^_`{|}~’''' def lemmatize_stemming(text): ''' ''' return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) def split_count(text): ''' '''
""" This file contains all the stopword removal steps. """ from gensim.parsing.preprocessing import STOPWORDS from gensim.parsing.preprocessing import remove_stopwords STOPWORDS = STOPWORDS.union(set(["no", "not", "never"])) class StopwordRemoval: """ This class contains all the methods which will handle numeric preprocessing. """ @classmethod def consider_negative_stopwords(cls, text): """ This method will remove the stopwords from text but not remove negative words. """ temp_str = text.split() new_string = [word for word in temp_str if word not in STOPWORDS] return text @classmethod def donot_consider_negative_stopwords(cls, text): """ THis method will remove all stopwords. """ return remove_stopwords(text) @classmethod def run_stopwords(cls, text, stopwords): """
'social', 'governance', 'corporate', 'responsibility', 'million', 'billion', ] # add company names as stop words organisations = esg_corpus.select("company").distinct().toPandas().company for organisation in organisations: for t in organisation.split(' '): org_stop_words.append(t) # our list contains all english stop words + companies names + specific keywords stop_words = STOPWORDS.union(org_stop_words) # COMMAND ---------- # DBTITLE 1,Lemmatize content import nltk from nltk.stem import WordNetLemmatizer, PorterStemmer from pyspark.sql.functions import pandas_udf, PandasUDFType from gensim.utils import simple_preprocess def lemmatize(text): results = [] lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer()
def _lemmatize(self, text): pos = self._get_wordnet_pos(text) return WordNetLemmatizer().lemmatize(text, pos=pos) def _preprocess(self, text): result = [] for token in text: if (token not in self.second_stopwords) and (len(token) > 3): lem = self._lemmatize(token) if lem not in self.second_stopwords: result.append(lem) return result def print_params(self): print(f'Bigrams={self.bigrams}') print(f'Trigrams={self.trigrams}\n') print(f'First set of stopwords: {self.first_stopwords}\n') print(f'Second set of stopwords: {self.second_stopwords}.') if __name__ == '__main__': added_stopwords = set(['bike', 'trail', 'mountain']) my_featurizer = Featurizer(STOPWORDS, STOPWORDS.union(added_stopwords), bigrams=False, trigrams=False) my_featurizer.print_params() my_featurizer.update_stopwords(['ride', 'road']) my_featurizer.update_ngrams(grams='bigrams', set_to=True) my_featurizer.print_params()
#### LDA Preprocessing ''' Augmenting stopwords with words used to filter the tweets originally. Since they show up in almost every tweet, they aren't useful for differentiating between topics. stopwords are all lowercase. ''' COVID_STOPWORDS = set([ 'coronavirus', '2019ncov', 'coronaviruspandemic', 'coronaoutbreak', 'wuhanvirus', 'covid19', 'covid-19', 'ncov', 'ncov2019', 'corona', 'virus', 'covid', 'covidー', 'cov', 'sarscov', 'sarscov2', 'amp' ]) FILTER_WORDS = STOPWORDS.union(COVID_STOPWORDS) def decontract(tweet): ''' helper function for splitting contractions. \'s is removed because we can't disambiguate between possession (Julia's) and is (Julia is ...) ''' tweet = re.sub(r"\b([A-Za-z]+)'([A-Za-z]+)\b", r"\1\2", tweet) return tweet # Source: https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258
from nltk.tokenize import word_tokenize text = "Nick likes to play football, however he is not too fond of tennis." # Documentation: https://radimrehurek.com/gensim/ filtered_sentence = remove_stopwords(text) print(filtered_sentence) # Adicionando e removendo palavras de parada na lista de palavras de parada de Gensim padrão. all_stopwords = gensim.parsing.preprocessing.STOPWORDS print(all_stopwords) # Adicionando palavras de parada à lista de palavras de parada de Gensim padrão. # Para adicionar um elemento, você deve aplicar a função union no conjunto e passar a ele o conjunto de novas palavras. # O union método retornará um novo conjunto que contém suas palavras recém-adicionadas. all_stopwords_gensim = STOPWORDS.union(set(['likes', 'play'])) text = "Nick likes to play football, however he is not too fond of tennis." text_tokens = word_tokenize(text) tokens_without_sw = [ word for word in text_tokens if not word in all_stopwords_gensim ] print(tokens_without_sw) # Removendo palavras irrelevantes da lista de palavras irrelevantes padrão do Gensim all_stopwords_gensim = STOPWORDS sw_list = {"not"} # Para remover palavras irrelevantes da lista de palavras Gensim, você deve chamar o difference(). all_stopwords_gensim = STOPWORDS.difference(sw_list)