def word_clouds(list_, title): #Transforming the list into a string str_ = ' '.join(list_) stopwords = set(STOPWORDS) #removing any extra bigger words from these wordclouds stop_words = ["want", "company", "lot", "many", "work"] new_stopwords = stopwords.union(stop_words) #Defining the wordcloud parameters and #Generate word cloud wc = WordCloud(background_color="white", max_words=200, max_font_size=40, scale=3, random_state=42, stopwords=new_stopwords).generate(str_) plt.figure(figsize=(30, 30)) #store to file wc.to_file('company.png') #Show the cloud plt.imshow(wc) plt.axis('off') plt.title(title) plt.show() return
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(10,10), title = None, title_size=40, image_color=False): stopwords = set(STOPWORDS) more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'} stopwords = stopwords.union(more_stopwords) wordcloud = WordCloud(background_color='white', stopwords = stopwords, max_words = max_words, max_font_size = max_font_size, random_state = 42, width=800, height=400, mask = mask) wordcloud.generate(str(text)) plt.figure(figsize=figure_size) if image_color: image_colors = ImageColorGenerator(mask); plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear"); plt.title(title, fontdict={'size': title_size, 'verticalalignment': 'bottom'}) else: plt.imshow(wordcloud); plt.title(title, fontdict={'size': title_size, 'color': 'black', 'verticalalignment': 'bottom'}) plt.axis('off'); plt.tight_layout()
def make_tokens(df): """Removes stopwords, stems and lemmatizes Returns clean tokens""" stopwords = set(nltk.corpus.stopwords.words("english")) # turns the text in the dataframe into a long list of words TotalText = list(df.text.values) # stopwords, with plurals (otherwise the lemmatizong steps puts some of the stopwords back) NEW_STOP_WORDS stopwords = stopwords.union(NEW_STOP_WORDS) TotalText = " ".join(TotalText) # tokenization tokens = [ w for w in word_tokenize(TotalText.lower()) if w.isalpha() ] # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase no_stop = [ t.strip() for t in tokens if t.strip() not in stopwords ] # stopwords already comes with a built-in list of words to remove wordnet_lemmatizer = WordNetLemmatizer() lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop] return lemmatized
def addStopWords(extrastopfile="../../data/supplementalremovedwords.txt"): global stopwords readRegionalisms() regionalisms = getRegionalisms() extrastopfile = open(extrastopfile, "r+") extrastopfile_text = extrastopfile.read() extrastopfile.close() # filter out regionalims from the stop words stopwords.union(set(extrastopfile_text.split())) #avoid filtering out part of a regionalism if it's two words for word in regionalisms: regionalisms.union(set(word.split())) stopwords.difference_update(regionalisms)
def sentiment_swn(doc): operators = set(['not','down']) stopwords = set(ENGLISH_STOP_WORDS) - operators stopwords = stopwords.union(['gonna', 'does','the','of','and','to','in','a','is','that','for','it']) # This uses TF-IDF with both unigram and bigram and with maximum words (features) of 3000 # It uses IDF and stsop words, also discarding numbers (token_pattern) # Sublinear set to true to further penalize long documents, which in our case may not be required ( no use in our case ) # tf = TfidfVectorizer( analyzer='word', ngram_range=(1,2), lowercase= True, min_df=1 , max_df=2, max_features=MAX_FEATURES, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = stopwords, token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b') weight = 0; try: # apply TF-IDF tfidf_matrix = tf.fit_transform(doc) # get pre-processed terms feature_names = tf.get_feature_names() if lemmatize_first == True: # lemmatize first before pos-tag lema_stem = [lematizer.lemmatize(w) for w in feature_names] # apply pos-tags after lemmatize tokens_pos = pos_tag(feature_names) # now transpost tokens_pos = [transpose(term) for term in tokens_pos] else: # apply pos-tags tokens_pos = pos_tag(feature_names) # lemmatize using pos-tags tokens_pos = pos_tag( lemmatize(tokens_pos) ) # now pos-tag it again and transpose tokens_pos = [transpose(term) for term in tokens_pos] logging.debug("POS-Tag {}".format(tokens_pos)) # finally, get the sentiment weight weight = sentiment_weight( tokens_pos ) except: logging.debug("Error in sentiment ...") return weight
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0, 16.0), color='white', title=None, title_size=40, image_color=False): stopwords = set(STOPWORDS) more_stopwords = { 'u', "im", "thi", "ji", "us", "ha", "um", "hi", "be", "will", "by", "is", "of", "to" } stopwords = stopwords.union(more_stopwords) wordcloud = WordCloud(background_color=color, stopwords=stopwords, max_words=max_words, max_font_size=max_font_size, random_state=42, width=200, height=200, mask=mask, colormap=matplotlib.cm.inferno) wordcloud.generate(text) plt.figure(figsize=figure_size) if image_color: image_colors = ImageColorGenerator(mask) plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear") plt.title(title, fontdict={ 'size': title_size, 'verticalalignment': 'bottom' }) else: plt.imshow(wordcloud) plt.title(title, fontdict={ 'size': title_size, 'color': 'black', 'verticalalignment': 'bottom' }) plt.axis('off') plt.tight_layout() return wordcloud
def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(24.0, 16.0), title=None, title_size=40, image_color=False): stopwords = set(STOPWORDS) more_stopwords = {'com', 'http'} stopwords = stopwords.union(more_stopwords) wordcloud = WordCloud(background_color='white', stopwords=stopwords, max_words=max_words, max_font_size=max_font_size, random_state=42, mask=mask) wordcloud.generate(text) plt.figure(figsize=figure_size) if image_color: image_colors = ImageColorGenerator(mask) plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear") plt.title(title, fontdict={ 'size': title_size, 'verticalalignment': 'bottom' }) else: plt.imshow(wordcloud) plt.title(title, fontdict={ 'size': title_size, 'color': 'green', 'verticalalignment': 'bottom' }) plt.axis('off') plt.tight_layout()
def make_tokens(df): """Removes stopwords, stems and lemmatizes Returns clean tokens""" stopwords = set(nltk.corpus.stopwords.words('english')) #turns the text in the dataframe into a long list of words TotalText = [] for index, row in df.iterrows(): text = row['text'] TotalText.append(text) #stopwords, with plurals (otherwise the lemmatizong steps puts some of the stopwords back) newStopWords = [ 'school', 'learning', 'student', 'pupil', 'teacher', 'management', 'teaching', 'support', 'lesson', 'board' ] newStopWords_plur = [ 'schools', 'learnings', 'students', 'pupils', 'teachers', 'managements', 'teachings', 'supports', 'lessons', 'boards' ] newStopWords += newStopWords_plur stopwords = stopwords.union(newStopWords) TotalText = " ".join(TotalText) #tokenization tokens = [ w for w in word_tokenize(TotalText.lower()) if w.isalpha() ] # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase no_stop = [ t.strip() for t in tokens if t.strip() not in stopwords ] # stopwords already comes with a built-in list of words to remove wordnet_lemmatizer = WordNetLemmatizer() lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop] return lemmatized
import nltk import math from nltk.stem import WordNetLemmatizer import string from nltk.corpus import stopwords from collections import Counter import time dirpath = './reviews_summary_ALL_txt' ###資料夾### # f_list = os.listdir(dirpath) f_list = ['tt1201607', 'tt0111161'] wordnet_lemmatizer = WordNetLemmatizer() stopwords = set(stopwords.words('english')) #額外自訂義停用字 stopwords = stopwords.union({ 'movie', 'film', 'time', 'ha', 'wa', 'dont', 'much', 'thing', 'many', 'watch', 'thats' }) #前處理 def my_tokenizer(s): s = s.lower() # downcase #建立{符號:None}字典 remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) #string.punctuation=標點符號 no_punctuation = s.translate(remove_punctuation_map) #以字典移除標點符號 tokens = nltk.tokenize.word_tokenize(no_punctuation) # nltk斷字 tokens = [t for t in tokens if len(t) > 2] # 大於兩個字才要 tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 還原詞性 tokens = [t for t in tokens if t not in stopwords] # 移除停用字 tokens = [t for t in tokens if not any(c.isdigit() for c in t)] # 移除包含數字的字
import nltk from nltk.corpus import stopwords # Read input file #User will enter the file name here file = open('teddysou1908.txt') a = file.read() # Stopwords. These are the 250 most common words according to http://www.anglik.net/english250.htm stopwords = set(line.strip() for line in open('trumpsou2017.txt')) stopwords = set(stopwords.words('english')) """ stopwords.union(set(['mr','mrs','one','two','said','the','of','to','and','a','in','is','it', 'you','that','he','was','for','on','are','with','as','i','his','they','be','at','one','have','this','from','or', 'had','by','hot','but','some','what','there','we','can','out','other','were','all','your','when','up','use','word', 'how','said','an','each','she','which','do','their','time','if','will','way','about','many','then','them','would', 'write','like','so','these','her','long','make','thing','see','him','two','has','look','more','day','could','go', 'come','did','my','sound','no','most','number','who','over','know','water','than','call','first','people','may', 'down','side','been','now','find','any','new','work','part','take','get','place','made','live','where','after', 'back','little','only','round','man','year','came','show','every','good','me','give','our','under','name','very', 'through','just','form','much','great','think','say','help','low','line','before','turn','cause','same','mean', 'differ','move','right','boy','old','too','does','tell','sentence','set','three','want','air','well','also','play', 'small','end','put','home','read','hand','port','large','spell','add','even','land','here','must','big','high','such', 'follow','act','why','ask','men','change','went','light','kind','off','need','house','picture','try','us','again','animal', 'point','mother','world','near','build','self','earth','father','head','stand','own','page','should','country','found','answer', 'school','grow','study','still','learn','plant','cover','food','sun','four','thought','let','keep','eye','never','last','door', 'between','city','tree','cross','since','hard','start','might','story','saw','far','sea','draw','left','late','run',"don't",
#Criação de nova lista de palavras irrelevantes nova_stopwords = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'v', 'b', 'n', 'm', 'da', 'de', 'com', 'é', 'então', 'ja', 'já', 'sao', 'ai', 'vao', 'so', 'acho', 'até', 'daqui', 'dessa', 'desse', 'dessas', 'desses', 'assim', 'ia', 'tão', 'devem', 'fica', 'ficou', 'la', 'lá', 'ate', 'até', 'desde', 'só', 'pra', 'há', 'ha', 'hà', 'são', 'só', 'já', 'deixou', 'aí', 'sobre', 'que', 'durante', 'vai', 'dia', 'ainda', 'estão', 'deu', 'dar', 'para', 'r', 'o', 'e', 'após', 'sr', 'sra', 'tudo', 'q', 'tão', 'sendo', 'sem', 'me', 'as', 'os', 'isso', 'mas', 'quase', 'estar', 'ta', 'tá', 'ai', 'vão', 'lá', 'vá', 'tô' ] #União de lista padrão do StopWords com a nova lista nova_stopwords_list = stopwords.union(nova_stopwords) #Instancias para remoção de padrões irrelevantes nolink = r"http\S+" caracters = r"[^@#_a-záéíóúàèìòùâêîôûãõçA-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÃÕÇ0-9 ]" nospaces = r'\s+' #Criação do arquivo onde serão armazenados os dados transformados saida = open('C:/users/onlyone/desktop/prefeito/prefeito.txt', mode='w', encoding='UTF-8') #Laço para executar a instrução em toodas as postagens for page in tweepy.Cursor(api.user_timeline, screen_name=usuario.screen_name, count=200,
# In[14]: import collections import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # Read input file, note the encoding is specified here # It may be different in your text file a= s # Stopwords from nltk.corpus import stopwords stopwords=set(stopwords.words('english')) stopwords = stopwords.union(set(['mr','mrs','one','two','said','phone','amazon','iphone','product','apple','vivo','redmi','lenovo','x',' '])) # Instantiate a dictionary, and for every word in the file, # Add to the dictionary if it doesn't exist. If it does, increase the count. wordcount = {} # To eliminate duplicates, remember to split by punctuation, and use case demiliters. for word in a.lower().split(): word = word.replace(".","") word = word.replace(",","") word = word.replace(":","") word = word.replace("\"","") word = word.replace("!","") word = word.replace("“","") word = word.replace("‘","") word = word.replace("*","") if word not in stopwords: if word not in wordcount:
random_seed = 3613 test_percentage = 0.20 mongo_user = urllib.parse.quote_plus(mcred.USERNAME) mongo_pass = urllib.parse.quote_plus(mcred.PASSWORD) stopwords = set(stopwords.words('english')) punctuation_list = {'.', ',', '?', '!', '\'', '"', ':', ';', '-', '–'} special_list = { '`', '~', '@', '#', '$', '%', '^', '&', '+', '*', '/', '=', '>', '<', '(', ')', '{', '}', '[', ']', '|', '\\' } other_sym_list = {'...', '…', '’', '..', '“', '”'} stop_url_symbol_list = stopwords.union(punctuation_list).union( special_list).union(other_sym_list).union({'#url'}) class CustomTweetTokenizer(TweetTokenizer): def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False, convert_urls=True, remove_stopwords=False): super().__init__(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles) self.convert_urls = convert_urls self.remove_stopwords = remove_stopwords
import scipy.io import select import shutil import sys import time import tensorflow as tf from embedding_evaluation import write_embedding_to_file, evaluate, EmbeddingTaskEvaluator from gensim_utils import batch_generator, batch_generator2 from tensor_embedding import PMIGatherer, PpmiSvdEmbedding from tensor_decomp import CPDecomp, SymmetricCPDecomp, JointSymmetricCPDecomp from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) grammar_stopwords = {',', "''", '``', '.', 'the'} stopwords = stopwords.union(grammar_stopwords) class GensimSandbox(object): def __init__(self, method, embedding_dim, num_sents, min_count, gpu=True): self.method = method self.embedding_dim = int(embedding_dim) self.min_count = int(min_count) self.num_sents = int(num_sents) self.gpu = gpu if '--buildvocab' in sys.argv: self.buildvocab = True else: self.buildvocab = False # To be assigned later
import nltk import numpy as np import matplotlib.pyplot as plt from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.decomposition import TruncatedSVD lemm = WordNetLemmatizer() titles = [line.rstrip() for line in open('book_title.txt')] stopwords = set(w.rstrip() for w in stopwords.words('english')) stopwords = stopwords.union({ 'introduction', 'edition', 'series', 'approach', 'card', 'access', 'application', 'package', 'brief', 'vol', 'fundamental', 'second', 'third', 'fourth', 'first', 'guide', 'essential', 'print' }) def tokenize_words(s): s = s.lower() tokens = nltk.tokenize.word_tokenize(s) tokens = [t for t in tokens if len(t) > 3] # remove short words tokens = [lemm.lemmatize(t) for t in tokens] tokens = [t for t in tokens if t not in stopwords] tokens = [t for t in tokens if not any(c.isdigit() for c in t)] return tokens word_index_map = {}
import matplotlib.pyplot as plt import nltk, re from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import WordNetLemmatizer from collections import Counter stopwords = set(nltk.corpus.stopwords.words('english')) TotalText = [] for index, row in df_FilesProperlyConverted.iterrows(): text = row['Text'] TotalText.append(text) newStopWords = ['school','learning','student','pupil','teacher','management','teaching','support', 'lesson', 'board'] stopwords = stopwords.union(newStopWords) TotalText = " ".join(TotalText) tokens = [w for w in word_tokenize(TotalText.lower()) if w.isalpha()] # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase no_stop = [t.strip() for t in tokens if t.strip() not in stopwords] # stopwords already comes with a built-in list of words to remove wordnet_lemmatizer = WordNetLemmatizer() lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop] bow = Counter(lemmatized) MostCommon = dict(bow.most_common(10)) plt.bar(*zip(*MostCommon.items())) plt.title('Whole sample') plt.xlabel('Most common words') plt.ylabel('Number of times the word appears') plt.xticks(rotation='vertical') plt.savefig("Results\\Word count\\Whole sample.png") plt.show()
import matplotlib.pyplot as plt from nltk.corpus import stopwords from wordcloud import WordCloud from gensim.models import KeyedVectors from gensim.test.utils import datapath from nltk import word_tokenize import pandas as pd import json import os import re VEC_PATH = '/Users/anthonysicilia/Desktop/GoogleNews-vectors-negative300.bin' stopwords = set(stopwords.words('english')) stopwords = stopwords.union(set([word.strip() for word in open('stopwords.txt')])) vectors = KeyedVectors.load_word2vec_format(datapath(VEC_PATH), binary=True) comments = dict() for comment_path in os.listdir('comments/'): with open('comments/' + comment_path) as f: try: x = json.load(f) except: print('Error Loading File.') exit() try: x = x['comments'] except: print('Expected "comment" field. Field not found.') exit()
soup = get_soup(bp_transcripts) alltxt = get_text(soup) alltxt = filter_bolsonaro(alltxt) text = "".join(alltxt) text = punctuation_stop(text) new_words = [] with open("brazilianwords.txt", 'r', encoding='utf-8') as f: [new_words.append(word) for line in f for word in line.split()] new_stopwords = stopwords.union(new_words) text = ' '.join(text) wc = WordCloud(background_color="white", width=1600, height=800, max_words=100, max_font_size=200, min_font_size=1, stopwords=new_stopwords) wc.generate(text) plt.figure(figsize=[20, 20]) plt.imshow(wc) plt.axis('off')
proceedings = get_proceedings(min_year=1980, max_year=2019) keywords = set(["embodied", "embody", "body", "bodies"]) cnt = Counter() nouns = Counter() for paper in proceedings: sentences = sent_tokenize(paper.clean_text) for sentence in sentences: words = e(sentence).split() for word in words: if word in keywords: r = [wordnet_lemmatizer.lemmatize(w) for w in words if w not in stopwords.union(keywords)] cnt.update(r) nouns.update([x for x in r if x in all_nouns]) break for w,c in cnt.most_common(500): print w,c print 70 * "*" for w,c in nouns.most_common(500): print w,c # Generate a word cloud image from wordcloud import WordCloud import matplotlib.pyplot as plt # lower max_font_size
from nltk.stem import PorterStemmer from nltk.corpus import stopwords import string from nltk import corpus import numpy # importing the corpus into data variable data = corpus.brown # Using Porter Stemmer stemmer = PorterStemmer() # Building the list of stop words # We will filter the tokens against it stopwords = set(stopwords.words('english')) stopwords = stopwords.union(string.punctuation) # Limiting the number of files we will use # uncomment the following line to use all of the # corpus files # fileIds = data.fileids() fileids = data.fileids()[:30] idf_matrix = [] dictionary = dict() # total count f words in the corpus words_count = 0 # total count of document in the corpus documents_count = len(fileids)
from nltk.stem import WordNetLemmatizer from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt #sets working directory os.chdir("/home/allomorph/Downloads") #ingesting and reading the PDF of the textbook intext = open("Keith Johnson-Acoustic and auditory phonetics (Kindle friendly).pdf",'rb') read_text = PyPDF2.PdfFileReader(intext) num_pages = read_text.getNumPages() #prints the number of pages #print(num_pages) #building a stopword set including punctuations stopwords = set(stopwords.words('english')) stopwords = stopwords.union(punctuation) #calling the lemmatizer function wnl = WordNetLemmatizer() #building a lexicon of words from the textbook by the variable instances instances = [] for i in range(0,num_pages): page = read_text.getPage(i) content = page.extractText() content.encode('utf-8') tokens = word_tokenize(content) filtered_tokens = [word.lower() for word in tokens if word not in stopwords] for instance in filtered_tokens: instances.append(instance)