def review_cleaner(df, new_stopwords=None): ''' Takes a df, cleans the reviews in it and returns an updated df. ''' reviews = list(df["Spell-checked review"]) stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(["u", "also", "mok", "eric"]) if new_stopwords != None and type(new_stopwords) == list: stopwords.extend(new_stopwords) cleaned_reviews = [] for review in tqdm(reviews): review = review.lower() review = re.sub('(?!\s)((\S*)((.com)|(.com\/))(\S*))', ' ', review) review = re.sub('[^A-Za-z\s-]', '', review) review = re.sub('(\W?)(\d+)(\S?)((\d+)?)(((a|p)[m])|((\s)((a|p)[m]))?)', ' ', review) review = re.sub('(?!\w+|\s)--+(?=\s|\w+)', ' ', review) review = re.sub('(?!\w+)([,]+|[.][.]+|\/+)(?=\w+)', ' ', review) review = re.sub('([A-Z]([a-z]+))((\s[A-Z]([a-z]+))+)', ' ', review) review = re.sub('-', '', review) review = " ".join([word for word in review.split() if word not in stopwords]) doc = nlp(review) lemmatized_review = " ".join([token.lemma_ for token in doc if token.lemma_ != "-PRON-"]) cleaned_reviews.append(lemmatized_review) df["Cleaned review"] = cleaned_reviews return df
def lyrics_to_words(lyrics): ''' helper function to clean out song lyrics. We apply porter Stemmer algorithm and remove stopwords ''' stopwords = nltk.corpus.stopwords.words('english') newStopWords = [ 'verse', '1', '2', 'chorus', 'bridge', 'talking', 'refrain', 'explain', 'request' ] stopwords.extend(newStopWords) stemmer = PorterStemmer() words_english = set(nltk.corpus.words.words()) remove_non_english = " ".join( w for w in nltk.wordpunct_tokenize(lyrics) if w.lower() in words_english or not w.isalpha()) text = re.sub(r"[^a-zA-Z0-9]", " ", remove_non_english.lower()) # Convert to lower case words = text.split() # Split string into words words = [w for w in words if w not in stopwords] # Remove stopwords words = [PorterStemmer().stem(w) for w in words] # stem return words
def create_maps(): '''generates geojson files for creating maps of twitter activity broken down into weekend vs week day, and by major chunks of time.''' df = mm.pipeline.retrieve_and_merge_tweet_data() df = mm.pipeline.transform_timestamp(df, hour = True) df = get_tweet_rate(df) #remove geoids that are in the ocean odd_ids = ['060750601001016', '060750179021003','060759901000003',\ '060759901000002', '060750179021000','060750601001000',\ '060759804011003', '060750201001001'] df = df[~df['geoid10'].isin(odd_ids)] #customize stopwords for editing tokens stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a',\ '@','.', 'co', 'com','amp', 'via','http','htt','https', '()',']']) sstopwords=[unicode(word) for word in stopwords] #break dataframe into four hour chunks of time throughout the day df_hour = tweets_by_hour(df) #obtain geometry data for each geoid for mapping df_hour = retrieve_geometry_information(df_hour) #get top ten tokens for each group df_hour['top_ten'] = df_hour.tokens.apply(top_tokens) #generate geojsons for time in df_hour.hr_bin.unique(): time_df = df_hour[df_hour['hr_bin']== time] outfilename = 'data/' + time + '.json' dataframe_to_geojson(time_df, outfilename)
def text_preprocessing(textIter): """ remove stopwords, punctuation, etc., and stem/tokenize text strings :param textIter: iterable of text (e.g. list, dataframe, etc.) :return: list of tokens, grouped by document """ stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) # addstopwords = [None, 'nan', ''] # stopwords.extend(addstopwords) reviewsList = textIter # clean up text to remove punctuation and empty reviews reviewsList[:] = [s.translate(None, string.punctuation).lower().split() if str(s) not in (None, 'nan', '') else '' for s in reviewsList] # group the list of strings together and remove stopwords reviewsList[:] = list(itertools.chain(*reviewsList)) print "count of tokens before stopword removal: ", len(reviewsList) reviewsList = [word for word in reviewsList if word not in stopwords] reviewsList = [word for word in reviewsList if re.search(r'[a-zA-Z]', word) is not None] print "count of tokens after stopword removal: ", len(reviewsList) # join all reviews together into one string tokens = nltk.word_tokenize(" ".join(reviewsList)) return tokens
def stop_words(table): #We need to remove the stop words stopwords = nltk.corpus.stopwords.words('english') #spacy's stop words newStopWords = ['im','oh','lol','whence','id', 'here', 'show', 'were', 'why', 'n’t', 'the', 'whereupon', 'not', 'more', 'how', 'eight', 'indeed', 'i', 'only', 'via', 'nine', 're', 'themselves', 'almost', 'to', 'already', 'front', 'least', 'becomes', 'thereby', 'doing', 'her', 'together', 'be', 'often', 'then', 'quite', 'less', 'many', 'they', 'ourselves', 'take', 'its', 'yours', 'each', 'would', 'may', 'namely', 'do', 'whose', 'whether', 'side', 'both', 'what', 'between', 'toward', 'our', 'whereby', "'m", 'formerly', 'myself', 'had', 'really', 'call', 'keep', "'re", 'hereupon', 'can', 'their', 'eleven', '’m', 'even', 'around', 'twenty', 'mostly', 'did', 'at', 'an', 'seems', 'serious', 'against', "n't", 'except', 'has', 'five', 'he', 'last', '‘ve', 'because', 'we', 'himself', 'yet', 'something', 'somehow', '‘m', 'towards', 'his', 'six', 'anywhere', 'us', '‘d', 'thru', 'thus', 'which', 'everything', 'become', 'herein', 'one', 'in', 'although', 'sometime', 'give', 'cannot', 'besides', 'across', 'noone', 'ever', 'that', 'over', 'among', 'during', 'however', 'when', 'sometimes', 'still', 'seemed', 'get', "'ve", 'him', 'with', 'part', 'beyond', 'everyone', 'same', 'this', 'latterly', 'no', 'regarding', 'elsewhere', 'others', 'moreover', 'else', 'back', 'alone', 'somewhere', 'are', 'will', 'beforehand', 'ten', 'very', 'most', 'three', 'former', '’re', 'otherwise', 'several', 'also', 'whatever', 'am', 'becoming', 'beside', '’s', 'nothing', 'some', 'since', 'thence', 'anyway', 'out', 'up', 'well', 'it', 'various', 'four', 'top', '‘s', 'than', 'under', 'might', 'could', 'by', 'too', 'and', 'whom', '‘ll', 'say', 'therefore', "'s", 'other', 'throughout', 'became', 'your', 'put', 'per', "'ll", 'fifteen', 'must', 'before', 'whenever', 'anyone', 'without', 'does', 'was', 'where', 'thereafter', "'d", 'another', 'yourselves', 'n‘t', 'see', 'go', 'wherever', 'just', 'seeming', 'hence', 'full', 'whereafter', 'bottom', 'whole', 'own', 'empty', 'due', 'behind', 'while', 'onto', 'wherein', 'off', 'again', 'a', 'two', 'above', 'therein', 'sixty', 'those', 'whereas', 'using', 'latter', 'used', 'my', 'herself', 'hers', 'or', 'neither', 'forty', 'thereupon', 'now', 'after', 'yourself', 'whither', 'rather', 'once', 'from', 'until', 'anything', 'few', 'into', 'such', 'being', 'make', 'mine', 'please', 'along', 'hundred', 'should', 'below', 'third', 'unless', 'upon', 'perhaps', 'ours', 'but', 'never', 'whoever', 'fifty', 'any', 'all', 'nobody', 'there', 'have', 'anyhow', 'of', 'seem', 'down', 'is', 'every', '’ll', 'much', 'none', 'further', 'me', 'who', 'nevertheless', 'about', 'everywhere', 'name', 'enough', '’d', 'next', 'meanwhile', 'though', 'through', 'on', 'first', 'been', 'hereby', 'if', 'move', 'so', 'either', 'amongst', 'for', 'twelve', 'nor', 'she', 'always', 'these', 'as', '’ve', 'amount', '‘re', 'someone', 'afterwards', 'you', 'nowhere', 'itself', 'done', 'hereafter', 'within', 'made', 'ca', 'them'] stopwords.extend(newStopWords) table = ' '.join([word for word in table.split() if word not in (stopwords)]) return table
def remove_stopwords(words): """Remove stop words from list of tokenized words""" stopwords = nltk.corpus.stopwords.words('english') myStopWords = [] stopwords.extend(myStopWords) new_words = [] for word in words: if word not in stopwords: new_words.append(word) return new_words
def stopword_removal(words): stopwords = nltk.corpus.stopwords.words('english') newStopWords = ['said', 'say', 'says', 'mr'] stopwords.extend(newStopWords) word_filtered = [] for w in words: if w not in stopwords: word_filtered.append(w) unique = list(dict.fromkeys(word_filtered)) return " ".join(unique)
def get_similarity(df): stopwords = nltk.corpus.stopwords.words('english') newStopWords = ['description', 'developers', 'describe'] stopwords.extend(newStopWords) count = CountVectorizer(stop_words=newStopWords) count_matrix = count.fit_transform(df['soup']) cosine_sim = cosine_similarity(count_matrix, count_matrix) df = df.reset_index() indices = pd.Series(df.index, index = df['name']) return cosine_sim, indices
def calc_freq(file): stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(["metaenddot", "metanumberref", "metaendquestion, metanumberrefs"]) f = open(file,'r') raw = nltk.clean_html(f.read()) raw = ''.join(ch for ch in raw if ch not in set(string.punctuation)) tokens = [token.lower() for token in raw.split() if token not in stopwords] #generates a list of tokens fdist=FreqDist(tokens)
def text_process(text): strip_punc = [c for c in text if c not in string.punctuation] strip_punc = ''.join(strip_punc) strip_punc = strip_punc.lower() stopwords = nltk.corpus.stopwords.words('english') newstopwords = ['4', '2', '7', '3', '5', '1', '8', '0', '9', 'f', 'n', 'g', 'u', 'w', 'b', 'p', 'r', '6', 'k', 'x', 'cs', 'kp', 'kn', 'fa', 'ua', 'fo', 'st', 'jt', 'rr', 'pr', 'ey', 'gt', 'ff', 'lk', 'yo', 'um', 'jj', 'jh', 'ya', 'cr', 'th', 'lh', 'http'] stopwords.extend(newstopwords) return [word for word in strip_punc.split() if word.lower() not in stopwords]
def stemm_stop(text): ps = PorterStemmer() #stop_words = stopwords.words("english") stopwords = nltk.corpus.stopwords.words('english') newStopWords = ['num', 'na', '#'] stopwords.extend(newStopWords) filtered_words = [] for i in text.split(): if i not in stopwords: filtered_words.append(ps.stem(i)) return " ".join(filtered_words)
def remove_stop_words(self): all_stopwords = nltk.corpus.stopwords.words('english') if self.stop_words_list: stopwords.extend(self.stop_words_list) for item in self.items: for field in self.class_properties: current_field_value = getattr(item, field) setattr(item, field, [ word for word in current_field_value if word not in all_stopwords ])
def filter_it(self, titles): global l #filtered = re.sub(r'[^A-Za-z0-9 ]+', ' ',titles) newStopWords = [ 'Extractation Failed', 'mercado', 'libre', 'amazonde', 'amazon', 'amazonca', 'en', 'amazonde', 'amazonfr', 'amazoncom', 'amazoncouk', 'mercadolibre', 'ebay', 'ebaycom' ] stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(newStopWords) word_tokens = word_tokenize(titles.lower()) l = [w for w in word_tokens if not w in stopwords]
def clean_words(job_type_list,stopwords): from nltk.corpus import stopwords stopwords = ' '.join(stopwords) stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower() stopwords = stopwords.split(' ') stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering']) special_chars = ['--','...','\n','•','®','●','\n'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] return resultwords
def main(): nlp = spacy.load('en_core_web_lg') path = os.getcwd() + '/data/' data_list = os.listdir(path) data_list = sorted(data_list) if '.DS_Store' in data_list: data_list.remove('.DS_Store') else: pass stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(['–', '=', '>', '↩']) lemma = WordNetLemmatizer() holder = [] for i in range(len(data_list)): fullpath = path + data_list[i] data = codecs.open(fullpath, 'r', 'utf-8') data_text = data.read() data_tokens = data_text.strip().split() lemma_data_tokens = [ lemma.lemmatize(word.lower()) for word in data_tokens ] content = [ lemma.lemmatize(word.lower()) for word in data_tokens if word.lower() not in stopwords ] fdist_nostop = nltk.FreqDist(content) fdist_stop = nltk.FreqDist(lemma_data_tokens) holder.append(data_text) print('-----------------------{}-----------------------'.format( data_list[i])) print('1 gram with stopwords: {}'.format(fdist_stop.most_common(10))) print('1 gram without stopwords: {}'.format( fdist_nostop.most_common(10))) for i in range(2, 6): bgs = nltk.ngrams(data_tokens, i) fdist = nltk.FreqDist(bgs) print('{} gram: {}'.format(i, fdist.most_common(10))) for k in range(len(holder)): nlp_doc = nlp(holder[k]) for y in range(len(holder)): nlp_other_doc = nlp(holder[y]) print("{} {}: {}".format( data_list[k], data_list[y], round(nlp_doc.similarity(nlp_other_doc), 4)))
def remove_stopwords(text): ''' Funcao para remover stopwords em inglês input: text: string output: string ''' from nltk.corpus import stopwords stopwords = stopwords.words('english') #define aqui que é 'english' stopwords.extend(stopwords) #stopwords.extend(stop_pt) return ' '.join([word for word in str(text).split() if word not in stopwords])
def cleanText(x): soup = BeautifulSoup(x, 'lxml') no_html_text = soup.get_text() tokens = nltk.word_tokenize(no_html_text) tokens = [w.lower() for w in tokens] words = [word for word in tokens if word.isalpha()] table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] stopwords = nltk.corpus.stopwords.words('italian') stopwords.extend(string.punctuation) stopwords.extend(nltk.corpus.stopwords.words('english')) words = [w for w in stripped if w.isalpha() and not w in stopwords] return " ".join(words)
def get_value(self, text): if self.executable is None: raise Exception('An executable is necessary.') stopwords = [] if self.stem_patterns: for pattern in self.patterns: stopwords.extend(pattern.split(' ')) # remove duplicate stopwords = list(set(stopwords)) return self.executable( CleanedText(text, additional_stopwords=stopwords))
def get_stopwords(self, language): """ Funcao responsavel carregar a lista de stopwords para o processamento Params: language (string): idioma de busca das stopwords Returns: stopwords (list): lista de stopwords carregadas """ stopwords = nltk.corpus.stopwords.words(language) stopwords.extend(['?', '.', ',', '(', ')', '!']) return stopwords
def freqdst(ks, stopwords=stopwords, leaveout=[]): stopwords.extend(leaveout) tokenizer = nltk.word_tokenize txtout = dict.fromkeys(ks.keys()) for k in ks.keys(): txt = ' '.join([i['TI'] for i in ks[k]]) txt = txt.lower() try: tok = tokenizer(UnicodeDammit(txt).unicode_markup) except UnicodeEncodeError: pass # dan niet tok = [t for t in tok if t not in stopwords] tok = [t for t in tok if len(t) > 2] txtout[k] = tok return txtout
def get_all_stopwords(character_names=True): stopwords = [] for file in os.listdir("resources"): with open(os.path.join(my_path, "resources", file)) as infile: if file == "char_stopwords.txt": if character_names == False: pass else: words = [] w = [line.strip() for line in infile.readlines()] stopwords.extend(words) else: words = [] w = [line.strip() for line in infile.readlines()] stopwords.extend(words) return list(set(stopwords))
def remove_stopwords(input_str, add_stopwords=[], exclude_stopwords=[]): #''' returns a string, optionally add a list of words to the stopword list # , also optionally add a list of words to exclude from stopwords stopwords = nltk.corpus.stopwords.words('english') if len(add_stopwords) > 0: stopwords.extend(add_stopwords) if len(exclude_stopwords) > 0: final_stop_words = [ word for word in stopwords if word not in exclude_stopwords ] else: final_stop_words = stopwords stopwords = final_stop_words input_list = input_str.split() without_stopwords = [word for word in input_list if word not in stopwords] return_str = ' '.join(without_stopwords) return return_str
def create_maps(): '''generates geojson files for creating maps of twitter activity broken down into weekend vs week day, and by major chunks of time.''' df = mm.pipeline.retrieve_and_merge_tweet_data() wkd_df = mm.pipeline.transform_timestamp(df, DOW = True) wkd_df = get_tweets_per_day(wkd_df) #remove geoids that are in the ocean odd_ids = ['060750601001016', '060750179021003','060759901000003',\ '060759901000002', '060750179021000','060750601001000',\ '060759804011003', '060750201001001'] df = df[~df['geoid10'].isin(odd_ids)] #get the average number of tweets per day for every day of the week wkd_df = wkd_df.groupby(['geoid10', 'DOW']).agg(np.mean).reset_index() #get a grouped sum of the words wkd_df_txt = wkd_df.groupby(['geoid10', 'DOW'])['tokens'].apply(lambda x: ','.join(x)).reset_index() #merge these two dataframes together wkd_df['tokens'] = wkd_df_txt['tokens'] #create a dataframe of only weekend values df_weekend = seperate_weekends(wkd_df, True) #create a dataframe of only weekday values df_weekday = seperate_weekends(wkd_df, False) #customize stopwords for editing tokens stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a', ',',\ '@','.', 'co', 'com','amp','?' 'via','http','htt','https', '()',']']) stopwords.extend([str(char) for char in punctuation]) sstopwords=[unicode(word) for word in stopwords] #get the most frequent words for visualization dfwkday['top_ten'] = dfwkday.tokens.apply(top_tokens) df_weeknd['top_ten'] = df_weeknd.tokens.apply(top_tokens) #get geometry information for each san francisco block df_end = retrieve_geometry_information(df_weeknd) df_day = retrieve_geometry_information(dfwkday) #generate geojsons dataframe_to_geojson(df_end, 'data/weekend.json') dataframe_to_geojson(df_day, 'data/weekday.json')
def stem_tokenize(str_use): """ Takes a string and tokenizes it, stripping it of punctuation and stopwords. Returns a list of strings. """ stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') addstopwords = ["in", "on", "of", "''"] stopwords.append(addstopwords) stemmer = wordnet.WordNetLemmatizer() tokenizer = punkt.PunktWordTokenizer() # removes stopwords and punctuation, then splits the string into a list of words token = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(str_use) if token.lower().strip(string.punctuation) not in stopwords] text = [word for word in token if re.search(r'[a-zA-Z]', word) is not None] stem = [stemmer.lemmatize(word) for word in text] # Returns a list of strings return stem
def generate_word_list(text_col, nr_words = n): tokens = word_tokenize(text_col.to_string()) # tokenize lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "blockcha", "btc", "bitcoi", "bitcoins", "daily", "say", "could", "price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie", "one", "first", "short", "check", "cryptocurrencies", "http", "htttp", "hour", "list", "u", "new", "vi", "ccn", "etc", "usd"] stopwords.extend(newStopWords) no_stops = [t for t in alpha_only if t not in stopwords] wordnet_lemmatizer = WordNetLemmatizer() # create instance of the WordNetLemmatizer class lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list lemmatized = [t for t in lemmatized if t not in stopwords] # remove stopwords again after lemmatization bow = Counter(lemmatized) # Create the bag-of-words: bow word = [] word_count = [] for i in range(nr_words): word.append(bow.most_common(nr_words)[i][0]) word_count.append(bow.most_common(nr_words)[i][1]) words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count}) return(words_and_counts_df) # return the n most common tokens
def is_ci_stem_stopword_set_match(self, a, b, threshold=0.5): # Get default English stopwords and extend with punctuation stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) stopwords.append('') # Create tokenizer and stemmer tokenizer = nltk.tokenize.punkt.PunktWordTokenizer() stemmer = nltk.stem.snowball.SnowballStemmer('english') """Check if a and b are matches.""" tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \ if token.lower().strip(string.punctuation) not in stopwords] tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \ if token.lower().strip(string.punctuation) not in stopwords] stems_a = [stemmer.stem(token) for token in tokens_a] stems_b = [stemmer.stem(token) for token in tokens_b] # Calculate Jaccard similarity ratio = len(set(stems_a).intersection(stems_b)) / float( len(set(stems_a).union(stems_b))) return (ratio >= threshold)
def get_reviews(fname): """ get review text from the data set :param fname: file name of data set; expecting csv :return: pandas dataframe of text reviews (strings) """ stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(string.punctuation) try: with open(fname, "rb") as infile: df = pd.DataFrame.from_csv(infile, header=0, index_col=False) # drop any review entries that are blank print "length of df: ", len(df) df = list(df['r_text'].dropna()) print "... after removing NAs: ", len(df) # clean up text to remove punctuation and empty reviews reviewsList = [s.replace('\n', '').lower() for s in df] return reviewsList except: raise IOError
def parse_text(txt): txt = txt.lower() #converting to lowercase #removing punctuation and digits p = string.punctuation d = string.digits tables = str.maketrans(p, len(p) * " ") text1 = txt.translate(tables) tables = str.maketrans(d, len(d) * " ") text1 = text1.translate(tables) words = word_tokenize(text1) #tokenization #lemmatization wordnet_lemmatizer = WordNetLemmatizer() words1 = [wordnet_lemmatizer.lemmatize(token) for token in words] #removing stopwords stopwords = nltk.corpus.stopwords.words("English") extra_stopwords = [ 'rt', 'RT', 'TakeTheKnee', 'taketheknee', 'TakeAKnee', 'takeaknee' ] #adding stopwords stopwords.extend(extra_stopwords) words = [w for w in words1 if w not in stopwords] return " ".join(words)
import string import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import confusion_matrix import seaborn ## 2. Tokenizing ## 3. Removal of stopwords ## 4. Stemming stopwords = nltk.corpus.stopwords.words("english") #extending the stopwords to include other words used in twitter such as retweet(rt) etc. other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) stemmer = PorterStemmer() def preprocess(comments_dataset): # removal of extra spaces regex_pat = re.compile(r'\s+') comments_dataset_space = comments_dataset.str.replace(regex_pat, ' ') # removal of @name[mention] regex_pat = re.compile(r'@[\w\-]+') comments_dataset_name = comments_dataset_space.str.replace(regex_pat, '') # removal of links[https://abc.com] giant_url_regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
# Get list of each track lyric text files # --------------------------------------- # ------------------NELSON--------------------------------------- import os from itertools import chain from nelsonfunctions import inforemove, infocheck, nelsonlinewrite # ------------------MODIFY HERE----------------------------------- path = os.getcwd() + "\\Counts Program\\SourcePath.txt" infobuzzwords = ["typed", "artist", "album", "title", "song"] stopwords = nltk.corpus.stopwords.words("english") outfile = "top_20_words_per_lyric_file.csv" # ------------------------------------------------------ stopwords.extend([x.strip() for x in open("newstopwords.txt.")]) # combine with new words FILE = open(path, "r") for line in FILE: line = line.replace("\n", "") tracks.append(line) FILE.close() # For every track get count of words that match keywords # ------------------------------------------------------ word = [] for track in tracks: word = [] # Clear word buffer
if len(df) < 1: st.error("Não foi possível recuperar dados do twitter do parlamentar") else: #criando uma lista com todas as palavras dos ultimos 200 twites big_string = '' for i in range(len(df)): big_string = big_string + df[i]['text'] #Definindo stopwords stopwords = stopwords.words('portuguese') + list(punctuation) stopwords.extend([ 'https', 'http', 'sobre', 'vamos', 'co', 'rt', 'todos', 'todo', 'rs', 'vc', 'ser', 'pra', 'tudo', 'vai', 'vcs', 'www', 'br', 'coisa', 'hoje', 'dia', 'saiba', 'html', 'htm', 'via' ]) # Inserindo título e hiperlink acima da nuvem de palavras end = df[0]['link'].split('/status')[0] link_perfil = 'Nuvem de Palavras (<a href= %(end)s target="_blank">Twitter</a>)' % { 'end': end } st.markdown("### " + link_perfil, unsafe_allow_html=True) # Criando WordCloud wordcloud = WordCloud(stopwords=stopwords, background_color='white').generate( big_string.lower())
#Morgan Smith #This is a program to classify Github Repositories and label them with possible points of contributions. #NLP will be used on README files as well as the most recent issues published.LDA is used to #find the needed contributions. import nltk from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary #filter with stopwords stopwords = (nltk.corpus.stopwords.words('english')) newStops = ['.', '#', ',', 'txt'] stopwords.extend(newStops) #opening file to be tokenized f = open('C:/Users/jmorg/390/390SNA/README.txt') raw = f.read() #creating tokenized word list tokens = word_tokenize(raw) lemma = WordNetLemmatizer() wordList = [] lower = (w.lower() for w in tokens) for i in lower: if i not in stopwords: i = wordList.append(lemma.lemmatize(i))
con_words.append(tokens) con_words = [sl for li in con_words for sl in li] adv_words = [] for sentence in adv_sentences: tokens = word_tokenize(sentence) adv_words.append(tokens) adv_words = [sl for li in adv_words for sl in li] #removing stopwords and adding few more stopwords to this words stopwords = stopwords.words('english') addstopwords = ['please','make','still'] stopwords.extend(addstopwords) #filter word tokens by removing stopwords filtered_pos_words = [] for word in pos_words: if word not in stopwords: filtered_pos_words.append(word) filtered_con_words = [] for word in con_words: if word not in stopwords: filtered_con_words.append(word) filtered_adv_words = [] for word in adv_words:
from collections import defaultdict from tqdm import tqdm import glob import pandas as pd from itertools import product import string from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer from nltk.corpus import stopwords import numpy as np stopwords = stopwords.words('english') stopwords.extend(['sp', 'ssp', 'var']) MIN_CONF = 0.5 ALPHA = 20 from time import time import argparse def clean_string(text): text = ''.join([word for word in text if word not in string.punctuation]) text = text.lower() text = ' '.join([word for word in text.split() if word not in stopwords]) return text
'Watch', 'Podcast', 'Foreign', 'Edition', 'Podcast', 'Opinion', 'Notable', 'Quotable', 'opinion', \ 'Best', 'Web', 'Newsletter', 'opinion', 'Morning', 'Editorial', 'Report', 'Newsletter', 'section-link',\ '3qFFDClt', 'Life', 'Arts', 'Arts', 'Books', 'Cars', 'Food', 'Drink', 'Health', 'Ideas', 'Science', 'Sports',\ 'Style', 'Fashion', 'Travel', 'Magazine', 'Puzzles', 'Future', 'Everything', 'Far', 'Away', 'Life', 'Arts',\ 'section-link', '3qFFDClt', 'House', 'Day', 'section-link', '3qFFDClt', 'Magazine', 'Fashion', 'Art', 'Design',\ 'Travel', 'Food', 'Culture', 'returnLink', '235Zspdg', 'mailto', 'support', '@', 'support', '@', 'strap',\ 'Articles', 'img', 'U.S.', 'Ban', 'Travel', 'From', 'image', '2srBg4oD', '1x', '2x', '3x','/h3', '1zGPJwbt',\ 'div', 'image-container', '3SkfuWVV', '/', '/div', '1zGPJwbt', 'h3', 'episode-name', '3Xrkqwfv', '/h3', \ '1zGPJwbt', 'div', 'image-container', '3SkfuWVV', '/', '/div', '1zGPJwbt', 'h3', 'episode-name', '3Xrkqwfv',\ 'Cookie', 'Policy', 'Copyright','3qZEiy_G', 'skipToMainButton', '-1', 'Skip', 'Main','instagram', '1nV6js1B', \ 'Instagram', 'Instagram', 'youtube','$','brand-link', '21t2Ybqa', 'masthead-strap-link', '3Kba64tv', 'Print',\ 'masthead-strap-link', '3Kba64tv','Privacy', 'Data', 'Subscriber', 'Agreement', 'Terms', 'Use', \ 'cookies-advertising', 'Choices', ] stopwords.extend(newstpwrds) with open(f'ZMSMDummy_{today}.txt', 'a') as fo: fo.write(text) with open(f'ZMSMDummy_{today}.txt', 'r', errors='ignore') as fo1: csvWriter = csv.writer(fo1) msm = fo1.readlines() for i in msm: clean = [] tokenized_var = word_tokenize(i) for word in tokenized_var: if not word in stopwords and "https" not in word \ and '.com' not in word \ and 'index' not in word \
import numpy as np from PIL import Image import json import multiprocessing as mp source_directory = "books" data_directory = "bookdata" source_files = [f.split(".")[0] for f in os.listdir(source_directory)] done_files = [f.split(".")[0] for f in os.listdir(data_directory)] files = [] gfile = open("custom_stopwords.txt","r") to_remove = [l.strip() for l in gfile.readlines()] stopwords = stopwords.words('english') stopwords.extend(to_remove) # Creating a list of already done files in case the process gets interrupted for file in source_files: if file not in done_files: files.append(file) file_count = len(files) def process(i): data = {} file = open(f"{source_directory}/{files[i]}.txt","r") print(f"Starting {i}: {files[i]}") text = file.read() tokens = word_tokenize(text) ## Not doing text.lower() to maintain the case of the words for word in tokens:
import math import time import sys # Assuming English everything for now, would require a bit # of refactoring to try this sampler on a corpus in another # language. # stemmer = EnglishStemmer() # Default stopwords from NLTK stopwords = stopwords.words('english') # Add some of our own. stopwords.extend(['european', 'commission','like','must','also','would',\ 'mr','mrs','go','.',',','?','new','put','way','use', 'policy', 'europe',\ 'need', 'member','preside','state', 'parliament', 'union', 'make', \ 'propose', 'country', 'council', 'report', 'take', 'develop', 'right', \ 'question', 'therefore']) stopwords = set([stemmer.stem(x) for x in stopwords]) def should_exclude(word, sw=stopwords): """ Should we exclude word? :param word: a word :param sw: a set of stopwords to exclude :return: True iff we want to exclude word. """ return (word.lower() in sw) or (len(word) <= 3)
#from cochranenlp.textprocessing.drugbank import Drugbank import nltk from nltk.corpus import stopwords from noaho import NoAho #import fuzzywuzzy #drugbank = Drugbank() stopwords = stopwords.words('english') # hand-crafted and intended for targeting interventions! # some of these are just words that are likely to be shared # between cdsr text and abstract, even though not describing # interventions specifically stopwords.extend(["either", "time", "patients", "or"]) def distantly_annotate(n=None): ''' e.g., > tagged_pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = distant_intervention_tag.distantly_annotate(500) ''' bviewer = BiViewer() return ds_interventions_abstracts(bviewer, num_studies=n) def _tag_drugs(study): intervention_text = _iv_for_study(study) abstract = _abstract_for_study(study) interventions_tokens = nltk.word_tokenize(intervention_text)
import datetime from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer # from nltk.stem import SnowballStemmer from pattern.text.en import lemma from pymongo import MongoClient import collections import re # stemmer = SnowballStemmer("english") splits = re.compile('[\s\[\]\?().,;:\'"/]+') lemmatizer = WordNetLemmatizer() stopwords = list(stopwords.words('english')) other_words = ['used', 'propose', 'provide', 'show', 'set', 'also'] stopwords.extend(other_words) stopwords = [unicode(line.strip('\n')) for line in open('./stop_words.txt')] # str 13041 # unicode 后 13042 # C,R,go is_num = re.compile('^[\d|-|=]+$') """ """ class Count(object): def __init__(self, host='127.0.0.1:27017', db_name='esi', doc_name='test', key=None, result=None, show_result=True, reset_result=True,
#!/usr/bin/env python from nltk.corpus import stopwords stopwords = stopwords.words('english') stopwords.extend(['table', 'tables','host', 'hosts', 'delicious', 'anything', 'everything', 'something', 'host', 'lunch', 'aka', 'menu', 'menus', 'fare', 'buffet', 'lunches', 'dinner', 'dinners', "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", 'ingredient', 'ingredients', 'cuisine', 'cuisines', 'restaurant', 'restaurants', 'board', 'boards', 'waitstaff', 'waiter', 'waitress', 'waiters', 'waitresses', 'waitperson', 'scratch', 'scratches', 'potluck', 'potlucks', 'msg', 'feed', 'feeds', 'feeding', 'drinkable', 'drinkables', 'dishses', 'dietary', 'dieting', 'diets', 'colouring', 'coloured', 'colour', 'conserve', 'center', 'centre', 'centers', 'centres', 'bite', 'spoon', 'spoons', 'spoonful', 'yumm', 'yummy', 'edible', 'drunk', 'drank', 'drunken', 'vegetarianism', 'vegetarian', 'vegetarians', 'takeout', 'mess', 'messy', 'messing', 'messes', 'leftovers', 'leftover', 'end', 'ends', 'ending', 'joint', 'joints', 'carb', 'carbs', 'carbohydrate', 'carbohydrates'])
def csw(stopwords,filename): #Given the existing stopword list and a file of new stopwords, return combines listofwords = [x.strip() for x in open(filename)] stopwords.extend(listofwords) return(stopwords)
# Find where the nltk.file is import nltk print(nltk.__file__) ## sample text sample = gutenberg.raw("bible-kjv.txt") ## fp = open('C:/Users/MyStyle/Desktop/WordAnalyze/Text/Trump.txt', 'r', encoding='utf-8') ## sample = fp.readline() tok = word_tokenize(sample) stopwords = nltk.corpus.stopwords.words('english') newStopWords = [ ',', '.', ':', ';', '?', 'And', 'I', '!', '``', '\'s', '-', '—' ] stopwords.extend(newStopWords) filtered_sentence = [w for w in tok if not w in stopwords] filtered_sentence = [] for w in tok: if w not in stopwords: filtered_sentence.append(w) mytext = nltk.Text(filtered_sentence) filter_dist = nltk.FreqDist(filtered_sentence) print(filter_dist.most_common(50)) ## 詞彙多樣性 (相異單詞數量/總單詞數量) def lexical_diversity(text): return len(set(text)) / len(text)
__author__ = 'prashantravi' from sklearn.preprocessing import LabelEncoder from nltk.tokenize import TreebankWordTokenizer from nltk.tag.stanford import StanfordPOSTagger from nltk.corpus import stopwords from readData import DataModel from itertools import chain, imap from sentiWord import get_scores stopwords = stopwords.words("English") stopwords.extend(['#', ',', '+', '.']) punctuation = ".,:;!?\"" def transformTweetData(tweet): content = unicode(tweet.sentence.lower(), errors='ignore') words = content.strip().split() tokenizer = TreebankWordTokenizer() extra_features = [] content = " ".join(words + extra_features) tokens = tokenizer.tokenize(content) tokens = [t for t in tokens if t not in stopwords] return tokens def remove_punctuation(input_string): for item in punctuation: input_string = input_string.replace(item, '') #print input_string return input_string def main(): sentence = raw_input("What's your sentence? "); dataModel = DataModel(None, None, None,None, None, None, sentence.lower());