def main(sheet, stopw): tweetlist = sheet_extractor(sheet) new_tweetlist = punctuation_splitter(tweetlist) tweet_words = just_tweet_words(new_tweetlist) clean_tweetlist = clean_tweets(new_tweetlist) tweet_stopw = stopwords(tweet_words, stopw) tweet_topw = topwords(tweet_stopw) tweet_hasht = hashtags(tweet_words) tweet_bigram = bigrams(clean_tweetlist) tweet_trigram = trigrams(clean_tweetlist) t_s = tweet_sent(new_tweetlist) # all lists as one so that it can be passed as one argument all_list = [] all_list.append(tweet_topw) all_list.append(tweet_hasht) all_list.append(tweet_bigram) all_list.append(tweet_trigram) all_list.append(emojis) # apply the feature extractor to the tweets featuresets = [(extract_tweet_features(tweet, all_list), sent) for (tweet, sent) in t_s] # divide the data into a train_set (90% of the data) and a devtest_set (10% of the data) as the test_set is separate train_set = featuresets[:(int(len(featuresets) * 0.9))] devtest_set = featuresets[(int(len(featuresets) * 0.9)):] # train the classifier with the train_set classifier = nltk.NaiveBayesClassifier.train(train_set) print("accuracy: " + str(nltk.classify.accuracy(classifier, devtest_set))) classifier.show_most_informative_features(20)
def processAndVader(): tweetProcessor_vader = TwitterProcessing.TwitterProcessing( TweetTokenizer(), stopwords()) lSentiment_vader = [] lSentiment_vader = vaderSentimentAnalysis(getTweets(), True, tweetProcessor_vader) return lSentiment_vader
def main(): import os folder = "egos/" output = "Documentos Finais/" for doc in os.listdir(folder): print("\n--> " + doc) file = folder + doc f = open(file, 'r') doc_save = output + doc outfile = open(doc_save, 'a+') lines = f.readlines() termos = {} for line in lines: tweet = stopwords(line) if len(tweet) > 0: cont = 0 for termo in tweet: if not termo in termos.keys(): termos[termo] = contagem(lines, termo) #Salvando if termos[termo] > 1: outfile.write(str(termo + " ")) cont += 1 if cont > 0: outfile.write(str("\n")) f.close() outfile.close()
def cleaner(text): text = remove_punctuations(text) text = small(text) text = num_remove(text) text = stopwords(text) text = lemmatize(text) return text
def cluster_texts(texts, filenames, clusters): #TF IDF #Transform texts to Tf-Idf coordinates stop_words = lemm(stopwords()) vectorizer = TfidfVectorizer(tokenizer=process_text, stop_words=stop_words, max_df=0.5, min_df=0.03, lowercase=True) vectorizer = vectorizer.fit(texts) tfidf_model = vectorizer.transform(texts) first_vector_tfidfvectorizer = tfidf_model[0] tdidf_df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"]) tdidf_df = tdidf_df.sort_values(by=["tfidf"], ascending=False) print('#####################################') print('TD IDF - Collective Frequencies:') print(tdidf_df.to_string()) print('#####################################') print('TD IDF - Frequencies per doc:') print(tfidf_model) #COUNT count_vectorizer = CountVectorizer(tokenizer=process_text, stop_words=stop_words, max_df=0.5, min_df=0.03, lowercase=True) count_vectorizer = count_vectorizer.fit(texts) count = count_vectorizer.transform(texts) first_vector_countvectorizer = count[0] count_df = pd.DataFrame(first_vector_countvectorizer.T.todense(), index=count_vectorizer.get_feature_names()) count_df = count_df.sort_values(by=0, ascending=False) print('#####################################') print('Term Count:') print(count_df.to_string()) #Cluster texts using K-Means km_model = KMeans(n_clusters=clusters) km_model.fit(tfidf_model) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) print('#####################################') print('Clusters:') print(dict(clustering))
def sc_transform(df_col): # clean text from the column df_col = df_col.apply(lambda x: stopwords(remove_punctuation(x))) # get frequencies and filter for uncommon words freq_d = pd.Series(' '.join(df_col).split()).value_counts() rare_words = freq_d[freq_d <= 4] freq_d = freq_d[freq_d > 4] # compute BoW vector and its norm to get one value df_col = df_col.apply( lambda x: [x for x in x.split() if x not in rare_words]) dict_d = corpora.Dictionary(df_col) df_col = df_col.apply(lambda desc: LA.norm(dict_d.doc2bow(desc))) return df_col
def preprocess(fdict): dict = {} for key in fdict.keys(): list = [] for doc in fdict[key]: clean = cleanlines(doc) tokener = tokener(clean) lemmatize = lemmatize(tokener) sy = sy(lemmatize) steming = stem(sy) list = stopwords(steming) list.append(list) dict.setdefault(key,list) return dict
def TF_IDF(df, colname, min_df, ngram_range_tuple): from sklearn.feature_extraction.text import TfidfVectorizer a = list(df[colname]) vectorizer = TfidfVectorizer(min_df=min_df, max_features=10000, tokenizer=tokenizer_tf_idf, stop_words=stopwords(), ngram_range=ngram_range_tuple) vz = vectorizer.fit_transform(a) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index') tfidf.columns = ['tfidf'] return tfidf
def final_tokenzied_list(df, colname, ngrams): stop_words = stopwords() paragraph_list = df[colname].tolist() df['tokens'] = df[colname].map(tokenizer_tf_idf) n_grams = 2 while n_grams < ngrams + 1: col_name = str(n_grams) + '_' + 'grams' n_gram_df = text_to_ngrams(paragraph_list, stop_words, n_grams) df = pd.merge(df, n_gram_df, on='doct_no', how='left') df['tokens'] = np.where(df[col_name].isnull(), df['tokens'], df['tokens'] + df[col_name]) df.drop([col_name], axis=1, inplace=True) n_grams = n_grams + 1 return df
def __call__(self, docs): terms_field = self.terms_field get_lang = self.get_lang stopwords = self._stopwords for doc in docs: lang = get_lang(doc) lang_stopwords = stopwords(lang) tfield = doc[terms_field] to_del = [] for term in tfield: if term in lang_stopwords: to_del.append(term) self._logger.debug("Remove %s stopwords" % (len(to_del))) for keys in to_del: del tfield[keys] return docs
def preprocess(phrase): cleaned_phrase = re.sub(re.compile("<.*?>"), "", phrase) # remove HTML tags cleaned_phrase = re.sub("[^A-Za-z0-9]+", " ", cleaned_phrase) # keep only words cleaned_phrase = cleaned_phrase.lower() tokens = nltk.word_tokenize(cleaned_phrase) stop_words = stopwords("english") # stop words to remove filtered_phrase = [word for word in tokens if word not in stop_words] # remove stop words lemmatizer = WorldNetLemmatizer() lemmed_phrase = [lemmatizer.lemmaize(word) for word in filtered_phrase] phrase = " ".join(lemmed_phrase) return phrase
def preprocess_text(sentence): lemmatizer = WordNetLemmatizer() stop_words = stopwords('english') updated_text = '' text_data = sentence text_data = re.sub(r"[^\w\s]", '', text_data) # Removing Punctuations text_tokens = nltk.word_tokenize(text_data) # Tokenization updated_text_tokens = [ token for token in text_tokens if token not in stop_words ] # Stop words removal for t in updated_text_tokens: updated_text = updated_text + ' ' + str( lemmatizer.lemmatize(t)).lower() return updated_text
def split_and_process_lines(string, correct_spelling=False, stemming=False, remove_stopwords=False, expand_contractions=False): if string is None: return None lines = [] for line in re.split("[.?!]", string): line = remove_punctuation(line) line = remove_redundant_spaces(line) if expand_contractions: line = contractions(line) line_array = line.split(" ") if correct_spelling: line_array = autocorrect(line_array) if stemming: line_array = stem(line_array) if remove_stopwords: line_array = stopwords(line_array) if len(line_array) > 1: lines.append(" ".join(line_array).lower()) return lines
def ent_getword(text, rel): word_ent = {} word_list = set() for i in text: tokens = nltk.word_tokenize(text) filtered = stopwords(tokens) for j in filtered: if j not in word_list: word_list.add(j) for word in word_list: X = [] for text_i in text: if text_i in text: X.append(1) else: X.append(0) word_ent[word] = calc_ent_grap(X, rel) word_ent = sorted(word_ent.items(), key=lambda d: d[1])[:20] word_list = {} for word in word_ent: for i in wordnet(word[0]): # wordnet Expansion if i not in word_list: word_list[i] = word_ent[1] return word_list
return #print(dictionary) Train = "split.train" Test = "split.test" print("Answer to Question 1:") features_matrix, labels_train = extract_features(Train) print("") print("Answer to Question 2:") features = topwords(Train) print("") print("Answer to Question 3:") features_matrix = stopwords(Train, 10) print("Words after excluding stop words have been saved to nbStopWords.txt ") print("") print("Answer to Question 5:") features_matrix = programtopwordsLogOdds(Train) model = MultinomialNB() #model.fit(features_matrix,labels_train) # def extract_features(root_dir): # with open('C:/Users/cool dude/PycharmProjects/hello/hw9 (1)/split.test', 'r') as fin: # mylist = [line.rstrip('\n') for line in fin] # #features_matrix = np.zeros((len(mylist), 3000)) # train_labels = np.zeros(len(mylist)) # print(train_labels) #
# # Author: Maya Shaked # # Created: 02/15/2018 #------------------------------------------------------------------------------- import pandas as pd import sqlite3 import aggregate_numerical_data as agg_num from nltk.corpus import stopwords import dyadic_partitioning as dy EVALS_PART_1 = 'evals_json_version_5_part1' EVALS_PART_2 = 'evals_json_version_5_part2' SQL_DB_PATH = 'reevaluations.db' STOPWORDS = stopwords("english") + ['class', 'classes', 'professor', \ 'professors' 'course', 'courses', 'ta', 'tas'] def pre_process(sql_db_path, evals_part_1, evals_part_2): ''' Takes the SQL database path as well as the two json files containing all the evaluations and adds the numerical scores and sentiment analysis scores from aggregate_numerical_scores. adds the dyadic partitioning results. and cleans the dataframe - sql_db_pth is a string - evals_part_1 is a string - evals_part_2 is a string Returns a database object and a pandas dataframe '''
length = len(book.text1) actorpot = [book.text1[length//2+indexes[i]] for i in range len(indexes)] actorpot = [book.text1[length//2+indexes[i]] for i in range(len(indexes))] actorpot motspertinents(book.text2,30) motspertinents(book.text2,100) runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3') testX = createtestX(book.text2) predictions2 = DTC.predict(testX) length = len(book.text2) indexes = [i for i in range(len(predictions2))if predictions2[i] == 1] actorpot = [book.text1[length//2+indexes[i]] for i in range(len(indexes))] actorpot = [book.text2[length//2+indexes[i]] for i in range(len(indexes))] actorpot 'Edward' in actorpot stopwords() stopwords(english) stopwords(en) runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3') actors(book.text1) runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3') actors(book.text1) actors(book.text2) actors(book.text3) runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3') actors(book.text3) actors(book.text4) runfile('C:/Users/rapha/.spyder-py3/temp.py', wdir='C:/Users/rapha/.spyder-py3') scored[1] scored[(',','and')] scored.(',','and')
fo.write(l.name) line=reader.readline() fo.close() file = open("saibaba.csv") reader = csv.reader(file) sentencetokenize(reader) fp1 = open('sentence.txt', 'r') line=fp1.readline() while line: line=fp1.readline() fp2 = open('clean.txt', 'r') shorthanddata(fp2) fp3=open('expansion.txt','r') wordtokenize(fp3) fp4=open('words.txt','r') stopwords(fp4) fp5=open('wordswithoutstopwords.txt','r') stemy(fp5) fp6=open("stemming.txt",'r') stemreplacer(fp6) fp7=open("stemreplacer.txt","a+") antonymfilter(fp7)
for i in keyword: word = stopword.remove(i) if word != '': hasil_keyword.append(word) hasil_label = [] for i in abstrak: word = stopword.remove(i) if word != '': hasil_label.append(word) return (hasil_judul, hasil_abstrak, hasil_keyword, hasil_label) hasil_stopwords = [] for data in range(len(hasil_tokenize)): hasil = stopwords(hasil_tokenize[data][0], hasil_tokenize[data][1], hasil_tokenize[data][2], hasil_tokenize[data][3]) hasil_stopwords.append(hasil) print(hasil_stopwords) factory = StemmerFactory() stemmer = factory.create_stemmer() def stemming(judul, abstrak, keyword, label): hasil_judul = [] for i in judul: word = stemmer.stem(i) if word != '': hasil_judul.append(word) hasil_abstrak = []
3) 'maketrans' is used to translate the variable to a predefined mapped values. 4) string.punctuation means that we are targeting the punctuation marks present inside the sentence. 5) ('a', 'b', c) maps as ('variables that need to be repalaced', 'new variables which will be put instead of the new variables', 'values to be removed') ''' cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation)) # Tokenisation: Splitting the words in a sentence and storing these words in a list. tokenised_words = word_tokenize(cleaned_text, "english") # Removing the stop words which are redundant in a sentence. # Creating an empty list for storing the new words acquired after removing the stop words. new_words = [] # Traversing through the total words achieved after tokenization. for word in tokenised_words: if word not in stopwords('english'): # Appending the word to the empty list. new_words.append(word) # The list which will contain all the emotion attributes from the read.txt file and emotions file emotion_list = [] # Opening new file with open('emotions.txt', 'r') as file: for line in file: # Removing spaces, commas etc from the emotions file. clean_line = line.replace('\n', '').replace(',', '').replace("'", '').strip() # Assigning the word before colon to word and after colon to emotion. word, emotion = clean_line.split(':')
for str in text.split(): if str.startswith('#'): return str.split() >>> term_split("hey yoo me #dayofthedream") ['#dayofthedream'] >>> >>> >>> >>> from nltk.corpus import stopwords >>> stopword=stopwords.words('english') >>> def stopwords(text): return [i for i in text.split() if i in stopword] >>> stopwords("this is a foo bar sentence and example sentence for us to learn this subject") ['this', 'is', 'a', 'and', 'for', 'to', 'this'] >>> >>> >>> def stopwords(text): if [i for i in text.split() if i in stopword]: return True else: return False >>> stopwords("foo bar sentence example sentence") False >>> >>> >>>
def filter_stopwords(tokens): global stopwordp sw = stopwords() w1 = [x for x in tokens if not (x in sw)] return [y for y in w1 if not (None == stopwordp.match(y))]
tweets_target = list(tweets['target']) tweets_list = list(tweets['text']) # tweets_list = tweets_list.apply(remove_punctuation) tweets = [remove_punctuation(x) for x in tweets_list] sw = stopwords.words('english') def stopwords(text): text = [word.lower() for word in text.split() if word.lower() not in sw] return " ".join(text) # tweets_list = tweets_list.apply(stopwords) tweets = [stopwords(x) for x in tweets_list] # create an object of stemming function stemmer = SnowballStemmer("english") def stemming(text): '''a function which stems each word in the given text''' text = [stemmer.stem(word) for word in text.split()] return " ".join(text) # tweets_list = tweets_list.apply(stemming) tweets = [stemming(x) for x in tweets_list] # tweets.head(10)
import sys, nltk from collections import defaultdict from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords METHODS = ['simplified', 'best-avg', 'orig', 'leading'] stops = stopwords('english') def getDocuments(paths): docs = [] docs_tokens = [] for path in paths: with open(path, 'r', encoding='utf8') as f: s = f.read() f.close() sents = sent_tokenize(s) tokens = [word_tokenize(sent) for sent in sents] docs.append(sents) docs_tokens.append(tokens) return docs, docs_tokens def sumBasic(documents, docs_tokens, method='orig', length=100): # Get probs probs = defaultdict(int) sentprobs = defaultdict(int) result = "" if method == 'leading': sentences = [sen for doc in documents for sen in doc]
I believe that India got its first vision of this in 1857, when we started the war of Independence. It is this freedom that we must protect and nurture and build on. If we are not free, no one will respect us. My second vision for India’s DEVELOPMENT. For fifty years we have been a developing nation. It is time we see ourselves as a developed nation. We are among top five nations of the world in terms of GDP. We have 10 per cent growth rate in most areas. Our poverty levels are falling. Our achievements are being globally recognised today. Yet we lack the self-confidence to see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect""" import re from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer ps = PorterStemmer() Wordnet = WordNetLemmatizer() sentences = nltk.sent_tokenize(paragraph) corpus = [] for i in range(len(sentences)): review = re.sub('[^a-zA-Z]', ' ', sentences[i]) review = review.lower() review = review.split() review = [ ps.stem(word) for word in review if not word in set(stopwords('english')) ] review = ' '.join(review) corpus.append(review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.transform(corpus).toarray()
# Cursor Setting cursor = tweepy.Cursor(api.search, q=keyword, since=dsince, until=duntil, tweet_mode='extended', count=number, lang='en', geocode=location, include_entities=True) for i, tweet in enumerate(cursor.items(3000)): if i == 0: wfile.write("{},{},{},{},{}".format('no', 'time', 'favorite_count', 'tweet_retweet_count', 'tweet_text' + '\n')) try: wfile.write("{},{},{},{},{}".format( i, tweet.created_at, tweet.favorite_count, tweet.retweet_count, (emoji( stopwords( strip_all_entities( strip_links( removeRT(tweet.retweeted_status.full_text. lower().replace('\n', '')))))))) + '\n') except: pass wfile.close()
import nltk # Tokenization from nltk.tokenize import word_tokenize, sent_tokenize text = "Mary had a little lamb. Her fleece was white as snow" sents = sent_tokenize(text) print (sents) words = [word_tokenize(sent) for sent in sents] print (words) #Stopwords Removal from nltk.corpus import stopwords from string import punctuation customStopWords = set(stopwords('english')+list(punctuation)) wordsNotStopWords = [for word in word_tokenize(text) if word not in customStopWords] print(customStopWords) #N_Grams from nltk.collocations import * bigramMeasures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(wordsNotStopWords) print(sorted(finder.ngram_fd.items())) #Stemming text2 = "Mary closed on closing night when she was in the mood to close" from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() stemmedWords = [st.stem(word) for word in word_tokenize(text2)] stemmedNotStopWords = [word for word in word_tokenize(text2) if word not in customStopWords] print(stemmedWords) print(stemmedWords)
def K_m_cluster(num_clusters, df, text_col_name, top_n_terms): import numpy as np import pandas as pd import bokeh.plotting as bp from bokeh.models import HoverTool, BoxSelectTool from bokeh.plotting import figure, show, output_notebook from sklearn.feature_extraction.text import TfidfVectorizer import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.cluster import MiniBatchKMeans vectorizer = TfidfVectorizer(min_df=0.01, max_features=10000, tokenizer=tokenizer_tf_idf, stop_words=stopwords(), ngram_range=(1, 1)) vz = vectorizer.fit_transform(list(df[text_col_name])) kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False, max_iter=1000) kmeans = kmeans_model.fit(vz) kmeans_clusters = kmeans.predict(vz) kmeans_distances = kmeans.transform(vz) high_impact_cluster_terms = kmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() from sklearn.manifold import TSNE tsne_model = TSNE(n_components=2, verbose=1, random_state=1) tsne_kmeans = tsne_model.fit_transform(kmeans_distances) kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y']) kmeans_df['cluster'] = kmeans_clusters kmeans_df['text'] = df[text_col_name] cumpercent = 0 Km_file = [] for i in range(num_clusters): clust_num = i percent_in_doc = 100 * len( kmeans_df[kmeans_df['cluster'] == i]) / len(kmeans_df) hot_terms = '' for j in high_impact_cluster_terms[i, :top_n_terms]: hot_terms += terms[j] + ' | ' Km_file.append( dict(clust_num=clust_num, clust_terms=hot_terms, percent_in_doc=round(percent_in_doc, 2))) Km_file = pd.DataFrame(Km_file) name_csv = 'Km_clusters' + '_' + str(num_clusters) + '.csv' Km_file.to_csv(name_csv) colormap = np.array([ "#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5", "#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981", "#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", "#d07d3c", "#52697d", "#7d6d33", "#d27c88", "#36422b", "#b68f79" ]) kmeans_df['colors'] = colormap[kmeans_clusters] plot_kmeans = bp.figure( plot_width=700, plot_height=600, title="KMeans clustering of the news", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) plot_kmeans.scatter(x='x', y='y', color='colors', source=kmeans_df) hover = plot_kmeans.select(dict(type=HoverTool)) hover.tooltips = {"text": "@text", "cluster": "@cluster"} show(plot_kmeans) return kmeans_df
continue subject = entries[0] count = int(entries[1].replace("\n", "")) dict[subject] = count return dict if __name__ == "__main__": tokenizer = RegexpTokenizer(r'\w+') stemmer = SnowballStemmer("english") ## get stop words and normalize initial_stopwords = stopwords() stopwords = list() for s in initial_stopwords: s = normalize_string(s) stopwords.append(s) print("Loading index") mention_dict = load_index("../data/surface_forms_new.txt") subject_predicates_dict = [ ] # load_subject_predicates("data/SimpleQuestions_v2/freebase-FB2M.txt") subject_triple_counts = [ ] #load_subject_triple_counts("data/subject_triple_counts.txt") dataset_names = ["test"] max_ngram_size = 10 exclude_small_ngrams = True
""" Module which tokenizes the given data, i.e. filters out stopwords(prepositions, conjunctions and so on), punctuation marks, and gives us sentence and word arrays""" import string from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords def new_tokenize(input_string): """ Returns a list containing a list of words and a list of sentences given by nltk tokenize function and cleans up the punctuation marks and common stop words like 'or' and 'and' """ stop_words = set(stopwords.words('english') + list(string.punctuation)) unfiltered_words = word_tokenize(input_string) words = [word for word in unfiltered_words if word not in stop_words] sentences = sent_tokenize(input_string) return [words, sentences]