def find_politician_names(debate_soup_dict): for debate in debate_soup_dict.keys(): raw = debate_soup_dict[debate]["soup"].get_text() # raw = raw.replace("\\\", "") raw = raw.replace("\\", "") raw = raw.replace(".", ". ") raw = raw.replace("?", "? ") raw = raw.replace("!", "! ") raw = raw.replace(" ", " ") raw = raw.replace("[applause]", "") raw = raw.replace("[crosstalk]", "") raw = raw.replace("[laughter]" "[Laughter]" "(LAUGHTER)", "") tokens = nltk.word_tokenize(raw) speech = nltk.Text(tokens) sent_detector = nltk.data.load("tokenizers/punkt/english.pickle") sents = sent_detector.tokenize(raw.strip()) # find candidate names, most commonly repeated first words of sentences, not common words colon_names = [] dumbWords = stopwords.words("english") for sent in sents: if ":" in sent: sent = sent.split(":") possible_name = sent[0] if len(possible_name) < 25: colon_names.append(possible_name) fdist1 = FreqDist(colon_names) fdist2 = FreqDist(sents) mostFreq = fdist1.most_common(1)[0][1] if mostFreq > 20: debate_soup_dict[debate]["names"] = fdist1.most_common(10) else: debate_soup_dict[debate]["names"] = fdist2.most_common(10)
def follow_description(api, friend_list, screen_name): the_list = [] all_tags = [] for friend in friend_list: username = friend[0] frequency = friend[1] print(username) try: user = api.get_user(screen_name=username) for list_obj in user.lists_memberships(screen_name=username, count=50): for w in list_obj.name.lower().split(" "): # print(w) all_tags.append(w) except TweepError as err: print(err.reason) break # print(all_tags) the_list_name = strip_words(all_tags) the_list_dist = FreqDist(the_list_name) # for w in the_list_dist: # print ('***' + str(w)) print(the_list_dist.most_common(20)) return the_list_dist.most_common(20)
def get_hosts(year): '''Hosts is a list of one or more strings. Do NOT change the name of this function or what it returns.''' # Your code here file_name = 'gg%s.json' % year with open(file_name, 'r') as data: db = json.load(data) hosts = [] pairs = [] for f in db: e = f['text'] if 'and' in e.lower(): for proper in strip_proper_pairs(normalize_str(e).split()): pair = proper.split('and') if len(pair) == 2: if pair[0] != ' ' and pair[1] != ' ': pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' '))) pairs_freq = FreqDist(pairs) if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2: hosts.append(pairs_freq.most_common(10)[1][0][0]) hosts.append(pairs_freq.most_common(10)[1][0][1]) else: hosts.append(pairs_freq.most_common(10)[0][0][0]) hosts.append(pairs_freq.most_common(10)[0][0][1]) return hosts
def get_top_followings(screen_name): # authorize twitter, initialize tweepy api = TwitterGrabber.initialise_api(0) print(api.get_status) # initialize a list to hold all the tweepy Tweets all_tweets = [] # make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200) # get the user object # user = api.get_user(screen_name=screen_name) # print(user.lists_subscriptions) # save most recent tweets all_tweets.extend(new_tweets) # save the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 # keep grabbing tweets until there are no tweets left to grab while len(new_tweets) < 0: # print("getting tweets before %s" % oldest) # all subsequent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) # save most recent tweets all_tweets.extend(new_tweets) # update the id of the oldest tweet less one oldest = all_tweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(all_tweets))) tweet_text = [] for tweet in all_tweets: tweet_text.append(tweet.text) content = [] retweets = [] for tweet in tweet_text: words = word_tokenize(tweet, 'english') content.extend(strip_words(words)) if words[0] == 'RT': retweets.append(words[2]) tweet_distribution = FreqDist(retweets) print(tweet_distribution.most_common(20)) a = follow_description(api, tweet_distribution.most_common(20), screen_name) return a
def get_monograms_freqdist(tokens): freq_dist = FreqDist(tokens) # print FreqDist.N(freq_dist) print 'Returned monograms' print freq_dist.most_common(10) temp_list = freq_dist.most_common(100) temp_dict = dict((item[0], item[1]) for item in temp_list) ordered_freq_dist = OrderedDict(sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)) return ordered_freq_dist
def get_trigrams_freqdist(tokens): tri_grams = trigrams(tokens) print 'Returned trigrams' freq_dist_trigrams = FreqDist(tri_grams) print freq_dist_trigrams.most_common(10) freq_dist_trigrams_new = dict() for item in freq_dist_trigrams.items(): temp_str = item[0] temp_key = temp_str[0] + ' ' + temp_str[1] + ' ' + temp_str[2] freq_dist_trigrams_new[temp_key] = item[1] freq_dist_trigrams_new = OrderedDict(sorted(freq_dist_trigrams_new.items(), key=lambda x: x[1], reverse=True)) return freq_dist_trigrams_new
def create_word_freq(db): db = getattr(db, "Posts") #client.command("CREATE CLASS concepted EXTENDS E") client.command("DELETE EDGE concepted") #client.command('create property frequency.freq string') #client.command("DELETE VERTEX frequency") data = db.find().batch_size(50) concept = client.command("SELECT name FROM concept") c = [c.name for c in concept] for d in data: if not 'Body' in d: display= '' else: display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', '')) tokens = nltk.word_tokenize(display) fdist=FreqDist(tokens) i = fdist.most_common() for k in i: if k[0].lower() in c: try: client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1])) except: continue
def get_list_dists_for(member_id): print(member_id, file=sys.stderr) # cursor.execute(get_listcount_for_member, [member_id]) # mlistcount = cursor.fetchone()[0] cursor.execute(get_listinfo_for_member, [member_id]) tstrout = '' rows = cursor.fetchall() for row in rows: c_line = str(row) c_line = ''.join(filter(lambda x: x in string.printable, c_line)) if len(c_line): parsed_text = f.parse(c_line, True) strout = '' if parsed_text is not None: for item in s.items(): # print(1,str(item[1])) word = [] for word in item[1]: strout = strout + ' ' + word if len(strout): tstrout = tstrout + ' ' + strout # print(tstrout) words = nltk.tokenize.word_tokenize(tstrout) the_list_dist = FreqDist(words) return str(member_id) + " on " + str(len(rows)) + " lists: " + str(the_list_dist.most_common(20))
def transmit_vocabulary(t_token, t_lang): languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish'] voc_stopwords = set() if t_lang in languages: voc_stopwords = set(stopwords.words(t_lang)) i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8') lines = i_f.readlines() all_tweets = [] corpus_size = 0 for line in lines: row = line.split('\t') words = word_tokenize(row[1]) all_tweets.extend([w.lower() for w in words]) corpus_size += 1 freq_distribution = FreqDist(all_tweets) cats_vocabulary_elements = [] for word, frequency in freq_distribution.most_common(1000): if word not in voc_stopwords: cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']') cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']' print(cats_vocabulary) result_data = {'token': t_token, 'result': cats_vocabulary} json_data = json.dumps(result_data) results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile') results_request.add_header('Content-Type', 'application/json') results_request.data = json_data.encode('utf-8') urllib2.urlopen(results_request) print('Transmitted vocabulary for token '+t_token) os.remove('csv/' + t_token + '.csv')
def analyzeTitles(): fulltitles = [] titles = [] with open('../top100clean.csv', 'rb') as bookfile: reader = csv.reader(bookfile) for row in reader: if "..." in row[0]: row[0] = " ".join(row[0].split(" ")[:-1]) words = nltk.word_tokenize(row[0]) for w in words: if w.isalpha() and w.lower() not in ['the','a']: titles.append(w.lower()) fulltitles.append(row[0]) titleset = nltk.Text(titles) wordsintitle = [len(f.split(" ")) for f in fulltitles] wit_fd = FreqDist(wordsintitle) print "\nw.i.t.\tfreq" print "--------------------" for numwords, times in wit_fd.iteritems(): print str(numwords) + "\t" + str(times) print "\n" print "\nword\t\tfreq" print "--------------------" fd = FreqDist(titleset) common_words = fd.most_common(25) for k, v in common_words: print str(k) + "\t\t" + str(v)
def cleaner(filename): textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r') text = [] all_dates = [] complete_text = [] words_list = [] nodes = [] for line in textfile: datetime,chat = line.split('-') date, time = datetime.split(',') loc = chat.find(':') #if len(chat.split(':'))==3: # print chat user,text = chat[:loc],chat[loc+2:] text = text.replace("\n",'') words = text.split(' ') for i in words: words_list.append(i) complete_text.append(text) nodes.append(user) all_dates.append(date) #print set(nodes) #print set(all_dates) fdist = FreqDist(words_list) f1 = fdist.most_common(100) create_csv('wordcloud.csv',f1) textfile.close()
def find_names(self): """creates a frequency distribution of the most common names in the texts""" names_list = LIST_OF_NAMES name_tokens = [w for w in self.tokens if w in names_list] fd = FreqDist(name_tokens) return fd.most_common(50)
def setUpOwnSubjectStopWords(): for topic in topics_table_noun_only_title: #only limiting it to a specified length #might want to look into the numeric part all_description = [ds for ds in topics_table_noun_only_description[topic] if len(ds) > 5].join() all_topics = [topics for topics in topics_table_noun_only_title[topic] if len(ds) > 5].join() fdist_description = FreqDist(all_description) fidst_topics = FreqDist(all_topics) ten_most_common_descr = fdist_description.most_common(10) ten_most_common_topic = fdist_description.most_common(10) built_topic_stop_words[topic] = [word for word,freq in ten_most_common_descr ] built_topic_stop_words[topic].append([word for word, freq in ten_most_common_topic]) #here we set up the top 5-10 words (we need to look into the data more to find #the hard margin of the good numerical value to stop, but for simplicity sake, we #pick 5 for now, let's see how our accuracy changes when change the most frequent words for topic in built_topic_stop_words: print built_topic_stop_words[topic] print "\n"
def getTopNFreqWords(textArr,N): fdist = FreqDist(textArr) topWordsWithFreq = fdist.most_common(N) topWords=[] for word in topWordsWithFreq: topWords.append(word[0]) return topWords
def generate_ngrams_profile(self, text, profile_size, min_size=2, max_size=3): """ It reads incoming text, generates all possible N-grams, with sizes ranging between min_size and max_size and counts the occurrences of all N-grams. Parameters ---------- text : unicode profile_size : int min_size : int, optional (default=2) max_size : int, optional (default=3) Returns ------- ngram_profile : FreqDist object """ raw_ngrams = [] text = self.sanitize_text(text) for n in range(min_size, max_size+1): for ngram in ngrams(text, n): raw_ngrams.append(''.join(unicode(i) for i in ngram)) fdist = FreqDist(raw_ngrams) ngram_profile = fdist.most_common(n=profile_size) return ngram_profile
def main(): for file in glob.glob('./*/*/*.json'): data = loadFile(file) processMessages(data) fdist = FreqDist(all_tokens) # Output top 50 words for word, frequency in fdist.most_common(50): print('%s;%d' % (word, frequency)).encode('utf-8')
def __commonWords(self, pos, number=100): """ Find common words in the text. """ from nltk import FreqDist vocab = FreqDist(pos) common = [word[0] for (word, _) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS'] return common
def freq(): movies = Movie.query.all() all_string = ''; for movie in movies: all_string += movie.stemmed; all_list = all_string.split('/') fdist = FreqDist([w for w in all_list if len(w)>9]) common_l = fdist.most_common(300) return render_template('freq.html',commons = common_l)
def populate_comments(self, face_post): comments_cleaned = [] for comment in face_post.get_comments(): tokens = nltk.word_tokenize(comment) base_cleaned = [w for w in tokens if w not in self.stopwords and len(w) > 1] comments_cleaned.extend(base_cleaned) self.cleaned.extend(base_cleaned) dist = FreqDist(comments_cleaned) distribution = dist.most_common(30) face_post.set_distribution(distribution)
def main(): # класс частного распределения fd_text1 = FreqDist(book.text1) print(str.format('Объект частотного распределения: {}', fd_text1)) print(str.format( '50 наиболее встречаемых слов: {}', fd_text1.most_common(50) )) fd_text1.plot(50, cumulative=True)
def Estadisticas(): print('total', len(movie_reviews.fileids())) print('categorias', movie_reviews.categories()) print('total positivos', len(movie_reviews.fileids('pos'))) print('total negativos', len(movie_reviews.fileids('neg'))) all_words = [word.lower() for word in movie_reviews.words()] all_words_frequency = FreqDist(all_words) print('10 palabras más frecuentes', all_words_frequency.most_common(10)) print('cantidad de veces que se repite la palabra happy', all_words_frequency['happy'])
def __commonWords(self, pos, number=100): """ Find common words in the text. """ from nltk import FreqDist vocab = FreqDist(pos) common = [(word[0], index) for (word, index) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS'] return common
def top_word_frequency_graph(text, k, label): fdist1 = FreqDist([w for w in text]) word_freq = fdist1.most_common() # get frequency statistics total = 0 for w in word_freq: total += w[1] word_freq_per = {} for w in word_freq: word_freq_per[w[0]] = (100 * w[1] / total)
def Datos(): documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) shuffle(documents) stopwords_english = stopwords.words('english') all_words = [word.lower() for word in movie_reviews.words()] all_words_clean = [] for word in all_words: if word not in stopwords_english and word not in string.punctuation: all_words_clean.append(word) all_words_frequency = FreqDist(all_words_clean) print(all_words_frequency.most_common(10)) most_common_words = all_words_frequency.most_common(2000) print(most_common_words[:10]) word_features = [item[0] for item in most_common_words] return documents, word_features
def get_most_common_words(cls, textual_data, n): """ This function is used to extract the most common words from the submitted textual information. :param textual_data: Textual information :param n: The number of words to extract from the input. :return: The most common words from the input. :rtype: list """ words = word_tokenize(textual_data.lower()) frequency_distribution = FreqDist(words) most_common = frequency_distribution.most_common(n) return most_common
def __init__(self, normaliser, conllu, direction='ltr', cutoff=0.9, maximise_information=True): check_context = FreqDist(normaliser.print_full()) size = int(len(check_context) * cutoff) check_context = [i[0] for i in check_context.most_common(size)] if direction == 'ltr': self.check_context = {i.split(':')[1]: i.split(':')[0] for i in check_context} else: self.check_context = {i.split(':')[0]: i.split(':')[1] for i in check_context} self.check_free = set(normaliser.print_stream()) self.conllu = conllu self.maximise_information = maximise_information
def classify(message): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json') stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) #print("Accuracy is:", classify.accuracy(classifier, test_data)) #print(classifier.show_most_informative_features(10)) custom_tweet = message custom_tokens = remove_noise(word_tokenize(custom_tweet)) return (classifier.classify(dict([token, True] for token in custom_tokens)))
def printTop(n, min_len): out_file = "output_min_" + str(n) + "_len_gt_" + str(min_len) + ".csv" fdist1 = FreqDist(tokens) most_common = fdist1.most_common(n) out_fh = open(out_file, "w") for tup in most_common: if (len(tup[0]) >= min_len): line = "\"" + tup[0] + "\"," + str(tup[1]) + "\n" # print line out_fh.write(line) out_fh.close()
def most_hashtag(df_tweets): import pandas as pd import numpy as np from mlxtend.frequent_patterns import apriori from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import association_rules data = df_tweets.hashtags.apply(lambda x: np.nan if len(x) <= 0 else x) all_hashtags = list(data.dropna()) hashtags = [] for i in all_hashtags: for j in i: hashtags.append(j) hash_str = '' for i in hashtags: hash_str += i + ' ' hash_str = hash_str.lower() hashtags2 = hash_str.split() freq = FreqDist(hashtags2) hash_most_freq = pd.DataFrame(data=freq.most_common(10), columns=['Hashtag', 'Frequency']) list_freq = list(hash_most_freq.Hashtag) all_hashtags_lower = [[h.lower() for h in line] for line in all_hashtags] def select_hashtag(freq, all_hash): select = [] for list_hash in all_hash: for f in freq: if (len(list_hash) >= 2 and (f in list_hash)): select.append(list_hash) break else: pass return select select = select_hashtag(list_freq, all_hashtags_lower) te = TransactionEncoder() te_ary = te.fit(select).transform(select) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) return rules
def form_word_embeddings_samples(X_train, X_test): #creating vocabulary for training X_train = [word_tokenize(x.lower()) for x in X_train] X_test = [word_tokenize(x.lower()) for x in X_test] x_distr = FreqDist(np.concatenate(X_train + X_test)) x_vocab = x_distr.most_common(min(len(x_distr), 10000)) x_idx2word = [word[0] for word in x_vocab] x_idx2word.insert(0, '<PADDING>') x_idx2word.append('<NA>') x_word2idx = {word: idx for idx, word in enumerate(x_idx2word)} x_train_seq = np.zeros( (len(X_train), 20), dtype=np.int32 ) # padding implicitly present, as the index of the padding token is 0 #using an embedding for samples training data for i, da in enumerate(X_train): for j, token in enumerate(da): # truncate long Titles if j >= 20: break # represent each token with the corresponding index if token in x_word2idx: x_train_seq[i][j] = x_word2idx[token] else: x_train_seq[i][j] = x_word2idx['<NA>'] x_test_seq = np.zeros( (len(X_test), 20), dtype=np.int32 ) # padding implicitly present, as the index of the padding token is 0 # form embeddings for samples testing data for i, da in enumerate(X_test): for j, token in enumerate(da): # truncate long Titles if j >= 20: break # represent each token with the corresponding index if token in x_word2idx: x_test_seq[i][j] = x_word2idx[token] else: x_test_seq[i][j] = x_word2idx['<NA>'] print( "---------------------------formed word-embeddings for samples-----------------------------------" ) return x_train_seq, x_test_seq
def top_words(self, num_words=10): ''' Takes an int (num_words) and prints the (num_words) most common words in the corpus. ''' cleaned_reviews = self.reviews one_big_string = " ".join(cleaned_reviews) splits = one_big_string.split() freq_splits = FreqDist(splits) print("\n") for i, term in enumerate(freq_splits.most_common(num_words)): print(f'{i+1}. {term}')
def nameTheTopic(self, message, tokenizer): message = self.concatenate_list_data(message, tokenizer) finder = BigramCollocationFinder.from_words(word_tokenize(message)) f = finder.ngram_fd.items() flipped = list([(v, k) for k, v in f]) flipped.sort(reverse=True) if float(flipped[0][0]) > 1: s = " " return s.join(flipped[0][1]).title() else: frequency = FreqDist(word_tokenize(message)) return frequency.most_common(1)[0][0].title()
def tf(text): #frequence tf_score = {} freq_dist = FreqDist(text) kwords = freq_dist.most_common(None) total_words = len(kwords) for k in kwords: if k is not None: tf_score[k[0]] = k[1] / total_words return tf_score
def _makeProfile(self, auth, text): wdlist = self._preprocess(text) totalvocab = len(wdlist) fd = FreqDist(wdlist) commonwords = dict(fd.most_common(self.topN)) df = pd.DataFrame.from_dict(commonwords, orient='index', columns=[auth]) df = df/totalvocab #normalize df = df.transpose() return df
def featex(data): res = []; for d in data: fdist = FreqDist(w.lower() for w in d[0]); words = fdist.most_common(100)[5:]; feat_dict = {w : fdist[w] for w in words}; res = res + [(feat_dict, d[1], d[2])]; return res;
def learnColors(colorNamesDf, n=43): ''' Learn the basic color names and their RGB values from the data frameself. Return list of (basic_color_term objects: (name + rgb tuple)) ''' words = [] results = [] red_n = defaultdict(int) red_sum = defaultdict(int) green_n = defaultdict(int) green_sum = defaultdict(int) blue_n = defaultdict(int) blue_sum = defaultdict(int) for index, row in colorNamesDf.iterrows(): tokens = tok.tokenize(row['color_name_raw']) #Store data for average rgb values per token for t in tokens: red_n[t] += 1 red_sum[t] += row['red'] green_n[t] += 1 green_sum[t] += row['green'] blue_n[t] += 1 blue_sum[t] += row['blue'] words += tokens fd = FreqDist(words) basic_color_terms = [c for (c, f) in fd.most_common(43)] drop = ['of', 'mist', 'sea', 'sweet', 'spring', 'ice', 'sky', 'light', 'garden', 'stone', 'deep', 'golden', 'dark', 'pale', 'soft', 'the', 'fresh', 'mountain', 'sage', 'desert'] basic_color_terms = [color for color in basic_color_terms if color not in drop] #Take the average R, G, B values for each of the basic color terms for basic_color_term in basic_color_terms: r = red_sum[basic_color_term]/red_n[basic_color_term] g = green_sum[basic_color_term]/green_n[basic_color_term] b = blue_sum[basic_color_term]/blue_n[basic_color_term] newColor = Color(basic_color_term, r, g, b) #Add related words to the basic color object relatedWords = findRelatedWords(basic_color_term) newColor.setRelatedWords(relatedWords) results.append(newColor) return results
def print_top_words_from_text(self, text): print("\n<b>Top 10 words</b>") tokens = self.tokenize_without_punctuation(text) nltkText = nltk.Text(tokens) fdist = FreqDist(nltkText) print("<ul>") for word, frequency in fdist.most_common(10): print(" <li>") print(" %s : <b>%d</b>" % (word, frequency)) print(" </li>") print("</ul>")
def getMostCommonWords(allTweets): regex = re.compile('[^\w ]') string = '' for tweet in allTweets: string += (" " + tweet.text) string = string.lower() string = regex.sub('', string) tokens = pos_tag(word_tokenize(string)) tokens = cleanUpPosTags(tokens) # print(tokens) fdist = FreqDist(tokens) return fdist.most_common(100)
def train_social(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] # Classifier - TODO Add persistence classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(100)) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) return classifier
def assign_relations(dataset, name="cross_relations"): num_rows, num_cols = len(dataset), len(dataset[0]) cross_relations = {} for u, v in product(xrange(num_cols), repeat=2): if u == v: continue cross_sents = [] deduplicated_set = set() for i, row in enumerate(dataset): if isinstance(row[u], TEntry) and isinstance(row[v], TEntry): for reln in row.attrs["cross_relations"]: if (row[u].text.lower() in set(obj.lower() for obj in reln.subjects) and row[v].text.lower() in set( obj.lower() for obj in reln.objects)): if (reln.doc_name, reln.sent_num) in deduplicated_set: continue deduplicated_set.add((reln.doc_name, reln.sent_num)) cross_sents.append(reln.sent) freq_dict = FreqDist([ token["lemma"].lower() for tokens in cross_sents for token in tokens if token["word"].lower() not in STOPWORDS and token["pos"].startswith("VB") ]) verb_set = {word for word, _ in freq_dict.most_common()[:5]} cross_reln = [] deduplicated_set = set() for i, row in enumerate(dataset): if isinstance(row[u], TEntry) and isinstance(row[v], TEntry): for reln in row.attrs["cross_relations"]: if (row[u].text.lower() in set(obj.lower() for obj in reln.subjects) and row[v].text.lower() in set( obj.lower() for obj in reln.objects)): if (reln.doc_name, reln.sent_num) in deduplicated_set: continue deduplicated_set.add((reln.doc_name, reln.sent_num)) accepted = False for token in reln.sent[slice(*reln.relation_span)]: if token["lemma"] in verb_set and token[ "pos"].startswith("V"): accepted = True if accepted: cross_reln.append((reln, reln.relation)) cross_relations[u, v] = cross_reln dataset.attrs[name] = cross_relations
def new_terms(texts): #df=pd.read_csv(csv_file) #texts=list(df.iloc[:,0]) tokens = preprocess(texts) tokens_join = list(chain.from_iterable(tokens)) #print('tokens') #print(tokens_join) fdist = FreqDist(str(w) for w in tokens_join) mostCommon = fdist.most_common(100) return mostCommon
def get_token_list(df, col, freq=False): """Takes in a DataFrame and column that contains tokenized texts and returns a list containing all the tokens (including duplicates) from that column. If freq=True, the function will also print out the number of unique tokens and the top 25 most common words as well as their counts based on nltk's FreqDist. Args: df (Pandas DataFrame): DataFrame from which to obtain tokenized text. col (str): Name of the column that contains the text to tokenize. freq (bool, default=False): Whether to print summary of token list. Returns: iterable: List of tokens as strings. Example: >>> df = pd.DataFrame({'numbers': [2, 4], 'text': [['an', 'example'], ['another', 'example']]}) >>> example_tokens = get_token_list(df, col='text', freq=True) >>> example_tokens ********** text Summary ********** Number of unique words = 3 token count 0 example 2 1 an 1 2 another 1 ['an', 'example', 'another', 'example'] """ import pandas as pd from nltk import FreqDist ## Create list of all tokens tokens = [] for text in df[col].to_list(): tokens.extend(text) if freq: # Make a FreqDist from token list fd = FreqDist(tokens) # Display length of the FreqDist (# of unique tokens) and 25 most common words print('\n********** {} Summary **********\n'.format(col)) print('Number of unique words = {}'.format(len(fd))) display(pd.DataFrame(fd.most_common(25), columns=['token', 'count'])) return tokens
def word_freq(corpus, article): wordfreq = [] wf = FreqDist(corpus) for word, freq in wf.most_common(): # (word-frequency / body-tokens-length ) * 100 rel = (freq / len(corpus)) * 100 wwf = word, freq, rel wordfreq.append(wwf) return wordfreq
def lr_metric(self, word, top5): num_neg = 0 num_pos = 0 if top5: freq = FreqDist([w for rew in self.clean_neg for w in rew]) best_neg, _ = zip(*freq.most_common(5)) freq = FreqDist([w for rew in self.clean_pos for w in rew]) best_pos, _ = zip(*freq.most_common(5)) print("Top 5 words used in negative reviews:") print(best_neg) print("Top 5 words used in positive reviews:") print(best_pos) num_neg = self.neg_counts.get(word) num_pos = self.pos_counts.get(word) if num_pos >= 10 and num_neg >= 10: return float(num_pos / num_neg) else: return -1
def create_vocabulary(self, column, num_words_per_text): vocabulary = [] for lemmatized_text in self.data[column]: freq_dist = FreqDist(lemmatized_text) count = 0 for word, times in freq_dist.most_common(50): if count == 10: break elif word not in vocabulary and times > 1: vocabulary.append(word) count += 1 self.vocabulary = vocabulary
def extract_ngrams(text, low=1, high=2, lowercase=False, filter_punctuation=True, binary=False, least_common=None, most_common=None, normalize=False, sample=False): #text = ' '.join(review.paragraphs) tokens = None # Make lowercase if lowercase: tokens = word_tokenize(text.lower()) else: tokens = word_tokenize(text) # Remove Punctuation if filter_punctuation: words = [t for t in tokens if t not in PUNCTUATION] else: words = [t for t in tokens] # Do the N Gram Thing ngram_counts = {} assert not ( sample and binary ), "Please don't make sample and binary True. One or the other or neither pls" for n in range(low, high + 1): ngram_freqdist = FreqDist(ngrams(words, n)) grams_to_consider = ngram_freqdist if least_common: assert least_common > 0.0 and least_common <= 1.0, \ 'Least common must be a proportion, not %.3f' % least_common num_least_common = int(least_common * ngram_freqdist.N()) grams_to_consider = [] for bleh in ngram_freqdist.most_common()[-1 * num_least_common:]: gram, count = bleh grams_to_consider.append(gram) for gram in grams_to_consider: if sample: ngram_counts[gram] = ngram_freqdist.freq(gram) elif binary: ngram_counts[gram] = True else: ngram_counts[gram] = ngram_freqdist[gram] if normalize: total_counts = sum(count for ngram, count in ngram_counts.items()) for gram, count in ngram_counts.items(): ngram_counts[gram] = count / total_counts return ngram_counts
def RetornNucleo(frase): # Instancia a spacy no modelo portugues nlp = spacy.load("pt") # submeter a frase no modelo spacy doc = nlp(frase) #sem as palavras irrelevantes frase2 = principal.RemoverIrrelevantes(frase) doc2 = nlp(frase2) # Gerar dataset das dependecias dep_palavra = [] for token in doc: while token.head != token: x = token.text dep_palavra.append(x) token = token.head # Gerar dataset das dependecias sem as palavras irrelevante dep_PalvraIrr = [] if len(frase2.split()) > 1: for token in doc2: while token.head != token: x = token.text dep_PalvraIrr.append(x) token = token.head else: dep_PalvraIrr.append(frase2) maiorfreq = FreqDist(item for item in dep_palavra) maiorfreq2 = FreqDist(item for item in dep_PalvraIrr) mf1 = maiorfreq.most_common(1) mf2 = maiorfreq2.most_common(1) if mf1 == mf2: return mf1 else: return ValidarNucleos(frase, mf1[0][0], mf2[0][0])
def plt_keyword_frequency(title='{} most frequent keywords', x_label='Keyword', y_label='Frequency', limit=20, save=False, is_long_title=False, file='plt_keyword_frequency.png', fig_size=PLOT_DIMENSIONS): """" Display a plot that shows the most frequent keywords. The number of words is determined by the limit. """ # set the style sns.set(style=SN_STYLE, font=FONT_NAME) plt.figure(figsize=fig_size) title = title.format(limit) # get data df = roll.roll_with_entities_df() # filter out rows without keywords df_keywords = df[df[common.KEYWORDS_COL].notnull()] # get keywords as a list all_keywords = df_keywords[common.KEYWORDS_COL].to_list() # remove the semicolon delimiter kw = [] for k in all_keywords: for i in k.split(';'): kw.append(i) kw_freq = FreqDist(kw) # create a pandas as df kw_df = pd.DataFrame(kw_freq.most_common(limit), columns=['Word', 'Frequency']).set_index('Word') sns.set() kw_df.plot(kind='bar', legend=None) plt.xticks(fontsize=10, fontname=FONT_NAME) plt.yticks(fontsize=10, fontname=FONT_NAME) # plot labels set_labels(x_label, y_label) # plot title title_text(title, is_long_title) # show or save the image to file save_or_show(save=save, plot_file_name=file)
def load_words(source, vocab_size=10000, limit=None, max_length=None): """ Loads sentences (or other natural language sequences) from a text file. Assumes a single sequence per line. :param source: Text file to read from :param vocab_size: Maximum number of words to retain. If there are more unique words than this, the most frequent "vocab_size" words are used, and the rest are replaced by the <UNK> symbol :param limit: If not None, only the first "character_limit" characters are read. Useful for debugging on large corpora. :param max_length: If not none, any sentence longer containing more words than this is removed. :return: (1) A list of lists of integers representing the encoded sentences, (3) a dict from strings to ints representing the mapping from words to indices (2) a list of strings representing the mapping from indices to words. """ # Reading raw text from source and destination files f = open(source, 'r') x_data = f.read() f.close() print('raw data read') if limit is not None: x_data = x_data[:limit] # Splitting raw text into array of sequences x = [text_to_word_sequence(x) for x in x_data.split('\n') if len(x) > 0] if max_length is not None: x = [s for s in x if len(s) <= max_length] # Creating the vocabulary set with the most common words (leaving room for PAD, START, UNK) dist = FreqDist(np.hstack(x)) x_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS)) # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary i2w = [word[0] for word in x_vocab] # Adding the word "ZERO" to the beginning of the array i2w = EXTRA_SYMBOLS + i2w # Creating the word-to-index dictionary from the array created above w2i = {word: ix for ix, word in enumerate(i2w)} # Converting each word to its index value for i, sentence in enumerate(x): for j, word in enumerate(sentence): if word in w2i: x[i][j] = w2i[word] else: x[i][j] = w2i['<UNK>'] return x, w2i, i2w
def text_similarity(): """ 文本相似度 :return: """ text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terribl e movie' text = text1 + text2 + text3 + text4 + text5 words = nltk.word_tokenize(text) freq_dist = FreqDist(words) print(freq_dist['is']) # 取出常用的n=5个单词 n = 5 # 构造“常用单词列表” most_common_words = freq_dist.most_common(n) print(most_common_words) def lookup_pos(most_common_words): """ 查找常用单词的位置 """ result = {} pos = 0 for word in most_common_words: result[word[0]] = pos pos += 1 return result # 记录位置 std_pos_dict = lookup_pos(most_common_words) print(std_pos_dict) # 新文本 new_text = 'That one is a good movie. This is so good!' # 初始化向量 freq_vec = [0] * n # 分词 new_words = nltk.word_tokenize(new_text) # 在“常用单词列表”上计算词频 for new_word in new_words: if new_word in list(std_pos_dict.keys()): freq_vec[std_pos_dict[new_word]] += 1 print(freq_vec)
def analyzeAuthors(): authors = [] with open('../top100clean.csv', 'rb') as bookfile: reader = csv.reader(bookfile) for row in reader: authors.append(row[4]) authorset = nltk.Text(authors) fd = FreqDist(authorset) prolific = fd.most_common(10) for k, v in prolific: print str(k) + "\t" + str(v)
def removeStopwords(t, n, label="No label"): frequency = FreqDist(t) s1 = frequency.most_common(n) s1 = [x[0] for x in s1] ## ## print "Top "+str(n)+" stopwords of "+label ## printList(stopwords) ## ## new_text = [x for x in t if x not in stopwords] ## return new_text s2 = stopwords.words("english") s = set(s1+s2) return [x for x in t if x not in s]
def ViewWords(self, event): self.processing.SetLabel('Procrssing....') global filePath if filePath == "": self.OutLabel.SetLabel("Output\n\nPlease select file.") return stopwords = nltk.corpus.stopwords.words('english') f = open(filePath, 'r').read() tokens = nltk.word_tokenize(f) words = [w.lower() for w in tokens] alphawords = [w for w in words if not alpha_filter(w)] stoppedwords = [w for w in alphawords if not w in stopwords] fdist = FreqDist(stoppedwords) inp = self.editname.GetValue() if inp == '' or inp == 0 or inp == None: inp =10000 val = int(inp) outstr = "Output : \nWord\t\t --> \t Frequency\n" self.OutLabel.SetLabel(str(fdist.most_common(val))) for word in fdist.most_common(val): print word outstr = outstr +str(word[0]).ljust(40)+ str(word[1])+"\n" self.OutLabel.SetLabel(str(outstr)) self.processing.SetLabel('Done.')
def getDict(filename) : fileComment = open(filename , 'rU') # our data is in latin-1 format rawData = fileComment.read().decode('latin-1') tokens = word_tokenize(rawData) # remove the common and non-alpha tokens ignoredWords = stopwords.words('english') filterNoAlpha = [i.lower()for i in tokens if i.isalpha()] filtered_words = [i for i in filterNoAlpha if i not in ignoredWords] fdist1 = FreqDist(filtered_words) final = fdist1.most_common(200) fileComment.close() return dict(final)
def processTexts(processedListOfTexts): #FreqDist print('\n--------------------------------------------------\nCompute most frequent indicative words') #requires tokenized nltk/string. Also can use ' '.join(processedListOfTexts) fullDist = FreqDist(str(word) for word in processedListOfTexts) #Use items() because most_common() doesn't work on mac. #Note: Please confirm if output of items() is sorted by value top=[word[0] for word in fullDist.most_common(50) if word[0] not in stopWordSet] for word in top: print(word) relevantNouns=[] print('\n--------------------------------------------------\nRelevant Nouns:') for word in top: if word in nounList: relevantNouns.append(word) print(word)
def common_tri(textt): word=word_tokenize(textt) fdist=FreqDist(trigrams(word)) h=fdist.most_common(1) h=str(h) trans=maketrans(symbols,whitespace) x= h.translate(trans) x = x.strip() print(x) ##he function operates for trigrams, simple adjustment can be made to use whatever ##number of terms (n grams)
def get_palavras_frequentes(self): """Documentar. """ if self._palavras_frequentes is None: print "-- Verificando as palavras mais frequentes do corpus." # Teste - retorna apenas as 2000 palavras mais frequentes do corpus todas_palavras = [word.lower() for word in self._corpus.words()] freq_dist_palavras = FreqDist(todas_palavras) frequencia_palavras = freq_dist_palavras.most_common(2000) # 2000 palavras mais frequentes self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras] # all_words = FreqDist(word.lower() for word in self.corpus.words()) # self.word_features = list(all_words)[:2000] return self._palavras_frequentes
def check_svc_bef_aft(list_line, command): # check the freq of words before and after bus service # check the freq of words before and after of word (number) which is non bus svc text = '' for i in range(0, len(list_line), 3): split_first = 0 split_second = 0 if i % 3 == 0: split_first = list_line[i].strip().split('\t') j = i + 1 if j % 3 == 1: split_second = list_line[j].strip().split('\t') for k in range(0, len(split_second)): if command == 'before_svc': if int(split_second[k]) == 1: # mean bus svc if command == 'before_svc': if k > 0: # bus svc doesn't appear at the first position of sentences text = text + split_first[k - 1].lower() + ' ' # take the word before print i, k, split_first[k] if command == 'after_svc': if int(split_second[k]) == 1: # mean bus svc if command == 'after_svc': if k < len(split_second) - 1: text = text + split_first[k + 1].lower() + ' ' # take the word after if command == 'before_notsvc': if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc if k > 0: # bus svc doesn't appear at the last position of sentences text = text + split_first[k - 1].lower() + ' ' if command == 'after_notsvc': if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1: # text is a number and not a bus svc if k < len(split_second) - 1: # bus svc doesn't appear at the last position of sentences text = text + split_first[k + 1].lower() + ' ' fdist = FreqDist() tokens = word_tokenize(str(text)) fdist.update(tokens) for value in fdist.most_common(len(fdist)): print value[0], '\t', value[1] print text
def featuresextr(self, set_name='featurespace.csv'): new_set = [] for tweet_tokens, features, weight in self.new_tweetset: # features are pending till performance methodoly # print '\n{}'.format(tweet_tokens) total_tokens = len(tweet_tokens) frequencies = FreqDist(tweet_tokens) words_tfidf = [(t_word, round(self.termfrequency(count_w, total_tokens) * self.inversdocfreq(t_word), 2)) for t_word, count_w in frequencies.most_common()] tfidf_vector = tuple([value for unigram, value in words_tfidf]) feat_bigrams = self.posbigrams(tweet_tokens) ortony_occur = self.wordsoccurrences(tweet_tokens) profane_occur = self.wordsoccurrences(tweet_tokens, option='profane') preprocessed_twits = self.new_tweetset[:, 0] guardar_csv(preprocessed_twits, 'recursos/processed_twits_slang.csv') new_set.append((sum(tfidf_vector), ortony_occur, profane_occur, sum(feat_bigrams), weight)) guardar_csv(new_set, 'recursos/{}'.format(set_name)) self.features_space = np.array(new_set)
def create_document_properties(cat, doc): words = [clean_string(x) for x in corpus.words(fileids=doc) if clean_string(x)!=''] tot_words = len(words) sentences = len(corpus.sents(fileids=doc)) word_sent = [len(x) for x in corpus.sents(doc)] avg_word = sum(word_sent)/len(word_sent) char_word = [len(x) for x in words] avg_char = sum(char_word)/len(char_word) words_ex_stop = [x for x in words if x not in stop] freq_dist = FreqDist(words_ex_stop) common = freq_dist.most_common(5) return ( { "category": cat, "doc": doc, "tot_words": tot_words, "avg_char": avg_char, "sentences": sentences, "avg_word": avg_word, "most_common": common[0][0], "most_common_freq": common[0][1] } )