def find_abbreviations(): import db from tokenizers import es from nltk import FreqDist corpus = db.connect() #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)]) text = '\n'.join([a['text'] for a in corpus.articles.find()]) tokens = es.tokenize(text, ignore_abbreviations=True) fd = FreqDist() fd_abbr = FreqDist() fd_n_abbr = FreqDist() n_tokens = len(tokens) for i in range(n_tokens): fd.inc(tokens[i]) if i < (n_tokens - 1) and tokens[i + 1] == u'.': fd_abbr.inc(tokens[i]) else: fd_n_abbr.inc(tokens[i]) adjusted = {} f_avg = len(fd.keys()) / fd.N() for t, n in fd_abbr.iteritems(): f = fd.get(t, 0) / fd.N() deviation = 1 + (f - f_avg) adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t) items = adjusted.items() items.sort(key=lambda i: i[1], reverse=True) for t, n in items[:100]: print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
def analyzeTitles(): fulltitles = [] titles = [] with open('../top100clean.csv', 'rb') as bookfile: reader = csv.reader(bookfile) for row in reader: if "..." in row[0]: row[0] = " ".join(row[0].split(" ")[:-1]) words = nltk.word_tokenize(row[0]) for w in words: if w.isalpha() and w.lower() not in ['the','a']: titles.append(w.lower()) fulltitles.append(row[0]) titleset = nltk.Text(titles) wordsintitle = [len(f.split(" ")) for f in fulltitles] wit_fd = FreqDist(wordsintitle) print "\nw.i.t.\tfreq" print "--------------------" for numwords, times in wit_fd.iteritems(): print str(numwords) + "\t" + str(times) print "\n" print "\nword\t\tfreq" print "--------------------" fd = FreqDist(titleset) common_words = fd.most_common(25) for k, v in common_words: print str(k) + "\t\t" + str(v)
def get_stats(self, output_fname): fd = FreqDist() for text in self.texts: fd.update(set(text)) fh = open(output_fname, 'w') text = Text(self.paragraph_tokens) fdist = FreqDist(text) for (w,f) in fdist.iteritems(): print >> fh, "%s\t%i" % (w, f) fh.close()
def description_and_tokens(self, id_, timestamp, soup): overview = soup.find(id="description") for scr in overview.find_all('script'): scr.clear() desc = overview.text tokens = word_tokenize(desc) freqdist = FreqDist(tokens) self.redis.set('daftpunk:%s:description' % id_, desc) for token, freq in freqdist.iteritems(): self.redis.zadd('daftpunk:%s:tokens' % id_, freq, token)
def mapapalabras(issn, titulo): print 'Analizando ' + titulo kw = keywordsrevista(issn) print titulo + ' ya analizado' saveoutput(kw,issn, titulo) fd = FreqDist(kw) lista = [(k,v) for k, v in fd.iteritems()] ordenado = sorted(lista, key=itemgetter(1), reverse=True) woc = wordcloud.WordCloud(max_font_size = 40, relative_scaling=.5).generate_from_frequencies(ordenado) plt.figure() plt.suptitle(titulo) #ax = plt.add_subplot(111, autoscale_on=False, xlim=(-1, 5), ylim=(-3, 5)) #ax.annotate('pixels', xy=(0, 0), xycoords='figure fraction') plt.imshow(woc) plt.axis("off") plt.savefig(issn + '-' + titulo + '.png')
def main(): corpora = ['idwiki', 'kaskus', 'kompas', 'twitter'] corpora_dict = {} for corpus in corpora: fd = FreqDist() for line in codecs.open('../data/' + corpus + '.1gram', 'r', 'utf-8'): (word, freq) = line.split('\t') fd[len(word)] += int(freq.strip()) sorted_fd = sorted(fd.iteritems(), key=operator.itemgetter(0)) lengths = [0] + [x for x, y in sorted_fd] freqs = [0] + [y for x, y in sorted_fd] plt.plot(lengths, freqs, label=corpus) plt.grid(True) plt.xlabel('length', fontsize=14, fontweight='bold') plt.ylabel('frequency', fontsize=14, fontweight='bold') plt.legend(loc='upper right') plt.savefig('char.png') plt.close()
def create_word_scores(): tweets = get_tweets_from_db() postweets = tweets[800001:] negtweets = tweets[:800001] posWords = [] negWords = [] for tweet in postweets: posWords.append(tweet[0]) for tweet in negtweets: negWords.append(tweet[0]) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_frequent_features(self, min_support): #get n item sets wnl = WordNetLemmatizer() features = [wnl.lemmatize(token) for token in self.candidate_feature_list()] dist = FreqDist(features) return [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
def get_frequent_features_list(self, min_support): dist = FreqDist(self.get_candidate_feature_list()) features = [(item, count) for (item, count) in dist.iteritems() if count >= min_support] return self.prune_features(features, 3)
import re, os, glob from sys import argv from nltk import FreqDist, PorterStemmer, ingrams assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0] assert os.path.isdir(argv[1]) indir = argv[1] wordposngram = "%s/" % argv[2] assert argv[3] in ("dev", "test") devortest=argv[3] leaves = re.compile(r" ([^ )]+)\)") pos = re.compile(r"\(([^ ]+) [^ )]+\)") porter = PorterStemmer() print "extracting ngrams" for train in glob.glob("%s/*.*.train" % indir): fold = int(train.split(".")[-2]) if fold > 3: continue wordpostrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag for t in open(train) for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)): output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1]) testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag for t in open(test).readlines() for word,tag in zip(leaves.findall(t), pos.findall(t))), 3)) open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b) for a, b in testtrigrams.iteritems() if wordpostrigrams[a]) print output print "done"
word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for item in train_set: tweet = item[0].lower() words = word_tokenize(item[0]) word_fd.update(words) label_word_fd[item[1]].update(words) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:50] print best bestwords = set([w for w, s in best]) print bestwords def tweet_features(tweet): tweet_words = word_tokenize(tweet) features = {} for word in all_words:
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000): pos_feature = list() neg_feature = list() pos_all_words = list() neg_all_words = list() # pos_all_feature = dict() # neg_all_feature = dict() if featuregram == 'one': for each in pos_wordlist: cur = UniGramFeature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = UniGramFeature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) elif featuregram == 'two': for each in pos_wordlist: cur = Mixup2Feature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = Mixup2Feature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) elif featuregram == 'three': for each in pos_wordlist: cur = Mixup3Feature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = Mixup3Feature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) else: return [] fd = FreqDist() cfd = ConditionalFreqDist() for word in pos_all_words: fd[word] += 1 cfd['pos'][word] += 1 for word in neg_all_words: fd[word] += 1 cfd['neg'][word] += 1 pos_N = cfd['pos'].N() neg_N = cfd['neg'].N() N = fd.N() score_list = dict() for word, freq in fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N) neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N) score_list[word] = pos_score + neg_score best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True) # print json.dumps(best_topwords[-100:-1], ensure_ascii=False) best_topwords = best_topwords[:n] # print json.dumps(best_topwords[:100], ensure_ascii=False) best_topwords = set(word for word, freq in best_topwords) return pos_feature, neg_feature, best_topwords
from nltk import FreqDist # Donwload all books (http://www.nltk.org/data.html) # NOTE: if this does not work, run this code in Python from the Terminal (not from inside IDE) # nltk.download() # Import a text and examine its words from nltk.corpus import brown brown.words() # Find the frequency of each word in a text fd = FreqDist(brown.words()) # Find the most frequent words in a text: # http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary import operator max(fd.iteritems(), key=operator.itemgetter(1)) sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10] # Or use the wrapper function fd.most_common(10) # plot the most frequent words fd.plot(10) fd.plot(10, cumulative=True) # See the words with lowest frequency (these words are called hapaxes) fd.hapaxes() # Count all the words len(text1) # count unique words len(set(text1))
train_word_pos = get_word_pos(train_sents) train_words = get_words(train_word_pos) train_pos = get_pos(train_word_pos) train_pos_unique = get_num_tags(train_pos) #NUMBER OF POS TAGS train_num_pos_tags = len(train_pos_unique) #NUMBER OF TRAINING SENTENCES num_train_sent = get_number_of_sentences(train_pos_unique) train_num_of_words = get_number_of_words(train_words) train_unigrams = ngrams(train_pos, 1) train_unigram_freq_dist = FreqDist(train_unigrams) train_unigram_dict = {k[0]: v for k, v in train_unigram_freq_dist.iteritems()} #print unigram_dict train_trans_bigrams = ngrams(train_pos, 2) train_trans_bigram_freq_dist = FreqDist(train_trans_bigrams) train_trans_bigram_dict = { k[0] + "|" + k[1]: v for k, v in train_trans_bigram_freq_dist.iteritems() } train_word_pos_bigrams = ngrams(train_word_pos, 2) train_word_pos_bigram_freq_dist = FreqDist(train_word_pos_bigrams) train_word_pos_bigram_dict = { k[0][0] + "|" + k[0][1]: v for k, v in train_word_pos_bigram_freq_dist.iteritems() }
from sys import argv from nltk import FreqDist, PorterStemmer, ingrams assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0] assert os.path.isdir(argv[1]) indir = argv[1] wordposngram = "%s/" % argv[2] assert argv[3] in ("dev", "test") devortest = argv[3] leaves = re.compile(r" ([^ )]+)\)") pos = re.compile(r"\(([^ ]+) [^ )]+\)") porter = PorterStemmer() print "extracting ngrams" for train in glob.glob("%s/*.*.train" % indir): fold = int(train.split(".")[-2]) if fold > 3: continue wordpostrigrams = FreqDist( ingrams((porter.stem(word) + "/" + tag for t in open(train) for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)): output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1]) testtrigrams = FreqDist( ingrams( (porter.stem(word) + "/" + tag for t in open(test).readlines() for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) open(wordposngram + output, "w").writelines("%s\t%d\n" % (" ".join(a), b) for a, b in testtrigrams.iteritems() if wordpostrigrams[a]) print output print "done"
def get_frequent_features(self, min_support): #get n item sets dist = FreqDist(self.candidate_feature_list()) features = [(item, count) for (item, count) in dist.iteritems() if count >= min_support] return self.prune_features(features, 3)