Example #1
0
def find_abbreviations():
    import db
    from tokenizers import es
    from nltk import FreqDist

    corpus = db.connect()
    #text = '\n'.join([a['text'] for a in corpus.articles.find().limit(10)])
    text = '\n'.join([a['text'] for a in corpus.articles.find()])
    tokens = es.tokenize(text, ignore_abbreviations=True)

    fd = FreqDist()
    fd_abbr = FreqDist()
    fd_n_abbr = FreqDist()
    n_tokens = len(tokens)
    for i in range(n_tokens):
        fd.inc(tokens[i])
        if i < (n_tokens - 1) and tokens[i + 1] == u'.':
            fd_abbr.inc(tokens[i])
        else:
            fd_n_abbr.inc(tokens[i])

    adjusted = {}
    f_avg = len(fd.keys()) / fd.N()
    for t, n in fd_abbr.iteritems():
        f = fd.get(t, 0) / fd.N()
        deviation = 1 + (f - f_avg)
        adjusted[t] = n * deviation / fd_n_abbr.get(t, 1) / len(t)

    items = adjusted.items()
    items.sort(key=lambda i: i[1], reverse=True)
    for t, n in items[:100]:
        print u'%s. %f (%d, %d)' % (t, n, fd_abbr[t], fd_n_abbr.get(t, 0))
Example #2
0
def analyzeTitles():
    fulltitles = []
    titles = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            if "..." in row[0]:
                row[0] = " ".join(row[0].split(" ")[:-1])
            words = nltk.word_tokenize(row[0])
            for w in words:
                if w.isalpha() and w.lower() not in ['the','a']:
                    titles.append(w.lower())
            fulltitles.append(row[0])

    titleset = nltk.Text(titles)
    wordsintitle = [len(f.split(" ")) for f in fulltitles]
    wit_fd = FreqDist(wordsintitle)
    print "\nw.i.t.\tfreq"
    print "--------------------"
    for numwords, times in wit_fd.iteritems():
        print str(numwords) + "\t" + str(times)
    print "\n"

    print "\nword\t\tfreq"
    print "--------------------"
    fd = FreqDist(titleset)
    common_words = fd.most_common(25)
    for k, v in common_words:
        print str(k) + "\t\t" + str(v)
Example #3
0
    def get_stats(self, output_fname):
        fd = FreqDist()
        for text in self.texts:
            fd.update(set(text))

        fh = open(output_fname, 'w')
        text = Text(self.paragraph_tokens)
        fdist = FreqDist(text)
        for (w,f) in fdist.iteritems():
            print >> fh, "%s\t%i" % (w, f)
        fh.close()
Example #4
0
    def description_and_tokens(self, id_, timestamp, soup):
        overview = soup.find(id="description")
        for scr in overview.find_all('script'):
            scr.clear()

        desc = overview.text
        tokens = word_tokenize(desc)
        freqdist = FreqDist(tokens)

        self.redis.set('daftpunk:%s:description' % id_, desc)
        for token, freq in freqdist.iteritems():
            self.redis.zadd('daftpunk:%s:tokens' % id_, freq, token)
Example #5
0
def mapapalabras(issn, titulo):
    print 'Analizando ' + titulo
    kw = keywordsrevista(issn)
    print titulo + ' ya analizado'
    saveoutput(kw,issn, titulo)
    fd = FreqDist(kw)
    lista = [(k,v) for k, v in fd.iteritems()]
    ordenado = sorted(lista, key=itemgetter(1), reverse=True)
    woc = wordcloud.WordCloud(max_font_size = 40, relative_scaling=.5).generate_from_frequencies(ordenado)        
    plt.figure()
    plt.suptitle(titulo)
    #ax = plt.add_subplot(111, autoscale_on=False, xlim=(-1, 5), ylim=(-3, 5))
    #ax.annotate('pixels', xy=(0, 0), xycoords='figure fraction')
    plt.imshow(woc)
    plt.axis("off")
    plt.savefig(issn + '-' + titulo + '.png')
    
Example #6
0
def main():
    corpora = ['idwiki', 'kaskus', 'kompas', 'twitter']
    corpora_dict = {}

    for corpus in corpora:
        fd = FreqDist()
        for line in codecs.open('../data/' + corpus + '.1gram', 'r', 'utf-8'):
            (word, freq) = line.split('\t')
            fd[len(word)] += int(freq.strip())

        sorted_fd = sorted(fd.iteritems(), key=operator.itemgetter(0))
        lengths = [0] + [x for x, y in sorted_fd]
        freqs = [0] + [y for x, y in sorted_fd]
        plt.plot(lengths, freqs, label=corpus)

    plt.grid(True)
    plt.xlabel('length', fontsize=14, fontweight='bold')
    plt.ylabel('frequency', fontsize=14, fontweight='bold')
    plt.legend(loc='upper right')
    plt.savefig('char.png')
    plt.close()
Example #7
0
def create_word_scores():
    tweets = get_tweets_from_db()
    postweets = tweets[800001:]
    negtweets = tweets[:800001]
 
    posWords = []
    negWords = []
    for tweet in postweets:
        posWords.append(tweet[0])
    for tweet in negtweets:
        negWords.append(tweet[0])

    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
	def get_frequent_features(self, min_support):
		#get n item sets	
		wnl = WordNetLemmatizer()
		features = [wnl.lemmatize(token) for token in self.candidate_feature_list()]
		dist = FreqDist(features)
		return [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
 def get_frequent_features_list(self, min_support):
     dist = FreqDist(self.get_candidate_feature_list())
     features = [(item, count) for (item, count) in dist.iteritems() if count >= min_support]
     return self.prune_features(features, 3)
Example #10
0
import re, os, glob
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest=argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
	fold = int(train.split(".")[-2])
	if fold > 3: continue
	wordpostrigrams  = FreqDist(ingrams((porter.stem(word)+"/"+tag
		for t in open(train)
		for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
	for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
		output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
		testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag
			for t in open(test).readlines()
			for word,tag in zip(leaves.findall(t), pos.findall(t))), 3))
		open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b)
			for a, b in testtrigrams.iteritems() if wordpostrigrams[a])
		print output
print "done"
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for item in train_set:
    tweet = item[0].lower()
    words = word_tokenize(item[0])
    word_fd.update(words)
    label_word_fd[item[1]].update(words)

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:50]
print best
bestwords = set([w for w, s in best])
print bestwords

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    features = {}
    for word in all_words:
Example #12
0
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000):
	pos_feature = list()
	neg_feature = list()
	pos_all_words = list()
	neg_all_words = list()
	# pos_all_feature = dict()
	# neg_all_feature = dict()
	if featuregram == 'one':
		for each in pos_wordlist:
			cur = UniGramFeature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = UniGramFeature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'two':
		for each in pos_wordlist:
			cur = Mixup2Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup2Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'three':
		for each in pos_wordlist:
			cur = Mixup3Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup3Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	else:
		return []

	fd = FreqDist()
	cfd = ConditionalFreqDist()
	for word in pos_all_words:
		fd[word] += 1
		cfd['pos'][word] += 1
	for word in neg_all_words:
		fd[word] += 1
		cfd['neg'][word] += 1
	pos_N = cfd['pos'].N()
	neg_N = cfd['neg'].N()
	N = fd.N()
	score_list = dict()
	for word, freq in fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N)
		neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N)
		score_list[word] = pos_score + neg_score

	best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True)
	# print json.dumps(best_topwords[-100:-1], ensure_ascii=False)
	best_topwords = best_topwords[:n]
	# print json.dumps(best_topwords[:100], ensure_ascii=False)
	best_topwords = set(word for word, freq in best_topwords)
	return pos_feature, neg_feature, best_topwords
Example #13
0
from nltk import FreqDist
# Donwload all books (http://www.nltk.org/data.html)
# NOTE: if this does not work, run this code in Python from the Terminal (not from inside IDE)
# nltk.download()

# Import a text and examine its words
from nltk.corpus import brown
brown.words()

# Find the frequency of each word in a text
fd = FreqDist(brown.words())

# Find the most frequent words in a text:
# http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
import operator
max(fd.iteritems(), key=operator.itemgetter(1))
sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
# Or use the wrapper function
fd.most_common(10)

# plot the most frequent words
fd.plot(10)
fd.plot(10, cumulative=True)

# See the words with lowest frequency (these words are called hapaxes)
fd.hapaxes()

# Count all the words
len(text1)
# count unique words
len(set(text1))
Example #14
0
train_word_pos = get_word_pos(train_sents)
train_words = get_words(train_word_pos)
train_pos = get_pos(train_word_pos)

train_pos_unique = get_num_tags(train_pos)
#NUMBER OF POS TAGS
train_num_pos_tags = len(train_pos_unique)

#NUMBER OF TRAINING SENTENCES
num_train_sent = get_number_of_sentences(train_pos_unique)

train_num_of_words = get_number_of_words(train_words)

train_unigrams = ngrams(train_pos, 1)
train_unigram_freq_dist = FreqDist(train_unigrams)
train_unigram_dict = {k[0]: v for k, v in train_unigram_freq_dist.iteritems()}
#print unigram_dict

train_trans_bigrams = ngrams(train_pos, 2)
train_trans_bigram_freq_dist = FreqDist(train_trans_bigrams)
train_trans_bigram_dict = {
    k[0] + "|" + k[1]: v
    for k, v in train_trans_bigram_freq_dist.iteritems()
}

train_word_pos_bigrams = ngrams(train_word_pos, 2)
train_word_pos_bigram_freq_dist = FreqDist(train_word_pos_bigrams)
train_word_pos_bigram_dict = {
    k[0][0] + "|" + k[0][1]: v
    for k, v in train_word_pos_bigram_freq_dist.iteritems()
}
Example #15
0
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest = argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
    fold = int(train.split(".")[-2])
    if fold > 3: continue
    wordpostrigrams = FreqDist(
        ingrams((porter.stem(word) + "/" + tag for t in open(train)
                 for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
    for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
        output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
        testtrigrams = FreqDist(
            ingrams(
                (porter.stem(word) + "/" + tag for t in open(test).readlines()
                 for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
        open(wordposngram + output,
             "w").writelines("%s\t%d\n" % (" ".join(a), b)
                             for a, b in testtrigrams.iteritems()
                             if wordpostrigrams[a])
        print output
print "done"
Example #16
0
 def get_frequent_features(self, min_support):
     #get n item sets
     dist = FreqDist(self.candidate_feature_list())
     features = [(item, count) for (item, count) in dist.iteritems()
                 if count >= min_support]
     return self.prune_features(features, 3)