Example #1
0
         num_words += result['WORDS']
         min_words = min(min_words, result['WORDS'])
         max_words = max(max_words, result['WORDS'])
         all_words.extend(words)
         num_bigrams += result['BIGRAMS']
         all_bigrams.extend(bigrams)
         write_status(i + 1, num_tweets)
 num_emojis = num_pos_emojis + num_neg_emojis
 unique_words = list(set(all_words))
 with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf:
     uwf.write('\n'.join(unique_words))
 num_unique_words = len(unique_words)
 num_unique_bigrams = len(set(all_bigrams))
 print('\nCalculating frequency distribution')
 # Unigrams
 freq_dist = FreqDist(all_words)
 pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl'
 with open(pkl_file_name, 'wb') as pkl_file:
     pickle.dump(freq_dist, pkl_file)
     print(freq_dist)
 print('Saved uni-frequency distribution to %s' % pkl_file_name)
 # Bigrams
 bigram_freq_dist = get_bigram_freqdist(all_bigrams)
 bi_pkl_file_name = sys.argv[1][:-4] + '-freqdist-bi.pkl'
 with open(bi_pkl_file_name, 'wb') as pkl_file:
     pickle.dump(bigram_freq_dist, pkl_file)
 print('Saved bi-frequency distribution to %s' % bi_pkl_file_name)
 print('\n[Analysis Statistics]')
 print(
     'Reviews => Total: %d, Very Positive: %d, Positive: %d, Neural: %d, Negative: %d, Very Negative: %d'
     % (num_tweets, num_xpos_tweets, num_pos_tweets, num_neu_tweets,
Example #2
0
# Convert all characters to Lower case
tokens_lower = [t.lower() for t in tokens_nop]
print(tokens_lower[:50])
len(set(tokens_lower))

# Create a stopword list from the standard list of stopwords available in nltk
stop = stopwords.words('english')
print(stop)

# Remove all these stopwords from the text
tokens_nostop = [t for t in tokens_lower if t not in stop]
print(tokens_nostop[:50])
len(tokens_lower)
len(tokens_nostop)
FreqDist(tokens_nostop).most_common(50)

# Now, let's do some Stemming!
# There are different stemmers available in Python. Let's take a look at a few

# The most popular stemmer
porter = nltk.PorterStemmer()
tokens_porter = [porter.stem(t) for t in tokens_nostop]
print(tokens_nostop[:50])
print(tokens_porter[:50])

# The Lancaster Stemmer - developed at Lancaster University
lancaster = nltk.LancasterStemmer()
tokens_lanc = [lancaster.stem(t) for t in tokens_nostop]
print(tokens_lanc[:50])
Example #3
0
def createMatrices(sentences, mappings, padOneTokenSentence):
    data = []
    numTokens = 0
    numUnknownTokens = 0
    missingTokens = FreqDist()
    paddedSentences = 0

    for sentence in sentences:
        row = {name: [] for name in list(mappings.keys()) + ['raw_tokens']}

        for mapping, str2Idx in mappings.items():
            if mapping not in sentence:
                continue

            for entry in sentence[mapping]:
                if mapping.lower() == 'tokens':
                    numTokens += 1
                    idx = str2Idx['UNKNOWN_TOKEN']

                    if entry in str2Idx:
                        idx = str2Idx[entry]
                    elif entry.lower() in str2Idx:
                        idx = str2Idx[entry.lower()]
                    elif wordNormalize(entry) in str2Idx:
                        idx = str2Idx[wordNormalize(entry)]
                    else:
                        numUnknownTokens += 1
                        missingTokens[wordNormalize(entry)] += 1

                    row['raw_tokens'].append(entry)
                elif mapping.lower() == 'characters':
                    idx = []
                    for c in entry:
                        if c in str2Idx:
                            idx.append(str2Idx[c])
                        else:
                            idx.append(str2Idx['UNKNOWN'])

                else:
                    idx = str2Idx[entry]

                row[mapping].append(idx)

        if len(row['tokens']) == 1 and padOneTokenSentence:
            paddedSentences += 1
            for mapping, str2Idx in mappings.items():
                if mapping.lower() == 'tokens':
                    row['tokens'].append(mappings['tokens']['PADDING_TOKEN'])
                    row['raw_tokens'].append('PADDING_TOKEN')
                elif mapping.lower() == 'characters':
                    row['characters'].append([0])
                else:
                    row[mapping].append(0)

        data.append(row)

    if numTokens > 0:
        logging.info("Unknown-Tokens: %.2f%%" %
                     (numUnknownTokens / float(numTokens) * 100))

    return data
Example #4
0
from nltk.corpus import gutenberg
from nltk import FreqDist

import matplotlib
import matplotlib.pyplot as plt

fd = FreqDist()

for text in gutenberg.fileids():
    print(text, end=' ')
    for word in gutenberg.words(text):
        fd[word] += 1
    print("......done")

samples = fd.most_common()

freqs = [freq for _, freq in samples]
ranks = [i for i in range(1, fd.B() + 1)]
# print(freqs)
# print(ranks)

plt.loglog(ranks, freqs)
plt.xlabel('requency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
Example #5
0
import string
#Step through each index of the list with all file names and texts to output
#  corresponding lexical diversity and load a FreqDist list.
while (i * 2 < (len(all_texts))):
    #Load filtered_texts wil string list to be worked on
    filtered_texts = all_texts[i * 2 + 1]
    #Remove all punctuation from filtered_texts
    filtered_texts = [
        ''.join(c for c in s if c not in string.punctuation)
        for s in filtered_texts
    ]
    #Remove all empty strings left over from removing punctuation
    filtered_texts = [s for s in filtered_texts if s]
    #Calculate and store FreqDist of each text in an index of freqdist
    #Lowercase all words so capital and lowercase are counted together
    freqdist.append(FreqDist([x.lower() for x in filtered_texts]))
    #Write to new list to be shuffled
    randomized_text = filtered_texts
    #Counter set for next while loop
    j = 0
    #List to hold MTLD values that will be averaged for final score
    random_mtlds = list()
    #Calculate MTLD score more times for shorter texts to get a more
    #  consistent average.  Calculate minumum of 25 times.

    if (100000 / (len(randomized_text)) < 25):
        while j < 25:
            #Shuffle list
            random.shuffle(randomized_text)
            #Loading random_mtlds list of length 'j' with mtld values
            random_mtlds.append(mtld(randomized_text))
Example #6
0
from os import listdir
from os.path import isfile, join
mypath = './corpus/.'
onlyfiles=[ f for f in listdir(mypath) if isfile(join(mypath,f)) ]
filewords = []
for f in onlyfiles:
	frequencies = []
	print(f+', '+str(onlyfiles.index(f)+1)+'/'+str(len(onlyfiles)))
	if f=='.DS_Store' or f=='all.txt' or f=='all_ascii.txt':
		continue
	current = open('corpus/'+f).read()
	tokens = nltk.word_tokenize(current)
	words = [w.lower() for w in tokens]
	words = [w for w in words if not w in stopwords.words('english')]
	words = [w for w in words if (len(w) != 1 and len(w) != 2)]
	fdist1 = FreqDist(words)
	fdist_words = fdist1.keys()
	for i in range(30):
		frequencies.append([fdist_words[i],fdist1[fdist_words[i]]])
	filewords.append([f,frequencies])

print(filewords)
print('done!')

import operator
most_freqs_dict = {}
most_freqs_sorted = []
def most_frequent():
	for f in filewords:
		fname = f[0]
		print('handling '+fname)
Example #7
0
 def analize(self, text):
     tokens = self.normalized_words(text)
     counts = [c for k, c in FreqDist(tokens).most_common(5)]
     self.loathing = round(sum(counts) / len(tokens), 4)
     return self.have_fake, self.loathing
def eval(tst_s, tagset, ep , tp, wds):
    if VERBOSE: print("\nStarting Testing ...")
    correct = 0
    false = 0
    count = 0
    confusion_matrix = create_table(len(ts), len(ts))
    
    counter = 0
    unks = {}
    for s in tst_s:
        
        print("Testing progress: %d%% \r" % (counter * 100/len(tst_s) + 1), end='', flush=True)
        counter += 1
        s =[('<s>', '<s>')]+ s + [('</s>', '</s>')]
        
        for i in range(len(s)):
            w = s[i][0]
            if len(w) > 1 and w not in wds and w[0].isupper() and i > 1:
                s[i] = ('N-UNK', s[i][1])
            if w not in wds:
                for suf in suffixes:
                    if w.endswith(suf):
                        s[i] = ("UNK-" + suf, s[i][1])
                        try:
                            unks[suf] += 1
                        except:
                            unks[suf] = 0

        
        ws = [w for (w, _) in s]
        tgs = [t for (_, t) in s]
        
        ptags = viterbi(ws, tagset, ep, tp)
        for i in range(len(tgs)):
            count += 1
            if tgs[i] == ptags[i]:
                correct += 1
            else:
                false += 1

            confusion_matrix[ts.index(tgs[i])][ts.index(ptags[i])] += 1
    print("")
    #print stats
    
    if VERBOSE:
        ntwords= [w for s in test_sents for (w, _) in s]
        ntwc = FreqDist(ntwords)
        print("* Test type count:\t" + str(len(ntwc.keys())))
        print("* Test token count:\t" + str(len(ntwords)))
        print("* UNK occurences (% of total words)")
        for unk in unks:
            print("{}\t{} ({}%)".format(unk, str(unks[unk]), str(unks[unk] * 100/len(ntwords))))
        print("")

   # maxval = max([max(l) for l in confusion_matrix])
   # confusion_matrix = [[v/maxval for v in l] for l in confusion_matrix]
   # fig, ax = plt.subplots()
   # im = ax.imshow(confusion_matrix, cmap='Reds', vmin=0.05)
   # ax.set_xticks(range(len(ts)))
   # ax.set_yticks(range(len(ts)))
   # ax.set_xticklabels(ts)
   # ax.set_yticklabels(ts)
   # plt.setp(ax.get_xticklabels(), rotation=45, rotation_mode='anchor')
   # ax.set_title("Confusion Matrix Heatmap")
   # fig.tight_layout()
   # plt.show()
    print("-" * 48 + "Confusion Matrix" + '-' * 48)
    print("a\p\t" + "\t".join(ts))
    for i in range(len(confusion_matrix)):
        line = ts[i] + "\t"
        for j in range(len(confusion_matrix)):
                line += str(confusion_matrix[i][j])+ "\t"
        print(line)
    print("Recall :" + str((correct * 100)/count))
Example #9
0
from __future__ import print_function
from nltk.metrics import *

reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
test    = 'DET VB VB DET NN NN NN IN DET NN'.split()
print(accuracy(reference, test))

reference_set = set(reference)
test_set = set(test)
precision(reference_set, test_set)
print(recall(reference_set, test_set))
print(f_measure(reference_set, test_set))

from nltk import FreqDist, MLEProbDist
pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
print(log_likelihood(['a', 'd'], [pdist1, pdist2]))

edit_distance("rain", "shine")

s1 = set([1,2,3,4])
s2 = set([3,4,5])
binary_distance(s1, s2)
print(jaccard_distance(s1, s2))
print(masi_distance(s1, s2))

spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})

s1 = "000100000010"
s2 = "000010000100"
Example #10
0
def stats(samples):
    _, labels = zip(*samples)
    dist = FreqDist(labels)
    size = len(samples)
    for key in dist.keys():
        logging.info("%s\t%f" % (key, dist[key] / float(size)))
def train(train_sents):
    # initial count (before preprocessing)
    words = [w for s in train_sents for (w, _) in s]
    wc = FreqDist(words)
    type_count = len(wc.keys())


    train_duration = time.time()
    pp_duration = 0
 
    tags = []
    total_bigrams = []
    wt_pairs = []
    nwords= [] #processed words

    counter = 0
    for sentence in train_sents:
        sentence = [('<s>', '<s>')] + sentence + [('</s>', '</s>')]
        stags = []
        print("Training progress: %d%% \r" % (counter * 100/len(train_sents) + 1), end='', flush=True)
        counter+= 1
        for i in range(len(sentence)):
            if PREPROCESS:
                start = time.time()
                word = sentence[i][0]
                # --- start preprocessing
                if wc[word] <= RARETHRESH :
                    if PP_CAP:
                        if len(word) > 1 and word[0].isupper() and i > 1:
                            sentence[i]= ('N-UNK', sentence[i][1])
                    if PP_SUF:
                        for suf in suffixes:
                            if word.endswith(suf):
                                sentence[i]= ('UNK-' + suf, sentence[i][1])
                pp_duration += time.time() - start
                # ----

            stags += [sentence[i][1]]
            nwords += [sentence[i][0]]
            wt_pairs += [sentence[i]]


        total_bigrams += list(bigrams(stags))
        tags += stags
    words = nwords    
    wc = FreqDist(words)
    types = list(wc.keys())
    print("")
    tag_count = FreqDist(tags)
    ts = list(tag_count.keys())

    train_duration = time.time() - train_duration
    if VERBOSE:
        # tag stats
        print("* Training lasted:\t" + str(train_duration) + "ms")
        print("* Preprocessing lasted:\t{}ms ({}%)".format(str(pp_duration), str(int(pp_duration * 100/train_duration))))
        print("* Tags occurences :")
        for t in tag_count:
            if t in ['<s>', '</s>']:
                continue
            print("{}\t{}".format(t, str(tag_count[t])))
        print("* Type count :\t" + str(type_count))
        print("* Token Count:\t" + str(len(words)))
        # graph generation
        # tag distribution
        #plt.barh(list(tag_count.keys()), list(tag_count.values()))
        #plt.xlabel("Number of Occurences")
        #plt.ylabel("Tags")
        #plt.title("Distribution of tags in the training data")
       # plt.show()
        # word frequency distribution
        
        #print(sorted(list(wc.keys()), key= lambda key: wc[key], reverse=True)[:9])
        #plt.hist(x=list(wc.values()), bins='auto', log=True)
        #plt.ylabel("Frequency")
        #plt.xlabel("Number of word occurences")
        #plt.title("Word frequency distribution")
        #plt.savefig("wordfreq.png", dpi=400)
        
    return wt_pairs, words, types, wc, tags, total_bigrams, ts
Example #12
0
>>> print(x)
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')']
>>> print (documents[0])
(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')
>>> from random import shuffle
>>> shuffle(documents) #shuffle the document list
>>> 
>>> #Feature Exctraction
>>> all_words = [word.lower() for word in movie_reviews.words()]
>>> print (all_words[:10]) #print first 10 words
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']
>>> 
>>> #Creating a frequency distribution
>>> from nltk import FreqDist

>>> all_words_frequency = FreqDist(all_words)
>>> print (all_words_frequency)
<FreqDist with 39768 samples and 1583820 outcomes>
>>> print (all_words_frequency.most_common(10)) #print 10 most frequently occuring words
[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822)]
>>> 
>>> #Data Cleaning
>>> from nltk.corpus import stopwords
>>> stopwords_english = stopwords.words('english')
>>> print(stopwords_english)
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
>>> all_words_without_stopwords = [word for word in all_words if word not in stopwords_english]
>>> print (all_words_without_stopwords[:10]) #print first 10 words
['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink']
>>> 
>>> #Remove punctuations
Example #13
0
    def calculate_emission_prob(self):

        for tag in self.tag_type_set:
            words = [w for (w, t) in self.unified_sentences if t == tag]
            self.emission_prob[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
Example #14
0
cleaned_text_tokens = []

for tokens in text_tokens:
    cleaned_text_tokens.append(remove_noise(tokens, stop_words))

# compare before with after
print(text_tokens[2510])
print(cleaned_text_tokens[2510])
'''
Determining Word Density
'''

# not sensible to do this for one entry, so lets look at all...


def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token


all_pos_words = get_all_words(cleaned_text_tokens)

from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))
'''
Preparing data for the model
'''
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import sinica_treebank
##################################################################
## 简单测试
print(
    type(sinica_treebank)
)  # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'>
print(len(sinica_treebank.words()))  # 91627
print(sinica_treebank.words()
      )  # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...]
# 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符
##################################################################
## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt
import re
from nltk import FreqDist
from pypinyin import pinyin, lazy_pinyin, Style
fd = FreqDist(sinica_treebank.words())
print(len(list(fd.keys())))  # 17273; 去重以后的结果
print(len(fd.most_common()))  # 17273
str = ''.join([x[0] for x in fd.most_common()])
print(len(str))  # 38844
str = re.sub('[^\u4e00-\u9fa5]', '', str)
print(len(str))  # 38225; 去掉标点符号
with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f:
    f.write('\n'.join(lazy_pinyin(str)))
Example #16
0
 def test_empty_string(self):
     test = NgramCounter("")
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
Example #17
0
"""
with open('moby.csv', encoding='utf-8-sig', mode='w') as fp:
    fp.write('pal|freq\n')
    for tag, count in ocm.items():
        fp.write('{}|{}\n'.format(tag, count))

with open('zara.csv', encoding='utf-8-sig', mode='w') as fp:
    fp.write('pal|freq\n')
    for tag, count in octt.items():
        fp.write('{}|{}\n'.format(tag, count))
"""
Frecuencia
"""
from nltk import FreqDist

fdist1 = FreqDist(ocm)
fdist1.plot(25, cumulative=True)

fdist2 = FreqDist(octt)
fdist2.plot(25, cumulative=True)
"""
Mapa de dispersión
"""
from nltk.text import Text

textListNLTK = Text(m)
textListNLTK.concordance('also')
marcommon = []
for list in comt[0:30]:
    for word in list[0]:
        marcommon.append(word)
Example #18
0
 def test_empty_list(self):
     test = NgramCounter([])
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
Example #19
0
#reading given corpus.
words = []
with open('text8', 'r') as f:
    for line in f:
        for word in line.split():
            words.append(word)
f.close()

ABSS_word = words[:100000]

#finding most 10000 frequent words from given corpus.
f = open('Most_Fre_data.txt', 'w')
f.write('Words frequency in given corpus')
f.write('\n')
fdist = FreqDist(ABSS_word)
dict1 = fdist.most_common(1000)
dict1 = dict(dict1)

sorted_dict = sorted(dict1.items(), key=itemgetter(1), reverse=True)
lis = [item[0] for item in sorted_dict]
lis.append("UNK")

#string most 10000 frequent word with their frequency.
for item in sorted_dict:
    f.write(str(item[0]))
    f.write(' = ')
    f.write(str(item[1]))
    f.write('\n')
f.close()
Example #20
0
 def test_None(self):
     test = NgramCounter(None)
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
Example #21
0
    def classify(self, sentence, k):
        self.query = sentence
        #idf
        self.idf = []
        self.query = self.query.split()
        for i in range(len(self.query)):
            #menghitung byknya jml dokumen dengan term tertentu
            self.numDocWithThisTerm = 0
            for j in range(self.allDoc):
                for k in range(len(self.tf[j])):
                    if self.query[i] in self.tf[j][k]:
                        self.numDocWithThisTerm = self.numDocWithThisTerm + 1
            if self.numDocWithThisTerm > 0:
                self.temp = 1.0 + math.log(
                    float(self.allDoc / self.numDocWithThisTerm))
            else:
                self.temp = 1.0
            self.idf = self.idf + [(self.query[i], self.temp)]

        #tfidf
        self.tfidf = []
        for i in range(len(self.query)):
            for j in range(self.allDoc):
                #get tf dgn term tertentu
                for k in range(len(self.tf[j])):
                    if self.query[i] in self.tf[j][k]:
                        self.tfTemp = self.tf[j][k][1]
                        break
                    else:
                        self.tfTemp = 0
                #get idf dgn term tertentu
                for l in range(len(self.idf)):
                    if self.query[i] in self.idf[l]:
                        self.idfTemp = self.idf[l][1]
                self.tfidf = self.tfidf + [self.tfTemp * self.idfTemp]
        self.tfidf = [i for i in zip(*[iter(self.tfidf)] * self.allDoc)]

        #tf query
        self.freqQuery = FreqDist(self.query)
        self.tfQuery = self.freqQuery.most_common()
        #normalized tf query
        for i in range(len(self.tfQuery)):
            self.tfQuery[i] = (self.tfQuery[i][0],
                               self.tfQuery[i][1] / len(self.query))
        #idf query = idf (sama)
        #tf idf query
        self.tfidfQuery = []
        for i in range(len(self.tfQuery)):
            self.tfidfQuery = self.tfidfQuery + [
                self.tfQuery[i][1] * self.idf[i][1]
            ]

        #cosine similarity
        self.cosSim = []
        for i in range(self.allDoc):
            self.dotProduct = 0
            self.queryAll = 0
            self.docI = 0
            for j in range(len(self.tfQuery)):
                self.dotProduct = self.dotProduct + (self.tfidfQuery[j] *
                                                     self.tfidf[j][i])
                self.queryAll = self.queryAll + (self.tfidfQuery[j]**2)
                self.docI = self.docI + (self.tfidf[j][i]**2)
            if self.docI == 0:
                self.temp = 0  #mengatasi jika term sama sekali tdk muncul, shg hasil tdk akan menjadi tak hingga
            else:
                self.temp = self.dotProduct / (math.sqrt(self.queryAll) *
                                               math.sqrt(self.docI))
            self.cosSim = self.cosSim + [self.temp]

        self.indexMax = [
            i for i, val in enumerate(self.cosSim) if val == max(self.cosSim)
        ]
        self.closestClass = []
        for i in range(len(self.indexMax)):
            if self.indexMax[i] >= len(self.allTermNeg):
                self.closestClass = self.closestClass + ['Positif']
            else:
                self.closestClass = self.closestClass + ['Negatif']

        self.freqClass = Counter(self.closestClass).most_common()
        if len(self.freqClass
               ) == 2 and self.freqClass[0][1] == self.freqClass[1][1]:
            return 'Positif/Negatif'
        elif k > len(self.closestClass):
            self.closestClass = sorted(self.closestClass,
                                       key=self.closestClass.count,
                                       reverse=True)
            return self.closestClass[0]
        else:
            #self.closestClass = sorted(self.closestClass, key = self.closestClass.count)
            return self.closestClass[k - 1]
Example #22
0

def pmi(a, b):
    return log(pairs[a, b]) - log(pairs.N()) - log(unigrams[a]) - log(
        unigrams[b]) + 2 * log(unigrams.N())


h = FrameHierarchy.load()
# training data contains a bad frame
valid_names = {f.name for f in h._frames.values()}

with codecs.open("../../../training/data/naacl2012/cv.train.sentences.json",
                 encoding="utf8") as train_file:
    train = [json.loads(line) for line in train_file]

unsorted_frames = ([(f['target']['spans'][0]['start'], f['target']['name'])
                    for f in s['frames']] for s in train)
frames = [[name for start, name in sorted(s) if name in valid_names]
          for s in unsorted_frames]
del unsorted_frames
unigrams = FreqDist(chain(*frames))
pairs = FreqDist(
    chain(*[[tuple(sorted(b)) for b in combinations(f, 2)] for f in frames]))
pmis = FreqDist({(a, b): pmi(a, b)
                 for a, b in pairs.keys()
                 if unigrams[a] >= THRESHOLD and unigrams[b] >= THRESHOLD})

unigrams_with_ancestors = FreqDist(unigrams)
for u in unigrams:
    for a in h.ancestors(h._frames[u]):
        unigrams_with_ancestors.inc(a.name)
Example #23
0
      len(set(text1)))  # On average, each work in Moby dick occurs 13 times

print(
    len(text6) / len(set(text6))
)  # As a comedic movie, we expect Monty Pyton to be less 'lexically rich' than
# the novel Moby dick and we can see this is true from this statement!

# === Part 3: Analysing frequency of words ===
# NLTK has a built in function that lets us find the frequency of words
# 'Distribution' is a fancy word for what we get if we gather up all the individual frequencies of words and store
# them somewhere. Hence the 'Frequency Distribution' of the words in a text is a sort of table of all the frequencies of
# each word in a text

from nltk import FreqDist
fd = FreqDist(
    text1
)  # We create a frequency distribution of all the words in text1 ie Moby Dick

# Now try:
print(fd['the'])  # Returns the number of times 'the' appears in the text

print(
    fd.keys()[0:30]
)  # Prints the first 30 unique words of the text in decreasing order of frequency
print(
    fd.items()
    [0:30])  # Does the same as above but also shows the frequency of each word

# === Part 4: Your task ===
# At the end of corupuses.py (you can comment out the above code if its not needed), write python code that:
# 1.) Returns the 10 most frequent words in Obama's 2009 inaugural speech, including their frequencies
Example #24
0
#print(texts[:300])

texts = texts.replace('\n', '')
tokenizer = re.compile('[^ ㄱ-힣]+')
texts = tokenizer.sub('', texts)

tokens = word_tokenize(texts)

noun_token = []
for token in tokens:
    token_pos = okt.pos(token)
    temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == "Noun"]
    if len(''.join(temp)) > 1:
        noun_token.append("".join(temp))
texts = " ".join(noun_token)

with open(filename2, 'r', encoding='UTF-8') as f:
    stopwords = f.read()

stopwords = stopwords.split(' ')

texts = [text for text in tokens if text not in stopwords]
freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
print(freqtxt[:30])

wcloud = WordCloud(font, relative_scaling=0.2,
                   background_color='white').generate(" ".join(texts))
plt.figure(figsize=(140, 80))
plt.imshow(wcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Example #25
0
 def __init__(self, text, id=None, category=None):
     self.text = text
     self.token_counts = FreqDist(normalized_tokens(text))
     self.id = id
     self.category = category
Example #26
0
corpus_root = '/path/to/text/files/directory'
file = open("terrier-stop.txt")
stopwords = file.read().splitlines()
wordlists = PlaintextCorpusReader(corpus_root, '.*')

texts = []
for item in wordlists.fileids():
    try:
        temp = [word.lower() for word in wordlists.words(item) if word.lower() not in stopwords and word.isalpha()]
        texts.append(temp)
    except UnicodeError:
        print(word, item)
        

flat_list = [item for sublist in texts for item in sublist]
fdist = FreqDist(flat_list)


dictionary = corpora.Dictionary(texts)
dictionary.save('/path/to/saving/dictionary.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/path/to/save/corpus/new_corpus.mm', corpus)


mm = gensim.corpora.MmCorpus('new_corpuss.mm')


lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics= 5, update_every=2, chunksize=10, passes=1)

Example #27
0
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer
from nltk import FreqDist
from nltk.book import text3 as book_of_genesis

# frequency distribution
fdist = FreqDist(book_of_genesis)
print(fdist)
fdist.most_common(50)  # 50 most common tokens

# cumilative frequency plot
fdist.plot(50, cumulative=True)

# consider capitalization and ignore duplicates
# words_set = set(book_of_genesis)

# ignore capitalization and duplicates
# words_set = set(word.lower() for word in book_of_genesis)

# ignore capitalization, duplicates and non-alphabetic items (numbers and punctuation characters)
words_set = set(word.lower() for word in book_of_genesis if word.isalpha())

# number of words
len(words_set)

# get words longer than 10
minimum_characters = 10
long_words = [word for word in words_set if len(word) > minimum_characters]
sorted(long_words)  # sorted alphabetically (capital letters first)

# get words longer than 7 that occur more than 7 times
minimum_characters = 7
minimum_frequency = 7
Example #29
0
def GetMostFrequentWords(extracted, textProcesser, filterFun, topN):
    allwords = textProcesser(extracted)  # textProcesser take in a piece of text and return words or phrases (n-gram)
    # remove punctuation, numeric value and ignoreWords
    cleanedWords = [word.lower() for word in allwords if filterFun(word.lower())]
    all_words = FreqDist(cleanedWords)
    return all_words.most_common()[:topN]
Example #30
0
print(stopwords)

nostopwords = [word for word in allTokens if word not in stopwords]

# NLTK.Text
text = nltk.Text(nostopwords)

# Collocations (words that frequently appear together)
colos = text.collocations()
print(colos)

# count
print(text.count('inlet'))

# words in similar contexts
print(text.similar('ship'))

text.dispersion_plot(['north', 'south', 'east', 'west'])

text.dispersion_plot(['ship', 'dock', 'boat', 'canoe', 'steamboat'])

# Frequency distributions!

from nltk import FreqDist
fdist = FreqDist(text)

print(fdist.hapaxes())  # words that occur only once

print(fdist.most_common(50))

fdist.plot(30)