num_words += result['WORDS'] min_words = min(min_words, result['WORDS']) max_words = max(max_words, result['WORDS']) all_words.extend(words) num_bigrams += result['BIGRAMS'] all_bigrams.extend(bigrams) write_status(i + 1, num_tweets) num_emojis = num_pos_emojis + num_neg_emojis unique_words = list(set(all_words)) with open(sys.argv[1][:-4] + '-unique.txt', 'w') as uwf: uwf.write('\n'.join(unique_words)) num_unique_words = len(unique_words) num_unique_bigrams = len(set(all_bigrams)) print('\nCalculating frequency distribution') # Unigrams freq_dist = FreqDist(all_words) pkl_file_name = sys.argv[1][:-4] + '-freqdist.pkl' with open(pkl_file_name, 'wb') as pkl_file: pickle.dump(freq_dist, pkl_file) print(freq_dist) print('Saved uni-frequency distribution to %s' % pkl_file_name) # Bigrams bigram_freq_dist = get_bigram_freqdist(all_bigrams) bi_pkl_file_name = sys.argv[1][:-4] + '-freqdist-bi.pkl' with open(bi_pkl_file_name, 'wb') as pkl_file: pickle.dump(bigram_freq_dist, pkl_file) print('Saved bi-frequency distribution to %s' % bi_pkl_file_name) print('\n[Analysis Statistics]') print( 'Reviews => Total: %d, Very Positive: %d, Positive: %d, Neural: %d, Negative: %d, Very Negative: %d' % (num_tweets, num_xpos_tweets, num_pos_tweets, num_neu_tweets,
# Convert all characters to Lower case tokens_lower = [t.lower() for t in tokens_nop] print(tokens_lower[:50]) len(set(tokens_lower)) # Create a stopword list from the standard list of stopwords available in nltk stop = stopwords.words('english') print(stop) # Remove all these stopwords from the text tokens_nostop = [t for t in tokens_lower if t not in stop] print(tokens_nostop[:50]) len(tokens_lower) len(tokens_nostop) FreqDist(tokens_nostop).most_common(50) # Now, let's do some Stemming! # There are different stemmers available in Python. Let's take a look at a few # The most popular stemmer porter = nltk.PorterStemmer() tokens_porter = [porter.stem(t) for t in tokens_nostop] print(tokens_nostop[:50]) print(tokens_porter[:50]) # The Lancaster Stemmer - developed at Lancaster University lancaster = nltk.LancasterStemmer() tokens_lanc = [lancaster.stem(t) for t in tokens_nostop] print(tokens_lanc[:50])
def createMatrices(sentences, mappings, padOneTokenSentence): data = [] numTokens = 0 numUnknownTokens = 0 missingTokens = FreqDist() paddedSentences = 0 for sentence in sentences: row = {name: [] for name in list(mappings.keys()) + ['raw_tokens']} for mapping, str2Idx in mappings.items(): if mapping not in sentence: continue for entry in sentence[mapping]: if mapping.lower() == 'tokens': numTokens += 1 idx = str2Idx['UNKNOWN_TOKEN'] if entry in str2Idx: idx = str2Idx[entry] elif entry.lower() in str2Idx: idx = str2Idx[entry.lower()] elif wordNormalize(entry) in str2Idx: idx = str2Idx[wordNormalize(entry)] else: numUnknownTokens += 1 missingTokens[wordNormalize(entry)] += 1 row['raw_tokens'].append(entry) elif mapping.lower() == 'characters': idx = [] for c in entry: if c in str2Idx: idx.append(str2Idx[c]) else: idx.append(str2Idx['UNKNOWN']) else: idx = str2Idx[entry] row[mapping].append(idx) if len(row['tokens']) == 1 and padOneTokenSentence: paddedSentences += 1 for mapping, str2Idx in mappings.items(): if mapping.lower() == 'tokens': row['tokens'].append(mappings['tokens']['PADDING_TOKEN']) row['raw_tokens'].append('PADDING_TOKEN') elif mapping.lower() == 'characters': row['characters'].append([0]) else: row[mapping].append(0) data.append(row) if numTokens > 0: logging.info("Unknown-Tokens: %.2f%%" % (numUnknownTokens / float(numTokens) * 100)) return data
from nltk.corpus import gutenberg from nltk import FreqDist import matplotlib import matplotlib.pyplot as plt fd = FreqDist() for text in gutenberg.fileids(): print(text, end=' ') for word in gutenberg.words(text): fd[word] += 1 print("......done") samples = fd.most_common() freqs = [freq for _, freq in samples] ranks = [i for i in range(1, fd.B() + 1)] # print(freqs) # print(ranks) plt.loglog(ranks, freqs) plt.xlabel('requency(f)', fontsize=14, fontweight='bold') plt.ylabel('rank(r)', fontsize=14, fontweight='bold') plt.grid(True) plt.show()
import string #Step through each index of the list with all file names and texts to output # corresponding lexical diversity and load a FreqDist list. while (i * 2 < (len(all_texts))): #Load filtered_texts wil string list to be worked on filtered_texts = all_texts[i * 2 + 1] #Remove all punctuation from filtered_texts filtered_texts = [ ''.join(c for c in s if c not in string.punctuation) for s in filtered_texts ] #Remove all empty strings left over from removing punctuation filtered_texts = [s for s in filtered_texts if s] #Calculate and store FreqDist of each text in an index of freqdist #Lowercase all words so capital and lowercase are counted together freqdist.append(FreqDist([x.lower() for x in filtered_texts])) #Write to new list to be shuffled randomized_text = filtered_texts #Counter set for next while loop j = 0 #List to hold MTLD values that will be averaged for final score random_mtlds = list() #Calculate MTLD score more times for shorter texts to get a more # consistent average. Calculate minumum of 25 times. if (100000 / (len(randomized_text)) < 25): while j < 25: #Shuffle list random.shuffle(randomized_text) #Loading random_mtlds list of length 'j' with mtld values random_mtlds.append(mtld(randomized_text))
from os import listdir from os.path import isfile, join mypath = './corpus/.' onlyfiles=[ f for f in listdir(mypath) if isfile(join(mypath,f)) ] filewords = [] for f in onlyfiles: frequencies = [] print(f+', '+str(onlyfiles.index(f)+1)+'/'+str(len(onlyfiles))) if f=='.DS_Store' or f=='all.txt' or f=='all_ascii.txt': continue current = open('corpus/'+f).read() tokens = nltk.word_tokenize(current) words = [w.lower() for w in tokens] words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if (len(w) != 1 and len(w) != 2)] fdist1 = FreqDist(words) fdist_words = fdist1.keys() for i in range(30): frequencies.append([fdist_words[i],fdist1[fdist_words[i]]]) filewords.append([f,frequencies]) print(filewords) print('done!') import operator most_freqs_dict = {} most_freqs_sorted = [] def most_frequent(): for f in filewords: fname = f[0] print('handling '+fname)
def analize(self, text): tokens = self.normalized_words(text) counts = [c for k, c in FreqDist(tokens).most_common(5)] self.loathing = round(sum(counts) / len(tokens), 4) return self.have_fake, self.loathing
def eval(tst_s, tagset, ep , tp, wds): if VERBOSE: print("\nStarting Testing ...") correct = 0 false = 0 count = 0 confusion_matrix = create_table(len(ts), len(ts)) counter = 0 unks = {} for s in tst_s: print("Testing progress: %d%% \r" % (counter * 100/len(tst_s) + 1), end='', flush=True) counter += 1 s =[('<s>', '<s>')]+ s + [('</s>', '</s>')] for i in range(len(s)): w = s[i][0] if len(w) > 1 and w not in wds and w[0].isupper() and i > 1: s[i] = ('N-UNK', s[i][1]) if w not in wds: for suf in suffixes: if w.endswith(suf): s[i] = ("UNK-" + suf, s[i][1]) try: unks[suf] += 1 except: unks[suf] = 0 ws = [w for (w, _) in s] tgs = [t for (_, t) in s] ptags = viterbi(ws, tagset, ep, tp) for i in range(len(tgs)): count += 1 if tgs[i] == ptags[i]: correct += 1 else: false += 1 confusion_matrix[ts.index(tgs[i])][ts.index(ptags[i])] += 1 print("") #print stats if VERBOSE: ntwords= [w for s in test_sents for (w, _) in s] ntwc = FreqDist(ntwords) print("* Test type count:\t" + str(len(ntwc.keys()))) print("* Test token count:\t" + str(len(ntwords))) print("* UNK occurences (% of total words)") for unk in unks: print("{}\t{} ({}%)".format(unk, str(unks[unk]), str(unks[unk] * 100/len(ntwords)))) print("") # maxval = max([max(l) for l in confusion_matrix]) # confusion_matrix = [[v/maxval for v in l] for l in confusion_matrix] # fig, ax = plt.subplots() # im = ax.imshow(confusion_matrix, cmap='Reds', vmin=0.05) # ax.set_xticks(range(len(ts))) # ax.set_yticks(range(len(ts))) # ax.set_xticklabels(ts) # ax.set_yticklabels(ts) # plt.setp(ax.get_xticklabels(), rotation=45, rotation_mode='anchor') # ax.set_title("Confusion Matrix Heatmap") # fig.tight_layout() # plt.show() print("-" * 48 + "Confusion Matrix" + '-' * 48) print("a\p\t" + "\t".join(ts)) for i in range(len(confusion_matrix)): line = ts[i] + "\t" for j in range(len(confusion_matrix)): line += str(confusion_matrix[i][j])+ "\t" print(line) print("Recall :" + str((correct * 100)/count))
from __future__ import print_function from nltk.metrics import * reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() test = 'DET VB VB DET NN NN NN IN DET NN'.split() print(accuracy(reference, test)) reference_set = set(reference) test_set = set(test) precision(reference_set, test_set) print(recall(reference_set, test_set)) print(f_measure(reference_set, test_set)) from nltk import FreqDist, MLEProbDist pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) print(log_likelihood(['a', 'd'], [pdist1, pdist2])) edit_distance("rain", "shine") s1 = set([1,2,3,4]) s2 = set([3,4,5]) binary_distance(s1, s2) print(jaccard_distance(s1, s2)) print(masi_distance(s1, s2)) spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) s1 = "000100000010" s2 = "000010000100"
def stats(samples): _, labels = zip(*samples) dist = FreqDist(labels) size = len(samples) for key in dist.keys(): logging.info("%s\t%f" % (key, dist[key] / float(size)))
def train(train_sents): # initial count (before preprocessing) words = [w for s in train_sents for (w, _) in s] wc = FreqDist(words) type_count = len(wc.keys()) train_duration = time.time() pp_duration = 0 tags = [] total_bigrams = [] wt_pairs = [] nwords= [] #processed words counter = 0 for sentence in train_sents: sentence = [('<s>', '<s>')] + sentence + [('</s>', '</s>')] stags = [] print("Training progress: %d%% \r" % (counter * 100/len(train_sents) + 1), end='', flush=True) counter+= 1 for i in range(len(sentence)): if PREPROCESS: start = time.time() word = sentence[i][0] # --- start preprocessing if wc[word] <= RARETHRESH : if PP_CAP: if len(word) > 1 and word[0].isupper() and i > 1: sentence[i]= ('N-UNK', sentence[i][1]) if PP_SUF: for suf in suffixes: if word.endswith(suf): sentence[i]= ('UNK-' + suf, sentence[i][1]) pp_duration += time.time() - start # ---- stags += [sentence[i][1]] nwords += [sentence[i][0]] wt_pairs += [sentence[i]] total_bigrams += list(bigrams(stags)) tags += stags words = nwords wc = FreqDist(words) types = list(wc.keys()) print("") tag_count = FreqDist(tags) ts = list(tag_count.keys()) train_duration = time.time() - train_duration if VERBOSE: # tag stats print("* Training lasted:\t" + str(train_duration) + "ms") print("* Preprocessing lasted:\t{}ms ({}%)".format(str(pp_duration), str(int(pp_duration * 100/train_duration)))) print("* Tags occurences :") for t in tag_count: if t in ['<s>', '</s>']: continue print("{}\t{}".format(t, str(tag_count[t]))) print("* Type count :\t" + str(type_count)) print("* Token Count:\t" + str(len(words))) # graph generation # tag distribution #plt.barh(list(tag_count.keys()), list(tag_count.values())) #plt.xlabel("Number of Occurences") #plt.ylabel("Tags") #plt.title("Distribution of tags in the training data") # plt.show() # word frequency distribution #print(sorted(list(wc.keys()), key= lambda key: wc[key], reverse=True)[:9]) #plt.hist(x=list(wc.values()), bins='auto', log=True) #plt.ylabel("Frequency") #plt.xlabel("Number of word occurences") #plt.title("Word frequency distribution") #plt.savefig("wordfreq.png", dpi=400) return wt_pairs, words, types, wc, tags, total_bigrams, ts
>>> print(x) ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')'] >>> print (documents[0]) (['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg') >>> from random import shuffle >>> shuffle(documents) #shuffle the document list >>> >>> #Feature Exctraction >>> all_words = [word.lower() for word in movie_reviews.words()] >>> print (all_words[:10]) #print first 10 words ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party'] >>> >>> #Creating a frequency distribution >>> from nltk import FreqDist >>> all_words_frequency = FreqDist(all_words) >>> print (all_words_frequency) <FreqDist with 39768 samples and 1583820 outcomes> >>> print (all_words_frequency.most_common(10)) #print 10 most frequently occuring words [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822)] >>> >>> #Data Cleaning >>> from nltk.corpus import stopwords >>> stopwords_english = stopwords.words('english') >>> print(stopwords_english) ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"] >>> all_words_without_stopwords = [word for word in all_words if word not in stopwords_english] >>> print (all_words_without_stopwords[:10]) #print first 10 words ['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink'] >>> >>> #Remove punctuations
def calculate_emission_prob(self): for tag in self.tag_type_set: words = [w for (w, t) in self.unified_sentences if t == tag] self.emission_prob[tag] = WittenBellProbDist(FreqDist(words), bins=1e6)
cleaned_text_tokens = [] for tokens in text_tokens: cleaned_text_tokens.append(remove_noise(tokens, stop_words)) # compare before with after print(text_tokens[2510]) print(cleaned_text_tokens[2510]) ''' Determining Word Density ''' # not sensible to do this for one entry, so lets look at all... def get_all_words(cleaned_tokens_list): for tokens in cleaned_tokens_list: for token in tokens: yield token all_pos_words = get_all_words(cleaned_text_tokens) from nltk import FreqDist freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) ''' Preparing data for the model '''
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import sinica_treebank ################################################################## ## 简单测试 print( type(sinica_treebank) ) # <class 'nltk.corpus.reader.sinica_treebank.SinicaTreebankCorpusReader'> print(len(sinica_treebank.words())) # 91627 print(sinica_treebank.words() ) # ['一', '友情', '嘉珍', '和', '我', '住在', '同一條', '巷子', '我們', ...] # 去 ~/nltk_data/corpora/sinica_treebank/ 里面直接看会有好多的其他字符 ################################################################## ## 生成 中文拼音的 38k-cn-words-pinyin-sorted-by-frequency.txt import re from nltk import FreqDist from pypinyin import pinyin, lazy_pinyin, Style fd = FreqDist(sinica_treebank.words()) print(len(list(fd.keys()))) # 17273; 去重以后的结果 print(len(fd.most_common())) # 17273 str = ''.join([x[0] for x in fd.most_common()]) print(len(str)) # 38844 str = re.sub('[^\u4e00-\u9fa5]', '', str) print(len(str)) # 38225; 去掉标点符号 with open('38k-cn-words-pinyin-sorted-by-frequency.txt', 'w') as f: f.write('\n'.join(lazy_pinyin(str)))
def test_empty_string(self): test = NgramCounter("") self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
""" with open('moby.csv', encoding='utf-8-sig', mode='w') as fp: fp.write('pal|freq\n') for tag, count in ocm.items(): fp.write('{}|{}\n'.format(tag, count)) with open('zara.csv', encoding='utf-8-sig', mode='w') as fp: fp.write('pal|freq\n') for tag, count in octt.items(): fp.write('{}|{}\n'.format(tag, count)) """ Frecuencia """ from nltk import FreqDist fdist1 = FreqDist(ocm) fdist1.plot(25, cumulative=True) fdist2 = FreqDist(octt) fdist2.plot(25, cumulative=True) """ Mapa de dispersión """ from nltk.text import Text textListNLTK = Text(m) textListNLTK.concordance('also') marcommon = [] for list in comt[0:30]: for word in list[0]: marcommon.append(word)
def test_empty_list(self): test = NgramCounter([]) self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
#reading given corpus. words = [] with open('text8', 'r') as f: for line in f: for word in line.split(): words.append(word) f.close() ABSS_word = words[:100000] #finding most 10000 frequent words from given corpus. f = open('Most_Fre_data.txt', 'w') f.write('Words frequency in given corpus') f.write('\n') fdist = FreqDist(ABSS_word) dict1 = fdist.most_common(1000) dict1 = dict(dict1) sorted_dict = sorted(dict1.items(), key=itemgetter(1), reverse=True) lis = [item[0] for item in sorted_dict] lis.append("UNK") #string most 10000 frequent word with their frequency. for item in sorted_dict: f.write(str(item[0])) f.write(' = ') f.write(str(item[1])) f.write('\n') f.close()
def test_None(self): test = NgramCounter(None) self.assertNotIn(2, test) self.assertEqual(test[1], FreqDist())
def classify(self, sentence, k): self.query = sentence #idf self.idf = [] self.query = self.query.split() for i in range(len(self.query)): #menghitung byknya jml dokumen dengan term tertentu self.numDocWithThisTerm = 0 for j in range(self.allDoc): for k in range(len(self.tf[j])): if self.query[i] in self.tf[j][k]: self.numDocWithThisTerm = self.numDocWithThisTerm + 1 if self.numDocWithThisTerm > 0: self.temp = 1.0 + math.log( float(self.allDoc / self.numDocWithThisTerm)) else: self.temp = 1.0 self.idf = self.idf + [(self.query[i], self.temp)] #tfidf self.tfidf = [] for i in range(len(self.query)): for j in range(self.allDoc): #get tf dgn term tertentu for k in range(len(self.tf[j])): if self.query[i] in self.tf[j][k]: self.tfTemp = self.tf[j][k][1] break else: self.tfTemp = 0 #get idf dgn term tertentu for l in range(len(self.idf)): if self.query[i] in self.idf[l]: self.idfTemp = self.idf[l][1] self.tfidf = self.tfidf + [self.tfTemp * self.idfTemp] self.tfidf = [i for i in zip(*[iter(self.tfidf)] * self.allDoc)] #tf query self.freqQuery = FreqDist(self.query) self.tfQuery = self.freqQuery.most_common() #normalized tf query for i in range(len(self.tfQuery)): self.tfQuery[i] = (self.tfQuery[i][0], self.tfQuery[i][1] / len(self.query)) #idf query = idf (sama) #tf idf query self.tfidfQuery = [] for i in range(len(self.tfQuery)): self.tfidfQuery = self.tfidfQuery + [ self.tfQuery[i][1] * self.idf[i][1] ] #cosine similarity self.cosSim = [] for i in range(self.allDoc): self.dotProduct = 0 self.queryAll = 0 self.docI = 0 for j in range(len(self.tfQuery)): self.dotProduct = self.dotProduct + (self.tfidfQuery[j] * self.tfidf[j][i]) self.queryAll = self.queryAll + (self.tfidfQuery[j]**2) self.docI = self.docI + (self.tfidf[j][i]**2) if self.docI == 0: self.temp = 0 #mengatasi jika term sama sekali tdk muncul, shg hasil tdk akan menjadi tak hingga else: self.temp = self.dotProduct / (math.sqrt(self.queryAll) * math.sqrt(self.docI)) self.cosSim = self.cosSim + [self.temp] self.indexMax = [ i for i, val in enumerate(self.cosSim) if val == max(self.cosSim) ] self.closestClass = [] for i in range(len(self.indexMax)): if self.indexMax[i] >= len(self.allTermNeg): self.closestClass = self.closestClass + ['Positif'] else: self.closestClass = self.closestClass + ['Negatif'] self.freqClass = Counter(self.closestClass).most_common() if len(self.freqClass ) == 2 and self.freqClass[0][1] == self.freqClass[1][1]: return 'Positif/Negatif' elif k > len(self.closestClass): self.closestClass = sorted(self.closestClass, key=self.closestClass.count, reverse=True) return self.closestClass[0] else: #self.closestClass = sorted(self.closestClass, key = self.closestClass.count) return self.closestClass[k - 1]
def pmi(a, b): return log(pairs[a, b]) - log(pairs.N()) - log(unigrams[a]) - log( unigrams[b]) + 2 * log(unigrams.N()) h = FrameHierarchy.load() # training data contains a bad frame valid_names = {f.name for f in h._frames.values()} with codecs.open("../../../training/data/naacl2012/cv.train.sentences.json", encoding="utf8") as train_file: train = [json.loads(line) for line in train_file] unsorted_frames = ([(f['target']['spans'][0]['start'], f['target']['name']) for f in s['frames']] for s in train) frames = [[name for start, name in sorted(s) if name in valid_names] for s in unsorted_frames] del unsorted_frames unigrams = FreqDist(chain(*frames)) pairs = FreqDist( chain(*[[tuple(sorted(b)) for b in combinations(f, 2)] for f in frames])) pmis = FreqDist({(a, b): pmi(a, b) for a, b in pairs.keys() if unigrams[a] >= THRESHOLD and unigrams[b] >= THRESHOLD}) unigrams_with_ancestors = FreqDist(unigrams) for u in unigrams: for a in h.ancestors(h._frames[u]): unigrams_with_ancestors.inc(a.name)
len(set(text1))) # On average, each work in Moby dick occurs 13 times print( len(text6) / len(set(text6)) ) # As a comedic movie, we expect Monty Pyton to be less 'lexically rich' than # the novel Moby dick and we can see this is true from this statement! # === Part 3: Analysing frequency of words === # NLTK has a built in function that lets us find the frequency of words # 'Distribution' is a fancy word for what we get if we gather up all the individual frequencies of words and store # them somewhere. Hence the 'Frequency Distribution' of the words in a text is a sort of table of all the frequencies of # each word in a text from nltk import FreqDist fd = FreqDist( text1 ) # We create a frequency distribution of all the words in text1 ie Moby Dick # Now try: print(fd['the']) # Returns the number of times 'the' appears in the text print( fd.keys()[0:30] ) # Prints the first 30 unique words of the text in decreasing order of frequency print( fd.items() [0:30]) # Does the same as above but also shows the frequency of each word # === Part 4: Your task === # At the end of corupuses.py (you can comment out the above code if its not needed), write python code that: # 1.) Returns the 10 most frequent words in Obama's 2009 inaugural speech, including their frequencies
#print(texts[:300]) texts = texts.replace('\n', '') tokenizer = re.compile('[^ ㄱ-힣]+') texts = tokenizer.sub('', texts) tokens = word_tokenize(texts) noun_token = [] for token in tokens: token_pos = okt.pos(token) temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == "Noun"] if len(''.join(temp)) > 1: noun_token.append("".join(temp)) texts = " ".join(noun_token) with open(filename2, 'r', encoding='UTF-8') as f: stopwords = f.read() stopwords = stopwords.split(' ') texts = [text for text in tokens if text not in stopwords] freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False) print(freqtxt[:30]) wcloud = WordCloud(font, relative_scaling=0.2, background_color='white').generate(" ".join(texts)) plt.figure(figsize=(140, 80)) plt.imshow(wcloud, interpolation='bilinear') plt.axis("off") plt.show()
def __init__(self, text, id=None, category=None): self.text = text self.token_counts = FreqDist(normalized_tokens(text)) self.id = id self.category = category
corpus_root = '/path/to/text/files/directory' file = open("terrier-stop.txt") stopwords = file.read().splitlines() wordlists = PlaintextCorpusReader(corpus_root, '.*') texts = [] for item in wordlists.fileids(): try: temp = [word.lower() for word in wordlists.words(item) if word.lower() not in stopwords and word.isalpha()] texts.append(temp) except UnicodeError: print(word, item) flat_list = [item for sublist in texts for item in sublist] fdist = FreqDist(flat_list) dictionary = corpora.Dictionary(texts) dictionary.save('/path/to/saving/dictionary.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/path/to/save/corpus/new_corpus.mm', corpus) mm = gensim.corpora.MmCorpus('new_corpuss.mm') lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics= 5, update_every=2, chunksize=10, passes=1)
def __init__(self, tokenizer=None, max_size=None, min_freq=1): """Basic Vocabulary object""" self.vocab_size = 0 self.freqdist = FreqDist() self.tokenizer = tokenizer
from nltk import FreqDist from nltk.book import text3 as book_of_genesis # frequency distribution fdist = FreqDist(book_of_genesis) print(fdist) fdist.most_common(50) # 50 most common tokens # cumilative frequency plot fdist.plot(50, cumulative=True) # consider capitalization and ignore duplicates # words_set = set(book_of_genesis) # ignore capitalization and duplicates # words_set = set(word.lower() for word in book_of_genesis) # ignore capitalization, duplicates and non-alphabetic items (numbers and punctuation characters) words_set = set(word.lower() for word in book_of_genesis if word.isalpha()) # number of words len(words_set) # get words longer than 10 minimum_characters = 10 long_words = [word for word in words_set if len(word) > minimum_characters] sorted(long_words) # sorted alphabetically (capital letters first) # get words longer than 7 that occur more than 7 times minimum_characters = 7 minimum_frequency = 7
def GetMostFrequentWords(extracted, textProcesser, filterFun, topN): allwords = textProcesser(extracted) # textProcesser take in a piece of text and return words or phrases (n-gram) # remove punctuation, numeric value and ignoreWords cleanedWords = [word.lower() for word in allwords if filterFun(word.lower())] all_words = FreqDist(cleanedWords) return all_words.most_common()[:topN]
print(stopwords) nostopwords = [word for word in allTokens if word not in stopwords] # NLTK.Text text = nltk.Text(nostopwords) # Collocations (words that frequently appear together) colos = text.collocations() print(colos) # count print(text.count('inlet')) # words in similar contexts print(text.similar('ship')) text.dispersion_plot(['north', 'south', 'east', 'west']) text.dispersion_plot(['ship', 'dock', 'boat', 'canoe', 'steamboat']) # Frequency distributions! from nltk import FreqDist fdist = FreqDist(text) print(fdist.hapaxes()) # words that occur only once print(fdist.most_common(50)) fdist.plot(30)