def chapter2_exercise17(): stopwords = stopwords.words("english") top_50 = FreqDist(brown.words(categories='romance')) temp = top_50.copy() for word in temp: if word in top_50 and word in stopwords: top_50.pop(word) return top_50.most_common(50)
def generate_vocab(tokens: list, min_token_len: int = 2, threshold: int = 2, remove_numbers=True): freq_dist = FreqDist(tokens) if remove_numbers: remove_digit_tokens(freq_dist) tokens = preprocess_tokens(tokens=list(freq_dist.keys()), min_token_len=min_token_len) removed_tokens = set(freq_dist.keys()).difference(tokens) for t in removed_tokens: freq_dist.pop(t, None) [freq_dist.pop(t, None) for t in tokens if freq_dist[t] < threshold] return freq_dist
def chapter2_exercise18(): freq_dist = FreqDist(brown.words(categories='humor')) stopwords_list = stopwords.words("english") for word in freq_dist.copy(): if word in freq_dist and (not word.isalpha() or word in stopwords_list): freq_dist.pop(word) bigrams_dist = FreqDist([ (item1, item2) for item1, item2 in nltk.bigrams(brown.words(categories='humor')) if item1 in freq_dist and item2 in freq_dist ]) return bigrams_dist.most_common(50)
print("Collected data from " + str(count_submission) + " submissions and " + str(count_comment) + " comments.") #Use nltk to convert string into tokens (words and puncuation) tokens = word_tokenize(raw_text) #Use nltk to find frequency distribution of words fdist = FreqDist(tokens) #Remove tokens with only 1 or 2 characters short_words = [] for i in fdist: if len(i) < 3: short_words.append(i) for s in short_words: fdist.pop(s) #Remove common but useless tokens stop = set(stopwords.words('english')) #get pre-defined stop words additional_stop = ['https', 'http', 'n\'t', 'The', 'This', 'That', '...'] #add additional stop words for a in additional_stop: stop.add(a) for s in stop: try: fdist.pop(s) except: pass #Needs some work here to stem the tokens
punctuations = [",", ".", '" "', ';', '-', ':', '."', '"', "'"] # here for and if loop are used to filter out punctuation and stop words for words in clintonwords: #to filter stop words if words not in englishstopwords: #to filter stopwords (first letter capitalzie) if words not in englishcapitalize: #to filter punctuations if words not in punctuations: frequentlyOccuring.append(words) else: pass #print(frequentlyOccuring) #create Freuency Distribution Class frequencydist = FreqDist(frequentlyOccuring) #print 50 frequent words print(frequencydist.most_common(50)) print("Plot the top 50 words") #creating plot frequencydist.plot(50) print( "Find out how many times the words world and america were used in the speech :" ) worldcount = frequencydist.pop("world") americacount = frequencydist.pop('America') print("Count of America :", americacount) print("Count of world :", worldcount)
totalNumOfWords = len(inaugural.words('1993-Clinton.txt')) print("Total number of words in 1993-Clinton's Speech = ", totalNumOfWords) print("Total Distinct Words = ", len(set(inaugural.words('1993-Clinton.txt')))) count = 0 for i in inaugural.words('1993-Clinton.txt'): count = count + len(i) print("Average Length of Words = ", round(count / totalNumOfWords)) ######################################## print(inaugural.words('1993-Clinton.txt')) allWords = inaugural.words('1993-Clinton.txt') lowerCase = [i for i in allWords if i.islower()] freqDist = FreqDist(lowerCase) print(freqDist) print(freqDist.most_common(10)) stopWords = stopwords.words("english") notStopWords = [i for i in allWords if i not in stopWords] freqDist02 = FreqDist(notStopWords) print(freqDist02.most_common(10)) plotWords = FreqDist(allWords) plotWords.plot(10) print("World " + str(freqDist02.pop("world"))) print("america " + str(freqDist02.pop("America")))