Example #1
0
def chapter2_exercise17():
    stopwords = stopwords.words("english")
    top_50 = FreqDist(brown.words(categories='romance'))
    temp = top_50.copy()
    for word in temp:
        if word in top_50 and word in stopwords:
            top_50.pop(word)
    return top_50.most_common(50)
def generate_vocab(tokens: list, min_token_len: int = 2, threshold: int = 2, remove_numbers=True):
    freq_dist = FreqDist(tokens)
    if remove_numbers:
        remove_digit_tokens(freq_dist)
    tokens = preprocess_tokens(tokens=list(freq_dist.keys()), min_token_len=min_token_len)
    removed_tokens = set(freq_dist.keys()).difference(tokens)
    for t in removed_tokens:
        freq_dist.pop(t, None)
    [freq_dist.pop(t, None) for t in tokens if freq_dist[t] < threshold]
    return freq_dist
Example #3
0
def chapter2_exercise18():
    freq_dist = FreqDist(brown.words(categories='humor'))
    stopwords_list = stopwords.words("english")
    for word in freq_dist.copy():
        if word in freq_dist and (not word.isalpha()
                                  or word in stopwords_list):
            freq_dist.pop(word)
    bigrams_dist = FreqDist([
        (item1, item2)
        for item1, item2 in nltk.bigrams(brown.words(categories='humor'))
        if item1 in freq_dist and item2 in freq_dist
    ])
    return bigrams_dist.most_common(50)
Example #4
0
print("Collected data from " + str(count_submission) + " submissions and " +
      str(count_comment) + " comments.")

#Use nltk to convert string into tokens (words and puncuation)
tokens = word_tokenize(raw_text)

#Use nltk to find frequency distribution of words
fdist = FreqDist(tokens)

#Remove tokens with only 1 or 2 characters
short_words = []
for i in fdist:
    if len(i) < 3:
        short_words.append(i)
for s in short_words:
    fdist.pop(s)

#Remove common but useless tokens
stop = set(stopwords.words('english'))  #get pre-defined stop words
additional_stop = ['https', 'http', 'n\'t', 'The', 'This', 'That',
                   '...']  #add additional stop words
for a in additional_stop:
    stop.add(a)
for s in stop:
    try:
        fdist.pop(s)
    except:
        pass

#Needs some work here to stem the tokens
Example #5
0
punctuations = [",", ".", '" "', ';', '-', ':', '."', '"', "'"]
# here for and if loop are used to filter out punctuation and stop words
for words in clintonwords:
    #to filter stop words
    if words not in englishstopwords:
        #to filter stopwords (first letter capitalzie)
        if words not in englishcapitalize:
            #to filter punctuations
            if words not in punctuations:
                frequentlyOccuring.append(words)

    else:
        pass

#print(frequentlyOccuring)
#create Freuency Distribution Class
frequencydist = FreqDist(frequentlyOccuring)
#print 50 frequent words
print(frequencydist.most_common(50))
print("Plot the top 50 words")
#creating plot
frequencydist.plot(50)
print(
    "Find out how many times the words world and america were used in the speech :"
)
worldcount = frequencydist.pop("world")
americacount = frequencydist.pop('America')
print("Count of America :", americacount)
print("Count of world   :", worldcount)
totalNumOfWords = len(inaugural.words('1993-Clinton.txt'))
print("Total number of words in 1993-Clinton's Speech = ", totalNumOfWords)

print("Total Distinct Words = ", len(set(inaugural.words('1993-Clinton.txt'))))

count = 0
for i in inaugural.words('1993-Clinton.txt'):
    count = count + len(i)
print("Average Length of Words = ", round(count / totalNumOfWords))

########################################

print(inaugural.words('1993-Clinton.txt'))

allWords = inaugural.words('1993-Clinton.txt')
lowerCase = [i for i in allWords if i.islower()]
freqDist = FreqDist(lowerCase)
print(freqDist)
print(freqDist.most_common(10))

stopWords = stopwords.words("english")
notStopWords = [i for i in allWords if i not in stopWords]
freqDist02 = FreqDist(notStopWords)
print(freqDist02.most_common(10))

plotWords = FreqDist(allWords)
plotWords.plot(10)

print("World " + str(freqDist02.pop("world")))
print("america " + str(freqDist02.pop("America")))