def filter_words(words):
    new_words = FreqDist(words)
    stopwords = get_stop_words('ar')
    keys = new_words.keys()
    
    for word in keys:
        if word in stopwords:
            new_words.pop(word)
            
        if len(word) <= 2:
            new_words.pop(word)
            
    return new_words
Beispiel #2
0
def filter_words(words):
    new_words = FreqDist(words)
    stopwords = get_stop_words('ar')
    keys = new_words.keys()

    for word in keys:
        if word in stopwords:
            new_words.pop(word)

        if len(word) <= 2:
            new_words.pop(word)

    return new_words
Beispiel #3
0
 def worst_errors_many_wrong_decisions(self, k, feature_extractor):
     worst_errors = []
     features = []
     wrongDocs = self.error_prediction_docs(self.maintest, self.testClassify)
     for doc in wrongDocs:
         feature_dic = feature_extractor(movie_reviews.words(fileids=[doc]))
         features = features + feature_dic.keys()
     fd = FreqDist(feature.lower() for feature in features)
     for i in range(1, k+1):
         x = fd.max()
         fd.pop(x)
         worst_errors.append(x)
     return worst_errors
Beispiel #4
0
 def worst_errors_many_wrong_decisions(self, k, feature_extractor):
     worst_errors = []
     features = []
     wrongDocs = self.error_prediction_docs(self.maintest,
                                            self.testClassify)
     for doc in wrongDocs:
         feature_dic = feature_extractor(movie_reviews.words(fileids=[doc]))
         features = features + feature_dic.keys()
     fd = FreqDist(feature.lower() for feature in features)
     for i in range(1, k + 1):
         x = fd.max()
         fd.pop(x)
         worst_errors.append(x)
     return worst_errors
def word_count(text, exclude_inputlist):
    frequency = FreqDist(wd.lower() for wd in text if wd.isalpha())
    excludelist = stopwords.words('english') + exclude_inputlist
    for word in frequency.keys():
        if word in excludelist or frequency[word] < 2 or not word.isalpha(): frequency.pop(word)
    return frequency
Beispiel #6
0
# removing numeric digits from list of words
filteredStopwords = [i for i in filteredStopwords if not i.isdigit()]
freqDist = FreqDist(filteredStopwords)

print("In HHBD Hindi Bible")
print(f"प्रेम appears for {freqDist['प्रेम']} times")
print(f"डर appears for {freqDist['डर']} times")

checkWords = ["यीशु", "मसीह", "उद्धारकर्ता", "उद्धार", "क्रूस"]
checkWordFreq = {}
for checkWord in checkWords:
    checkWordFreq[checkWord] = freqDist[checkWord]

print(checkWordFreq)

freqDist.pop("राजा", None)

sents = []
for i in text.split("॥"):
    sents.append(i.split("।"))
sents = [item for sublist in sents for item in sublist]

from collections import defaultdict

ranking = defaultdict(int)
for i, sent in enumerate(sents):
    for token in tokenizeWord(sent):
        if token in freqDist:
            ranking[i] += freqDist[token]

from heapq import nlargest
Beispiel #7
0
    for x, y in itertools.zip_longest(f_list, f_list2, fillvalue=(0, 0)):

        rank_tup = tuple(['Rank:' + str(rank)])
        new_element = tuple([rank_tup + x])

        if x[1] > y[1]:
            ranked_list += new_element
            rank += 1

        elif x[1] == y[1]:
            ranked_list += new_element

    return ranked_list


print('Ranked frequency distribution in descending order of frequency',
      ranked_freq_dist(fdist))

# Removing stop words and punctuations from the words list.
filtered_words = [w for w in words if not w in stop_words and w.isalnum()]

# Creating a FreqDist object from the filtered list.
filtered_fdist = FreqDist(filtered_words)

# Plotting the top 10 words. I didnt plot the top 50 as the graph gets very hard to read on a small screen.
#filtered_fdist.plot(10, title='Frequency Distribution')

# Checking the number of occurences of the words 'America' and 'world'.
print('Occurences of \'America\':', filtered_fdist.pop('America'),
      '\nOccurences of \'world\':', filtered_fdist.pop('world'))