def __freqs_dict(self, raw_text): t_start = time() print('Making filtered text...') stopset = set(stopwords.words('russian')) ad = AlphabetDetector() tokens = word_tokenize(raw_text) tokens_filtered = [w.lower() for w in tokens if w not in stopset and w not in self.__custom_stopwords and w.isalpha() and len(w) >= self.__min_word_len and ad.is_cyrillic(w)] freqs_tokenized_text = FreqDist(tokens_filtered) freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words)) res_text = '' for item in freqs_most_common.items(): word = item[0] freq = item[1] for i in range(freq): res_text += word + ' ' t_end = time() print("TIME = %.2f s" % (t_end - t_start)) return res_text
print(s) #removing verbs from input file file_without_verbs = [word for word,tag in s if tag != 'VBG' and tag != 'VBZ' and tag!='VBN'] z=' '.join(file_without_verbs) # z is the file without verbs print(z) s1=nltk.pos_tag(nltk.word_tokenize(z)) print(s1) # you can see in the output that all the verbs are removed fdist=FreqDist(z) print(fdist) q=fdist.most_common(5) print(q) #word frequency of remaining words def tokens(text): """ Get all words from the corpus """ return re.findall('[a-z]+', text.lower()) WORD_COUNTS = collections.Counter(tokens(z)) print (WORD_COUNTS) print (WORD_COUNTS.most_common(5)) ##go through the original file
def most_freq_words(self, text, number): word_freq = FreqDist(text) words_counts = word_freq.most_common(number) words = [pair[0] for pair in words_counts] return words
else: data['Abstract'][i] = '' # In[415]: s = re.sub('[^A-Za-z0-9 ]+', '', s) #calculate frequency of word using nltk library from nltk.book import FreqDist #newlist is list of all words newlist = s.split() #normalize capitals- The and the are same for i in range(len(newlist)): newlist[i] = newlist[i].lower() fdist = FreqDist(newlist) #extracting most common i.e. most frequent words remlist = fdist.most_common(10) # print(remlist) rem = remlist for i in range(0, 10): rem[i] = remlist[i][0] # print(rem) # TASK 2- # In[416]: #this will remove frequent 10 words from whole corpus newlist = [w for w in newlist if w.lower() not in rem] #removal from each asbtract for i in range(100):
def main(): # Parsing user input parser = ap.ArgumentParser() parser.add_argument('-i', '--input', nargs='?', type=str, required=True, help='Input filename.') parser.add_argument('-c', '--concordance', nargs='?', type=str, default=None, help='Word concordance.') parser.add_argument('-d', '--dispersion', nargs='*', type=str, default=None, help='Word dispersion.') parser.add_argument('-f', '--frequency', nargs='?', type=int, default=None, help='Word frequency.') parser.add_argument('-a', '--acro', action='store_true', help='Acronyms only.') args = parser.parse_args() with open(args.input, 'r') as f: plain = f.read() plain = remove_comments(plain) words = nltk.word_tokenize(plain) if args.acro: words = [w for w in words if is_acro(w)] print '%d unique words out of %d total words.' % (len( set(words)), len(words)) text = nltk.Text(words) if args.concordance is not None: text.concordance(args.concordance) return if args.dispersion is not None: text.dispersion_plot(args.dispersion) return if args.frequency is not None: freq = FreqDist(text) for i, f in enumerate(freq.most_common(args.frequency)): print '%9d%9d %s' % (i, f[1], f[0]) freq.plot(args.frequency)
# ◑ Write a function that takes a list of words (containing duplicates) and returns a list of words (with no duplicates) sorted by decreasing frequency. E.g. if the input list contained 10 instances of the word table and 9 instances of the word chair, then table would appear before chair in the output list. from nltk.book import FreqDist words = ['this', 'is', 'my', 'list', 'of', 'list', 'of', 'list', 'is', 'this', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'words'] fdist = FreqDist(words) length = len(set(fdist)) answer = list(fdist.most_common(length)) answer.reverse() answer = [i[0] for i in answer] print(answer)
lemmatizer = nltk.WordNetLemmatizer() lemmatized = [lemmatizer.lemmatize(w, 'v') for w in text1] len(set(lemmatized)) # ANSWER1 ratio = len(set(text1)) / len(text1) #ANSWER2 freq = FreqDist(text1) freq['whale'] freq['Whale'] #ANSWER3 freq = FreqDist(text1) freq.most_common(n=20) #ANSWER4 freq = FreqDist(text1) freq_150 = sorted([key for key in freq if len(key) > 5 and freq[key] > 150]) freq_150 #ANSWER 5 words = list(set(text1)) longest = '' for word in words: if len(word) > len(longest): longest = word (longest, len(longest))
tokens1+=word else: tokens1.append(word) tokens1=sorted(tokens1) def lexical_diversity(text): return len(set(text)) / len(text) def percentage(word, tokens): return 100 * (tokens.count(word) / len(tokens)) fdist=FreqDist(tokens1) tokens1=fdist.most_common(len(tokens1)) print(fdist.most_common(10)) print('le mot %s est présent à ' % 'comme', percentage('comme',tokens), '% dans le texte') print('la diversité lexicale du texte est de ', lexical_diversity(tokens), '%') from pylab import * x = array([1, 3, 4, 6]) y = array([2, 3, 5, 1]) plot(x, y)
def question_three(): dist = FreqDist(text1) return dist.most_common(20)
def question_eight(): pos_tags = nltk.pos_tag(moby_tokens) pos_freq = FreqDist([pos_tag for (word, pos_tag) in pos_tags]) return pos_freq.most_common(5)