def __freqs_dict(self, raw_text):

        t_start = time()
        print('Making filtered text...')

        stopset = set(stopwords.words('russian'))
        ad = AlphabetDetector()

        tokens = word_tokenize(raw_text)
        tokens_filtered = [w.lower() for w in tokens
                           if w not in stopset
                           and w not in self.__custom_stopwords
                           and w.isalpha()
                           and len(w) >= self.__min_word_len
                           and ad.is_cyrillic(w)]


        freqs_tokenized_text = FreqDist(tokens_filtered)
        freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words))

        res_text = ''
        for item in freqs_most_common.items():
            word = item[0]
            freq = item[1]
            for i in range(freq):
                res_text += word + ' '

        t_end = time()
        print("TIME = %.2f s" % (t_end - t_start))

        return res_text
Example #2
0
print(s)

#removing verbs from input file
file_without_verbs = [word for word,tag in s if tag != 'VBG' and tag != 'VBZ' and tag!='VBN']
z=' '.join(file_without_verbs)         # z is the file without verbs
print(z)
s1=nltk.pos_tag(nltk.word_tokenize(z))
print(s1)                              # you can see in the output that all the verbs are removed





fdist=FreqDist(z)
print(fdist)
q=fdist.most_common(5)
print(q)

#word frequency of remaining words
def tokens(text):
    """
    Get all words from the corpus
    """
    return re.findall('[a-z]+', text.lower())
WORD_COUNTS = collections.Counter(tokens(z))
print (WORD_COUNTS)
print (WORD_COUNTS.most_common(5))


##go through the original file
Example #3
0
 def most_freq_words(self, text, number):
     word_freq = FreqDist(text)
     words_counts = word_freq.most_common(number)
     words = [pair[0] for pair in words_counts]
     return words
    else:
        data['Abstract'][i] = ''

# In[415]:

s = re.sub('[^A-Za-z0-9 ]+', '', s)
#calculate frequency of word using nltk library
from nltk.book import FreqDist
#newlist is list of all words
newlist = s.split()
#normalize capitals- The and the are same
for i in range(len(newlist)):
    newlist[i] = newlist[i].lower()
fdist = FreqDist(newlist)
#extracting most common i.e. most frequent words
remlist = fdist.most_common(10)
# print(remlist)
rem = remlist
for i in range(0, 10):
    rem[i] = remlist[i][0]
# print(rem)

# TASK 2-

# In[416]:

#this will remove frequent 10 words from whole corpus
newlist = [w for w in newlist if w.lower() not in rem]

#removal from each asbtract
for i in range(100):
Example #5
0
def main():

    # Parsing user input
    parser = ap.ArgumentParser()
    parser.add_argument('-i',
                        '--input',
                        nargs='?',
                        type=str,
                        required=True,
                        help='Input filename.')
    parser.add_argument('-c',
                        '--concordance',
                        nargs='?',
                        type=str,
                        default=None,
                        help='Word concordance.')
    parser.add_argument('-d',
                        '--dispersion',
                        nargs='*',
                        type=str,
                        default=None,
                        help='Word dispersion.')
    parser.add_argument('-f',
                        '--frequency',
                        nargs='?',
                        type=int,
                        default=None,
                        help='Word frequency.')
    parser.add_argument('-a',
                        '--acro',
                        action='store_true',
                        help='Acronyms only.')
    args = parser.parse_args()

    with open(args.input, 'r') as f:
        plain = f.read()

    plain = remove_comments(plain)

    words = nltk.word_tokenize(plain)

    if args.acro:
        words = [w for w in words if is_acro(w)]

    print '%d unique words out of %d total words.' % (len(
        set(words)), len(words))

    text = nltk.Text(words)

    if args.concordance is not None:
        text.concordance(args.concordance)
        return

    if args.dispersion is not None:
        text.dispersion_plot(args.dispersion)
        return

    if args.frequency is not None:
        freq = FreqDist(text)
        for i, f in enumerate(freq.most_common(args.frequency)):
            print '%9d%9d %s' % (i, f[1], f[0])
        freq.plot(args.frequency)
Example #6
0
# ◑ Write a function that takes a list of words (containing duplicates) and returns a list of words (with no duplicates) sorted by decreasing frequency. E.g. if the input list contained 10 instances of the word table and 9 instances of the word chair, then table would appear before chair in the output list.

from nltk.book import FreqDist


words = ['this', 'is', 'my', 'list', 'of', 'list', 'of', 'list', 'is', 'this', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'words']

fdist = FreqDist(words)
length = len(set(fdist))
answer = list(fdist.most_common(length))
answer.reverse()
answer = [i[0] for i in answer]
print(answer)
Example #7
0
lemmatizer = nltk.WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w, 'v') for w in text1]
len(set(lemmatized))

# ANSWER1
ratio = len(set(text1)) / len(text1)

#ANSWER2
freq = FreqDist(text1)
freq['whale']
freq['Whale']

#ANSWER3
freq = FreqDist(text1)
freq.most_common(n=20)

#ANSWER4

freq = FreqDist(text1)
freq_150 = sorted([key for key in freq if len(key) > 5 and freq[key] > 150])
freq_150
#ANSWER 5

words = list(set(text1))
longest = ''
for word in words:
    if len(word) > len(longest):
        longest = word
(longest, len(longest))
Example #8
0
        tokens1+=word
    else:
        tokens1.append(word)


tokens1=sorted(tokens1)

def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(word, tokens):
    return 100 * (tokens.count(word) / len(tokens))


fdist=FreqDist(tokens1)
tokens1=fdist.most_common(len(tokens1))

print(fdist.most_common(10))


print('le mot %s est présent à ' % 'comme', percentage('comme',tokens), '% dans le texte')
print('la diversité lexicale du texte est de ', lexical_diversity(tokens), '%')



from pylab import *

x = array([1, 3, 4, 6])
y = array([2, 3, 5, 1])
plot(x, y)
def question_three():
    dist = FreqDist(text1)
    return dist.most_common(20)
def question_eight():
    pos_tags = nltk.pos_tag(moby_tokens)
    pos_freq = FreqDist([pos_tag for (word, pos_tag) in pos_tags])
    return pos_freq.most_common(5)