Ejemplo n.º 1
0
#18. ◑ Generate some statistics for tagged data to answer the following questions:
#a. What proportion of word types are always assigned the same part-of-speech
#tag?
#b. How many words are ambiguous, in the sense that they appear with at least
#two tags?
#c. What percentage of word tokens in the Brown Corpus involve these ambiguous
#words?

from __future__ import division
import nltk
from collections import defaultdict
from operator import itemgetter

brown = nltk.corpus.brown.tagged_words()

cfd = nltk.ConditionalFreqDist(brown)

tot_types = len(cfd)
print tot_types

#onetag = 0
#for w in cfd:
    #if len(cfd[w]) == 1:
        #onetag += 1
onetag = len([w for w in cfd if len(cfd[w])==1])        
print onetag

print onetag / tot_types * 100

ambig = [w for w in cfd if len(cfd[w])>1]
print len(ambig)
Ejemplo n.º 2
0
# ◑ Obtain some tagged data for another language, and train and evaluate a variety of taggers on it. If the language is morphologically complex, or if there are any orthographic clues (e.g. capitalization) to word classes, consider developing a regular expression tagger for it (ordered after the unigram tagger, and before the default tagger). How does the accuracy of your tagger(s) compare with the same taggers run on English data? Discuss any issues you encounter in applying these methods to the language.

import nltk
from nltk.corpus import floresta

text = floresta.words()
floresta_tagged_sents = floresta.tagged_sents()
floresta_tagged_words = floresta.tagged_words()
fd = nltk.FreqDist(text)
cfd = nltk.ConditionalFreqDist(floresta_tagged_words)
most_freq_words = fd.most_common(100)

# lookup tagger for likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)

# trained unigram tagger
size = int(len(floresta_tagged_sents) * 0.9)
training_data = floresta_tagged_sents[:size]
test_data = (floresta_tagged_sents[size:])

uni_tagger = nltk.UnigramTagger(training_data)
print(uni_tagger.evaluate(test_data))
Ejemplo n.º 3
0
    for i in range(len(s) - k + 1):
        result = result + s[i:i + k] + ' '
    return result


all_sent = ''
word_train = open(r'./temp/word_train.txt', mode='r', encoding='utf-8')
for line in word_train.readlines():
    sent = getkmer(line.strip(), 2)
    all_sent = all_sent + sent

all_token = nltk.word_tokenize(all_sent)
bigram_kmer = nltk.bigrams(all_token)
#print(all_token)
#print(list(bigram_kmer))
cfd = nltk.ConditionalFreqDist(bigram_kmer)
print(len(cfd.conditions()))  #fd = cfd['ny']
l_1mer = list_1mer()
l_2mer = dict_2mer(l_1mer)
mers = []
for mer in l_2mer:
    if mer not in cfd.conditions():
        mers.append(mer)
print(mers)
print(len(mers))

#fd.plot(10)
classfile = open(r'./input/tokens.txt', mode='r', encoding='utf-8')
scorefile = open(r'./output/scores.txt', mode='w', encoding='utf-8')
for line in classfile.readlines():
    kmer_string = nltk.word_tokenize(getkmer(line.strip(), 2))
Ejemplo n.º 4
0
#2 Use the corpus module to explore austen-persuasion.txt. How many word tokens does this book have? How many word types?
austen_persuasion = gutenberg.words('austen-persuasion.txt')
print("Number of word tokens = ", len(austen_persuasion))
print("Number of word types = ", len(set(austen_persuasion)))

#3 Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres.
print(brown.categories())
news_data = brown.words(categories='news')
religion_data = brown.words(categories='religion')

#4 Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time?
print(state_union.fileids())
#cfd for inaugral address speeches for each president showing count of words american and citizen each speech
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in state_union.fileids()
                               for w in state_union.words(fileid)
                               for target in ['men', 'women']
                               if w.lower().startswith(target))
#cfd.plot()

#5 Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().
house = wn.synsets('house')
print(house)
house = wn.synset('house.n.01')
print(house.lemma_names())
print(house.definition())
print(house.examples())
print(house.member_meronyms())
print(house.part_meronyms())
print(house.substance_meronyms())
print(house.member_holonyms())
Ejemplo n.º 5
0
    fdist = pickle.load(f)
    f.close()
else:
    f = open('lyrics.pkl', 'rb')
    data = pickle.load(f)
    f.close()
    lis = []
    cnt = 0
    for _ in data:
        js = [__ for __ in data[_]]
        lis += js
        cnt += 1
        print(cnt)

    bigram = list(nltk.bigrams(lis))
    fdist = nltk.ConditionalFreqDist(bigram)
    f = open('fdist.pkl', 'wb')
    pickle.dump(fdist, f, -1)
    f.close()

f = open('test_data_long.pkl', 'rb')
test_data = pickle.load(f)
f.close()

Ans = []
cnt = 0

for in_data, out_data in test_data:
    ans = ''.join(in_data)
    bg = in_data[-1]
    for _ in range(10):
Ejemplo n.º 6
0
def model_from_bigrams():
    text = reader.words(categories='critique_a')
    bigrams = nltk.bigrams(text)
    cfd = nltk.ConditionalFreqDist(bigrams)
    generate_model(cfd, 'lui')
Ejemplo n.º 7
0
        sum(pp_lengths) / len(pp_lengths),
        max(pp_lengths),
        sum(avp_lengths) / len(avp_lengths),
        max(avp_lengths),
        len(doc)
    ]


##############
##Perplexity##
##############

freq_brown_1gram = nltk.FreqDist(brown.words())
len_brown = len(brown.words())

cfreq_brown_2gram = nltk.ConditionalFreqDist(nltk.bigrams(brown.words()))
cprob_brown_2gram = nltk.ConditionalProbDist(cfreq_brown_2gram,
                                             nltk.MLEProbDist)

brown_trigrams = nltk.ngrams(brown.words(), 3)
condition_pairs = (((w0, w1), w2) for w0, w1, w2 in brown_trigrams)
cfreq_brown_3gram = nltk.ConditionalFreqDist(condition_pairs)
cprob_brown_3gram = nltk.ConditionalProbDist(cfreq_brown_3gram,
                                             nltk.MLEProbDist)


def unigram_prob(word):
    return freq_brown_1gram[word] / len_brown


def bigram_prob(word1, word2):
Ejemplo n.º 8
0
def get_trigram_freq(tokens):
    tgs = list(nltk.trigrams(tokens))

    a, b, c = list(zip(*tgs))
    bgs = list(zip(a, b))
    return nltk.ConditionalFreqDist(list(zip(bgs, c)))
Ejemplo n.º 9
0
def get_bigram_freq(tokens):
    bgs = list(nltk.bigrams(tokens))

    return nltk.ConditionalFreqDist(bgs)
Ejemplo n.º 10
0
data = open("data2.csv",newline='', encoding='utf-8')
my_list = data.readlines()
for row in my_list:
    newVec = row.split("|")
    if len(newVec) == 7 and newVec[5] == "false":
        token = nltk.word_tokenize(newVec[1])
        convertedFile.append(token)
        bigrams.append(list(ngrams(token,2)))
#         bigrams.append(list(ngrams(token,3)))
            


# In[3]:


cfd = nltk.ConditionalFreqDist()

for each_list_of_gram in bigrams:
    for words in each_list_of_gram:
        condition = words[0]
        cfd[condition][words[1]]+=1
        


# In[4]:


numWords = randint(10, 40)

tweetArr = []
wordFreq = {}
Ejemplo n.º 11
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 18:35:12 2018

@author: prathumarayatamanee
"""
# 18. Generate some statistics for tagged data to answer the following questions:
#18 คำไหนกำกวม (ติด tag มากกว่า 2 อันขึ้นไป)
from nltk.corpus import brown
import nltk

#setting things up
brown_tagged_words = brown.tagged_words(categories='news')
cfd = nltk.ConditionalFreqDist(brown_tagged_words)
conditions = cfd.conditions()

# creates a new array of word types that only have one distinct word tag
mono_tags = [condition for condition in conditions if len(cfd[condition]) == 1]

# 1. What proportion of word types are always assigned the same part-of-speech tag?
# answers number one - the proportion of tags that have only one POS tag.
proportion_mono_tags = len(mono_tags) / len(conditions)
#print(proportion_mono_tags)
#print('_______________________________________')

# 2. How many words are ambiguous, in the sense that they appear with at least two tags?
# answers number two - the number of ambiguous words.

freq2D = nltk.ConditionalFreqDist(
    nltk.corpus.brown.tagged_words(tagset='universal'))
Ejemplo n.º 12
0
from nltk.corpus import orown
brown_news_tagged = brown.tagged_words(categories='news', simplify_tags=True)
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()

#find most frequent tags before a noun
word_tag_pairs = nltk.bigrams(brown_news_tagged)
list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'N'))

#find most frequent verbs in news text
wsj = nltk.corpus.treebank.tagged_words(simplify_tags=True)
word_tag_fd = nltk.FreqDist(wsj)
[word + '/' + tag for (word, tag) in word_tag_fd if tag.startswith('V')]

#build a conditional frequency distribution in which words are conditions, can find most possible tags given a word
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['field'].keys()
cfd1['cut'].keys()

#use tag as condition, can find most possible words given a tag
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
cfd2['VN'].keys()

#explore the tags after 'often'
brown_lrnd_tagged = brown.tagged_words(categories='learned',
                                       simplify_tags=True)
tags = [b[1] for (a, b) in brown_lrnd_tagged if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

Ejemplo n.º 13
0
 def create_trigram(self):
     self.trigram_model = list(ngrams(self.tokens, 3))
     trigrams_as_bigrams = []
     trigrams_as_bigrams.extend([((t[0],t[1]), t[2]) for t in self.trigram_model])
     self.trigram_freq = nltk.ConditionalFreqDist(trigrams_as_bigrams)
Ejemplo n.º 14
0
 def create_bigram(self):
     self.bigram_model = list(ngrams(self.tokens, 2))
     self.bigram_freq = nltk.ConditionalFreqDist(self.bigram_model)
Ejemplo n.º 15
0
#This is for my corpus (indian)

import nltk
from nltk.corpus import indian
import matplotlib as cdf

print(indian.raw())
print(indian.fileids())
print(indian.sents())

import matplotlib

word1 = 'country'
word2 = 'city'
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in indian.fileids()
                               for w in indian.words(fileid)
                               for target in [word1, word2]
                               if w.lower().startswith(target))
cfd.plot()
Ejemplo n.º 16
0
import nltk
# Natural Language Toolkit: code_random_text


def generate_model(cfdist, word, num=15):
    for i in range(num):
        print word,
        word = cfdist[word].max()


text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)  # [_bigram-condition]
Ejemplo n.º 17
0
# Transition probability 찾기
tagged_words = []
all_tags=[]
#nltk.corpus.brown.tagged_sents(tagset='universal')[0]

for sent in nltk.corpus.brown.tagged_sents(tagset='universal'):
    tagged_words.append(("START","START"))
    all_tags.append("START")
    for (word, tag) in sent:
        all_tags.append(tag)
        tagged_words.append( (tag,word))
    tagged_words.append( ("END",'END'))
    all_tags.append("END")

cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(all_tags))
cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

print("Count('DET','NOUN') =", cfd_tags['DET']['NOUN'])
print("P('NOUN | 'DET') =", cpd_tags['DET'].prob('NOUN'))

#Emission probability 찾기
cfd_tagwords = nltk.ConditionalFreqDist(tagged_words)

cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

print("Count('DET','the') =", cfd_tagwords['DET']['the'])
print("P('the'|'DET')=", cpd_tagwords['DET'].prob('the'))


#p56
Ejemplo n.º 18
0
# -*- coding: utf-8 -*-
"""
@author: WQ
"""
#2.使用语料库模型处理 austen—persuasion.txt。统计整个语料库有多少词标识符?多少词类型
import nltk
nltk.corpus.gutenberg.fileids()
tmp1 = nltk.corpus.gutenberg.words('austen-persuasion.txt')
print(u'词标识符数目:',len(tmp1))       
print(u'词类型数目',len(set(tmp1)))  

#8.在名字语料库上定义一个条件频率分布,显示哪个首字母在男性名字中比在女性名字中更常用
from nltk.corpus import names
cfd = nltk.ConditionalFreqDist(
    (fileid, name[0])
    for fileid in names.fileids()
    for name in names.words(fileid)
)
cfd.plot()

#9.选两个文本, 研究它们之间在词汇、 词汇丰富性、 文体等方面的差异。 
#比较两个文本的词汇量、词汇丰富性
from nltk.book import *

print(len(set(text1)))      # 白鲸记中的词汇量
print(len(set(text2)))      # 理智与情感中的词汇量

def lexical_diversity(text):
    return len(text)/float(len(set(text)))
    
print(lexical_diversity(text1))
Ejemplo n.º 19
0
# ű u'\u0171'

vowel_to_ascii = [(u'á', 'a'), (u'é', 'e'), (u'\xf3', 'o'), (u'\u0151', 'o'),
                  (u'\u0171', 'u')]

# this listcomp doesn't work, and i don't feel like figuring out why
#ascii_bigrams = [ b.replace(unicode_vowel, ascii_vowel)
#                  for unicode_vowel, ascii_vowel in vowel_to_ascii
#                  for b in bigrams ]
# and watch out for modifying lists that you're looping over!!! less confusing to use C-style...
ascii_bigrams = bigrams  # for correct size AND initial values
for i in range(len(ascii_bigrams)):  #range(len(ascii_bigrams)):
    ascii_bigram = ascii_bigrams[i]
    for unicode_vowel, ascii_vowel in vowel_to_ascii:
        ascii_bigram = ascii_bigram.replace(unicode_vowel, ascii_vowel)
    ascii_bigrams[i] = ascii_bigram

for i in range(len(bigrams)):
    if bigrams[i] != ascii_bigrams[i]:
        print bigrams[i], ascii_bigrams[i]

print nltk.ConditionalFreqDist(ascii_bigrams).items()
nltk.ConditionalFreqDist(ascii_bigrams).tabulate()

# results
#      a    e    i    u
# a    0    1   12    1
# e    3    3    9    0
# i    8    4    3    0
# o    1    0    1    0
# u    0    1    0    0
Ejemplo n.º 20
0
def analyze_single_file(fname, fixflag=False):
    total_tokens = 0
    lines = {}

    # Meta info line
    lines["meta"] = {}
    # Non-alphabetical tokens
    lines["non_alpha_form"] = {}
    # Unreadable lemmas (Non-utf8 lemma)
    list_non_utf8 = []
    # Form-Lemma pairs
    fl_pairs = []

    if fixflag:
        filefix = open(fname + '-fix', "w")

    with open(fname, encoding='latin') as file:
        for index, line in enumerate(file):
            # Read the file in latin and encode it again to use the readline but bypass the utf-8 error
            line_bytes = line.encode('latin')
            while True:
                try:
                    line = line_bytes.decode('utf-8')
                    break
                except UnicodeDecodeError as inst:
                    # After we tackle the UnicodeDecodeError it should be all fine.
                    old_char = line_bytes[inst.args[2]]
                    line_bytes = line_bytes.replace(
                        bytes([old_char]),
                        bytes([old_char - (ord('a') - ord('A'))]))
                    # Finish save!
                    # Do the statistic
                    list_non_utf8 += [(index, old_char)]

            # Filter out the lines of the begin|end document meta info
            if bool(re.match("#(?:begin|end)\sdocument", line)):
                lines["meta"][index] = line
                if fixflag:
                    filefix.writelines(line)
                continue

            # Start to count the tokens
            total_tokens += 1

            form, lemma = splitlemma(line)

            # lowercase lemma
            lemma = lemma.lower()

            if not form.isalpha() and lemma.isalpha():
                lines["non_alpha_form"][index] = line

            fl_pairs += [(lemma, form)]
            if fixflag:
                filefix.writelines(form + '\t' + lemma + '\n')

    if fixflag:
        filefix.close()

    print("===")
    print(fname)
    # Show the lines statistics
    print(list((x, len(lines[x])) for x in lines))

    print("How many lines contain non-utf8:")
    print(len(nltk.Index([(x, y) for x, y in list_non_utf8])))

    print("How many kinds of first byte error non-utf8")
    print(sorted(list(nltk.Index([(hex(y), x) for x, y in list_non_utf8]))))

    print("---")
    # Start Lemma statistics
    print("Total Terms: {}".format(len(fl_pairs)))
    idx = nltk.Index(fl_pairs)
    list_lemma_freq = [(x, len(idx[x])) for x in idx]
    list_lemma_freq.sort(key=lambda x: x[1], reverse=True)
    topn = 20
    print("Lemma frequncy Top{}:".format(topn))
    print(list_lemma_freq[:topn])
    print(sum([x[1] for x in list_lemma_freq[:topn]]) /
          len(fl_pairs))  # Top20 40.5% Top100 53%

    idx_set = nltk.Index(set(fl_pairs))
    list_form_uni = [x for x in idx_set if len(idx[x]) == 1]
    print("Lemma just has one form ratio:")  # About 50%
    print(len(list_form_uni) / len(idx_set))

    print("Lemma et form est identical:")
    print(len([x
               for (x, y) in fl_pairs if x == y]) / len(fl_pairs))  # About 68%

    cfd = nltk.ConditionalFreqDist(fl_pairs)
    print("Show lemma 'be' show in which form and how many times for each")
    print(json.dumps(cfd['be'], ensure_ascii=False))

    list_lemma_forme_freq = [(x, len(cfd[x])) for index, x in enumerate(cfd)]
    list_lemma_forme_freq.sort(key=lambda x: x[1], reverse=True)

    return fl_pairs
def sent_gen():
    def generate(word):
        sentence = []
        sentence.append(word)

        for i in range(random.randint(8, 16)):
            #print(word)
            if word in cfd:
                word = random.choice(list(cfd[word].keys()))
                sentence.append(word.strip())
            else:
                break
        return sentence

    def connect(sentence, connectors):

        for i in range(len(sentence)):
            remove = random.random()
            if len(sentence[i]) == (3 or 2) and (
                    remove < 0.3
            ):  #30% chance of replacing with a standard Shakespeare connector
                sentence[i] = random.choice(connectors)

        return sentence

    def format(sentence):

        print(sentence)
        run = True
        i = 1
        if (ord(sentence[0][0]) in list(range(34, 65))):
            del sentence[0]
        if sentence[0].islower() == True:  #Capitalize the first word
            sentence[0] = sentence[0].capitalize()

        while run:
            try:
                #print(sentence[i][0])
                if (ord(sentence[i][0]) in list(range(34,46))) or\
                    (ord(sentence[i][0]) in list(range(47,65))) or\
                    (ord(sentence[i][0]) in list(range(91,97))):
                    del sentence[i]
                    continue

                if sentence[i - 1] == ("." or "?" or "!"):
                    sentence[i] = sentence[i].capitalize()
                else:
                    if len(sentence[i]) < 5:
                        sentence[i] = sentence[i].lower(
                        )  #If the word length is less than 4(probably not a name), set to lowercase

                if i == len(sentence) - 1:
                    run = False
                i += 1
            except:
                break

        if ord(sentence[len(sentence) - 1][0]) != (
                46 or 63):  #if the last element is not '.' or '!' append '.'
            sentence.append(".")

        return sentence

    #------------------------------------------------------------------------------
    nltk.corpus.gutenberg.fileids()

    text = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')

    bigrams = nltk.bigrams(text)  #builds a list of consecutive word pairs
    cfd = nltk.ConditionalFreqDist(bigrams)  #tabulates/counts each bigram

    word = random.choice(text)
    connectors = ['thou', 'thee', 'thou', 'thine',
                  'thy']  #Hardcoded words/phrases

    love_phrases = [
        "Thou art more lovely and more temperate.",
        "There’s beggary in love that can be reckoned.", "Love sought is good",
        "My heart fly to your service.", "One half of me is yours.",
        "Who ever loved that loved not at first sight?.",
        "Men’s vows are women’s traitors.", "You have witchcraft in your lips."
    ]

    sentence = generate(word)
    sentence = connect(sentence, connectors)
    sentence = format(sentence)

    sentpt0 = "Here is a random piece of Shakespeare:"
    sentpt1 = ' '.join(sentence)
    sentpt2 = random.choice(love_phrases)
    sentpt3 = "Happy Valentines day."

    return sentpt0 + "\n\n" + sentpt1 + "\n" + sentpt2 + "\n\n" + sentpt3 + "\n"
Ejemplo n.º 22
0
import pandas as pd
import random 
import nltk
df=pd.read_csv('cleanedtweets.csv')
def f(x):
    return x.split()

def makepairs(arr):
    pairs=[]
    for i in range(len(arr)):
        if i<len(arr)-1:
            temp=(arr[i],arr[i+1])
            pairs.append(temp)
    return pairs

def generate(cfd, word = 'the', num = 50):     
     for i in range(num):         
          arr = []           # an array with the words shown by count     
          for j in cfd[word]:             
               for k in range(cfd[word][j]):                 
                    arr.append(j)                  
                    #print(word, end=' ')         
                    word = arr[int((len(arr))*random.random())] 
                    print(word, end=' ') 
pairs=makepairs(df['tweets'])
cfd=nltk.ConditionalFreqDist(pairs)
generate(cfd)
Ejemplo n.º 23
0
def process_corpus(corpus_name):
    print("Corpus to examine: " + corpus_name)
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)
    sentences = []
    words = []
    pos_results = open(corpus_name + "-pos.txt", 'w+')
    cur_sentence = []
    all_pos = []
    for entry in corpus_contents:
        sentences.append(nltk.sent_tokenize(entry))
    for story in sentences:
        for sent in story:
            word_sent = nltk.word_tokenize(sent)
            words.extend(word_sent)
            cur_sentence = nltk.pos_tag(word_sent)
            all_pos.extend(cur_sentence)
            for pair in cur_sentence:
                pos_results.write(pair[0] + "/" + pair[1])
                pos_results.write('\n')
        pos_results.write('\n')
    print("Number of words: " + str(len(words)))
    i = 0
    for word in words:
        words[i] = word.casefold()
        i += 1
    print("The vocabulary size is: " + str(len(set(words))))
    most_common = nltk.FreqDist(pos for (word, pos) in all_pos)
    freq_list = most_common.most_common()
    print("The most common part of speech is " + str(freq_list[0][0]) +
          " which occurs " + str(freq_list[0][1]) + " times.")
    print("")
    word_dist = nltk.FreqDist(word for word in words)
    word_freq = word_dist.most_common()
    freq_results = open(corpus_name + "-word-freq.txt", 'w+')
    for pair in word_freq:
        freq_results.write(str(pair))
        freq_results.write('\n')
    chart_freq = nltk.ConditionalFreqDist(
        (word.casefold(), tag) for (word, tag) in all_pos)
    con_freq = nltk.ConditionalFreqDist(
        (tag, word.casefold()) for (word, tag) in all_pos)
    copy = sys.stdout
    sys.stdout = open(corpus_name + "-pos-word-freq.txt", 'w+')
    chart_freq.tabulate()
    sys.stdout = copy
    common_words_by_pos = [
        con_freq['NN'].most_common()[0], con_freq['VBD'].most_common()[0],
        con_freq['JJ'].most_common()[0], con_freq['RB'].most_common()[0]
    ]
    text_words = nltk.Text(words)
    print("The most common Noun is " + common_words_by_pos[0][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[0][0])
    print("The most common Past Tense Verb is " + common_words_by_pos[1][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[1][0])
    print("The most common Adjective is " + common_words_by_pos[2][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[2][0])
    print("The most common Adverb is " + common_words_by_pos[3][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[3][0])
    print("")
    print("The found collocations are:")
    text_words.collocations()

    pass
Ejemplo n.º 24
0
def count_words_by_genre():
    cfd = nltk.ConditionalFreqDist((genre, word)
                                   for genre in brown.categories()
                                   for word in brown.words(categories=genre))
    return cfd
import nltk
corpus = u"<s> hello how are you doing ? Hope you find the book interesting. </s>".split(
)
sentence = u"<s>how are you doing</s>".split()
vocabulary = set(corpus)
print(len(vocabulary))
cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
print([cfd[a][b] for (a, b) in nltk.bigrams(sentence)])
print([cfd[a].N() for (a, b) in nltk.bigrams(sentence)])
print([cfd[a].freq(b) for (a, b) in nltk.bigrams(sentence)])
print([1 + cfd[a][b] for (a, b) in nltk.bigrams(sentence)])
print([len(vocabulary) + cfd[a].N() for (a, b) in nltk.bigrams(sentence)])
print([
    1.0 * (1 + cfd[a][b]) / (len(vocabulary) + cfd[a].N())
    for (a, b) in nltk.bigrams(sentence)
])
cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary))
print([cpd_mle[a].prob(b) for (a, b) in nltk.bigrams(sentence)])
cpd_laplace = nltk.ConditionalProbDist(cfd,
                                       nltk.LaplaceProbDist,
                                       bins=len(vocabulary))
print([cpd_laplace[a].prob(b) for (a, b) in nltk.bigrams(sentence)])
Ejemplo n.º 26
0
import nltk
import pickle

fact_ru_eval_path = "factrueval-2016-tokenized/factrueval-2016-tokenized.txt"
idiom_path = "idioma1-tokenized/idioma1-tokenized.txt"
libru_path = "librusec-tokenized/librusec-tokenized.txt"
wiki_2017_path = "ruwiki-2017-tokenized/ruwiki-2017-tokenized.txt"
without_wiki2017 = [fact_ru_eval_path, idiom_path, libru_path]


def _read_file(path):
    with open(path, 'r', encoding='utf-8') as file:
        return [j for i in file.read().split("\n") for j in i.split()]


list_of_words = []
for path in without_wiki2017:
    list_of_words.extend(_read_file(path))

freq = nltk.FreqDist(list_of_words)
with open('no_wiki_ru_.pickle', 'wb') as f:
    pickle.dump(freq, f)

wiki = _read_file(wiki_2017_path)
freq_wiki = nltk.FreqDist(wiki)
with open('wiki_ru_.pickle', 'wb') as f_w:
    pickle.dump(freq_wiki, f_w)

cfreq = nltk.ConditionalFreqDist(nltk.bigrams(list_of_words))
with open('no_wiki_ru_cfreq.pickle', 'wb') as cf:
    pickle.dump(cfreq, cf)
Ejemplo n.º 27
0
    pieces = re.findall(regexp, word)
    return ''.join(pieces)


compress('IiIiIi')
english_udhr = nltk.corpus.udhr.words('English-Latin1')
english_tmp = [compress(w) for w in english_udhr]
len(english_tmp)
len(''.join(english_tmp))
re.findall(r'[AEIOUaeiou]', ''.join(english_tmp))
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

# P111 提取所有辅音-元音序列对,并且统计单词库中这样的序列对的数目
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[PTKSVRptksvr][AEIOUaeiou]', w.lower())]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

# 定义元音--辅音对应的单词集合
cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'['r'ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
cv_index['su']
cv_index['po']


# P112 查找词干(忽略词语结尾,只处理词干)
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
Ejemplo n.º 28
0
text = udhr.words('Hungarian_Magyar-Latin1')

def is_vowel(letter):
	"""Checks to see if a letter is a vowel."""
	if letter in "aeiou":
		return True
	else:
		return False

def pull_out_vowels(word):
	"""Takes in a word and pulls out all vowels for it."""
	vowels = []
	for letter in word:
		if is_vowel(letter):
			vowels.extend(letter)
	vowels = nltk.bigrams(vowels)
	return vowels

def vowels_for_all_words(text):
	"""pulls out all vowels for all words."""
	vowels = []

	for word in text:
		vowels.extend(pull_out_vowels(word))

	return vowels

vowel_bigrams = vowels_for_all_words(text)

cfd = nltk.ConditionalFreqDist(vowel_bigrams)
cfd.tabulate()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan  2 17:37:33 2018

@author: Mohnish_Devadiga
"""

import nltk

alice = nltk.corpus.gutenberg.words("carroll-alice.txt")

alice_norm = [word.lower() for word in alice if word.isalpha()]

alice_tag = nltk.pos_tag(alice_norm, tagset="universal")

alice_cfd = nltk.ConditionalFreqDist(alice_tag)

print(alice_cfd["over"]["ADV"])
print(alice_cfd["gloves"]["ADV"])
print(alice_cfd["savage"]["ADV"])
Ejemplo n.º 30
0
 def __init__(self, tagged_words):
     self.CndFreqDist = nltk.ConditionalFreqDist(
         (tag, word) for (word, tag) in tagged_words
     )  #a conditional frequency distribution that holds all required information