def initialization(self): 'initialize the model' corpusName = 'emma' genre = 'news' corpus = self.getCorpus(corpusName, genre) tagged_corpus = self.getCorpus(corpusName, genre, tagged=True) # n-gram frequency distributions self.trigrams = nltk.trigrams(tagged_corpus) self.tricfd = nltk.ConditionalFreqDist() self.trifd = nltk.FreqDist() self.postrifreq = nltk.ConditionalFreqDist() for ((word2, tag2), (word1, tag1), (word0, tag0)) in self.trigrams: self.tricfd[word2, word1][word0] += 1 self.trifd[(word2, word1, word0)] += 1 self.postrifreq[tag2, tag1][tag0] += 1 self.bicfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus)) self.bifd = nltk.FreqDist(nltk.bigrams(corpus)) self.unifd = nltk.FreqDist(corpus) self.taggedFreq = nltk.FreqDist(tagged_corpus) # n-gram probability distributions self.tricpd = nltk.ConditionalProbDist(self.tricfd, nltk.ELEProbDist) self.tripd = nltk.ELEProbDist(self.trifd) self.bicpd = nltk.ConditionalProbDist(self.bicfd, nltk.ELEProbDist) self.bipd = nltk.ELEProbDist(self.bifd) self.unipd = nltk.ELEProbDist(self.unifd) # POS n-gram self.postriprob = nltk.ConditionalProbDist(self.postrifreq, nltk.ELEProbDist)
def train_fullset(x): global full_tags global full_words global full_tag_set global full_cpd_word_tag global full_cpd_tags global full_cfd_word_tag global full_cfd_tags global full_training_set full_training_set_words = [] for sent in full_training_set: if x==0: full_training_set_words.append(('<s>','<s>')) full_training_set_words.extend([ (tag, word) for (word, tag) in sent ]) if x==0: full_training_set_words.append(('</s>','</s>')) full_tags = [tag for (tag, word) in full_training_set_words] full_words = [word for (tag, word) in full_training_set_words] full_tag_set = set(full_tags) full_cfd_word_tag = nltk.ConditionalFreqDist(full_training_set_words) full_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(full_tags)) full_obs_set = [] for sent in full_training_set: full_obs_set.append([ word for (word, tag) in sent ]) if x==0: for i in xrange(len(full_obs_set)): full_obs_set[i].append('</s>') full_obs_set[i].insert(0, '<s>') full_cpd_word_tag = nltk.ConditionalProbDist(full_cfd_word_tag, nltk.MLEProbDist) full_cpd_tags = nltk.ConditionalProbDist(full_cfd_tags, nltk.MLEProbDist)
def train(): train_words = [i.strip() for i in open('train.txt', 'r').readlines()] train_words.append('') train = [] temp_sent = [] for i in train_words: if not i: if temp_sent: train.append(temp_sent) temp_sent = [] else: temp_word = i.split() temp_word = (temp_word[0], map_tag(temp_word[1]) + ' ' + temp_word[2] + ' ' + temp_word[3]) temp_sent.append(temp_word) tags_words = [] for sent in train: tags_words.append(('START', 'START')) tags_words.extend([(tag, word) for (word, tag) in sent]) tags_words.append(('END', 'END')) cfd_tagwords = nltk.ConditionalFreqDist(tags_words) cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) universal_tags = [tag for (tag, word) in tags_words] cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(universal_tags)) cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) distinct_tags = set(universal_tags) return (cpd_tagwords, cpd_tags, distinct_tags)
def train(train_file, rareWords, replaceRare, useTrigrams, trigramBackoff): """ Read the file and populate the various frequency and conditional frequency distributions and build the HMM off these data structures. """ acc = Accumulator(rareWords, replaceRare, useTrigrams) reader = nltk.corpus.reader.TaggedCorpusReader(".", train_file) for sent in reader.tagged_sents(): unigrams = pad(sent) acc.addSentence(unigrams, normalizeRareWord) if useTrigrams: if trigramBackoff: backoffCPD = calculateBackoffTransCPD(acc.tagsFD, acc.transitionsCFD, acc.transitions2CFD) return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags), backoffCPD, nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist), nltk.ELEProbDist(acc.priorsFD)) else: return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags), nltk.ConditionalProbDist(acc.transitions2CFD, nltk.ELEProbDist, len(acc.transitions2CFD.conditions())), nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist), nltk.ELEProbDist(acc.priorsFD)) else: return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags), nltk.ConditionalProbDist(acc.transitionsCFD, nltk.ELEProbDist, len(acc.transitionsCFD.conditions())), nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist), nltk.ELEProbDist(acc.priorsFD))
def train(self, tagged_sentences): # call super so we'll know which words are seen in training super(HMM, self).train(tagged_sentences) # get (tag, word) and (tag, tag) pairs for each sentence tag_word = [] tag_tag = [] for sent in tagged_sentences: tag_word.extend([(tag, word) for (word, tag) in sent]) tags = [tag for (_word, tag) in sent] # add <s> and </s> tags.insert(0, self.INITIAL) tags.append(self.FINAL) tag_tag.extend(nltk.bigrams(tags)) # get counts as conditional frequency distributions tag_word_cfd = nltk.ConditionalFreqDist(tag_word) tag_tag_cfd = nltk.ConditionalFreqDist(tag_tag) # get probabilities as conditional probability distributions pd_factory = lambda fd: WittenBellProbDist(fd, 50000) tag_word_cpd = nltk.ConditionalProbDist(tag_word_cfd, pd_factory) tag_tag_cpd = nltk.ConditionalProbDist(tag_tag_cfd, MLEProbDist) # add HMM states using these distributions make_dict = lambda pd: dict([(sample, pd.prob(sample)) for sample in pd.samples()]) self.add(self.INITIAL, make_dict(tag_tag_cpd[self.INITIAL])) for tag in tag_word_cpd.conditions(): tag_tag_dict = make_dict(tag_tag_cpd[tag]) tag_word_dict = make_dict(tag_word_cpd[tag]) tag_word_dict[self.UNK] = tag_word_cpd[tag].prob(self.UNK) self.add(tag, tag_tag_dict, tag_word_dict)
def buildProbDist(self, corpus): """ Build tag probability distribution for Viterbi algorithm """ corpus_tags_words = [] # Build array containing all tags and words of all sentences, in order for sent in corpus.tagged_sents(): corpus_tags_words.append(("BEGIN", "BEGIN")) corpus_tags_words.extend([(tag, word) for (word, tag) in sent]) corpus_tags_words.append(("STOP", "STOP")) # Build a conditional frequency distribution based on all tags/words of all sentences fd_tagwords = nltk.ConditionalFreqDist(corpus_tags_words) # Build conditional probability of each tag/word based on the frequency distribution above pd_tagwords = nltk.ConditionalProbDist(fd_tagwords, nltk.MLEProbDist) # Build array containing all tags of all sentences, in order corpus_tags = [tag for (tag, word) in corpus_tags_words] # Build a frequency distribution based ONLY on bigrams tags fd_tags = nltk.ConditionalFreqDist(nltk.bigrams(corpus_tags)) # Build conditional probability of each tag based on the frequency distribution above pd_tags = nltk.ConditionalProbDist(fd_tags, nltk.MLEProbDist) all_tags = set(corpus_tags) self.corpora_prob_dists.append( (pd_tagwords, pd_tags, all_tags, corpus_tags_words))
def nltk_2gram_mix(author1_index, author2_index, ratio): global corpus_list tokens1 = corpus_list[author1_index][:] tokens2 = corpus_list[author2_index][:] cfreq_2gram1 = nltk.ConditionalFreqDist(nltk.bigrams(tokens1)) cprob_2gram1 = nltk.ConditionalProbDist(cfreq_2gram1, nltk.MLEProbDist) cfreq_2gram2 = nltk.ConditionalFreqDist(nltk.bigrams(tokens2)) cprob_2gram2 = nltk.ConditionalProbDist(cfreq_2gram2, nltk.MLEProbDist) r = random.randint(0,100) word = '' if r < ratio: word = tokens1[random.randint(0, len(tokens1))] else: word = tokens2[random.randint(0, len(tokens2))] # r = random.randint(0,100) print(word) t = '' word_count = 0 for i in range(0,100): # print(word, end=' ') # Only add words that have alphanumeric characters valid = re.match('^[a-zA-Z.,-/?;:!0123456789]+', word) is not None if valid: t += re.match('^[a-zA-Z.,-/?;:!0123456789]+', word).group(0) + ' ' # print(word, end=' ') if r > ratio: print(cprob_2gram1[word]) if word not in tokens1: word = tokens1[random.randint(0, len(tokens1))] word = cprob_2gram1[word].generate() else: if word not in tokens2: word = tokens2[random.randint(0, len(tokens2))] print(cprob_2gram2[word]) word = cprob_2gram2[word].generate() word_count += 1 # print('word count :', word_count) else: print('*', word, end='* ') if r > ratio: word = tokens1[random.randint(0, len(tokens1))] else: word = tokens2[random.randint(0, len(tokens2))] r = random.randint(0,100) return t
def probDist(corpus): tags_words = get_tags(corpus) corpus_tags = [tag for (tag, word) in tags_words ] # conditional frequency distribution cfd_tagwords = nltk.ConditionalFreqDist(tags_words) # conditional probability distribution cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) # make conditional frequency distribution of observations: cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(corpus_tags)) cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) return cpd_tagwords, cpd_tags
def train(self): print('Training model A...') tri_cfd = nltk.ConditionalFreqDist( ) # conditional frequency distribution bi_cfd = nltk.ConditionalFreqDist() uni_fd = nltk.FreqDist() wordtag_tri_cfd = nltk.ConditionalFreqDist() # tagged word's cfd wordtag_bi_cfd = nltk.ConditionalFreqDist() wordtag_uni_fd = nltk.FreqDist() genres = ['news', 'religion', 'reviews', 'romance', 'science_fiction'] for genre in genres: corpus = brown.tagged_words(categories=genre) size = int(len(corpus) * 0.90) # 0.9 part for training , 0.1 for testing corpus = corpus[:size] trigrams = nltk.trigrams(corpus) bigrams = nltk.bigrams(corpus) for ((word2, tag2), (word1, tag1), (word0, tag0)) in trigrams: # if find combination, add tri_cfd[word2, word1][word0] += 1 wordtag_tri_cfd[word2, tag2, word1, tag1][word0] += 1 for ((word1, tag1), (word0, tag0)) in bigrams: bi_cfd[word1][word0] += 1 wordtag_bi_cfd[word1, tag1][word0] += 1 for ((word0, tag0)) in corpus: uni_fd[word0] += 1 wordtag_uni_fd[word0, tag0] += 1 # n-gram probability distributions self.tri_cpd = nltk.ConditionalProbDist( tri_cfd, nltk.LaplaceProbDist) # add one smoothing self.bi_cpd = nltk.ConditionalProbDist(bi_cfd, nltk.LaplaceProbDist) self.uni_pd = nltk.LaplaceProbDist(uni_fd) # POS n-gram self.wordtag_uni_pd = nltk.LaplaceProbDist( wordtag_uni_fd) # also need to smooth by using adding one mthod. self.wordtag_bi_cpd = nltk.ConditionalProbDist(wordtag_bi_cfd, nltk.LaplaceProbDist) self.wordtag_tri_cpd = nltk.ConditionalProbDist( wordtag_tri_cfd, nltk.LaplaceProbDist) print('Done!')
def main(): sents = create_tokens(CORPUS_FILENAME) train_corpus, test_corpus = train_test_split(sents) cfd_2gram = ngram_freq_dist(train_corpus, ngram=2) #conditional frequency distribution for bigrams cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist) # conditional probality distribution for bigrams print(cpd_2gram) rev_sents = create_tokens(REV_CORPUS_FILENAME) rev_train_corpus, rev_test_corpus = train_test_split(rev_sents) rev_cfd_2gram = ngram_freq_dist(rev_train_corpus, ngram=2) rev_cpd_2gram = nltk.ConditionalProbDist(rev_cfd_2gram, nltk.MLEProbDist) text_wiki = generate_txt_bigram_model(cpd_2gram, rev_cpd_2gram, 'trump', numwords=10) print('Test sentence for trump:', text_wiki)
def __init__(self, is_efficient, data_frame): self.relevant_referrals_1gram_prob_calc = dict() self.other_ref_text_1gram_prob_calc = dict() if is_efficient: self.relevant_referrals = data_frame.loc[data_frame.IsEfficient == 1] self.other_ref = data_frame.loc[data_frame.IsEfficient == -1] else: self.relevant_referrals = data_frame.loc[data_frame.IsEfficient == -1] self.other_ref = data_frame.loc[data_frame.IsEfficient == 1] # work on efficient referrals: self.relevant_referrals_text = pd.Series( self.relevant_referrals['comment_body'].values) self.other_ref_text = pd.Series(self.other_ref['comment_body'].values) # concat all the comments self.relevant_referrals_all_text = self.relevant_referrals_text.str.cat( sep=' ') # remove /n and other signs from the string # TODO: we can look for more cases, but I think that for the start it's enough self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( '\n', '') self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( '.', '') self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( ',', '') self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( ':', '') self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( '(', '') self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace( ')', '') # self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace('/', ' ') self.relevant_referrals_all_text = self.relevant_referrals_all_text.lower( ) # create a list of all the words in the all the comments self.relevant_referrals_all_text_list = self.relevant_referrals_all_text.split( ' ') # self.relevant_referrals_all_text_list = self.relevant_referrals_all_text_list.remove('') # len of all comments in each group self.relevant_referrals_len = len( self.relevant_referrals_all_text_list) # create 1gram FreqDist (kind of dictionary) self.relevant_referrals_freq_1gram = nltk.FreqDist( self.relevant_referrals_all_text_list) # create 2gram FreqDist (kind of dictionary) self.relevant_referrals_freq_2gram = nltk.ConditionalFreqDist( nltk.bigrams(self.relevant_referrals_all_text_list)) # create 2gram condition probability: maps each pair of words to probability self.relevant_referrals_prob_2gram = nltk.ConditionalProbDist( self.relevant_referrals_freq_2gram, nltk.MLEProbDist)
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # TODO: prepare the data data = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> padded_data = [] for s in train_data: padded_data.append([('<s>','<s>')] + s + [('</s>','</s>')]) # TODO tagGenerators=(((s[i][1],s[i+1][1]) for i in range(len(s)-1)) for s in padded_data) data = list(itertools.chain.from_iterable(tagGenerators)) # TODO compute the transition model transition_FD = nltk.ConditionalFreqDist(data) lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1) self.transition_PD = nltk.ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def nltk_2gram(author1_index): global corpus_list tokens = corpus_list[author1_index][:] cfreq_2gram = nltk.ConditionalFreqDist(nltk.bigrams(tokens)) cprob_2gram = nltk.ConditionalProbDist(cfreq_2gram, nltk.MLEProbDist) word = tokens[random.randint(0, len(tokens))] # print(word) t = '' word_count = 0 for i in range(0,100): # print(word, end=' ') # Only add words that have alphanumeric characters valid = re.match('^[a-zA-Z.,-/?;:!0123456789]+', word) is not None if valid: t += re.match('^[a-zA-Z.,-/?;:!0123456789]+', word).group(0) + ' ' print(word, end=' ') word = cprob_2gram[word].generate() word_count += 1 # print('word count :', word_count) else: print('*', word, end='* ') word = tokens[random.randint(0, len(tokens))] pass return t
def create_trigram_table(data_words): trigrams = nltk.trigrams(data_words) sen = list() sen = [((a, b), c) for (a, b, c) in trigrams] cfreq_3gram = nltk.ConditionalFreqDist(sen) cprob_3gram = nltk.ConditionalProbDist(cfreq_3gram, nltk.MLEProbDist) return cfreq_3gram, cprob_3gram
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ data = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> for s in train_data: for i, word_tag in enumerate(s): tag = word_tag[1] if i == 0: data.append(("<s>", tag)) if i != len(s) - 1: data.append((tag, s[i + 1][1])) else: data.append((tag, "</s>")) transition_FD = nltk.ConditionalFreqDist(data) self.transition_PD = nltk.ConditionalProbDist(transition_FD, LidstoneProbDistFactory) return self.transition_PD
def createConditionalProbabilityModel(folder, fileids, model, sep='/', encoding='utf8'): """ Creates an tagging probability model to be used along with the FeatureEstimator object. Files of tagged data must contain one sentence per line, and each line must follow the following format: <word_1><separator><tag_1> <word_2><separator><tag_2> ... <word_n-1><separator><tag_n-1> <word_n><separator><tag_n> @param folder: Folder containing files of tagged sentences. @param fileids: A list or regular expressions specifying the file names with tagged data in "folder". @param model: File in which to save the trained model. @param sep: Separator between words and tags in the files with tagged data. @param encoding: Encoding of the files with tagged data. """ print('Reading files...') tcr = nltk.corpus.reader.tagged.TaggedCorpusReader(folder, fileids, sep=sep, encoding=encoding) print('Extracting tagged data...') data = tcr.tagged_words() print('Creating conditional probability maps...') cfd_tagwords = nltk.ConditionalFreqDist(data) cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) print('Saving model...') pickle.dump(cpd_tagwords, open(model, "wb")) print('Finished!')
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for s in train_data: for (word, tag) in s: data.append((tag, word.lower())) # TODO compute the emission model emission_FD = nltk.ConditionalFreqDist(data) lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = nltk.ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(set([ tag for (tag, word) in data])) return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ data = [[(x[1], x[0].lower()) if x[1] != "." else (x[1], x[0]) for x in s] for s in train_data] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> for s in data: s.insert(0, ("<s>", "<s>")) s.append(("</s>", "</s>")) tagGenerators = (((s[i][0], s[i + 1][0]) for i in range(len(s) - 1)) for s in data) data = itertools.chain.from_iterable(tagGenerators) transition_FD = nltk.ConditionalFreqDist(data) self.transition_PD = nltk.ConditionalProbDist(transition_FD, HMM.LidProDist) return self.transition_PD
def train_markov_model_from_constraint_matrix(self, csv_path, mm_path, delim="\t"): table = [line.split(delim) for line in open(csv_path)] tags = [] range_states = table.pop(0)[1:] for row in table: domain = row[0] for i, r in enumerate(row[1:]): s = r.replace(" ", "").strip("\n") if (s == ''): continue if int(s) > 0: for _ in range(0, int(s)): tags.append((domain, range_states[i])) self.cfd_tags = nltk.ConditionalFreqDist(tags) print "cfd trained, counts:" self.cfd_tags.tabulate() print "test:" print tabulate_cfd(self.cfd_tags) # save this new cfd for later use pickle.dump(self.cfd_tags, open(mm_path, "wb")) # initialize the cpd self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags, nltk.MLEProbDist) # print "cpd summary:" # print self.cpd_tags.viewitems() print tabulate_cfd(self.cpd_tags) all_outcomes = [v.keys() for v in self.cfd_tags.values()] self.tag_set = set(self.cfd_tags.keys() + [y for x in all_outcomes for y in x]) self.viterbi_init() # initialize viterbi
def makeModel(percentageSplit): brown_tags_words = [] cutoff = int(len(corpus) * percentageSplit / 100) tagged_data = corpus[:cutoff] for sent in tagged_data: # sent is a list of word/tag pairs # add START/START at the beginning brown_tags_words.append(("START", "START")) # then all the tag/word pairs for the word/tag pairs in the sentence. # shorten tags to 2 characters each brown_tags_words.extend([(tag, word) for (word, tag) in sent]) # then END/END brown_tags_words.append(("END", "END")) # conditional frequency distribution global cfd_tagwords cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words) # conditional probability distribution global cpd_tagwords cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) # Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE): # P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1}) brown_tags = [tag for (tag, word) in brown_tags_words] # make conditional frequency distribution: # count(t{i-1} ti) global cfd_tags cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags)) # make conditional probability distribution, using # maximum likelihood estimate: # P(ti | t{i-1}) global cpd_tags cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) ##### # Viterbi: # If we have a word sequence, what is the best tag sequence? # # The method above lets us determine the probability for a single tag sequence. # But in order to find the best tag sequence, we need the probability # for _all_ tag sequence. # What Viterbi gives us is just a good way of computing all those many probabilities # as fast as possible. # what is the list of all tags? global distinct_tags distinct_tags = set(brown_tags)
def getTagExpt(data): expt1 = Experiment() expt1.cfd_tags = nltk.ConditionalFreqDist( nltk.bigrams((tag for (word, tag) in getTagProb(data.train_items)))) expt1.cpd_tags = nltk.ConditionalProbDist(expt1.cfd_tags, nltk.MLEProbDist) expt1.tagset = set((tag for (word, tag) in \ getTagProb(data.train_items))) return expt1
def cal_cpd_tagwords(tags_words): cfd_tagwords = nltk.ConditionalFreqDist(tags_words) cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new")) print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck")) return cpd_tagwords
def train(self, train_sents): cvc = self.check_corpus_validity(train_sents) if cvc[1] == False: raise TypeError(cvc[0]) return train_tags_words = [ ] i = 0 for sent in train_sents[self.sentstart:min(len(train_sents), self.sentend)]: print("Training: {}".format(i)) i += 1 # sent is a list of word/tag pairs # add START/START at the beginning train_tags_words.append( ("START", "START") ) # then all the tag/word pairs for the word/tag pairs in the sentence. # shorten tags to 2 characters each train_tags_words.extend([ (tag[:self.trimtag], word) for (word, tag) in sent ]) # then END/END train_tags_words.append( ("END", "END") ) # conditional frequency distribution cfd_tagwords = nltk.ConditionalFreqDist(train_tags_words) # conditional probability distribution """cpd_tagwords contains emission probabilities""" self.cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) """print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new")) print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck")) """ # Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE): # P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1}) train_tags = [tag for (tag, word) in train_tags_words ] self.tags_list = list(set(train_tags)) # make conditional frequency distribution: # count(t{i-1} ti) cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(train_tags)) # make conditional probability distribution, using # maximum likelihood estimate: # P(ti | t{i-1}) """cpd_tags contains transition probabilities""" self.cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) """print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN"))
def findCPD(self, typecfd=None): if (typecfd == None): self.cpdTag = nltk.ConditionalProbDist(self.cfdTag, nltk.MLEProbDist) return self.cpdTag elif (typecfd == "bi"): return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist) else: print("invalid method")
def generate_text(text, initialword, numwords): bigrams = list(nltk.ngrams(text, 2)) cpd = nltk.ConditionalProbDist(nltk.ConditionalFreqDist(bigrams), nltk.MLEProbDist) word = initialword words = [initialword] for i in range(numwords): word = cpd[word].generate() words.append(word) print(' '.join(words) + '.')
def train(): print 'Training HMM...' # Use the first 1000 sentences from the 'news' category of the Brown corpus labelled_sequences, states, symbols = get_pos_data(1000) # Define the estimator to be used for probability computation estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins) # count occurences of starting states, transitions out of each state # and output symbols observed in each state freq_starts = nltk.FreqDist() freq_transitions = nltk.ConditionalFreqDist() freq_emissions = nltk.ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[1] symbol = token[0] if lasts == None: freq_starts.inc(state) else: freq_transitions[lasts].inc(state) freq_emissions[state].inc(symbol) lasts = state # update the state and symbol lists if state not in states: states.append(state) if symbol not in symbols: symbols.append(symbol) # create probability distributions (with smoothing) N = len(states) starts = estimator(freq_starts, N) transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N) emissions = nltk.ConditionalProbDist(freq_emissions, estimator, len(symbols)) # Return the transition and emissions probabilities along with # the list of all the states and output symbols return starts, transitions, emissions, states, symbols
def calculate_probability(trigrams): trigrams_as_bigrams = [] trigrams_as_bigrams.extend([((t[0], t[1]), t[2]) for t in trigrams]) cfd = nltk.ConditionalFreqDist(trigrams_as_bigrams) cpd = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist) # for trigram in trigrams_as_bigrams: # if conditional_probability_distribution[trigram[0]].prob(trigram[1]) == 1 : # print("{1} has probablity {0}".format(conditional_probability_distribution[trigram[0]].prob(trigram[1]), trigram)) return cpd
def cal_cpd_tags(tags_words): brown_tags = [tag for (tag, word) in tags_words] cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags)) cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist) print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN")) print("If we have just seen 'VB', the probability of 'JJ' is", cpd_tags["VB"].prob("DT")) print("If we have just seen 'VB', the probability of 'NN' is", cpd_tags["VB"].prob("NN")) return cpd_tags, list(set(brown_tags))
def hmm(): # calculate the frequency distribution for the tags, words conditionFreqDisttag = nltk.ConditionalFreqDist(brownTagsWords) # calculate the probability distribution for the tags,words by using Maximum Likelihood Estimation global conditionProbdistTag conditionProbdistTag = nltk.ConditionalProbDist(conditionFreqDisttag, nltk.MLEProbDist) #extract the tags and train it in HMM global brownTagsTrained brownTagsTrained = [tag for (tag, word) in brownTagsWords] # calculate the bigram frequency distribution for the tags #calculate the P(ti|ti-1)=C(ti-1ti)+1/C(ti-1)+V as V stands for the tags there are in the corpus freqDistTags = nltk.ConditionalFreqDist(nltk.bigrams(brownTagsTrained)) # calculate the probability distribution for the tags by using Maximum Likelihood Estimation # the HMM is now trained global probdistTags probdistTags = nltk.ConditionalProbDist(freqDistTags, nltk.LaplaceProbDist, bins=len(brownTagsTrained)) viterbi()
def train_set1(x): global training_set1 global set1_tags global set1_words global set1_cfd_word_tag global set1_cfd_tags global set1_cpd_word_tag global set1_cpd_tags set1_training_set_words = [] for sent in training_set1: if x==0: set1_training_set_words.append(('<s>','<s>')) set1_training_set_words.extend([ (tag, word) for (word, tag) in sent ]) if x==0: set1_training_set_words.append(('</s>','</s>')) set1_tags = [tag for (tag, word) in set1_training_set_words] set1_words = [word for (tag, word) in set1_training_set_words] set1_cfd_word_tag = nltk.ConditionalFreqDist(set1_training_set_words) set1_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(set1_tags)) set1_cpd_word_tag = nltk.ConditionalProbDist(set1_cfd_word_tag, nltk.MLEProbDist) set1_cpd_tags = nltk.ConditionalProbDist(set1_cfd_tags, nltk.MLEProbDist)