Exemple #1
0
    def initialization(self):
        'initialize the model'
        corpusName = 'emma'
        genre = 'news'
        corpus = self.getCorpus(corpusName, genre)
        tagged_corpus = self.getCorpus(corpusName, genre, tagged=True)

        # n-gram frequency distributions
        self.trigrams = nltk.trigrams(tagged_corpus)
        self.tricfd = nltk.ConditionalFreqDist()
        self.trifd = nltk.FreqDist()
        self.postrifreq = nltk.ConditionalFreqDist()
        for ((word2, tag2), (word1, tag1), (word0, tag0)) in self.trigrams:
            self.tricfd[word2, word1][word0] += 1
            self.trifd[(word2, word1, word0)] += 1
            self.postrifreq[tag2, tag1][tag0] += 1
        self.bicfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
        self.bifd = nltk.FreqDist(nltk.bigrams(corpus))
        self.unifd = nltk.FreqDist(corpus)
        self.taggedFreq = nltk.FreqDist(tagged_corpus)

        # n-gram probability distributions
        self.tricpd = nltk.ConditionalProbDist(self.tricfd, nltk.ELEProbDist)
        self.tripd = nltk.ELEProbDist(self.trifd)
        self.bicpd = nltk.ConditionalProbDist(self.bicfd, nltk.ELEProbDist)
        self.bipd = nltk.ELEProbDist(self.bifd)
        self.unipd = nltk.ELEProbDist(self.unifd)

        # POS n-gram
        self.postriprob = nltk.ConditionalProbDist(self.postrifreq,
                                                   nltk.ELEProbDist)
Exemple #2
0
def train_fullset(x):
    global full_tags
    global full_words
    global full_tag_set
    global full_cpd_word_tag
    global full_cpd_tags
    global full_cfd_word_tag
    global full_cfd_tags
    global full_training_set
    full_training_set_words = []
    for sent in full_training_set:
        if x==0:
            full_training_set_words.append(('<s>','<s>'))
        full_training_set_words.extend([ (tag, word) for (word, tag) in sent ])
        if x==0:
            full_training_set_words.append(('</s>','</s>'))
    full_tags = [tag for (tag, word) in full_training_set_words]
    full_words = [word for (tag, word) in full_training_set_words]
    full_tag_set = set(full_tags)
    full_cfd_word_tag = nltk.ConditionalFreqDist(full_training_set_words)
    full_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(full_tags))
    full_obs_set = []
    for sent in full_training_set:
        full_obs_set.append([ word for (word, tag) in sent ])
    if x==0:
        for i in xrange(len(full_obs_set)):
            full_obs_set[i].append('</s>')
            full_obs_set[i].insert(0, '<s>')
    full_cpd_word_tag = nltk.ConditionalProbDist(full_cfd_word_tag, nltk.MLEProbDist)
    full_cpd_tags = nltk.ConditionalProbDist(full_cfd_tags, nltk.MLEProbDist)
Exemple #3
0
def train():
    train_words = [i.strip() for i in open('train.txt', 'r').readlines()]
    train_words.append('')

    train = []
    temp_sent = []
    for i in train_words:
        if not i:
            if temp_sent:
                train.append(temp_sent)
            temp_sent = []
        else:
            temp_word = i.split()
            temp_word = (temp_word[0], map_tag(temp_word[1]) + ' ' +
                         temp_word[2] + ' ' + temp_word[3])
            temp_sent.append(temp_word)

    tags_words = []

    for sent in train:
        tags_words.append(('START', 'START'))
        tags_words.extend([(tag, word) for (word, tag) in sent])
        tags_words.append(('END', 'END'))

    cfd_tagwords = nltk.ConditionalFreqDist(tags_words)
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

    universal_tags = [tag for (tag, word) in tags_words]

    cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(universal_tags))
    cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

    distinct_tags = set(universal_tags)

    return (cpd_tagwords, cpd_tags, distinct_tags)
def train(train_file, 
    rareWords, replaceRare, useTrigrams, trigramBackoff):
  """
  Read the file and populate the various frequency and
  conditional frequency distributions and build the HMM
  off these data structures.
  """
  acc = Accumulator(rareWords, replaceRare, useTrigrams)
  reader = nltk.corpus.reader.TaggedCorpusReader(".", train_file)
  for sent in reader.tagged_sents():
    unigrams = pad(sent)
    acc.addSentence(unigrams, normalizeRareWord)
  if useTrigrams:
    if trigramBackoff:
      backoffCPD = calculateBackoffTransCPD(acc.tagsFD, acc.transitionsCFD,
        acc.transitions2CFD)
      return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags),
        backoffCPD,
        nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist),
        nltk.ELEProbDist(acc.priorsFD))
    else:
      return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags),
        nltk.ConditionalProbDist(acc.transitions2CFD, nltk.ELEProbDist,
        len(acc.transitions2CFD.conditions())),
        nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist),
        nltk.ELEProbDist(acc.priorsFD))
  else:
    return nltk.HiddenMarkovModelTagger(list(acc.words), list(acc.tags),
      nltk.ConditionalProbDist(acc.transitionsCFD, nltk.ELEProbDist,
      len(acc.transitionsCFD.conditions())),
      nltk.ConditionalProbDist(acc.outputsCFD, nltk.ELEProbDist),
      nltk.ELEProbDist(acc.priorsFD))
Exemple #5
0
 def train(self, tagged_sentences):
     # call super so we'll know which words are seen in training
     super(HMM, self).train(tagged_sentences)
     # get (tag, word) and (tag, tag) pairs for each sentence
     tag_word = []
     tag_tag = []
     for sent in tagged_sentences:
         tag_word.extend([(tag, word) for (word, tag) in sent])
         tags = [tag for (_word, tag) in sent]
         # add <s> and </s>
         tags.insert(0, self.INITIAL)
         tags.append(self.FINAL)
         tag_tag.extend(nltk.bigrams(tags))
     # get counts as conditional frequency distributions
     tag_word_cfd = nltk.ConditionalFreqDist(tag_word)
     tag_tag_cfd = nltk.ConditionalFreqDist(tag_tag)
     # get probabilities as conditional probability distributions
     pd_factory = lambda fd: WittenBellProbDist(fd, 50000)
     tag_word_cpd = nltk.ConditionalProbDist(tag_word_cfd, pd_factory)
     tag_tag_cpd = nltk.ConditionalProbDist(tag_tag_cfd, MLEProbDist)
     # add HMM states using these distributions
     make_dict = lambda pd: dict([(sample, pd.prob(sample))
                                  for sample in pd.samples()])
     self.add(self.INITIAL, make_dict(tag_tag_cpd[self.INITIAL]))
     for tag in tag_word_cpd.conditions():
         tag_tag_dict = make_dict(tag_tag_cpd[tag])
         tag_word_dict = make_dict(tag_word_cpd[tag])
         tag_word_dict[self.UNK] = tag_word_cpd[tag].prob(self.UNK)
         self.add(tag, tag_tag_dict, tag_word_dict)
Exemple #6
0
    def buildProbDist(self, corpus):
        """ Build tag probability distribution for Viterbi algorithm """

        corpus_tags_words = []

        # Build array containing all tags and words of all sentences, in order
        for sent in corpus.tagged_sents():
            corpus_tags_words.append(("BEGIN", "BEGIN"))
            corpus_tags_words.extend([(tag, word) for (word, tag) in sent])
            corpus_tags_words.append(("STOP", "STOP"))

        # Build a conditional frequency distribution based on all tags/words of all sentences
        fd_tagwords = nltk.ConditionalFreqDist(corpus_tags_words)
        # Build conditional probability of each tag/word based on the frequency distribution above
        pd_tagwords = nltk.ConditionalProbDist(fd_tagwords, nltk.MLEProbDist)

        # Build array containing all tags of all sentences, in order
        corpus_tags = [tag for (tag, word) in corpus_tags_words]

        # Build a frequency distribution based ONLY on bigrams tags
        fd_tags = nltk.ConditionalFreqDist(nltk.bigrams(corpus_tags))
        # Build conditional probability of each tag based on the frequency distribution above
        pd_tags = nltk.ConditionalProbDist(fd_tags, nltk.MLEProbDist)
        all_tags = set(corpus_tags)

        self.corpora_prob_dists.append(
            (pd_tagwords, pd_tags, all_tags, corpus_tags_words))
def nltk_2gram_mix(author1_index, author2_index, ratio):
    global corpus_list

    tokens1 = corpus_list[author1_index][:]
    tokens2 = corpus_list[author2_index][:]
 
    cfreq_2gram1 = nltk.ConditionalFreqDist(nltk.bigrams(tokens1))
    cprob_2gram1 = nltk.ConditionalProbDist(cfreq_2gram1, nltk.MLEProbDist)
    
    cfreq_2gram2 = nltk.ConditionalFreqDist(nltk.bigrams(tokens2))
    cprob_2gram2 = nltk.ConditionalProbDist(cfreq_2gram2, nltk.MLEProbDist)

    r = random.randint(0,100)
    word = ''
    if r < ratio:
        word = tokens1[random.randint(0, len(tokens1))]
    else:
        word = tokens2[random.randint(0, len(tokens2))]
    # r = random.randint(0,100)
    print(word)
    t = ''
    word_count = 0
    for i in range(0,100):
        # print(word, end=' ')
        # Only add words that have alphanumeric characters
        valid = re.match('^[a-zA-Z.,-/?;:!0123456789]+', word) is not None
        if valid:
            t += re.match('^[a-zA-Z.,-/?;:!0123456789]+', word).group(0) + ' '
            # print(word, end=' ')
            if r > ratio:
                print(cprob_2gram1[word])
                if word not in tokens1:
                    word = tokens1[random.randint(0, len(tokens1))]
                word = cprob_2gram1[word].generate()
            else:
                if word not in tokens2:
                    word = tokens2[random.randint(0, len(tokens2))]
                print(cprob_2gram2[word])
                word = cprob_2gram2[word].generate()

            word_count += 1
            # print('word count :', word_count)
        else:
            print('*', word, end='* ')
            if r > ratio:
                word = tokens1[random.randint(0, len(tokens1))]
            else:
                word = tokens2[random.randint(0, len(tokens2))]

            
        r = random.randint(0,100)

    return t
Exemple #8
0
def probDist(corpus):
    tags_words  = get_tags(corpus)
    corpus_tags = [tag for (tag, word) in tags_words ]
    # conditional frequency distribution
    cfd_tagwords = nltk.ConditionalFreqDist(tags_words)
    # conditional probability distribution
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

    # make conditional frequency distribution of observations:   
    cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(corpus_tags))   
    cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

    return cpd_tagwords, cpd_tags
    def train(self):
        print('Training model A...')

        tri_cfd = nltk.ConditionalFreqDist(
        )  # conditional frequency distribution
        bi_cfd = nltk.ConditionalFreqDist()
        uni_fd = nltk.FreqDist()

        wordtag_tri_cfd = nltk.ConditionalFreqDist()  # tagged word's cfd
        wordtag_bi_cfd = nltk.ConditionalFreqDist()
        wordtag_uni_fd = nltk.FreqDist()

        genres = ['news', 'religion', 'reviews', 'romance', 'science_fiction']

        for genre in genres:

            corpus = brown.tagged_words(categories=genre)
            size = int(len(corpus) *
                       0.90)  # 0.9 part for training , 0.1 for testing
            corpus = corpus[:size]
            trigrams = nltk.trigrams(corpus)
            bigrams = nltk.bigrams(corpus)

            for ((word2, tag2), (word1, tag1),
                 (word0, tag0)) in trigrams:  # if find combination, add
                tri_cfd[word2, word1][word0] += 1
                wordtag_tri_cfd[word2, tag2, word1, tag1][word0] += 1

            for ((word1, tag1), (word0, tag0)) in bigrams:
                bi_cfd[word1][word0] += 1
                wordtag_bi_cfd[word1, tag1][word0] += 1

            for ((word0, tag0)) in corpus:
                uni_fd[word0] += 1
                wordtag_uni_fd[word0, tag0] += 1

        # n-gram probability distributions
        self.tri_cpd = nltk.ConditionalProbDist(
            tri_cfd, nltk.LaplaceProbDist)  # add one smoothing
        self.bi_cpd = nltk.ConditionalProbDist(bi_cfd, nltk.LaplaceProbDist)
        self.uni_pd = nltk.LaplaceProbDist(uni_fd)

        # POS n-gram
        self.wordtag_uni_pd = nltk.LaplaceProbDist(
            wordtag_uni_fd)  # also need to smooth by using adding one mthod.
        self.wordtag_bi_cpd = nltk.ConditionalProbDist(wordtag_bi_cfd,
                                                       nltk.LaplaceProbDist)
        self.wordtag_tri_cpd = nltk.ConditionalProbDist(
            wordtag_tri_cfd, nltk.LaplaceProbDist)

        print('Done!')
def main():
    sents = create_tokens(CORPUS_FILENAME)
    train_corpus, test_corpus = train_test_split(sents)

    cfd_2gram = ngram_freq_dist(train_corpus, ngram=2) #conditional frequency distribution for bigrams
    cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist) # conditional probality distribution for bigrams
    print(cpd_2gram)

    rev_sents = create_tokens(REV_CORPUS_FILENAME)
    rev_train_corpus, rev_test_corpus = train_test_split(rev_sents)
    rev_cfd_2gram = ngram_freq_dist(rev_train_corpus, ngram=2)
    rev_cpd_2gram = nltk.ConditionalProbDist(rev_cfd_2gram, nltk.MLEProbDist)

    text_wiki = generate_txt_bigram_model(cpd_2gram, rev_cpd_2gram, 'trump', numwords=10)
    print('Test sentence for trump:', text_wiki)
Exemple #11
0
    def __init__(self, is_efficient, data_frame):
        self.relevant_referrals_1gram_prob_calc = dict()
        self.other_ref_text_1gram_prob_calc = dict()

        if is_efficient:
            self.relevant_referrals = data_frame.loc[data_frame.IsEfficient ==
                                                     1]
            self.other_ref = data_frame.loc[data_frame.IsEfficient == -1]
        else:
            self.relevant_referrals = data_frame.loc[data_frame.IsEfficient ==
                                                     -1]
            self.other_ref = data_frame.loc[data_frame.IsEfficient == 1]

        # work on efficient referrals:
        self.relevant_referrals_text = pd.Series(
            self.relevant_referrals['comment_body'].values)
        self.other_ref_text = pd.Series(self.other_ref['comment_body'].values)

        # concat all the comments
        self.relevant_referrals_all_text = self.relevant_referrals_text.str.cat(
            sep=' ')

        # remove /n and other signs from the string
        # TODO: we can look for more cases, but I think that for the start it's enough
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            '\n', '')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            '.', '')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            ',', '')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            ':', '')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            '(', '')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace(
            ')', '')
        # self.relevant_referrals_all_text = self.relevant_referrals_all_text.replace('/', ' ')
        self.relevant_referrals_all_text = self.relevant_referrals_all_text.lower(
        )

        # create a list of all the words in the all the comments
        self.relevant_referrals_all_text_list = self.relevant_referrals_all_text.split(
            ' ')
        # self.relevant_referrals_all_text_list = self.relevant_referrals_all_text_list.remove('')

        # len of all comments in each group
        self.relevant_referrals_len = len(
            self.relevant_referrals_all_text_list)

        # create 1gram FreqDist (kind of dictionary)
        self.relevant_referrals_freq_1gram = nltk.FreqDist(
            self.relevant_referrals_all_text_list)

        # create 2gram FreqDist (kind of dictionary)
        self.relevant_referrals_freq_2gram = nltk.ConditionalFreqDist(
            nltk.bigrams(self.relevant_referrals_all_text_list))

        # create 2gram condition probability: maps each pair of words to probability
        self.relevant_referrals_prob_2gram = nltk.ConditionalProbDist(
            self.relevant_referrals_freq_2gram, nltk.MLEProbDist)
Exemple #12
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')
        # TODO: prepare the data
        data = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        padded_data = []
        for s in train_data:
            padded_data.append([('<s>','<s>')] + s + [('</s>','</s>')])  # TODO

        tagGenerators=(((s[i][1],s[i+1][1]) for i in range(len(s)-1)) for s in padded_data)
        data = list(itertools.chain.from_iterable(tagGenerators))



        # TODO compute the transition model

        transition_FD = nltk.ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.transition_PD = nltk.ConditionalProbDist(transition_FD, lidstone_estimator)

        return self.transition_PD
def nltk_2gram(author1_index):
    global corpus_list
    tokens = corpus_list[author1_index][:]
    cfreq_2gram = nltk.ConditionalFreqDist(nltk.bigrams(tokens))
    cprob_2gram = nltk.ConditionalProbDist(cfreq_2gram, nltk.MLEProbDist)
    word = tokens[random.randint(0, len(tokens))]
    # print(word)
    t = ''
    word_count = 0
    for i in range(0,100):
        # print(word, end=' ')
        # Only add words that have alphanumeric characters
        valid = re.match('^[a-zA-Z.,-/?;:!0123456789]+', word) is not None
        if valid:
            t += re.match('^[a-zA-Z.,-/?;:!0123456789]+', word).group(0) + ' '
            print(word, end=' ')
            word = cprob_2gram[word].generate()
            word_count += 1
            # print('word count :', word_count)
        else:
            print('*', word, end='* ')
            word = tokens[random.randint(0, len(tokens))]
            pass

    return t
Exemple #14
0
def create_trigram_table(data_words):
    trigrams = nltk.trigrams(data_words)
    sen = list()
    sen = [((a, b), c) for (a, b, c) in trigrams]
    cfreq_3gram = nltk.ConditionalFreqDist(sen)
    cprob_3gram = nltk.ConditionalProbDist(cfreq_3gram, nltk.MLEProbDist)
    return cfreq_3gram, cprob_3gram
Exemple #15
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        data = []
        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        for s in train_data:
            for i, word_tag in enumerate(s):
                tag = word_tag[1]
                if i == 0:
                    data.append(("<s>", tag))
                if i != len(s) - 1:
                    data.append((tag, s[i + 1][1]))
                else:
                    data.append((tag, "</s>"))
        transition_FD = nltk.ConditionalFreqDist(data)
        self.transition_PD = nltk.ConditionalProbDist(transition_FD,
                                                      LidstoneProbDistFactory)
        return self.transition_PD
Exemple #16
0
def createConditionalProbabilityModel(folder,
                                      fileids,
                                      model,
                                      sep='/',
                                      encoding='utf8'):
    """
    Creates an tagging probability model to be used along with the FeatureEstimator object.
    Files of tagged data must contain one sentence per line, and each line must follow the following format:
    <word_1><separator><tag_1> <word_2><separator><tag_2> ... <word_n-1><separator><tag_n-1> <word_n><separator><tag_n>

    @param folder: Folder containing files of tagged sentences.
    @param fileids: A list or regular expressions specifying the file names with tagged data in "folder".
    @param model: File in which to save the trained model.
    @param sep: Separator between words and tags in the files with tagged data.
    @param encoding: Encoding of the files with tagged data.
    """
    print('Reading files...')
    tcr = nltk.corpus.reader.tagged.TaggedCorpusReader(folder,
                                                       fileids,
                                                       sep=sep,
                                                       encoding=encoding)

    print('Extracting tagged data...')
    data = tcr.tagged_words()

    print('Creating conditional probability maps...')
    cfd_tagwords = nltk.ConditionalFreqDist(data)
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

    print('Saving model...')
    pickle.dump(cpd_tagwords, open(model, "wb"))
    print('Finished!')
Exemple #17
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = []
        for s in train_data:
            for (word, tag) in s:
                data.append((tag, word.lower()))

        # TODO compute the emission model
        emission_FD = nltk.ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = nltk.ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(set([ tag for (tag, word) in data]))

        return self.emission_PD, self.states
Exemple #18
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """

        data = [[(x[1], x[0].lower()) if x[1] != "." else (x[1], x[0])
                 for x in s] for s in train_data]

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        for s in data:
            s.insert(0, ("<s>", "<s>"))
            s.append(("</s>", "</s>"))

        tagGenerators = (((s[i][0], s[i + 1][0]) for i in range(len(s) - 1))
                         for s in data)
        data = itertools.chain.from_iterable(tagGenerators)

        transition_FD = nltk.ConditionalFreqDist(data)
        self.transition_PD = nltk.ConditionalProbDist(transition_FD,
                                                      HMM.LidProDist)

        return self.transition_PD
Exemple #19
0
 def train_markov_model_from_constraint_matrix(self,
                                               csv_path,
                                               mm_path,
                                               delim="\t"):
     table = [line.split(delim) for line in open(csv_path)]
     tags = []
     range_states = table.pop(0)[1:]
     for row in table:
         domain = row[0]
         for i, r in enumerate(row[1:]):
             s = r.replace(" ", "").strip("\n")
             if (s == ''):
                 continue
             if int(s) > 0:
                 for _ in range(0, int(s)):
                     tags.append((domain, range_states[i]))
     self.cfd_tags = nltk.ConditionalFreqDist(tags)
     print "cfd trained, counts:"
     self.cfd_tags.tabulate()
     print "test:"
     print tabulate_cfd(self.cfd_tags)
     # save this new cfd for later use
     pickle.dump(self.cfd_tags, open(mm_path, "wb"))
     # initialize the cpd
     self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags,
                                              nltk.MLEProbDist)
     # print "cpd summary:"
     # print self.cpd_tags.viewitems()
     print tabulate_cfd(self.cpd_tags)
     all_outcomes = [v.keys() for v in self.cfd_tags.values()]
     self.tag_set = set(self.cfd_tags.keys() +
                        [y for x in all_outcomes for y in x])
     self.viterbi_init()  # initialize viterbi
def makeModel(percentageSplit):
    brown_tags_words = []
    cutoff = int(len(corpus) * percentageSplit / 100)
    tagged_data = corpus[:cutoff]
    for sent in tagged_data:
        # sent is a list of word/tag pairs
        # add START/START at the beginning
        brown_tags_words.append(("START", "START"))
        # then all the tag/word pairs for the word/tag pairs in the sentence.
        # shorten tags to 2 characters each
        brown_tags_words.extend([(tag, word) for (word, tag) in sent])
        # then END/END
        brown_tags_words.append(("END", "END"))
    # conditional frequency distribution
    global cfd_tagwords
    cfd_tagwords = nltk.ConditionalFreqDist(brown_tags_words)
    # conditional probability distribution
    global cpd_tagwords
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

    # Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE):
    # P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1})
    brown_tags = [tag for (tag, word) in brown_tags_words]

    # make conditional frequency distribution:
    # count(t{i-1} ti)
    global cfd_tags
    cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
    # make conditional probability distribution, using
    # maximum likelihood estimate:
    # P(ti | t{i-1})
    global cpd_tags
    cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

    #####
    # Viterbi:
    # If we have a word sequence, what is the best tag sequence?
    #
    # The method above lets us determine the probability for a single tag sequence.
    # But in order to find the best tag sequence, we need the probability
    # for _all_ tag sequence.
    # What Viterbi gives us is just a good way of computing all those many probabilities
    # as fast as possible.

    # what is the list of all tags?
    global distinct_tags
    distinct_tags = set(brown_tags)
Exemple #21
0
def getTagExpt(data):
    expt1 = Experiment()
    expt1.cfd_tags = nltk.ConditionalFreqDist(
        nltk.bigrams((tag for (word, tag) in getTagProb(data.train_items))))
    expt1.cpd_tags = nltk.ConditionalProbDist(expt1.cfd_tags, nltk.MLEProbDist)
    expt1.tagset = set((tag for (word, tag) in \
                        getTagProb(data.train_items)))
    return expt1
Exemple #22
0
def cal_cpd_tagwords(tags_words):
    cfd_tagwords = nltk.ConditionalFreqDist(tags_words)
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)
    print("The probability of an adjective (JJ) being 'new' is",
          cpd_tagwords["JJ"].prob("new"))
    print("The probability of a verb (VB) being 'duck' is",
          cpd_tagwords["VB"].prob("duck"))
    return cpd_tagwords
 def train(self, train_sents):
     cvc = self.check_corpus_validity(train_sents)
     if cvc[1] == False:
         raise TypeError(cvc[0])
         return
     train_tags_words = [ ]
     i = 0
     for sent in train_sents[self.sentstart:min(len(train_sents), self.sentend)]:
         print("Training: {}".format(i))
         i += 1
         # sent is a list of word/tag pairs
         # add START/START at the beginning
         train_tags_words.append( ("START", "START") )
         # then all the tag/word pairs for the word/tag pairs in the sentence.
         # shorten tags to 2 characters each
         train_tags_words.extend([ (tag[:self.trimtag], word) for (word, tag) in sent ])
         # then END/END
         train_tags_words.append( ("END", "END") )
     
     # conditional frequency distribution
     cfd_tagwords = nltk.ConditionalFreqDist(train_tags_words)
     # conditional probability distribution
     """cpd_tagwords contains emission probabilities"""
     self.cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)
     
     """print("The probability of an adjective (JJ) being 'new' is", cpd_tagwords["JJ"].prob("new"))
     print("The probability of a verb (VB) being 'duck' is", cpd_tagwords["VB"].prob("duck"))
     """
     
     # Estimating P(ti | t{i-1}) from corpus data using Maximum Likelihood Estimation (MLE):
     # P(ti | t{i-1}) = count(t{i-1}, ti) / count(t{i-1})
     train_tags = [tag for (tag, word) in train_tags_words ]
     self.tags_list = list(set(train_tags))
     
     # make conditional frequency distribution:
     # count(t{i-1} ti)
     cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(train_tags))
     # make conditional probability distribution, using
     # maximum likelihood estimate:
     # P(ti | t{i-1})
     
     """cpd_tags contains transition probabilities"""
     self.cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)
     
     """print("If we have just seen 'DT', the probability of 'NN' is", cpd_tags["DT"].prob("NN"))
Exemple #24
0
 def findCPD(self, typecfd=None):
     if (typecfd == None):
         self.cpdTag = nltk.ConditionalProbDist(self.cfdTag,
                                                nltk.MLEProbDist)
         return self.cpdTag
     elif (typecfd == "bi"):
         return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist)
     else:
         print("invalid method")
Exemple #25
0
def generate_text(text, initialword, numwords):
    bigrams = list(nltk.ngrams(text, 2))
    cpd = nltk.ConditionalProbDist(nltk.ConditionalFreqDist(bigrams),
                                   nltk.MLEProbDist)
    word = initialword
    words = [initialword]
    for i in range(numwords):
        word = cpd[word].generate()
        words.append(word)
    print(' '.join(words) + '.')
def train():
    print 'Training HMM...'

    # Use the first 1000 sentences from the 'news' category of the Brown corpus
    labelled_sequences, states, symbols = get_pos_data(1000)

    # Define the estimator to be used for probability computation
    estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins)

    # count occurences of starting states, transitions out of each state
    # and output symbols observed in each state
    freq_starts = nltk.FreqDist()
    freq_transitions = nltk.ConditionalFreqDist()
    freq_emissions = nltk.ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[1]
            symbol = token[0]
            if lasts == None:
                freq_starts.inc(state)
            else:
                freq_transitions[lasts].inc(state)
            freq_emissions[state].inc(symbol)
            lasts = state

            # update the state and symbol lists
            if state not in states:
                states.append(state)
            if symbol not in symbols:
                symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(states)
    starts = estimator(freq_starts, N)
    transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N)
    emissions = nltk.ConditionalProbDist(freq_emissions, estimator,
                                         len(symbols))

    # Return the transition and emissions probabilities along with
    # the list of all the states and output symbols
    return starts, transitions, emissions, states, symbols
Exemple #27
0
def calculate_probability(trigrams):
    trigrams_as_bigrams = []
    trigrams_as_bigrams.extend([((t[0], t[1]), t[2]) for t in trigrams])

    cfd = nltk.ConditionalFreqDist(trigrams_as_bigrams)
    cpd = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist)

    # for trigram in trigrams_as_bigrams:
    #    if conditional_probability_distribution[trigram[0]].prob(trigram[1]) == 1 :
    #        print("{1} has probablity {0}".format(conditional_probability_distribution[trigram[0]].prob(trigram[1]), trigram))
    return cpd
Exemple #28
0
def cal_cpd_tags(tags_words):
    brown_tags = [tag for (tag, word) in tags_words]
    cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(brown_tags))
    cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)
    print("If we have just seen 'DT', the probability of 'NN' is",
          cpd_tags["DT"].prob("NN"))
    print("If we have just seen 'VB', the probability of 'JJ' is",
          cpd_tags["VB"].prob("DT"))
    print("If we have just seen 'VB', the probability of 'NN' is",
          cpd_tags["VB"].prob("NN"))
    return cpd_tags, list(set(brown_tags))
Exemple #29
0
def hmm():
    # calculate the frequency distribution for the tags, words
    conditionFreqDisttag = nltk.ConditionalFreqDist(brownTagsWords)
    # calculate the probability distribution for the tags,words by using Maximum Likelihood Estimation
    global conditionProbdistTag
    conditionProbdistTag = nltk.ConditionalProbDist(conditionFreqDisttag,
                                                    nltk.MLEProbDist)
    #extract the tags and train it in HMM
    global brownTagsTrained
    brownTagsTrained = [tag for (tag, word) in brownTagsWords]
    # calculate the bigram frequency distribution for the tags
    #calculate the P(ti|ti-1)=C(ti-1ti)+1/C(ti-1)+V as V stands for the tags there are in the corpus
    freqDistTags = nltk.ConditionalFreqDist(nltk.bigrams(brownTagsTrained))
    # calculate the probability distribution for the tags by using Maximum Likelihood Estimation
    # the HMM is now trained
    global probdistTags
    probdistTags = nltk.ConditionalProbDist(freqDistTags,
                                            nltk.LaplaceProbDist,
                                            bins=len(brownTagsTrained))
    viterbi()
Exemple #30
0
def train_set1(x):
    global training_set1
    global set1_tags
    global set1_words
    global set1_cfd_word_tag
    global set1_cfd_tags
    global set1_cpd_word_tag
    global set1_cpd_tags
    set1_training_set_words = []
    for sent in training_set1:
        if x==0:
            set1_training_set_words.append(('<s>','<s>'))
        set1_training_set_words.extend([ (tag, word) for (word, tag) in sent ])
        if x==0:
            set1_training_set_words.append(('</s>','</s>'))
    set1_tags = [tag for (tag, word) in set1_training_set_words]
    set1_words = [word for (tag, word) in set1_training_set_words]
    set1_cfd_word_tag = nltk.ConditionalFreqDist(set1_training_set_words)
    set1_cfd_tags = nltk.ConditionalFreqDist(nltk.bigrams(set1_tags))
    set1_cpd_word_tag = nltk.ConditionalProbDist(set1_cfd_word_tag, nltk.MLEProbDist)
    set1_cpd_tags = nltk.ConditionalProbDist(set1_cfd_tags, nltk.MLEProbDist)