def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences new_data = [] for x in range(len(train_data)): new_data += train_data[x] data = [(tag, word.lower()) for (word, tag) in new_data] # print(data[:20]) # COMPLETED compute the emission model emission_FD = ConditionalFreqDist(data) est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = emission_FD.keys() #print(self.states[0]) return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: #for each sentence for tuples in sent: #for each pair of (word,tag) in every sentence data.append( (tuples[1], tuples[0].lower())) #list of tuples(tag,word) emission_FD = ConditionalFreqDist(data) # this is the estiamtor used for probability distribution est = lambda emission_FD: LidstoneProbDist(emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, est) self.states = list(emission_FD.keys()) #print(self.states) return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> data = [] for sent in train_data: data.append(("<s>", sent[0][1])) #start symbol for i in range(len(sent) - 1): data.append((sent[i][1], sent[i + 1][1])) data.append((sent[len(sent) - 1][1], "</s>")) #end symbol transition_FD = ConditionalFreqDist(data) #same estimator used for emission_model est = lambda transition_FD: LidstoneProbDist(transition_FD, 0.01, transition_FD.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, est) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ data = [] #[[(tag, word.lower()) for (word, tag) in sent]for sent in train_data] for sent in train_data: for (word, tag) in sent: data.append((tag, word.lower())) self.states.append(tag) emission_FD = ConditionalFreqDist(data) lidstone_estimator = lambda emission_FD: LidstoneProbDist( emission_FD, 0.01, emission_FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(set(self.states)) return self.emission_PD, self.states