Example #1
0
    def test(self, word_seq_path, output_path):
        original_seq, processed_seq = self.__prepare_word_seq(word_seq_path)
        decoder = Viterbi(self.vocab_list, self.tags, self.trans_prob,
                          self.emit_prob)
        tags_pred, prob = decoder.decode(processed_seq)

        with open(output_path, "w") as out:
            for word, tag in zip(original_seq, tags_pred):
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
Example #2
0
    def train(self, sequences, iterations=3):
        vit = Viterbi()
        for x in range(iterations):
            self.log_space()
            for name, seq in sequences.items():
                seq['Z'] = vit.decode(self, seq['X'])
                print seq['Z']
            #we return from log space
            self.delog()
            self.train_by_counting(sequences)
            print Model(self.keys, self.model, self.labels)

        return Model(self.keys, self.model, self.labels)
Example #3
0
 def train(self, sequences, iterations=3):
     vit = Viterbi()
     for x in range(iterations):
         self.log_space()
         for name, seq in sequences.items():
             seq['Z'] = vit.decode(self, seq['X'])
             print seq['Z']
         #we return from log space
         self.delog()
         self.train_by_counting( sequences )
         print Model(self.keys, self.model, self.labels) 
     
     return Model(self.keys, self.model, self.labels) 
Example #4
0
    def test(self):
        print('Test started...')
        start_test = time.time()
        self.pred_tags = []
        test_orig, test_prep = dataloader(self.corpus + TEST_WORDS, 'test')
        tagger = Viterbi(self.vocab, self.tags, test_prep, self.A, self.B)
        preds = tagger.decode()
        for word, tag in zip(test_orig, preds):
            self.pred_tags.append((word, tag))

        with open(PRED_T_POS, 'w') as out:
            for word, tag in self.pred_tags:
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
        out.close()
        print('Test finished, file has been written in '+ str(time.time()-\
                start_test))
Example #5
0
    def validate(self):
        print('Validation started...')
        start_val = time.time()
        self.pred_tags = []
        valid_orig, valid_prep = dataloader(self.corpus + \
                VALIDATE_WORDS, 'validate')
        tagger = Viterbi(self.vocab, self.tags, valid_prep, self.A, self.B)
        preds = tagger.decode()
        for word, tag in zip(valid_orig, preds):
            self.pred_tags.append((word, tag))

        with open(PRED_V_POS, 'w') as out:
            for word, tag in self.pred_tags:
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
        out.close()
        print('Validation ended, file has been written in '+ str(time.time()-\
                start_val))
Example #6
0
def test_viterbi_decode():
    '''
    Test case based on HW3 question.
  '''
    log = logging.getLogger('test viterbi')

    ZERO = 0.000000000000000001
    obs_space = ["moo", "hello", "quack", START_END_OBS]
    states = ["Cow", "Duck", START_END_TAG]
    trans_prob = [[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [1.0, ZERO, ZERO]]
    emit_prob = [[0.9, 0.1, ZERO, ZERO], [ZERO, 0.4, 0.6, ZERO],
                 [ZERO, ZERO, ZERO, 1.0]]

    decoder = Viterbi(obs_space, states, trans_prob, emit_prob)

    obs = ["moo", "hello", "quack", START_END_OBS]
    seq, prob = decoder.decode(obs)

    log.debug("seq: " + str(seq))
    log.debug("log_prob: " + str(prob))
    assert prob - (-5.03903) < ZERO and \
           seq == ["Cow", "Duck", "Duck", START_END_TAG]
Example #7
0
outputs.to_project_1_probs_file(sequences.get(), probs, 'viterbi-probs.txt')


"""
if __name__ == '__main__':
    model = hmm.Model(KEYS)
    model.load(HMMFILE)
    sequences = sequences.Sequences(SEQUENCEFILE)
    # load methods
    vit = Viterbi()
    post = Posterior()

    # viterbi
    probs = {}
    for key, sequence in sequences.get().items():
        probs[key] = vit.decode(model, sequence)

    outputs.to_project_2_viterbi(sequences.get(), probs,
                                 'pred-test-sequences-project2-viterbi.txt')

    probs = {}
    for key, value in sequences.get().items():
        sequence = {'Z': post.decode(model, value), 'X': value}
        log_joint = compute_hmm(model, sequence)

        probs[key] = (log_joint, sequence['Z'])

    #outputs.to_project_2_posterior(sequences.get(), probs, 'posterior-output.txt')
    outputs.to_project_2_posterior(
        sequences.get(), probs, 'pred-test-sequences-project2-posterior.txt')
    # testing
Example #8
0
def cross_validation(sequences, training_method, decoder):
    """
    Performs the 10-fold cross-validation
    Requieres an array of dict sequences
    Requires the training function
    Requires a decoder objetct (Viterbi or Posterior)
    """
    # here we store the total_ac for each cross-validation
    vit_total_ac = np.array([.0] * len(sequences))
    post_total_ac = np.array([.0] * len(sequences))
    vit = Viterbi()
    post = Posterior()

    for i in range(len(sequences)):
        vit_total_scores = np.zeros([4])
        post_total_scores = np.zeros([4])
        # arrays with the sequences for training and for validation
        training_data_array = sequences[:]
        validation_data_array = [training_data_array.pop(i)]

        # merging the arrays into dictionaries
        training_data = merge(training_data_array)
        validation_data = merge(validation_data_array)
        # the training function returns a model
        model = training_method(training_data)

        #do viterbi prediction on set i
        for key, sequence in validation_data.items():
            # the sequence from the file
            true_seq = sequence['Z']
            # the sequence decoded using viterbi, or posterior and the model generated
            vit_pred_seq = vit.decode(model, sequence['X'])
            post_pred_seq = post.decode(model, sequence['X'])
            """
            print key
            print "PREDICTED"
            print pred_seq
            print "TRUE"
            print true_seq
            """
            tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq)

            vit_total_scores += np.array([tp, fp, tn, fn])

            tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq)

            post_total_scores += np.array([tp, fp, tn, fn])
            if VERBOSE:
                print ">" + key
                compare_tm_pred.print_stats(tp, fp, tn, fn)
                print

        vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3]
        post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3]
        #print total_ac
        if VERBOSE:
            print "Summary 10-fold cross validation over index %i :" % (i)
            #  compare_tm_pred.print_stats( *total_scores  )
            print
            print
            print
            print "-------------------------------------------------------"
            if DEBUG:
                raw_input("press any key to continue\n")

    print "Overall viterbi result mean: %s, variance: %s" % (
        np.mean(vit_total_ac), np.var(vit_total_ac))
    print "Posterior mean: %s, variance %s" % (np.mean(post_total_ac),
                                               np.var(post_total_ac))
Example #9
0
def cross_validation(sequences, training_method, decoder):
    """
    Performs the 10-fold cross-validation
    Requieres an array of dict sequences
    Requires the training function
    Requires a decoder objetct (Viterbi or Posterior)
    """
    # here we store the total_ac for each cross-validation
    vit_total_ac = np.array([.0] * len(sequences))
    post_total_ac = np.array([.0] * len(sequences))
    vit = Viterbi()
    post = Posterior()
    

    for i in range(len(sequences)):
        vit_total_scores = np.zeros([4])
        post_total_scores = np.zeros([4])
        # arrays with the sequences for training and for validation
        training_data_array = sequences[:]
        validation_data_array = [ training_data_array.pop(i) ]

        # merging the arrays into dictionaries
        training_data = merge(training_data_array)
        validation_data = merge(validation_data_array)
        # the training function returns a model
        model = training_method(training_data)

        #do viterbi prediction on set i
        for key, sequence in validation_data.items():
            # the sequence from the file
            true_seq = sequence['Z']
            # the sequence decoded using viterbi, or posterior and the model generated
            vit_pred_seq = vit.decode(model, sequence['X'])
            post_pred_seq = post.decode(model, sequence['X'])
            """
            print key
            print "PREDICTED"
            print pred_seq
            print "TRUE"
            print true_seq
            """
            tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq)

            vit_total_scores += np.array([tp, fp, tn, fn])
            
            tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq)

            post_total_scores += np.array([tp, fp, tn, fn])
            if VERBOSE:
                print ">" + key
                compare_tm_pred.print_stats(tp, fp, tn, fn)
                print

        vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3]
        post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3]
        #print total_ac
        if VERBOSE:
            print "Summary 10-fold cross validation over index %i :"%(i)
          #  compare_tm_pred.print_stats( *total_scores  )
            print
            print
            print
            print "-------------------------------------------------------"
            if DEBUG:
                raw_input("press any key to continue\n")

    print "Overall viterbi result mean: %s, variance: %s"%(np.mean(vit_total_ac), np.var(vit_total_ac))
    print "Posterior mean: %s, variance %s"%(np.mean(post_total_ac), np.var(post_total_ac))