Beispiel #1
0
def predict(train_path, threshold, reg_lambda, test_path, conf, beam_width, file_name):
    v = MaximumEntropyMarkovModel.load_v_from_pickle(dump_weights_path='weights', threshold=args.threshold,
                                                     reg_lambda=reg_lambda)
    ft_statistics = FeatureStatistics(input_file_path=train_path, threshold=threshold, config=conf)
    ft_statistics.pre_process(fill_possible_tag_dict=False)
    is_comp = 'comp' in file_name
    if is_comp:
        test_sentence_hist_list = FeatureStatistics.fill_comp_ordered_history_list(file_path=test_path)
    else:
        test_sentence_hist_list = FeatureStatistics.fill_tagged_ordered_history_list(file_path=test_path, is_test=True)
    tag_set = ft_statistics.tags_set
    all_possible_tags_dict = ft_statistics.hist_to_feature_vec_dict
    get_ft_from_hist_func = ft_statistics.get_non_zero_feature_vec_indices_from_history
    word_possible_tag_set = ft_statistics.word_possible_tag_set
    word_possible_tag_with_threshold_dict = ft_statistics.word_possible_tag_with_threshold_dict
    rare_words_tags = ft_statistics.rare_words_tags

    viterbi = Viterbi(
        v=v, sentence_hist_list=test_sentence_hist_list, tags_list=tag_set,
        all_possible_tags_dict=all_possible_tags_dict, get_feature_from_hist=get_ft_from_hist_func,
        word_possible_tag_set=word_possible_tag_set,
        word_possible_tag_with_threshold_dict=word_possible_tag_with_threshold_dict,
        rare_words_tags=rare_words_tags,
        threshold=args.threshold,
        reg_lambda=args.reg_lambda,
        file_name=file_name,
        beam_width=beam_width
    )
    viterbi.predict_all_test(num_workers=4, is_comp=is_comp)
Beispiel #2
0
def main(logger):

    print "Initializing Data Parser..."
    data_parser         = OrwellDataParser(logger)

    print "Initializing HMM..."
    hmm_obj             = HMM(logger, data_parser, skew_unseen=True)

    print "Initializing Viterbi..."
    viterbi_obj         = Viterbi(logger, hmm_obj)

    print "Initializing Accuracy Estimator..."
    accuracy_estimator  = AccuracyEstimator(logger, data_parser)

    for language, language_file in DATA_FILES:
        print "******************************************************************************************"
        print language
        print "******************************************************************************************"

        print "Training HMM with %s data..." % language
        viterbi_obj.train(language_file, START_LINE - 1)

        print "Estimating accuracy of the model..."
        total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, START_LINE)

        print "TOTAL ACCURACY           : %.10f" % total_accuracy
        print "UNSEEN_ACCURACY          : %.10f" % unseen_accuracy
        print "Resetting model and estimator parameters..."

        viterbi_obj.reset()
        accuracy_estimator.reset()
Beispiel #3
0
    def test(self):
        v = Viterbi("model.txt")
        predicted_slot_count = 0
        actual_slot_count = 0
        hit_count = 0
        test_set_size = len(self.test_set[0])

        print("poccessing...")
        for i in range(test_set_size):
            if i != 0 and i % 100 == 0:
                print(str(i) + " done")
            sentence = list()
            for wordidx in self.test_set[0][i]:
                sentence.append(self.__idx2words[wordidx])
            predicted_seq = v.poccess(sentence)
            predicted_slot = extract_slot(predicted_seq)

            label_seq = list()
            for labelidx in self.test_set[2][i]:
                label_seq.append(self.__idx2labels[labelidx])
            actual_slot = extract_slot(label_seq)

            for item in predicted_slot:
                if item in actual_slot:
                    hit_count += 1
            predicted_slot_count += len(predicted_slot)
            actual_slot_count += len(actual_slot)

        print("test set size:" + str(test_set_size))
        print("predicted slot:" + str(predicted_slot_count) + " actual slot:" +
              str(actual_slot_count) + " hit:" + str(hit_count))
        print("Precision:" + str(hit_count / predicted_slot_count))
        print("Recall:" + str(hit_count / actual_slot_count))
        print("F1score:" + str(2 * hit_count /
                               (actual_slot_count + predicted_slot_count)))
    def test_wikipedia(self):
        hmm = {
            'Rainy': [('Rainy', 0.7), ('Sunny', 0.3)],
            'Sunny': [('Rainy', 0.4), ('Sunny', 0.6)]
        }

        start_probabilities = {'Rainy': 0.6, 'Sunny': 0.4}

        emission_probabilities = {
            'Rainy': {
                'walk': 0.1,
                'shop': 0.4,
                'clean': 0.5
            },
            'Sunny': {
                'walk': 0.6,
                'shop': 0.3,
                'clean': 0.1
            }
        }

        vit = Viterbi(hmm,
                      lambda state, obs: emission_probabilities[state][obs])

        (v, p) = vit.step('walk', start_probabilities)
        (v, p) = vit.step('shop', v, p)
        (v, p) = vit.step('clean', v, p)

        max_state = max(v, key=lambda x: v[x])
        assert (p[max_state] == ['Sunny', 'Rainy', 'Rainy'])
Beispiel #5
0
def main():
    bijen = Bijenkhan(BIJEN_CORPUS)
    sents_tags = []
    for sents, tags in bijen.sent_tag_gen(100):
        s = zip(sents, tags)
        sents_tags.extend(s)
    random.shuffle(sents_tags)
    test_sents_tags = sents_tags[:NUM_TEST_SAMPES]
    train_sents_tags = sents_tags[NUM_TEST_SAMPES:]
    viterbi = Viterbi(len(bijen.get_tags()),
                      len(bijen.get_vocab()),
                      bijen.get_tags(),
                      bijen.get_bigram_tags(),
                      train_sents_tags)

    for i in range(len(test_sents_tags)):
        true_labels = test_sents_tags[i][1]
        print(GREEN + 'True labels: ', true_labels)
        tmp = test_sents_tags[i][0]
        pred_labels = viterbi.viterbi(tmp[1:-1])
        print(RED + 'Pred labels: ', pred_labels)
        print(CYAN + f'Accuracy: {accuracy_score(true_labels, pred_labels)}')
        print(CYAN + f'Precision: {precision_score(true_labels, pred_labels, average="macro")}')
        print(CYAN + f'Recall: {recall_score(true_labels, pred_labels, average="macro")}')
        print('\n'*2)
Beispiel #6
0
    def run_viterbi(self):
        #print "Fin"
        #print self.observations

        viterbi = Viterbi(self.observations, self.states,
                          self.start_probability, self.transition_probability,
                          self.emission_probability)
        (junk, deduced_path) = viterbi.run_viterbi()
        self.checkSolutions(deduced_path)
Beispiel #7
0
def example2():
    likelihood = np.loadtxt('likelihood.txt')
    print('probs shape: %s ' % str(likelihood.shape))
    transcript = [2, 1, 3, 1, 3]

    viterbi = Viterbi(transcript, likelihood)
    alignement = viterbi.inference()
    assert len(alignement) == likelihood.shape[0]
    counter = count(alignement, transcript)
    print(alignement)
    print(counter)
Beispiel #8
0
    def test(self, word_seq_path, output_path):
        original_seq, processed_seq = self.__prepare_word_seq(word_seq_path)
        decoder = Viterbi(self.vocab_list, self.tags, self.trans_prob,
                          self.emit_prob)
        tags_pred, prob = decoder.decode(processed_seq)

        with open(output_path, "w") as out:
            for word, tag in zip(original_seq, tags_pred):
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
Beispiel #9
0
def example3():
    likelihood = np.loadtxt('likelihood.txt')
    print('probs shape: %s ' % str(likelihood.shape))
    transcript = ['a', 'b', 'c', 'b', 'c']
    state2idx = {'a': 2, 'b': 1, 'c': 3}

    viterbi = Viterbi(transcript, likelihood, state2idx=state2idx)
    alignement = viterbi.inference()
    assert len(alignement) == likelihood.shape[0]
    counter = count(alignement, transcript)
    print(alignement)
    print(counter)
Beispiel #10
0
    def get_observations(self):
        self.print_conditions()

        contents = self.contents[self.index + 1:]

        correct_answers = []
        correct_answer = ()
        observations = []
        observation = ()

        for (i, line) in enumerate(contents):
            read_line = line.rstrip()
            letters = read_line.split(" ")

            if read_line == "_ _":
                observations.append(observation)
                observation = ()
                continue
            if i + 1 == len(contents):
                observation = observation + (letters[1], )
                correct_answers.append(letters[0])
                observations.append(observation)
                observation = ()
                break

            observation = observation + (letters[1], )
            correct_answers.append(letters[0])

        correct_letters = []
        corrected_letters = []

        viterbi = Viterbi(observations[0], self.states, self.start_probability,
                          self.transition_probability,
                          self.emission_probability)
        hit = 0
        total = 0
        for (i, observation) in enumerate(observations):
            viterbi = Viterbi(observation, self.states, self.start_probability,
                              self.transition_probability,
                              self.emission_probability)
            corrected_letters = corrected_letters + viterbi.run_viterbi()[1]

        print "Some of the reconstructed state sequence: "
        for (i, letter) in enumerate(corrected_letters):
            if letter == correct_answers[i]:
                hit += 1
            if self.iteration < 100:
                print letter,
                self.iteration += 1
            total += 1

        print "\nPercent correctness:", hit / float(total) * 100
Beispiel #11
0
 def train(self, sequences, iterations=3):
     vit = Viterbi()
     for x in range(iterations):
         self.log_space()
         for name, seq in sequences.items():
             seq['Z'] = vit.decode(self, seq['X'])
             print seq['Z']
         #we return from log space
         self.delog()
         self.train_by_counting( sequences )
         print Model(self.keys, self.model, self.labels) 
     
     return Model(self.keys, self.model, self.labels) 
Beispiel #12
0
    def train(self, sequences, iterations=3):
        vit = Viterbi()
        for x in range(iterations):
            self.log_space()
            for name, seq in sequences.items():
                seq['Z'] = vit.decode(self, seq['X'])
                print seq['Z']
            #we return from log space
            self.delog()
            self.train_by_counting(sequences)
            print Model(self.keys, self.model, self.labels)

        return Model(self.keys, self.model, self.labels)
Beispiel #13
0
    def get_observations(self):
        self.print_conditions()

        contents = self.contents[self.index + 1:]

        correct_answers = []
        correct_answer = ()
        observations = []
        observation = ()


        for (i, line) in enumerate(contents):
            read_line = line.rstrip()
            letters = read_line.split(" ")

            if read_line == "_ _":
                observations.append(observation)
                observation = ()
                continue
            if i + 1 == len(contents):
                observation = observation + (letters[1], )
                correct_answers.append(letters[0])
                observations.append(observation)
                observation = ()
                break

            observation = observation + (letters[1], )
            correct_answers.append(letters[0])

        correct_letters = []
        corrected_letters = []


        viterbi = Viterbi(observations[0], self.states, self.start_probability, self.transition_probability, self.emission_probability)
        hit = 0
        total = 0
        for (i, observation) in enumerate(observations):
            viterbi = Viterbi(observation, self.states, self.start_probability, self.transition_probability, self.emission_probability)
            corrected_letters = corrected_letters + viterbi.run_viterbi()[1]

        print "Some of the reconstructed state sequence: "
        for (i, letter) in enumerate(corrected_letters):
            if letter == correct_answers[i]:
                hit += 1
            if self.iteration < 100:
                print letter,
                self.iteration += 1
            total += 1


        print "\nPercent correctness:", hit/float(total) * 100
Beispiel #14
0
    def __init__(self,
                 hmm,
                 emission_probability,
                 constraint_length=10,
                 MAX_DIST=500,
                 priors=None,
                 smallV=0.00000000001):
        # initialize spatial index
        self.previous_obs = None

        if priors == None:
            priors = dict([(state, 1.0 / len(hmm)) for state in hmm])

        state_spatial_index = Rtree()
        unlocated_states = []
        id_to_state = {}
        id = 0
        for state in hmm:
            geom = self.geometry_of_state(state)
            if not geom:
                unlocated_states.append(state)
            else:
                ((lat1, lon1), (lat2, lon2)) = geom
                state_spatial_index.insert(id, (min(lon1, lon2), min(
                    lat1, lat2), max(lon1, lon2), max(lat1, lat2)))
                id_to_state[id] = state
                id = id + 1

        def candidate_states(obs):  #was (lat,lon) in place of obs
            geom = self.geometry_of_observation(obs)
            if geom == None:
                return hmm.keys()
            else:
                (lat, lon) = geom
                nearby_states = state_spatial_index.intersection(
                    (lon - MAX_DIST / METERS_PER_DEGREE_LONGITUDE,
                     lat - MAX_DIST / METERS_PER_DEGREE_LATITUDE,
                     lon + MAX_DIST / METERS_PER_DEGREE_LONGITUDE,
                     lat + MAX_DIST / METERS_PER_DEGREE_LATITUDE))

                candidates = [id_to_state[id]
                              for id in nearby_states] + unlocated_states
                return candidates

        self.viterbi = Viterbi(hmm,
                               emission_probability,
                               constraint_length=constraint_length,
                               priors=priors,
                               candidate_states=candidate_states,
                               smallV=smallV)
def main():
    init_prob = [0, 0.5, 0.5]

    num_state = 2

    a = [[0, 0.5, 0.5], [0, 0.6, 0.4], [0, 0.7, 0.3]]

    b = [[0, 0, 0], [0, 0.7, 0.3], [0, 0.2, 0.8]]

    T = 2

    offset = 1

    Viterbi.get_optimal_state_sequence(init_prob, a, b, num_state, T, offset)
Beispiel #16
0
    def _writer_viterbi(self):
        sentence = []
        original_sentence = []
        tag_set = []
        lines_to_write = []

        with open(self.input_file, "r") as f:
            data = f.readlines()

            for line in data:

                words = line.split()
                if words and words[1] != '.':
                    current_word = words[1]
                    local_tag_set = []
                    for k, v in emission_probability.iteritems():
                        keys = k.split('|')
                        if keys[0] == words[1]:
                            local_tag_set.append(keys[1])
                    if not local_tag_set:
                        words[1] = UNKNOWN_WORD
                        local_tag_set = get_unknown_word_tags()
                    sentence.append(words[1])
                    original_sentence.append(current_word)
                    tag_set.extend(local_tag_set)
                elif words and words[1] == '.':
                    # send sentence to viterbi to compute tags.
                    viterbi = Viterbi(
                        tag_set=list(set(tag_set)),
                        word_set=sentence,
                        transition_probability=transition_probability,
                        emission_probability=emission_probability)
                    viterbi_states = viterbi.get_viterbi_states()

                    for word in range(0, len(sentence)):
                        lines_to_write.append(
                            str(word + 1) + '\t' + original_sentence[word] +
                            '\t' + viterbi_states[word] + '\n')

                    lines_to_write.append(
                        str(len(sentence) + 1) + '\t' + '.' + '\t' + '.' +
                        '\n')
                    lines_to_write.append('\n')
                    sentence = []
                    original_sentence = []
                    tag_set = []

        with open(self.output_path, 'w') as of:
            of.writelines(lines_to_write)
Beispiel #17
0
    def __init__(self, ref_frames_data_filename, ref_pickle_filename, test_pickle_filename):
        print "init..."
        self.previous_obs = None
        self.image_processor = ImageProcessor()

        self.descriptors_ref = self.image_processor.load_sift(ref_pickle_filename)
        self.descriptors_test = self.image_processor.load_sift(test_pickle_filename)

        hmm = self.ref_frames_data_to_hmm(ref_frames_data_filename)
        
        #emission_probabilities = map(lambda x: complementary_normal_distribution_cdf(x,0,EMISSION_SIGMA),range(0,int(3.0*EMISSION_SIGMA)))
            
        priors=dict([(state,1.0/len(hmm)) for state in hmm])
        
        self.viterbi = Viterbi(hmm,self.emission_probability,
                          constraint_length=2500, # BE CAREFUL with it. walking may take long time and higher value may be needed here
                          priors=priors)
Beispiel #18
0
    def cross_validation(self):
        cv_data = self.group_data(self.data)
        # The transition probabilities are done on the entire train and not on each fold.
        trans_probs = self.comp_transition_prob(self.data)
        # Do 10-fold cross validation below.
        k = 10
        for i in range(0, k):
            train_set = []
            valid_set = cv_data[i]
            print("Validation Fold ", i + 1)
            for j in range(0, k):
                if j != i:
                    train_set += cv_data[j]
            # Do the Naive Bayes Classification here
            self.estimate_nb(train_set)
            nb_pred_labels = self.predict(valid_set)
            nb_act_labels = [item[0] for item in valid_set]
            nb_acc = len(
                np.where(
                    np.array(nb_pred_labels) == np.array(nb_act_labels))[0])
            print("Validation Accuracy of Naive Bayes ",
                  nb_acc / len(nb_act_labels))

            # The emission probabilities are done for each cv dataset.
            emission_probs = self.comp_emission_prob(nb_pred_labels,
                                                     nb_act_labels)
            valid_words = self.dt.build_test_words(valid_set)
            # Do the Viterbi step here
            vt_pred_labels = []
            vt_act_labels = []
            nb_pred_labels = []
            itr = 0
            for w in valid_words:
                nb_pred_word = self.predict(valid_set[itr:(itr + len(w))])
                nb_pred_labels += nb_pred_word
                vit = Viterbi(emission_probs, trans_probs, nb_pred_word)
                vt_pred_labels += vit.hmmWord()
                itr += len(w)
            nb_acc = len(
                np.where(
                    np.array(nb_pred_labels) == np.array(nb_act_labels))[0])
            vt_acc = len(
                np.where(
                    np.array(vt_pred_labels) == np.array(nb_act_labels))[0])
            print("Validation Accuracy of Viterbi ",
                  vt_acc / len(nb_act_labels))
Beispiel #19
0
def main(logger):
    
    print "Initializing Data Parser..."
    data_parser         = OrwellDataParser(logger)

    print "Initializing Linear Sequence Model..."
    ls_obj              = LinearSequence(logger, data_parser, use_avg=True, use_suffix=True)

    print "Initializing Viterbi..."
    viterbi_obj         = Viterbi(logger, ls_obj)

    print "Initializing Accuracy Estimator..."
    accuracy_estimator  = AccuracyEstimator(logger, data_parser)

    for language, language_file in DATA_FILES:
        print "******************************************************************************************"
        print language
        print "******************************************************************************************"

        print "Training Linear Sequence Linear Sequence Model with %s data..." % language
        viterbi_obj.train(language_file, START_LINE - 1)

        #import pdb;pdb.set_trace()
        print viterbi_obj.predict_sequence(["his", "breast", "rose", "and", "fell", "a", "little", "faster", "."])
    
        print "Estimating accuracy of the model..."
        total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, START_LINE)
        
        print "TOTAL ACCURACY           : %.10f" % total_accuracy
        print "UNSEEN_ACCURACY          : %.10f" % unseen_accuracy
        print "Resetting model and estimator parameters..."
        
        viterbi_obj.reset()
        accuracy_estimator.reset()
    def viterbi(self, train_path, test_path, output_path):

        self._nerdic = NERDic(train_path)
        io = self._io
        train_sentences = []
        test_sentences = []
        for words, poss, labels in io.read_sentences(train_path):
            train_sentences.append(Sentence(labels, words, poss, self._nerdic))

        for words, poss, labels in io.read_sentences(test_path):
            test_sentences.append(Sentence(labels, words, poss, self._nerdic))

        viterbi = Viterbi(9)
        viterbi.train(train_sentences)
        for sent in test_sentences:
            predict_ids = viterbi.search(sent)
            sent.add_predict(predict_ids)

        io.write_sentences(output_path, test_sentences)
Beispiel #21
0
    def test(self):
        print('Test started...')
        start_test = time.time()
        self.pred_tags = []
        test_orig, test_prep = dataloader(self.corpus + TEST_WORDS, 'test')
        tagger = Viterbi(self.vocab, self.tags, test_prep, self.A, self.B)
        preds = tagger.decode()
        for word, tag in zip(test_orig, preds):
            self.pred_tags.append((word, tag))

        with open(PRED_T_POS, 'w') as out:
            for word, tag in self.pred_tags:
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
        out.close()
        print('Test finished, file has been written in '+ str(time.time()-\
                start_test))
Beispiel #22
0
    def validate(self):
        print('Validation started...')
        start_val = time.time()
        self.pred_tags = []
        valid_orig, valid_prep = dataloader(self.corpus + \
                VALIDATE_WORDS, 'validate')
        tagger = Viterbi(self.vocab, self.tags, valid_prep, self.A, self.B)
        preds = tagger.decode()
        for word, tag in zip(valid_orig, preds):
            self.pred_tags.append((word, tag))

        with open(PRED_V_POS, 'w') as out:
            for word, tag in self.pred_tags:
                if not word:
                    out.write("\n")
                else:
                    out.write("{0}\t{1}\n".format(word, tag))
        out.close()
        print('Validation ended, file has been written in '+ str(time.time()-\
                start_val))
Beispiel #23
0
 def train_iteration(self, filepath):
     viterbi = Viterbi("EMPTY")
     viterbi.v = self.v
     with open(filepath) as train_file:
         corpus = gen_sentence_train(train_file)
         count = 0
         for doc in corpus:
             count += 1
             if count % 1000 == 0:
                 _logger.debug("%d sentence processed" % count)
             sent = [s[0] for s in doc]
             tags = [s[1] for s in doc]
             tags_pred = viterbi.decode_one(list(sent))
             assert len(sent) == len(tags) == len(tags_pred)
             feat_gold = feat_vect(sent, tags)
             feat_pred = feat_vect(sent, tags_pred)
             for feat in feat_pred:
                 self.v[feat] -= feat_pred[feat]
             for feat in feat_gold:
                 self.v[feat] += feat_gold[feat]
Beispiel #24
0
 def get_example(self, idx):
     valid_indices = np.arange(self.dataset_length)[self._label_mask(idx)]
     # calculation of distance taking too long on all the states_
     valid_indices_choice = np.random.choice(valid_indices, size=1000)
     valid_states = self.states[valid_indices_choice][:, 0, ...]
     observation = self.states[idx]
     hidden_states = Viterbi(valid_states)(observation, self.blanket_size)
     example = self.base_dataset[idx]
     example["neighbours"] = self.base_dataset[hidden_states[:,
                                                             0].astype(int)]
     return example
def main(logger):

    out_ptr             = open(OUTPUT, "w")
    print "Initializing Data Parser..."
    data_parser         = OrwellDataParser(logger)

    print "Initializing Linear Sequence Model..."
    ls_obj              = LinearSequence(logger, data_parser)

    print "Initializing Viterbi..."
    viterbi_obj         = Viterbi(logger, ls_obj)

    print "Initializing Accuracy Estimator..."
    accuracy_estimator  = AccuracyEstimator(logger, data_parser)

    for language, language_file in TRAINING_FILES:
        language_file   = "Data/%s" % language_file
        print "******************************************************************************************"
        print language
        print "******************************************************************************************"

        print "Training Linear Sequence Linear Sequence Model with %s data..." % language
        viterbi_obj.train(language_file, START_LINE - 1)

        #import pdb;pdb.set_trace()
        #print viterbi_obj.predict_sequence(["his", "breast", "rose", "and", "fell", "a", "little", "faster", "."])

        print "Estimating accuracy of the model..."
        total_accuracy, unseen_accuracy = accuracy_estimator.compute_parameters(viterbi_obj, language_file, "inaccurate_words_%s.txt"%language, language, START_LINE)

        print "TOTAL ACCURACY           : %.10f" % total_accuracy
        print "UNSEEN_ACCURACY          : %.10f" % unseen_accuracy

        o_line = "Language : %s\nTotal Accuracy : %.10f\nUnseen Accuracy : %.10f\n\n\n" % (language, total_accuracy, unseen_accuracy)
        out_ptr.write(o_line)
        print "Resetting model and estimator parameters..."

        viterbi_obj.reset()
        accuracy_estimator.reset()

    out_ptr.close()
Beispiel #26
0
def test_viterbi_decode():
    '''
    Test case based on HW3 question.
  '''
    log = logging.getLogger('test viterbi')

    ZERO = 0.000000000000000001
    obs_space = ["moo", "hello", "quack", START_END_OBS]
    states = ["Cow", "Duck", START_END_TAG]
    trans_prob = [[0.5, 0.3, 0.2], [0.3, 0.5, 0.2], [1.0, ZERO, ZERO]]
    emit_prob = [[0.9, 0.1, ZERO, ZERO], [ZERO, 0.4, 0.6, ZERO],
                 [ZERO, ZERO, ZERO, 1.0]]

    decoder = Viterbi(obs_space, states, trans_prob, emit_prob)

    obs = ["moo", "hello", "quack", START_END_OBS]
    seq, prob = decoder.decode(obs)

    log.debug("seq: " + str(seq))
    log.debug("log_prob: " + str(prob))
    assert prob - (-5.03903) < ZERO and \
           seq == ["Cow", "Duck", "Duck", START_END_TAG]
Beispiel #27
0
def main():
    p = optparse.OptionParser()
    p.add_option('-r', action = 'store_true', dest = "redo", default = False)
    opts, args = p.parse_args()
    
    output_file = ''
    if len(args) == 1:
        fileName = args[0]
    elif len(args) == 2:
        fileName = args[0]
        output_file = args[1]
    elif not args:
        sys.stderr.write("Error: please specify a file name\n")
        raise SystemExit(1)
    elif len(args) > 2:
        sys.stderr.write("Error: too much argument\n")
        raise SystemExit(1)
    
    # split the sentences
    processor = Preprocessor(fileName)
    sentences = processor.getSentences()
    
    # create the likelihood table, prior probability table and so on
    if opts.redo or not (os.path.isfile("likelihood.pkl")
        and os.path.isfile("prior_prob.pkl")
        and os.path.isfile("tags.pkl")
        and os.path.isfile("vocabulary.pkl")):
        viterbi_util.compute_table("training.pos")
        
    # run viterbi algorithm
    viterbi = Viterbi()
    output = []
    
    for sentence in sentences:
       tag_seq = viterbi.go(sentence)
       output.append((sentence, tag_seq))
    
    # write the result into a file
    viterbi_util.write_out(output, output_file)
Beispiel #28
0
    def label(self):
        vit_obs = []
        hidden_states = []
        vit = Viterbi()
        with open(self.testing_file, 'r') as in_file:
            text = in_file.read()
            for word_pos in text.split():
                word_pos_split = (word_pos.split('_'))
                word = word_pos_split[0]
                pos = word_pos_split[1]
                self.real.append(pos)  # record the true value
                self.words.append(word)
                if pos not in hidden_states:
                    hidden_states.append(pos)
                vit_obs.append(word)

        # print "observation: ", vit_obs
        # print "hidden states: ", hidden_states
        # print "transition: ", self.transition
        # print "emission: ", self.emission
        # print "start: ", self.start
        print "Beginning viterbi algorithm"
        probability, self.predicted = vit.viterbi(vit_obs, hidden_states, self.start, self.transition, self.emission)
Beispiel #29
0
    def run_viterbi(self):
        contents = self.contents[self.index + 1:]

        observations = ()
        correct_path = []

        for line in contents:
            read_line = line.rstrip()
            if read_line == ".":
                viterbi = Viterbi(observations, self.states, \
                                  self.start_probability, \
                                  self.transition_probability, \
                                  self.emission_probability)
                deduced_path = viterbi.run_viterbi()
                junk, guessed_path = deduced_path
                self.iteration += 1
                self.check_correctness(guessed_path, correct_path)
                observations = ()
                correct_path = []
                continue

            coordinate, color = read_line.split(" ")
            observations = observations + (color,)
            correct_path.append(coordinate)
Beispiel #30
0
    def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001):                
        # initialize spatial index
        self.previous_obs = None

        if priors == None:
            priors=dict([(state,1.0/len(hmm)) for state in hmm])

        state_spatial_index = Rtree()
        unlocated_states = []
        id_to_state = {}
        id = 0
        for state in hmm: 
            geom=self.geometry_of_state(state)            
            if not geom:
                unlocated_states.append(state)
            else:
                ((lat1,lon1),(lat2,lon2))=geom
                state_spatial_index.insert(id,
                                           (min(lon1, lon2), min(lat1, lat2), 
                                            max(lon1, lon2), max(lat1, lat2)))
                id_to_state[id]=state
                id=id+1
            
        def candidate_states(obs): #was (lat,lon) in place of obs 
            geom = self.geometry_of_observation(obs)
            if geom == None:
                return hmm.keys()
            else:
                (lat,lon)=geom
                nearby_states = state_spatial_index.intersection((lon-MAX_DIST/METERS_PER_DEGREE_LONGITUDE,
                                                                  lat-MAX_DIST/METERS_PER_DEGREE_LATITUDE,
                                                                  lon+MAX_DIST/METERS_PER_DEGREE_LONGITUDE,
                                                                  lat+MAX_DIST/METERS_PER_DEGREE_LATITUDE))

                candidates = [id_to_state[id] for id in nearby_states]+unlocated_states
                return candidates

        self.viterbi = Viterbi(hmm,emission_probability,
                               constraint_length=constraint_length,
                               priors=priors,
                               candidate_states=candidate_states,
                               smallV=smallV)
Beispiel #31
0
 def __init__(self, logger, data_parser, use_avg=False, use_suffix=False, training_level=5, start_tag="START", stop_tag="STOP"):
     self.logger         = logger
     self.data_parser    = data_parser
     self.start_tag      = start_tag
     self.stop_tag       = stop_tag
     self.training_level = training_level
     self.tag_features   = set()
     self.word_features  = set()
     self.weights        = {}
     self.seen_words     = set()
     self.viterbi_obj    = Viterbi(logger, self)
     self.KEY_TAG        = "TAG_FEATURE"
     self.KEY_WORD       = "WORD_FEATURE"
     self.KEY_SUFFIX     = "SUFFIX_FEATURE"
     self.special_tags   = [start_tag, stop_tag]
     self.hidden_states  = []
     self.avg_weights    = {}
     self.use_avg        = use_avg
     self.use_suffix     = use_suffix
     self.trained        = False
     self.suffix_features= set()
Beispiel #32
0
    def predict(self):

        nvi = 12

        for i in range(nvi):

            predicted_labels_training_set = []

            print("starting viterbi run {}...".format(i))

            for j, sent in enumerate(self.training_set):

                predicted_labels_training_set.append(
                    Viterbi(sent, self.event_names, self.fweights).run())

                tmp_sent = copy.deepcopy(sent)
                tmp_sent["events"] = predicted_labels_training_set[j]

                for i, w in enumerate(sent["words"]):
                    # extract features from each word from the correcly labelled sentence..
                    ff = self.create_features(sent, i, "train")
                    # and the labelling by Viterbi
                    ff_pr = self.create_features(tmp_sent, i, "train")

                    if sent["events"][i] != tmp_sent["events"][i]:

                        for k in ff_pr:
                            if k in self.fweights:
                                self.fweights[k] -= 1
                        for g in ff:
                            if g in self.fweights:
                                self.fweights[g] += 1

            # now get scores for this Viterbi iteration
            training_labels = [st["events"] for st in self.training_set]
            # print("have {} training sentences and {} predicted ones".format(len(training_labels), len(predicted_labels_training_set)))
            Scores(training_labels, predicted_labels_training_set).show()

            with open(self.feature_trained_file_path, "w+") as f:
                json.dump(self.fweights, f)
Beispiel #33
0
from opt_results1 import simple_vec
from viterbi import Viterbi

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--lamb', type=float, default=0)
    parser.add_argument('-f',
                        '--families',
                        nargs='+',
                        type=int,
                        default=[0, 1, 2, 3, 4, 5, 7, 8, 15, 16, 17, 19, 20])
    args = parser.parse_args()

    print(args.lamb, " ", args.families)

    vec = train.calc_weight_vector("train.wtag",
                                   families=args.families,
                                   lamb=args.lamb)
    path = train.create_and_get_path(args.families, args.lamb)
    file = open(path, "w")
    file.write("simple_vec = %s\n" % vec.x.tolist())
    file.close()

    #vec = simple_vec

    vit = Viterbi(vec.x.tolist(), args.families)
    vit.evaluate("test.wtag", 3, 0, args.lamb)

    print(args.lamb)
Beispiel #34
0
Datei: toy.py Projekt: w-dq/PY2HZ
import utils
from viterbi import Viterbi

from utils import HmmParam

hmm = HmmParam()
# print(hmm.emission_table['data']['北京'])
# print(hmm.transition_table['data']['爱']['北京'])
# print(hmm.transition_table['data']['爱'])
# print(hmm.py2hz_dict['beijing'])

print(Viterbi(hmm, ['baobao', 'ye', 'tai', 'bang', 'le', 'ba'], 5))
Beispiel #35
0
class GPSMatcher:
    def __init__(self, hmm, emission_probability, constraint_length=10, MAX_DIST=500, priors=None, smallV=0.00000000001):                
        # initialize spatial index
        self.previous_obs = None

        if priors == None:
            priors=dict([(state,1.0/len(hmm)) for state in hmm])

        state_spatial_index = Rtree()
        unlocated_states = []
        id_to_state = {}
        id = 0
        for state in hmm: 
            geom=self.geometry_of_state(state)            
            if not geom:
                unlocated_states.append(state)
            else:
                ((lat1,lon1),(lat2,lon2))=geom
                state_spatial_index.insert(id,
                                           (min(lon1, lon2), min(lat1, lat2), 
                                            max(lon1, lon2), max(lat1, lat2)))
                id_to_state[id]=state
                id=id+1
            
        def candidate_states(obs): #was (lat,lon) in place of obs 
            geom = self.geometry_of_observation(obs)
            if geom == None:
                return hmm.keys()
            else:
                (lat,lon)=geom
                nearby_states = state_spatial_index.intersection((lon-MAX_DIST/METERS_PER_DEGREE_LONGITUDE,
                                                                  lat-MAX_DIST/METERS_PER_DEGREE_LATITUDE,
                                                                  lon+MAX_DIST/METERS_PER_DEGREE_LONGITUDE,
                                                                  lat+MAX_DIST/METERS_PER_DEGREE_LATITUDE))

                candidates = [id_to_state[id] for id in nearby_states]+unlocated_states
                return candidates

        self.viterbi = Viterbi(hmm,emission_probability,
                               constraint_length=constraint_length,
                               priors=priors,
                               candidate_states=candidate_states,
                               smallV=smallV)

    def step(self,obs,V,p):    
        if self.previous_obs != None:
            for int_obs in self.interpolated_obs(self.previous_obs, obs):
                V,p = self.viterbi.step(int_obs,V,p)        
        V,p = self.viterbi.step(obs,V,p)
        self.previous_obs = obs
        return V,p

    def interpolated_obs(self,prev,obs):
        return []

    def geometry_of_observation(self, obs):
        return obs

    def geometry_of_state(self, state):
        """ Subclasses should override this method to return the geometry of a given state, typically an edge."""
        if state == 'unknown': return None
        else:
            return state
    def run(self, verbose):
        data = self.data
        hmm = self.hmm
        viterbi = Viterbi(hmm)

        # Start probabilitites
        print("Start probabilities:")
        for state in data.statekeys:
            print(state, ':\t', "{0:.3f}".format(hmm.start_prob(state)))

        # Transition probabilities
        print("\nTransition probabilities:")

        for state in data.states:
            sys.stdout.write('\t' + state)
        sys.stdout.write('\n')
        sys.stdout.flush()
        for from_state in data.states:
            sys.stdout.write(from_state + ' :')
            for to_state in data.states:
                trans_prob = hmm.trans_prob(from_state, to_state)
                sys.stdout.write('\t' + "{0:.3f}".format(trans_prob))
            sys.stdout.write('\n')
            sys.stdout.flush()

        # Output probabilities
        print("\nOutput probabilities:")

        print_outputs = (len(data.outputs) < 30) or verbose

        if not print_outputs:
            print("*" * 32)
            print("Too many outputs to display... calculating the outputs...")
            print("Run with '-v' to see all outputs")
            print("*" * 32)

        if print_outputs:
            for output in sorted(data.outputs):
                sys.stdout.write('\t' + output)
        if print_outputs:
            sys.stdout.write('\n')
            sys.stdout.flush()

        for state in data.states:
            sys.stdout.write(state + ' :')
            for output in sorted(data.outputs):
                out_prob = hmm.output_prob(state, output)
                if print_outputs:
                    sys.stdout.write('\t' + "{0:.3f}".format(float(out_prob)))
            if print_outputs:
                sys.stdout.write('\n')
                sys.stdout.flush()

        # Most likely sequence
        overall_error = 0
        for i, sequence in enumerate(data.testing.sequences):
            print_mls = (i < 4) or verbose
            if (i == 4) and not verbose:
                print("")
                print("*" * 32)
                print("There are too many sequences to display... Calculating")
                print("Run with '-v' to see all outputs")
                print("*" * 32)

            outputs = sequence.outputs()
            inputs = sequence.inputs()
            _, mls = viterbi.most_likely_sequence(outputs)

            if print_mls:
                print("\nMost likely sequence #"+str(i)+":")
                print('input\tcalc\toutput')

            errors = 0
            inputs_len = len(inputs)
            for i in range(inputs_len):
                if print_mls:
                    print(inputs[i], '\t', mls[i], '\t', outputs[i])
                if inputs[i] != mls[i]:
                    errors += 1
                else:
                    pass

            err_percentage = errors / float(inputs_len)

            if print_mls:
                print('Errors:', errors, '/', len(inputs), '=', err_percentage)

            seq_len = float(len(data.testing.sequences))
            overall_error += err_percentage / seq_len

        correct_percent = 1 - overall_error

        print("\nThe overall percent correct is " +
              "{0:.3f}".format(correct_percent) + "%")
Beispiel #37
0
class LinearSequence:
    def __init__(self, logger, data_parser, use_avg=False, use_suffix=False, training_level=5, start_tag="START", stop_tag="STOP"):
        self.logger         = logger
        self.data_parser    = data_parser
        self.start_tag      = start_tag
        self.stop_tag       = stop_tag
        self.training_level = training_level
        self.tag_features   = set()
        self.word_features  = set()
        self.weights        = {}
        self.seen_words     = set()
        self.viterbi_obj    = Viterbi(logger, self)
        self.KEY_TAG        = "TAG_FEATURE"
        self.KEY_WORD       = "WORD_FEATURE"
        self.KEY_SUFFIX     = "SUFFIX_FEATURE"
        self.special_tags   = [start_tag, stop_tag]
        self.hidden_states  = []
        self.avg_weights    = {}
        self.use_avg        = use_avg
        self.use_suffix     = use_suffix
        self.trained        = False
        self.suffix_features= set()

    def reset(self):
        self.tag_features   = set()
        self.word_features  = set()
        self.weights        = {}
        self.seen_words     = set()
        self.trained        = False
        self.avg_weights    = {}
        self.hidden_states  = []
        self.suffix_features=set()

    def is_unseen(self, word):
        if word in self.seen_words:
            return False
        return True

    def train(self, training_file, end_line=5500):
        self.logger.info("Started training data from %s upto line %d" %(training_file, end_line))
        tags_info = {}
        for line_no, word_list in self.data_parser.next(training_file):

            if line_no > end_line:
                break

            prev_tag = None
            for index, (word, tag) in enumerate(word_list):
                #create feature space
                if prev_tag is not None:
                    self.tag_features.add((prev_tag, tag))
                self.word_features.add((tag, word))

                #suffix features
                if len(word) > 1:
                    self.suffix_features.add((word[-1:], tag))
                    if len(word) > 2:
                        self.suffix_features.add((word[-2:], tag))
                        if len(word) > 3:
                            self.suffix_features.add((word[-3:], tag))

                prev_tag = tag

                #if tag not in self.hidden_states:
                #    self.hidden_states.append(tag)
                tags_info[tag] = tags_info.setdefault(tag, 0) + 1

                self.seen_words.add(word)
        #self.viterbi_obj.tag_list = [tag for tag, count in sorted(tags_info.iteritems(), key=lambda x:x[1])]
        #self.hidden_states = [tag for tag, count in sorted(tags_info.iteritems(), key=lambda x:x[1])]
        self.viterbi_obj.tag_list = tags_info.keys()
        self.hidden_states        = tags_info.keys()
        print self.hidden_states
        self.logger.info("Completed parsing the training data to form feature space")
        self.logger.info("Tag features : %d" % len(self.tag_features))
        self.logger.info("Word features : %d" % len(self.word_features))
        self.logger.info("Suffix features : %d" % len(self.suffix_features))
        self.logger.info("Hidden States  : %d" % len(self.hidden_states))
        self.estimate_weights(training_file, end_line)
        self.trained = True

    def get_suffix_feature(self, tag, word):
        if self.trained and self.use_avg:
            weights = self.avg_weights
        else:
            weights = self.weights
        wt = 0
        if len(word) > 1:
            wt += weights.get(self.KEY_SUFFIX, {}).get((word[-1:], tag), 0)
            if len(word) > 2:
                wt += weights.get(self.KEY_SUFFIX, {}).get((word[-2:], tag), 0)
                if len(word) > 3:
                    wt += weights.get(self.KEY_SUFFIX, {}).get((word[-3:], tag), 0)
        return wt

    def get_cand_suffix_feature(self, tag, word):
        if self.trained and self.use_avg:
            weights = self.avg_weights
        else:
            weights = self.weights
        wt = 0
        if len(word) > 1:
            wt += weights.get("CAND_SUF", {}).get((word[-1:], tag), 0)
            if len(word) > 2:
                wt += weights.get("CAND_SUF", {}).get((word[-2:], tag), 0)
                if len(word) > 3:
                    wt += weights.get("CAND_SUF", {}).get((word[-3:], tag), 0)
        return wt

    def get_transition_feature(self, prev_tag, next_tag=None):
        if self.trained and self.use_avg:
            weights = self.avg_weights
        else:
            weights = self.weights
        if next_tag is None:
            next_tag = self.stop_tag

        return weights.get(self.KEY_TAG, {}).get((prev_tag, next_tag), 0)



    def get_emission_feature(self, tag, word):
        if self.trained and self.use_avg:
            weights = self.avg_weights
        else:
            weights = self.weights

        return weights.get(self.KEY_WORD, {}).get((tag, word), 0)

    def get_cand_emission_feature(self, tag, word):
        if self.trained and self.use_avg:
            weights = self.avg_weights
        else:
            weights = self.weights

        return weights.get("CAND_EMI", {}).get((tag, word), 0)



    def main_compute_prev(self, result_list, t, i, j, word_list, tag_list):
        addn_wt = 0
        if self.use_suffix:
            addn_wt = self.get_suffix_feature(tag_list[j], word_list[t + 1])

        if t < 0:
            if tag_list[i] == self.start_tag:
                return self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt
            else:
                return -10000000 + self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt

        return result_list[t][i][0] + self.get_transition_feature(tag_list[i], tag_list[j]) + self.get_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt

    def main_compute_final(self, result_list, i, num_words, tag_list):
        return result_list[num_words - 1][i][0] + self.get_transition_feature(tag_list[i])

    def compute_prev(self, result_list, t, j, word_list, tag_list):
        addn_wt = self.get_cand_suffix_feature(tag_list[j], word_list[t + 1])

        return self.get_cand_emission_feature(tag_list[j], word_list[t + 1]) + addn_wt

    def estimate_weights(self, training_file, end_line=5500):
        self.logger.info("Started estimating weights...")
        parse_level = 0
        multiplier = self.training_level * end_line
        for r_level in xrange(self.training_level):
            parse_level += 1
            for line_no, word_list in self.data_parser.next(training_file):

                if line_no > end_line:
                    break

                if line_no % 500 == 0:
                    print "LEVEL: %d : Processed %d lines..." % (r_level, line_no)

                new_word_list = [(word, tag) for word, tag in word_list if tag not in self.special_tags]
                predicted_tags  = self.viterbi_obj.predict_sequence([word for word, tag in new_word_list])
                self.reestimate_weights(new_word_list, predicted_tags, multiplier)

                multiplier -= 1

            self.logger.info("Completed parsing %d time(s) for estimating weights" % parse_level)
        self.logger.info("Completed estimating weights")
        self.logger.info("TAG WEIGHTS  : %d" % len(self.weights.get(self.KEY_TAG, {})))
        self.logger.info("WORD WEIGHTS  : %d" % len(self.weights.get(self.KEY_WORD, {})))


    def reestimate_weights(self, word_list, predicted_tags, multiplier):
        prev_tag        = self.start_tag
        pred_prev_tag   = self.start_tag
        prev_main_tag        = self.start_tag
        pred_prev_main_tag   = self.start_tag
        local_diff      = {}
        for index, (word, tag) in enumerate(word_list):
            if tag in self.special_tags:
                pred_tag = tag
            else:
                if predicted_tags:
                    pred_tag = predicted_tags[index]
                else:
                    pred_tag = None

            if tag in ["PUN", "START", "STOP"]:
                main_tag = tag
            else:
                main_tag = tag[0]

            if pred_tag in ["PUN", "START", "STOP"]:
                pred_main_tag = pred_tag
            else:
                pred_main_tag = pred_tag[0]


            #weights of tags
            if prev_main_tag is not None:
                #count = self.weights.setdefault(self.KEY_TAG, {}).setdefault((prev_tag, tag), 0)
                #self.weights[self.KEY_TAG][(prev_tag, tag)] = count + 1
                count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((prev_main_tag, main_tag), 0)
                local_diff[self.KEY_TAG][(prev_main_tag, main_tag)] = count + 1



            if predicted_tags and pred_prev_main_tag is not None:
                #count = self.weights.setdefault(self.KEY_TAG, {}).setdefault((pred_prev_tag, pred_tag), 0)
                #self.weights[self.KEY_TAG][(pred_prev_tag, pred_tag)] = count - 1
                count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((pred_prev_main_tag, pred_main_tag), 0)
                local_diff[self.KEY_TAG][(pred_prev_main_tag, pred_main_tag)] = count - 1


            #weights of words
            if tag not in self.special_tags:
                #count = self.weights.setdefault(self.KEY_WORD, {}).setdefault((tag, word), 0)
                #self.weights[self.KEY_WORD][(tag, word)] = count + 1
                count = local_diff.setdefault(self.KEY_WORD, {}).setdefault((main_tag, word), 0)
                local_diff[self.KEY_WORD][(main_tag, word)] = count + 1

                count = local_diff.setdefault("CAND_EMI", {}).setdefault((tag, word), 0)
                local_diff["CAND_EMI"][(tag, word)] = count + 1


                if predicted_tags:
                    #count = self.weights.setdefault(self.KEY_WORD, {}).setdefault((pred_tag, word), 0)
                    #self.weights[self.KEY_WORD][(pred_tag, word)] = count - 1
                    count = local_diff.setdefault(self.KEY_WORD, {}).setdefault((pred_main_tag, word), 0)
                    local_diff[self.KEY_WORD][(pred_main_tag, word)] = count - 1

                    count = local_diff.setdefault("CAND_EMI", {}).setdefault((pred_tag, word), 0)
                    local_diff["CAND_EMI"][(pred_tag, word)] = count - 1

            #weight of suffix
            if len(word) > 1:
                count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-1:], main_tag), 0)
                local_diff[self.KEY_SUFFIX][(word[-1:], main_tag)] = count + 1
                count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-1:], pred_main_tag), 0)
                local_diff[self.KEY_SUFFIX][(word[-1:], pred_main_tag)] = count - 1
                count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-1:], tag), 0)
                local_diff["CAND_SUF"][(word[-1:], tag)] = count + 1
                count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-1:], pred_tag), 0)
                local_diff["CAND_SUF"][(word[-1:], pred_tag)] = count - 1
                if len(word) > 2:
                    count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-2:], main_tag), 0)
                    local_diff[self.KEY_SUFFIX][(word[-2:], main_tag)] = count + 1
                    count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-2:], pred_main_tag), 0)
                    local_diff[self.KEY_SUFFIX][(word[-2:], pred_main_tag)] = count - 1
                    count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-2:], tag), 0)
                    local_diff["CAND_SUF"][(word[-2:], tag)] = count + 1
                    count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-2:], pred_tag), 0)
                    local_diff["CAND_SUF"][(word[-2:], pred_tag)] = count - 1
                    if len(word) > 3:
                        count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-3:], main_tag), 0)
                        local_diff[self.KEY_SUFFIX][(word[-3:], main_tag)] = count + 1
                        count = local_diff.setdefault(self.KEY_SUFFIX, {}).setdefault((word[-3:], pred_main_tag), 0)
                        local_diff[self.KEY_SUFFIX][(word[-3:], pred_main_tag)] = count - 1
                        count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-3:], tag), 0)
                        local_diff["CAND_SUF"][(word[-3:], tag)] = count + 1
                        count = local_diff.setdefault("CAND_SUF", {}).setdefault((word[-3:], pred_tag), 0)
                        local_diff["CAND_SUF"][(word[-3:], pred_tag)] = count - 1


            prev_tag        = tag
            prev_main_tag   = main_tag
            pred_prev_tag   = pred_tag
            pred_prev_main_tag = pred_main_tag

        count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((main_tag, self.stop_tag), 0)
        local_diff[self.KEY_TAG][(main_tag, self.stop_tag)] = count + 1

        count = local_diff.setdefault(self.KEY_TAG, {}).setdefault((pred_main_tag, self.stop_tag), 0)
        local_diff[self.KEY_TAG][(pred_main_tag, self.stop_tag)] = count - 1


        for tag_type, info_hash in local_diff.iteritems():
            for key, value in info_hash.iteritems():
                count = self.weights.setdefault(tag_type, {}).setdefault(key, 0)
                self.weights[tag_type][key] = count + value
                wt = self.avg_weights.setdefault(tag_type, {}).setdefault(key, 0)
                self.avg_weights[tag_type][key] = wt + multiplier * value
Beispiel #38
0
    except FileNotFoundError:
        print("Weights were not found")
        exit(0)

    predictions = list()
    incorrect_count = 0
    correct_count = 0
    start_time = time()
    incorrect_tags = dict()
    confusion_matrix = pd.DataFrame(index=tags, columns=tags).fillna(0)

    for i in range(test_data_size):
        # print(i+1,'/',test_data_size)
        sentence = [x[0] for x in test_data.data[i]]
        test_tags = [x[1] for x in test_data.data[i]]
        viterbi = Viterbi(tags, gen.transform, sentence, w_0, 5)
        predicted_tags = viterbi.run()
        predictions.append((sentence, predicted_tags))

        for t, p in zip(test_tags, predicted_tags):
            if t == p:
                correct_count += 1
            else:
                incorrect_count += 1
                if t in incorrect_tags:
                    incorrect_tags[t] += 1
                else:
                    incorrect_tags[t] = 1
            confusion_matrix.loc[t, p] += 1

    end_time = time() - start_time
Beispiel #39
0
    # Checking arguments
    parser = argparse.ArgumentParser(description='Output Viterbi Sequence.')
    parser.add_argument('sequence',
                        metavar='N',
                        type=str,
                        nargs='+',
                        help='an integer for the sequence')
    parser.add_argument(
        '-s',
        '--sequence',
        action='store_const',
        dest='sequence',
        const=sequence,
        default=None,
        help=
        'A sequence of numbers for the output sequence :: Required for any output.'
    )
    args = parser.parse_args()

    sequence = args.sequence[0]

    # If sequence not provided, return help and exit
    if not sequence:
        parser.print_help()
        exit()

    print 'Observation(Input Sequence):', sequence

    element = Viterbi(sequence)
    element.viterbi()
Beispiel #40
0
import numpy as np
from viterbi import Viterbi, add_one_smoothing

TAGS = ['N', 'C', 'V', 'J']
LEXICON = ['that', 'is', 'not', 'it', 'good', 'bad']

Pi = [1 / 8, 3 / 8, 3 / 8, 1 / 8]

count_A = np.array([[2., 0., 3., 1.], [2., 0., 0., 0.], [4., 0., 1., 0.],
                    [0., 0., 0., 0.]])

count_B = np.array([[4., 0., 2., 2., 0., 0.], [2., 0., 0., 0., 0., 0.],
                    [0., 6., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0.]])

if __name__ == '__main__':
    A = add_one_smoothing(count_A)
    B = add_one_smoothing(count_B)

    viterbi = Viterbi(Pi, A, B, TAGS, LEXICON)

    sentence1 = 'bad is not good'
    sentence2 = 'is it bad'
    pred1 = viterbi.predict_tags(sentence1)
    pred2 = viterbi.predict_tags(sentence2)
    print(pred1)
    print(pred2)
Beispiel #41
0
	def callViterbi(self, observations):
		vit = Viterbi()
		tvit = vit.execute(observations, self.state, self.startProb, self.transSt, self.emission)
		probs = tvit[0]
		state_res = tvit[1]
		return (probs, state_res)
Beispiel #42
0
import utils
from ChineseTone import PinyinHelper
from tqdm import tqdm

with open('./test/test_set.json') as f:
	test_set = json.load(f)

hmm = HmmParam()
# hmm.py2hz_dict['不']
# hmm.emmission['不']
count_single = 0
correct_single = 0
count_sentence = 0
correct_sentence = 0
for i in tqdm(range(len(test_set))):
	count_sentence += 1
	test = test_set[i]
	flag = True
	answer = Viterbi(hmm,test['py'],5)[0].path
	# print(answer)
	for idx,an in enumerate(answer):
		count_single += 1
		if an == test['hz'][idx]:
			correct_single += 1
		else:
			flag = False
	if flag:
		correct_sentence += 1

print('single:',correct_single/count_single)
print('sentence:',correct_sentence/count_sentence)
Beispiel #43
0
class GPSMatcher:
    def __init__(self,
                 hmm,
                 emission_probability,
                 constraint_length=10,
                 MAX_DIST=500,
                 priors=None,
                 smallV=0.00000000001):
        # initialize spatial index
        self.previous_obs = None

        if priors == None:
            priors = dict([(state, 1.0 / len(hmm)) for state in hmm])

        state_spatial_index = Rtree()
        unlocated_states = []
        id_to_state = {}
        id = 0
        for state in hmm:
            geom = self.geometry_of_state(state)
            if not geom:
                unlocated_states.append(state)
            else:
                ((lat1, lon1), (lat2, lon2)) = geom
                state_spatial_index.insert(id, (min(lon1, lon2), min(
                    lat1, lat2), max(lon1, lon2), max(lat1, lat2)))
                id_to_state[id] = state
                id = id + 1

        def candidate_states(obs):  #was (lat,lon) in place of obs
            geom = self.geometry_of_observation(obs)
            if geom == None:
                return hmm.keys()
            else:
                (lat, lon) = geom
                nearby_states = state_spatial_index.intersection(
                    (lon - MAX_DIST / METERS_PER_DEGREE_LONGITUDE,
                     lat - MAX_DIST / METERS_PER_DEGREE_LATITUDE,
                     lon + MAX_DIST / METERS_PER_DEGREE_LONGITUDE,
                     lat + MAX_DIST / METERS_PER_DEGREE_LATITUDE))

                candidates = [id_to_state[id]
                              for id in nearby_states] + unlocated_states
                return candidates

        self.viterbi = Viterbi(hmm,
                               emission_probability,
                               constraint_length=constraint_length,
                               priors=priors,
                               candidate_states=candidate_states,
                               smallV=smallV)

    def step(self, obs, V, p):
        if self.previous_obs != None:
            for int_obs in self.interpolated_obs(self.previous_obs, obs):
                V, p = self.viterbi.step(int_obs, V, p)
        V, p = self.viterbi.step(obs, V, p)
        self.previous_obs = obs
        return V, p

    def interpolated_obs(self, prev, obs):
        return []

    def geometry_of_observation(self, obs):
        return obs

    def geometry_of_state(self, state):
        """ Subclasses should override this method to return the geometry of a given state, typically an edge."""
        if state == 'unknown': return None
        else:
            return state
Beispiel #44
0
    #Set up the VITERBI
    states = ["Buy", "Sell"]
    obs = []
    obs_prices = []
    obs_delta = []
    prev_price = hist_prices["Adj Close"][0]
    for price in hist_prices["Adj Close"]:
        if price >= prev_price:
            obs.append("Up")
        else:
            obs.append("Down")
        obs_prices.append(price)
        obs_delta.append(price - prev_price)
        prev_price = price
    possible_obs = ["Up", "Down"]
    v = Viterbi(initial, states, obs, possible_obs, trans, emiss)
    v.run()
    #v.print_table()
    #v.print_backtrack_table()
    #v.print_backtrack()

    #make a graph
    backtrack = v.get_backtrack()
    backtrack.pop(0)
    to_print = pd.DataFrame(hist_prices['Adj Close'])
    to_print["Delta"] = obs_delta
    to_print["Output"] = backtrack
    print(to_print)
    fig = hist_prices['Adj Close'].plot(grid="True")
    i = start
    tmp_backtrack = backtrack
Beispiel #45
0
def cross_validation(sequences, training_method, decoder):
    """
    Performs the 10-fold cross-validation
    Requieres an array of dict sequences
    Requires the training function
    Requires a decoder objetct (Viterbi or Posterior)
    """
    # here we store the total_ac for each cross-validation
    vit_total_ac = np.array([.0] * len(sequences))
    post_total_ac = np.array([.0] * len(sequences))
    vit = Viterbi()
    post = Posterior()
    

    for i in range(len(sequences)):
        vit_total_scores = np.zeros([4])
        post_total_scores = np.zeros([4])
        # arrays with the sequences for training and for validation
        training_data_array = sequences[:]
        validation_data_array = [ training_data_array.pop(i) ]

        # merging the arrays into dictionaries
        training_data = merge(training_data_array)
        validation_data = merge(validation_data_array)
        # the training function returns a model
        model = training_method(training_data)

        #do viterbi prediction on set i
        for key, sequence in validation_data.items():
            # the sequence from the file
            true_seq = sequence['Z']
            # the sequence decoded using viterbi, or posterior and the model generated
            vit_pred_seq = vit.decode(model, sequence['X'])
            post_pred_seq = post.decode(model, sequence['X'])
            """
            print key
            print "PREDICTED"
            print pred_seq
            print "TRUE"
            print true_seq
            """
            tp, fp, tn, fn = compare_tm_pred.count(true_seq, vit_pred_seq)

            vit_total_scores += np.array([tp, fp, tn, fn])
            
            tp, fp, tn, fn = compare_tm_pred.count(true_seq, post_pred_seq)

            post_total_scores += np.array([tp, fp, tn, fn])
            if VERBOSE:
                print ">" + key
                compare_tm_pred.print_stats(tp, fp, tn, fn)
                print

        vit_total_ac[i] = compare_tm_pred.compute_stats(*vit_total_scores)[3]
        post_total_ac[i] = compare_tm_pred.compute_stats(*post_total_scores)[3]
        #print total_ac
        if VERBOSE:
            print "Summary 10-fold cross validation over index %i :"%(i)
          #  compare_tm_pred.print_stats( *total_scores  )
            print
            print
            print
            print "-------------------------------------------------------"
            if DEBUG:
                raw_input("press any key to continue\n")

    print "Overall viterbi result mean: %s, variance: %s"%(np.mean(vit_total_ac), np.var(vit_total_ac))
    print "Posterior mean: %s, variance %s"%(np.mean(post_total_ac), np.var(post_total_ac))