Exemple #1
0
class Parser:

    def __init__(self, corpus_path, initial_values_path,
                 final_value_path,debug_mode):
        self.sentences = self.get_sentences(corpus_path)
        self.debug_mode = debug_mode
        self.final_value_path = final_value_path
        self.pickle_handler = PickleHandler(initial_values_path)
        dep_mult_list, stop_mult_list =\
            self.pickle_handler.init_all_dicts()
        self.stop_multinomial_holder = MultinomialHolder()
        self.stop_multinomial_holder.mult_list = stop_mult_list
        self.dep_multinomial_holder = MultinomialHolder()
        self.dep_multinomial_holder.mult_list = dep_mult_list

    def run_em(self):
        sum_probs = defaultdict(lambda: 1.0)
        for i in range(10):
            print "iteration ", i
            for sentence in self.sentences:
                if(sentence.strip() == ""):
                    continue
                parsing_algo = ParsingAlgo(sentence,
			 self.dep_multinomial_holder.mult_list,
                               self.stop_multinomial_holder.mult_list)
                marginals = parsing_algo.get_marginals()
		sum_probs[i] += math.log(parsing_algo.total_potentials)
                edges = parsing_algo.hypergraph.edges
                self.update_counts(marginals, edges)

            if(sum_probs[i-1]!=1.0):
                assert sum_probs[i] > sum_probs[i-1], \
                 "The prob are %r, %r"% (sum_probs[i],  sum_probs[i-1])

            self.update_parameters()
            self.validate_multinomials(self.dep_multinomial_holder)
            self.validate_multinomials(self.stop_multinomial_holder)

	pickle_hand = PickleHandler(self.final_value_path)
	pickle_hand.write_to_pickle(self.dep_multinomial_holder.\
           mult_list, self.stop_multinomial_holder.mult_list)
	pprint.pprint(sum_probs)
    

    def update_counts(self, marginals, edges):
        for edge in edges:
            arc = edge.label
            if arc.is_cont and arc.modifier_word != "":
                self.stop_multinomial_holder.inc_counts(arc.is_cont,
                     (arc.head_word, arc.dir, arc.is_adj),
                                                 marginals[edge.id])
                self.dep_multinomial_holder.inc_counts(arc.\
                   modifier_word,(arc.head_word, arc.dir),
                                                 marginals[edge.id])

            if not arc.is_cont:
                self.stop_multinomial_holder.\
                 inc_counts(arc.is_cont,
                 (arc.head_word, arc.dir, arc.is_adj),
                            marginals[edge.id])

    def update_parameters(self):
	self.dep_multinomial_holder.estimate()
        self.stop_multinomial_holder.estimate()

    def get_sentences(self, file_path):
        sentences = []
        with open(file_path,"r") as fp:
            sentences = fp.readlines()
        return sentences

    def validate_multinomials(self, multinomial_holder):
        for key, mult in multinomial_holder.mult_list.iteritems():
            if(self.debug_mode):
                print key
                pprint.pprint(mult.prob)

            total = sum(mult.prob.values())
            assert round(total, 1) == 1.0 or round(total, 1) == 0 ,\
               "The mult for " + str(key) + " is not totalling to 1 "\
               + str(total)
    def initialize_dep(self):
        dep_mult_holder = MultinomialHolder()
        for cond_key, mult in self.harmonic_dep_mult.iteritems():
            for prob_key in mult.prob:
                dep_mult_holder.\
                    inc_counts(prob_key, cond_key, random.random())

        dep_mult_holder.estimate()
        return dep_mult_holder

    def initialize_stop_mult_cont(self):
        stop_cont_mult_holder = MultinomialHolder()
        for cond_key, mult in self.harmonic_stop_cont_mult.iteritems():
            random_value = random.random()
            stop_cont_mult_holder.\
                    inc_counts(0, cond_key,random_value)
            stop_cont_mult_holder.\
                    inc_counts(1, cond_key,1 - random_value)

        stop_cont_mult_holder.estimate()
        return stop_cont_mult_holder

if __name__ == "__main__":
    pickle_handler = PickleHandler("data/dummy")
    dep_mult, stop_cont_mult = pickle_handler.init_all_dicts()
    random_init = RandomInitializer(dep_mult, stop_cont_mult)
    random_init.initialize_multinomials()
    pickle_handler = PickleHandler("data/random_init")
    pickle_handler.write_to_pickle(random_init.dep_mult_holder.\
          mult_list, random_init.stop_cont_mult_holder.mult_list)
Exemple #3
0
      
        if(sent_acc <= 2 and len(actual_dep) >= 2):
            self.incorrect_sent.append(sentence)
            self.incorrect_dep[incorrect_dep_key] += 1

    def get_sentences(self, file_path):
        sentences = []
        with open(file_path,"r") as fp:
            sentences = fp.readlines()
        return sentences

    def write_to_file(self, file_name, data):
        with open(file_name, "wb") as fp:
            fp.writelines(("%s\n" % line for line in data))

if __name__ == "__main__":
    pickle_handler = PickleHandler("data/harmonic_final")
    dep_mult_holder, cont_stop_mult_holder =\
          pickle_handler.init_all_dicts()

    evaluator = Evaluator("data/sentences_train.txt",
       "data/dep_index_train.txt", dep_mult_holder,
                          cont_stop_mult_holder)

    evaluator.evaluate_sentences()
    evaluator.write_to_file("incorrect_sent_rule",
                           evaluator.incorrect_sent)

    with open("incorrect_dep_dict_new", "wb") as fp:
        pickle.dump(evaluator.incorrect_dep, fp)