class Parser: def __init__(self, corpus_path, initial_values_path, final_value_path,debug_mode): self.sentences = self.get_sentences(corpus_path) self.debug_mode = debug_mode self.final_value_path = final_value_path self.pickle_handler = PickleHandler(initial_values_path) dep_mult_list, stop_mult_list =\ self.pickle_handler.init_all_dicts() self.stop_multinomial_holder = MultinomialHolder() self.stop_multinomial_holder.mult_list = stop_mult_list self.dep_multinomial_holder = MultinomialHolder() self.dep_multinomial_holder.mult_list = dep_mult_list def run_em(self): sum_probs = defaultdict(lambda: 1.0) for i in range(10): print "iteration ", i for sentence in self.sentences: if(sentence.strip() == ""): continue parsing_algo = ParsingAlgo(sentence, self.dep_multinomial_holder.mult_list, self.stop_multinomial_holder.mult_list) marginals = parsing_algo.get_marginals() sum_probs[i] += math.log(parsing_algo.total_potentials) edges = parsing_algo.hypergraph.edges self.update_counts(marginals, edges) if(sum_probs[i-1]!=1.0): assert sum_probs[i] > sum_probs[i-1], \ "The prob are %r, %r"% (sum_probs[i], sum_probs[i-1]) self.update_parameters() self.validate_multinomials(self.dep_multinomial_holder) self.validate_multinomials(self.stop_multinomial_holder) pickle_hand = PickleHandler(self.final_value_path) pickle_hand.write_to_pickle(self.dep_multinomial_holder.\ mult_list, self.stop_multinomial_holder.mult_list) pprint.pprint(sum_probs) def update_counts(self, marginals, edges): for edge in edges: arc = edge.label if arc.is_cont and arc.modifier_word != "": self.stop_multinomial_holder.inc_counts(arc.is_cont, (arc.head_word, arc.dir, arc.is_adj), marginals[edge.id]) self.dep_multinomial_holder.inc_counts(arc.\ modifier_word,(arc.head_word, arc.dir), marginals[edge.id]) if not arc.is_cont: self.stop_multinomial_holder.\ inc_counts(arc.is_cont, (arc.head_word, arc.dir, arc.is_adj), marginals[edge.id]) def update_parameters(self): self.dep_multinomial_holder.estimate() self.stop_multinomial_holder.estimate() def get_sentences(self, file_path): sentences = [] with open(file_path,"r") as fp: sentences = fp.readlines() return sentences def validate_multinomials(self, multinomial_holder): for key, mult in multinomial_holder.mult_list.iteritems(): if(self.debug_mode): print key pprint.pprint(mult.prob) total = sum(mult.prob.values()) assert round(total, 1) == 1.0 or round(total, 1) == 0 ,\ "The mult for " + str(key) + " is not totalling to 1 "\ + str(total)
def initialize_dep(self): dep_mult_holder = MultinomialHolder() for cond_key, mult in self.harmonic_dep_mult.iteritems(): for prob_key in mult.prob: dep_mult_holder.\ inc_counts(prob_key, cond_key, random.random()) dep_mult_holder.estimate() return dep_mult_holder def initialize_stop_mult_cont(self): stop_cont_mult_holder = MultinomialHolder() for cond_key, mult in self.harmonic_stop_cont_mult.iteritems(): random_value = random.random() stop_cont_mult_holder.\ inc_counts(0, cond_key,random_value) stop_cont_mult_holder.\ inc_counts(1, cond_key,1 - random_value) stop_cont_mult_holder.estimate() return stop_cont_mult_holder if __name__ == "__main__": pickle_handler = PickleHandler("data/dummy") dep_mult, stop_cont_mult = pickle_handler.init_all_dicts() random_init = RandomInitializer(dep_mult, stop_cont_mult) random_init.initialize_multinomials() pickle_handler = PickleHandler("data/random_init") pickle_handler.write_to_pickle(random_init.dep_mult_holder.\ mult_list, random_init.stop_cont_mult_holder.mult_list)
if(sent_acc <= 2 and len(actual_dep) >= 2): self.incorrect_sent.append(sentence) self.incorrect_dep[incorrect_dep_key] += 1 def get_sentences(self, file_path): sentences = [] with open(file_path,"r") as fp: sentences = fp.readlines() return sentences def write_to_file(self, file_name, data): with open(file_name, "wb") as fp: fp.writelines(("%s\n" % line for line in data)) if __name__ == "__main__": pickle_handler = PickleHandler("data/harmonic_final") dep_mult_holder, cont_stop_mult_holder =\ pickle_handler.init_all_dicts() evaluator = Evaluator("data/sentences_train.txt", "data/dep_index_train.txt", dep_mult_holder, cont_stop_mult_holder) evaluator.evaluate_sentences() evaluator.write_to_file("incorrect_sent_rule", evaluator.incorrect_sent) with open("incorrect_dep_dict_new", "wb") as fp: pickle.dump(evaluator.incorrect_dep, fp)