def train_markov_model_from_constraint_matrix(self, csv_path, mm_path, delim="\t"): table = [line.split(delim) for line in open(csv_path)] tags = [] range_states = table.pop(0)[1:] for row in table: domain = row[0] for i, r in enumerate(row[1:]): s = r.replace(" ", "").strip("\n") if (s == ''): continue if int(s) > 0: for _ in range(0, int(s)): tags.append((domain, range_states[i])) self.cfd_tags = nltk.ConditionalFreqDist(tags) print "cfd trained, counts:" self.cfd_tags.tabulate() print "test:" print tabulate_cfd(self.cfd_tags) # save this new cfd for later use pickle.dump(self.cfd_tags, open(mm_path, "wb")) # initialize the cpd self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags, nltk.MLEProbDist) # print "cpd summary:" # print self.cpd_tags.viewitems() print tabulate_cfd(self.cpd_tags) all_outcomes = [v.keys() for v in self.cfd_tags.values()] self.tag_set = set(self.cfd_tags.keys() + [y for x in all_outcomes for y in x]) self.viterbi_init() # initialize viterbi
def train_markov_model_from_file(self, corpus_path, mm_path, update=False, non_sparse=False): """Adds to the self.cfd_tags conditional frequency distribution loaded, if there is one, else starts afresh. Recalculate the conditional prob distribution afresh. args: --filepath : filepath to newline separated file to learn sequence probabilities from. --mm_path : filepath to markov model distribution path to write to. --update : whether to update the current cfd, if not start anew. --non_sparse : whether to omit lines in the corpus without repairs, gives higher prob to repairs """ tags = [] # expects line separated sequences corpus_file = open(corpus_path) print "training decoder from", corpus_path for line in corpus_file: if line.strip("\n") == "": continue if non_sparse and ("<r" not in line): continue labels_data = line.strip("\n").split(",") if "<r" in labels_data[0]: continue # TODO error with corpus creation previous = "s" # print "length sequence", len(labels_data) for i in range(len(labels_data)): if labels_data[i] not in self.observation_tags: print labels_data[i], "not in obs tags" continue if any(["<i" in t for t in self.observation_tags]): if "<e" in labels_data[i] and i < len(labels_data) - 1: rps_onset = None for j in range(i, len(labels_data)): if "<rm" in labels_data[j]: rps_onset = j break if "<e" not in labels_data[j]: break if rps_onset: for k in range(i, rps_onset): labels_data[k] = labels_data[k].replace( "<e", "<i") # print labels_data[i] # adjust interregna # if any(["<i" in t for t in self.observation_tags]): # if "<rm-" in labels_data[i]: # b = len(tags)-1 # while ("e" in tags[b][1] and (not tags[b][1]=="se")\ # and b > 0): # if "i" not in tags[b][1]: # new_1 = tags[b][1].replace('eR', 'i').\ # replace('e', 'i') # tags[b] = (tags[b][0], new_1) # if "e" in tags[b][0] and "i" not in tags[b][0]: # new_0 = tags[b][0].replace('eR', 'i').\ # replace('e', 'i') # tags[b] = (new_0, tags[b][1]) # b -= 1 # previous = tags[-1][1] tag = self.convert_tag(previous, labels_data[i]) tags.append((previous, tag)) previous = tag if "se" in self.observation_tags: # add end tag tags.append((previous, 'se')) # print "If we have just seen 'DET', \ # the probability of 'N' is", cpd_tags["DET"].prob("N") # assumes these are added to exisiting one if update: self.cfd_tags += nltk.ConditionalFreqDist(tags) else: self.cfd_tags = nltk.ConditionalFreqDist(tags) print "cfd trained, counts:" self.cfd_tags.tabulate() print "test:" print tabulate_cfd(self.cfd_tags) # save this new cfd for later use pickle.dump(self.cfd_tags, open(mm_path, "wb")) # initialize the cpd self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags, nltk.MLEProbDist) # print "cpd summary:" # print self.cpd_tags.viewitems() print tabulate_cfd(self.cpd_tags) all_outcomes = [v.keys() for v in self.cfd_tags.values()] self.tag_set = set(self.cfd_tags.keys() + [y for x in all_outcomes for y in x]) self.viterbi_init() # initialize viterbi
by an integer. """ tag_dictionary = defaultdict(int) f = open(filepath) for line in f: l = line.strip('\n').split(",") tag_dictionary[l[1]] = int(l[0]) f.close() return tag_dictionary tags_name = "swbd_disf1_uttseg_simple_033" tags = load_tags( "../data/tag_representations/{}_tags.csv".format(tags_name)) if "disf" in tags_name: intereg_ind = len(tags.keys()) interreg_tag = "<i/><cc/>" if "uttseg" in tags_name else "<i/>" tags[interreg_tag] = intereg_ind # add the interregnum tag print tags h = FirstOrderHMM(tags, markov_model_file=None) mm_path = "models/{}_tags.pkl".format(tags_name) # corpus_path = "../data/tag_representations/{}_tag_corpus.csv".format( # tags_name).replace("_021", "") # h.train_markov_model_from_file(corpus_path, mm_path, non_sparse=True) csv_file = "models/{}.csv".format(tags_name) h.train_markov_model_from_constraint_matrix(csv_file, mm_path, delim=",") table = tabulate_cfd(h.cpd_tags) test_f = open("models/{}_tags_table.csv".format(tags_name), "w") test_f.write(table) test_f.close()