def make_gec_instances(raw_file, pos_file, correct_file=None): global nf, vf, df, pf, prof instances = FgExampleMemoryStore() raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()] pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()] if correct_file is not None: correct_lines = [[BOS] + cl.strip().split() + [EOS] for cl in codecs.open(correct_file, 'r', 'utf8').readlines()] pass else: correct_lines = [None] * len(raw_lines) pass for line_idx in range(len(raw_lines)): factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for w_idx, (w,p,c) in enumerate(zip(raw_lines[line_idx], pos_lines[line_idx], correct_lines[line_idx])): candidates = generate_correction_candidates(p,w) print w, p, c print 'can', candidates, c hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx) , candidates) vc.put(hc, c) if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM") factor_graph.addFactor(t_factor) else: pass prev_hc = hc pass instances.add(LabeledFgExample(factor_graph, vc)) return instances
def make_gec_instances(raw_file, pos_file, aspect_file): global cl instances = FgExampleMemoryStore() #raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()] #pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()] with codecs.open(aspect_file, 'r', 'utf8') as aspect_f, codecs.open( raw_file, 'r', 'utf8') as raw_f, codecs.open(pos_file, 'r', 'utf8') as pos_f: for raw_line, pos_line, aspect_line in zip(raw_f, pos_f, aspect_f): rl = [BOS] + raw_line.strip().split() + [EOS] pl = [BOS] + pos_line.strip().split() + [EOS] al = [BOS] + aspect_line.strip().split() + [EOS] sys.stderr.write('.') factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for w_idx, (w, p, a) in enumerate(zip(rl, pl, al)): #w does not contain aspet candidates = get_candidates(cl, w, p) if len(candidates) == 1: sys.stderr.write('no cans:' + w + ' ' + p + '\n') else: pass hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx), candidates) vc.put(hc, w + '|||' + a) assert w + '|||' + a in candidates if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM") factor_graph.addFactor(t_factor) else: pass prev_hc = hc pass instances.add(LabeledFgExample(factor_graph, vc)) return instances
def make_instances(txt_file, tag_list, obs_list): instances = FgExampleMemoryStore() text_train = [t.strip() for t in open(txt_file).read().split('###/###') if t.strip() != ''] for x in range(len(text_train)): factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for i, line in enumerate(text_train[x].split('\n')): hidden_state = line.split('/')[1].strip() observed_state = line.split('/')[0].strip() # print 'h', hidden_state, 'o', observed_state # make variables with their configurations #r = random.randint(0, len(tag_list)) #s = set(tag_list[:r]) #s.add(hidden_state) #s = list(s) s = tag_list hc = Var(Var.VarType.PREDICTED, len(s), "TAG_" + str(i), s) vc.put(hc, hidden_state) # o = Var(Var.VarType.PREDICTED , len(obs_list), "OBS_" + str(i), obs_list) # vc.put(o, observed_state) # make transition factor if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], 'TAG-TAG') factor_graph.addFactor(t_factor) else: pass prev_hc = hc # make emission factor e_varset = VarSet(hc) # e_varset.add(o) # e_factor = CRFFactor(e_varset, [hc, o], 'TAG-OBS') e_factor = ObservedFactor(e_varset, [hc], 'TAG-OBS', observed_state) factor_graph.addFactor(e_factor) # make clamp factor # c_factor = Clamper(o, obs_list.index(observed_state)) # factor_graph.addFactor(c_factor) instances.add(LabeledFgExample(factor_graph, vc)) return instances
def get(self, i): global tag2id, id2tag, obs2id, id2obs, obs_list #pdb.set_trace() ti = self.training_instances[i] tag_subset = self.en_vocab factor_graph = FactorGraph() vc = VarConfig() curr_g = {} curr_s = {} past_g_sent = {} curr_rg = {} curr_s_order = [] for crg in ti.current_revealed_guesses: curr_rg[tuple(crg.id)] = crg for psg in ti.past_guesses_for_current_sent: past_g_sent[tuple(psg.id)] = psg for cg in ti.current_guesses: curr_g[tuple(cg.id)] = cg for cs in ti.current_sent: curr_s[tuple(cs.id)] = cs curr_s_order.append((int(cs.position), cs)) curr_s_order.sort() var_map = {} fac_map = {} #pdb.set_trace() # fac_map_summary = {} for cso_id0, cso0 in curr_s_order: c, hs = self.get_hs(cso0, curr_rg, curr_g) print c, hs hc_var = Var(Var.VarType.PREDICTED, len(tag_subset), 'TAG_' + str(cso_id0), tag_subset) var_map[cso_id0] = (hc_var, hs, c) if not c: try: vc.put(hc_var, hs) except: print hs print 'vc is broken...' pdb.set_trace() assert cso0.id not in curr_rg e_factor = ObservedEFactor(VarSet(hc_var), hc_var, cso0.l2_word) fid = (cso_id0, 'er') fac_map[fid] = fac_map.get(fid, []) + [e_factor] # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [cso0.l2_word + '->emission->' + str(cso_id0)] else: pass for cso_id1, cso1 in curr_s_order: for cso_id2, cso2 in curr_s_order: if cso_id1 != cso_id2 and cso_id1 - cso_id2 == 1: (v1_hc, v1_hs, v1_c) = var_map[cso_id1] (v2_hc, v2_hs, v2_c) = var_map[cso_id2] if not v1_c and not v2_c: # both vars are not revealed fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'h')])) # there can be only 1 TT factor between 2 unobserved vars if fid not in fac_map: print fid, 'both are hidden' t_varset = VarSet(v1_hc) t_varset.add(v2_hc) t_factor = TTFactor(t_varset, var1=v1_hc, var1pos=cso_id1, var2=v2_hc, var2pos=cso_id2) fac_map[fid] = fac_map.get(fid, []) + [t_factor] # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [ # str(cso_id1) + '->trans->' + str(cso_id2)] elif not v1_c and v2_c: # var 2 is reveled i.e. observed fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'r')])) if fid not in fac_map: print cso_id1, ' is hidden', cso_id2, 'is reveled' t_factor = ObservedTFactor(VarSet(v1_hc), var1=v1_hc, var1pos=cso_id1, var2=None, var2pos=None, observed_state=v2_hs) # there can be multiple factors connected to a v1_hc fac_map[fid] = fac_map.get(fid, []) + [t_factor] # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [ # v2_hs + '->trans->' + str(cso_id1)] elif v1_c and not v2_c: # var 1 is reveled i.e. observed fid = tuple(sorted([(cso_id1, 'r'), (cso_id2, 'h')])) if fid not in fac_map: print cso_id2, 'is hidden', cso_id1, 'is reveled' t_factor = ObservedTFactor(VarSet(v2_hc), var1=None, var1pos=None, var2=v2_hc, var2pos=cso_id2, observed_state=v1_hs) fac_map[fid] = fac_map.get(fid, []) + [t_factor] # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [v1_hs + '->trans->' + str(cso_id2)] else: # this means both v1_hc and v2_hc are reveled, so we dont do anything.. pass for fid, factors in fac_map.items(): for _factor in factors: try: factor_graph.addFactor(_factor) except: print 'something broken when adding factor to factor_graph' pdb.set_trace() sys.stderr.write('.') return LabeledFgExample(factor_graph, vc)