def make_gec_instances(raw_file, pos_file, correct_file=None): global nf, vf, df, pf, prof instances = FgExampleMemoryStore() raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()] pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()] if correct_file is not None: correct_lines = [[BOS] + cl.strip().split() + [EOS] for cl in codecs.open(correct_file, 'r', 'utf8').readlines()] pass else: correct_lines = [None] * len(raw_lines) pass for line_idx in range(len(raw_lines)): factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for w_idx, (w,p,c) in enumerate(zip(raw_lines[line_idx], pos_lines[line_idx], correct_lines[line_idx])): candidates = generate_correction_candidates(p,w) print w, p, c print 'can', candidates, c hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx) , candidates) vc.put(hc, c) if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM") factor_graph.addFactor(t_factor) else: pass prev_hc = hc pass instances.add(LabeledFgExample(factor_graph, vc)) return instances
def make_instances(txt_file, tag_list, obs_list): instances = FgExampleMemoryStore() text_train = [t.strip() for t in open(txt_file).read().split('###/###') if t.strip() != ''] for x in range(len(text_train)): factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for i, line in enumerate(text_train[x].split('\n')): hidden_state = line.split('/')[1].strip() observed_state = line.split('/')[0].strip() # print 'h', hidden_state, 'o', observed_state # make variables with their configurations #r = random.randint(0, len(tag_list)) #s = set(tag_list[:r]) #s.add(hidden_state) #s = list(s) s = tag_list hc = Var(Var.VarType.PREDICTED, len(s), "TAG_" + str(i), s) vc.put(hc, hidden_state) # o = Var(Var.VarType.PREDICTED , len(obs_list), "OBS_" + str(i), obs_list) # vc.put(o, observed_state) # make transition factor if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], 'TAG-TAG') factor_graph.addFactor(t_factor) else: pass prev_hc = hc # make emission factor e_varset = VarSet(hc) # e_varset.add(o) # e_factor = CRFFactor(e_varset, [hc, o], 'TAG-OBS') e_factor = ObservedFactor(e_varset, [hc], 'TAG-OBS', observed_state) factor_graph.addFactor(e_factor) # make clamp factor # c_factor = Clamper(o, obs_list.index(observed_state)) # factor_graph.addFactor(c_factor) instances.add(LabeledFgExample(factor_graph, vc)) return instances
def make_gec_instances(raw_file, pos_file, aspect_file): global cl instances = FgExampleMemoryStore() #raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()] #pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()] with codecs.open(aspect_file, 'r', 'utf8') as aspect_f, codecs.open( raw_file, 'r', 'utf8') as raw_f, codecs.open(pos_file, 'r', 'utf8') as pos_f: for raw_line, pos_line, aspect_line in zip(raw_f, pos_f, aspect_f): rl = [BOS] + raw_line.strip().split() + [EOS] pl = [BOS] + pos_line.strip().split() + [EOS] al = [BOS] + aspect_line.strip().split() + [EOS] sys.stderr.write('.') factor_graph = FactorGraph() vc = VarConfig() prev_hc = None for w_idx, (w, p, a) in enumerate(zip(rl, pl, al)): #w does not contain aspet candidates = get_candidates(cl, w, p) if len(candidates) == 1: sys.stderr.write('no cans:' + w + ' ' + p + '\n') else: pass hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx), candidates) vc.put(hc, w + '|||' + a) assert w + '|||' + a in candidates if prev_hc: t_varset = VarSet(hc) t_varset.add(prev_hc) t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM") factor_graph.addFactor(t_factor) else: pass prev_hc = hc pass instances.add(LabeledFgExample(factor_graph, vc)) return instances