Ejemplo n.º 1
0
def make_gec_instances(raw_file, pos_file, correct_file=None):
	global nf, vf, df, pf, prof
	instances = FgExampleMemoryStore()
	raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()]
	pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()]
	if correct_file is not None:
		correct_lines = [[BOS] + cl.strip().split() + [EOS]  for cl in codecs.open(correct_file, 'r', 'utf8').readlines()]
		pass
	else:
		correct_lines = [None] * len(raw_lines)
		pass
	for line_idx in range(len(raw_lines)):
		factor_graph = FactorGraph()
		vc = VarConfig()
		prev_hc = None
		for w_idx, (w,p,c) in enumerate(zip(raw_lines[line_idx], pos_lines[line_idx], correct_lines[line_idx])):
			candidates = generate_correction_candidates(p,w)	
			print w, p, c
			print 'can', candidates, c
			hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx) , candidates)
			vc.put(hc, c)
			if prev_hc:
				t_varset = VarSet(hc)
				t_varset.add(prev_hc)
				t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM")
				factor_graph.addFactor(t_factor)
			else:
				pass
			prev_hc = hc
			pass
		instances.add(LabeledFgExample(factor_graph, vc))
	return instances
Ejemplo n.º 2
0
def make_instances(txt_file, tag_list, obs_list):
    instances = FgExampleMemoryStore()
    text_train = [t.strip() for t in open(txt_file).read().split('###/###') if t.strip() != '']
    for x in range(len(text_train)):
        factor_graph = FactorGraph()
        vc = VarConfig()
        prev_hc = None
        for i, line in enumerate(text_train[x].split('\n')):
            hidden_state = line.split('/')[1].strip()
            observed_state = line.split('/')[0].strip()
            # print 'h', hidden_state, 'o', observed_state
            # make variables with their configurations
            #r = random.randint(0, len(tag_list))
            #s = set(tag_list[:r])
            #s.add(hidden_state)
            #s = list(s)
            s = tag_list
            hc = Var(Var.VarType.PREDICTED, len(s), "TAG_" + str(i), s)
            vc.put(hc, hidden_state)
            # o = Var(Var.VarType.PREDICTED , len(obs_list), "OBS_" + str(i), obs_list)
            # vc.put(o, observed_state)
            # make transition factor
            if prev_hc:
                t_varset = VarSet(hc)
                t_varset.add(prev_hc)
                t_factor = CRFFactor(t_varset, [prev_hc, hc], 'TAG-TAG')
                factor_graph.addFactor(t_factor)
            else:
                pass
            prev_hc = hc
            # make emission factor
            e_varset = VarSet(hc)
            # e_varset.add(o)
            # e_factor = CRFFactor(e_varset, [hc, o], 'TAG-OBS')
            e_factor = ObservedFactor(e_varset, [hc], 'TAG-OBS', observed_state)
            factor_graph.addFactor(e_factor)
            # make clamp factor
            # c_factor = Clamper(o, obs_list.index(observed_state))
            # factor_graph.addFactor(c_factor)
        instances.add(LabeledFgExample(factor_graph, vc))
    return instances
Ejemplo n.º 3
0
def make_gec_instances(raw_file, pos_file, aspect_file):
    global cl
    instances = FgExampleMemoryStore()
    #raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()]
    #pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()]
    with codecs.open(aspect_file, 'r', 'utf8') as aspect_f, codecs.open(
            raw_file, 'r',
            'utf8') as raw_f, codecs.open(pos_file, 'r', 'utf8') as pos_f:
        for raw_line, pos_line, aspect_line in zip(raw_f, pos_f, aspect_f):
            rl = [BOS] + raw_line.strip().split() + [EOS]
            pl = [BOS] + pos_line.strip().split() + [EOS]
            al = [BOS] + aspect_line.strip().split() + [EOS]
            sys.stderr.write('.')
            factor_graph = FactorGraph()
            vc = VarConfig()
            prev_hc = None
            for w_idx, (w, p, a) in enumerate(zip(rl, pl, al)):
                #w does not contain aspet
                candidates = get_candidates(cl, w, p)
                if len(candidates) == 1:
                    sys.stderr.write('no cans:' + w + ' ' + p + '\n')
                else:
                    pass

                hc = Var(Var.VarType.PREDICTED, len(candidates),
                         "TAG_" + str(w_idx), candidates)
                vc.put(hc, w + '|||' + a)
                assert w + '|||' + a in candidates
                if prev_hc:
                    t_varset = VarSet(hc)
                    t_varset.add(prev_hc)
                    t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM")
                    factor_graph.addFactor(t_factor)
                else:
                    pass
                prev_hc = hc
                pass
            instances.add(LabeledFgExample(factor_graph, vc))
    return instances