Beispiel #1
0
def make_gec_instances(raw_file, pos_file, correct_file=None):
	global nf, vf, df, pf, prof
	instances = FgExampleMemoryStore()
	raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()]
	pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()]
	if correct_file is not None:
		correct_lines = [[BOS] + cl.strip().split() + [EOS]  for cl in codecs.open(correct_file, 'r', 'utf8').readlines()]
		pass
	else:
		correct_lines = [None] * len(raw_lines)
		pass
	for line_idx in range(len(raw_lines)):
		factor_graph = FactorGraph()
		vc = VarConfig()
		prev_hc = None
		for w_idx, (w,p,c) in enumerate(zip(raw_lines[line_idx], pos_lines[line_idx], correct_lines[line_idx])):
			candidates = generate_correction_candidates(p,w)	
			print w, p, c
			print 'can', candidates, c
			hc = Var(Var.VarType.PREDICTED, len(candidates), "TAG_" + str(w_idx) , candidates)
			vc.put(hc, c)
			if prev_hc:
				t_varset = VarSet(hc)
				t_varset.add(prev_hc)
				t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM")
				factor_graph.addFactor(t_factor)
			else:
				pass
			prev_hc = hc
			pass
		instances.add(LabeledFgExample(factor_graph, vc))
	return instances
Beispiel #2
0
def make_instances(txt_file, tag_list, obs_list):
    instances = FgExampleMemoryStore()
    text_train = [t.strip() for t in open(txt_file).read().split('###/###') if t.strip() != '']
    for x in range(len(text_train)):
        factor_graph = FactorGraph()
        vc = VarConfig()
        prev_hc = None
        for i, line in enumerate(text_train[x].split('\n')):
            hidden_state = line.split('/')[1].strip()
            observed_state = line.split('/')[0].strip()
            # print 'h', hidden_state, 'o', observed_state
            # make variables with their configurations
            #r = random.randint(0, len(tag_list))
            #s = set(tag_list[:r])
            #s.add(hidden_state)
            #s = list(s)
            s = tag_list
            hc = Var(Var.VarType.PREDICTED, len(s), "TAG_" + str(i), s)
            vc.put(hc, hidden_state)
            # o = Var(Var.VarType.PREDICTED , len(obs_list), "OBS_" + str(i), obs_list)
            # vc.put(o, observed_state)
            # make transition factor
            if prev_hc:
                t_varset = VarSet(hc)
                t_varset.add(prev_hc)
                t_factor = CRFFactor(t_varset, [prev_hc, hc], 'TAG-TAG')
                factor_graph.addFactor(t_factor)
            else:
                pass
            prev_hc = hc
            # make emission factor
            e_varset = VarSet(hc)
            # e_varset.add(o)
            # e_factor = CRFFactor(e_varset, [hc, o], 'TAG-OBS')
            e_factor = ObservedFactor(e_varset, [hc], 'TAG-OBS', observed_state)
            factor_graph.addFactor(e_factor)
            # make clamp factor
            # c_factor = Clamper(o, obs_list.index(observed_state))
            # factor_graph.addFactor(c_factor)
        instances.add(LabeledFgExample(factor_graph, vc))
    return instances
Beispiel #3
0
def make_gec_instances(raw_file, pos_file, aspect_file):
    global cl
    instances = FgExampleMemoryStore()
    #raw_lines = [[BOS] + rl.strip().split() + [EOS] for rl in codecs.open(raw_file, 'r', 'utf8').readlines()]
    #pos_lines = [[BOS] + pl.strip().split() + [EOS] for pl in codecs.open(pos_file, 'r', 'utf8').readlines()]
    with codecs.open(aspect_file, 'r', 'utf8') as aspect_f, codecs.open(
            raw_file, 'r',
            'utf8') as raw_f, codecs.open(pos_file, 'r', 'utf8') as pos_f:
        for raw_line, pos_line, aspect_line in zip(raw_f, pos_f, aspect_f):
            rl = [BOS] + raw_line.strip().split() + [EOS]
            pl = [BOS] + pos_line.strip().split() + [EOS]
            al = [BOS] + aspect_line.strip().split() + [EOS]
            sys.stderr.write('.')
            factor_graph = FactorGraph()
            vc = VarConfig()
            prev_hc = None
            for w_idx, (w, p, a) in enumerate(zip(rl, pl, al)):
                #w does not contain aspet
                candidates = get_candidates(cl, w, p)
                if len(candidates) == 1:
                    sys.stderr.write('no cans:' + w + ' ' + p + '\n')
                else:
                    pass

                hc = Var(Var.VarType.PREDICTED, len(candidates),
                         "TAG_" + str(w_idx), candidates)
                vc.put(hc, w + '|||' + a)
                assert w + '|||' + a in candidates
                if prev_hc:
                    t_varset = VarSet(hc)
                    t_varset.add(prev_hc)
                    t_factor = CRFFactor(t_varset, [prev_hc, hc], "BIGRAM")
                    factor_graph.addFactor(t_factor)
                else:
                    pass
                prev_hc = hc
                pass
            instances.add(LabeledFgExample(factor_graph, vc))
    return instances
Beispiel #4
0
    def get(self, i):
        global tag2id, id2tag, obs2id, id2obs, obs_list
        #pdb.set_trace()
        ti = self.training_instances[i]
        tag_subset = self.en_vocab
        factor_graph = FactorGraph()
        vc = VarConfig()
        curr_g = {}
        curr_s = {}
        past_g_sent = {}
        curr_rg = {}
        curr_s_order = []

        for crg in ti.current_revealed_guesses:
            curr_rg[tuple(crg.id)] = crg

        for psg in ti.past_guesses_for_current_sent:
            past_g_sent[tuple(psg.id)] = psg

        for cg in ti.current_guesses:
            curr_g[tuple(cg.id)] = cg

        for cs in ti.current_sent:
            curr_s[tuple(cs.id)] = cs
            curr_s_order.append((int(cs.position), cs))
        curr_s_order.sort()
        var_map = {}
        fac_map = {}
        #pdb.set_trace()
        # fac_map_summary = {}
        for cso_id0, cso0 in curr_s_order:
            c, hs = self.get_hs(cso0, curr_rg, curr_g)
            print c, hs
            hc_var = Var(Var.VarType.PREDICTED, len(tag_subset),
                         'TAG_' + str(cso_id0), tag_subset)
            var_map[cso_id0] = (hc_var, hs, c)

            if not c:
                try:
                    vc.put(hc_var, hs)
                except:
                    print hs
                    print 'vc is broken...'
                    pdb.set_trace()
                assert cso0.id not in curr_rg
                e_factor = ObservedEFactor(VarSet(hc_var), hc_var,
                                           cso0.l2_word)
                fid = (cso_id0, 'er')
                fac_map[fid] = fac_map.get(fid, []) + [e_factor]
                # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [cso0.l2_word + '->emission->' + str(cso_id0)]
            else:
                pass

        for cso_id1, cso1 in curr_s_order:
            for cso_id2, cso2 in curr_s_order:
                if cso_id1 != cso_id2 and cso_id1 - cso_id2 == 1:
                    (v1_hc, v1_hs, v1_c) = var_map[cso_id1]
                    (v2_hc, v2_hs, v2_c) = var_map[cso_id2]
                    if not v1_c and not v2_c:  # both vars are not revealed
                        fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'h')]))
                        # there can be only 1 TT factor between 2 unobserved vars
                        if fid not in fac_map:
                            print fid, 'both are hidden'
                            t_varset = VarSet(v1_hc)
                            t_varset.add(v2_hc)
                            t_factor = TTFactor(t_varset,
                                                var1=v1_hc,
                                                var1pos=cso_id1,
                                                var2=v2_hc,
                                                var2pos=cso_id2)

                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [
                            #    str(cso_id1) + '->trans->' + str(cso_id2)]
                    elif not v1_c and v2_c:  # var 2 is reveled i.e. observed
                        fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'r')]))
                        if fid not in fac_map:
                            print cso_id1, ' is hidden', cso_id2, 'is reveled'
                            t_factor = ObservedTFactor(VarSet(v1_hc),
                                                       var1=v1_hc,
                                                       var1pos=cso_id1,
                                                       var2=None,
                                                       var2pos=None,
                                                       observed_state=v2_hs)

                            # there can be multiple factors connected to a v1_hc
                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [
                            #    v2_hs + '->trans->' + str(cso_id1)]
                    elif v1_c and not v2_c:  # var 1 is reveled i.e. observed
                        fid = tuple(sorted([(cso_id1, 'r'), (cso_id2, 'h')]))
                        if fid not in fac_map:
                            print cso_id2, 'is hidden', cso_id1, 'is reveled'
                            t_factor = ObservedTFactor(VarSet(v2_hc),
                                                       var1=None,
                                                       var1pos=None,
                                                       var2=v2_hc,
                                                       var2pos=cso_id2,
                                                       observed_state=v1_hs)

                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [v1_hs + '->trans->' + str(cso_id2)]
                    else:
                        # this means both v1_hc and v2_hc are reveled, so we dont do anything..
                        pass

        for fid, factors in fac_map.items():
            for _factor in factors:
                try:
                    factor_graph.addFactor(_factor)
                except:
                    print 'something broken when adding factor to factor_graph'
                    pdb.set_trace()
        sys.stderr.write('.')
        return LabeledFgExample(factor_graph, vc)
Beispiel #5
0
    def get(self, i):
        global tag2id, id2tag, obs2id, id2obs, obs_list
        #pdb.set_trace()
        ti = self.training_instances[i]
        tag_subset = self.en_vocab
        factor_graph = FactorGraph()
        vc = VarConfig()
        curr_g = {}
        curr_s = {}
        past_g_sent = {}
        curr_rg = {}
        curr_s_order = []

        for crg in ti.current_revealed_guesses:
            curr_rg[tuple(crg.id)] = crg

        for psg in ti.past_guesses_for_current_sent:
            past_g_sent[tuple(psg.id)] = psg

        for cg in ti.current_guesses:
            curr_g[tuple(cg.id)] = cg

        for cs in ti.current_sent:
            curr_s[tuple(cs.id)] = cs
            curr_s_order.append((int(cs.position), cs))
        curr_s_order.sort()
        var_map = {}
        fac_map = {}
        #pdb.set_trace()
        # fac_map_summary = {}
        for cso_id0, cso0 in curr_s_order:
            c, hs = self.get_hs(cso0, curr_rg, curr_g)
            print c, hs
            hc_var = Var(Var.VarType.PREDICTED, len(tag_subset), 'TAG_' + str(cso_id0), tag_subset)
            var_map[cso_id0] = (hc_var, hs, c)

            if not c:
                try:
                    vc.put(hc_var, hs)
                except:
                    print hs
                    print 'vc is broken...'
                    pdb.set_trace()
                assert cso0.id not in curr_rg
                e_factor = ObservedEFactor(VarSet(hc_var), hc_var, cso0.l2_word)
                fid = (cso_id0, 'er')
                fac_map[fid] = fac_map.get(fid, []) + [e_factor]
                # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [cso0.l2_word + '->emission->' + str(cso_id0)]
            else:
                pass

        for cso_id1, cso1 in curr_s_order:
            for cso_id2, cso2 in curr_s_order:
                if cso_id1 != cso_id2 and cso_id1 - cso_id2 == 1:
                    (v1_hc, v1_hs, v1_c) = var_map[cso_id1]
                    (v2_hc, v2_hs, v2_c) = var_map[cso_id2]
                    if not v1_c and not v2_c:  # both vars are not revealed
                        fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'h')]))
                        # there can be only 1 TT factor between 2 unobserved vars
                        if fid not in fac_map:
                            print fid, 'both are hidden'
                            t_varset = VarSet(v1_hc)
                            t_varset.add(v2_hc)
                            t_factor = TTFactor(t_varset, var1=v1_hc,
                                                var1pos=cso_id1,
                                                var2=v2_hc,
                                                var2pos=cso_id2)

                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [
                            #    str(cso_id1) + '->trans->' + str(cso_id2)]
                    elif not v1_c and v2_c:  # var 2 is reveled i.e. observed
                        fid = tuple(sorted([(cso_id1, 'h'), (cso_id2, 'r')]))
                        if fid not in fac_map:
                            print cso_id1, ' is hidden', cso_id2, 'is reveled'
                            t_factor = ObservedTFactor(VarSet(v1_hc),
                                                       var1=v1_hc,
                                                       var1pos=cso_id1,
                                                       var2=None,
                                                       var2pos=None,
                                                       observed_state=v2_hs)

                            # there can be multiple factors connected to a v1_hc
                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [
                            #    v2_hs + '->trans->' + str(cso_id1)]
                    elif v1_c and not v2_c:  # var 1 is reveled i.e. observed
                        fid = tuple(sorted([(cso_id1, 'r'), (cso_id2, 'h')]))
                        if fid not in fac_map:
                            print cso_id2, 'is hidden', cso_id1, 'is reveled'
                            t_factor = ObservedTFactor(VarSet(v2_hc),
                                                       var1=None,
                                                       var1pos=None,
                                                       var2=v2_hc,
                                                       var2pos=cso_id2,
                                                       observed_state=v1_hs)

                            fac_map[fid] = fac_map.get(fid, []) + [t_factor]
                            # fac_map_summary[fid] = fac_map_summary.get(fid, []) + [v1_hs + '->trans->' + str(cso_id2)]
                    else:
                        # this means both v1_hc and v2_hc are reveled, so we dont do anything..
                        pass

        for fid, factors in fac_map.items():
            for _factor in factors:
                try:
                    factor_graph.addFactor(_factor)
                except:
                    print 'something broken when adding factor to factor_graph'
                    pdb.set_trace()
        sys.stderr.write('.')
        return LabeledFgExample(factor_graph, vc)