Beispiel #1
0
    def print_features(self, options):
        # get all align.
        if WITHOUT_DD:
            ALL_ALIGN = get_all_align("withoutDD")
        else:
            ALL_ALIGN = get_all_align("expanded_single")
        # get materials for supsim feature.
        nowPAS = PAStructure(self.num)
        supsim_dict = nowPAS.get_supsim_dictionary()
        self.get_compliment_feature_str({}, nowPAS.PA2["arguments"]["all"], self.gold_aligns, write_log=True)
        # get candidate caseframes.
        cfs = []
        for PA in [nowPAS.PA1, nowPAS.PA2]:
            # vstr = PA['verb']['form']
            v = PA["verb"]
            sup_vec = PA["arguments"]["supporting"]
            inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"])
            cfs.append(inst.cfs)
        # get rid of caseframes inconsistent with original sentences.
        # pruning caseframes for verb1.
        if cfs[0][0]["similarity_socre"] != 0:
            threshold = cfs[0][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[0][i]
                if cf["similarity_socre"] < threshold:
                    cfs[0] = cfs[0][:i]
                    break
        # pruning caseframes for verb2.
        if cfs[1][0]["similarity_socre"] != 0:
            threshold = cfs[1][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[1][i]
                if cf["similarity_socre"] < threshold:
                    cfs[1] = cfs[1][:i]
                    break
        for cf1 in cfs[0]:
            for cf2 in cfs[1]:
                cf_str = self.get_cfsim_dictionary(cf1, cf2)["str"]
                sys.stderr.write("%s\n" % (cf_str))

        # initialize train & test files.
        train_file = open("%s/%s.txt" % (TRAIN_DIR, self.num), "w")
        test_file = open("%s/%s.txt" % (TEST_DIR, self.num), "w")
        train_file.write("@boi\n")
        test_file.write("@boi\n")
        # start printing features.
        pos_or_neg_train = "+"
        for nowAlign in ALL_ALIGN:
            # binary feature string.
            binary_fstr = self.get_binary_feature_str(nowAlign)
            num_fstr = self.get_num_feature_str(nowAlign)
            # supsim feature string.
            supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign)

            # handle positive/negative instance issue.
            if nowAlign == self.gold_aligns:
                pos_or_neg = "+"
            else:
                pos_or_neg = "-"

            for cf1 in cfs[0]:
                for cf2 in cfs[1]:
                    # note string.
                    cfs_id = "%s_%s" % (cf1["id"], cf2["id"])
                    eng_align_id = "-".join([re.sub("-", "", x) for x in nowAlign])
                    instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id)
                    hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in nowAlign])
                    note = "# %s cfs:%s align:%s\n" % (instance_id, cfs_id, hira_align_id)
                    # cfsim feature string.
                    cfsim_dict = self.get_cfsim_dictionary(cf1, cf2)
                    cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign)
                    # construct list of all features needed.
                    all_feature_list = []
                    if options.binary or options.basic:
                        all_feature_list.append(binary_fstr)
                    if options.num:
                        all_feature_list.append(num_fstr)
                    if options.cfsim or options.basic:
                        all_feature_list.append(cfsim_fstr)
                    if options.supsim:
                        all_feature_list.append(supsim_fstr)
                    if options.corearg:
                        # corearg feature string.
                        corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign)
                        all_feature_list.append(corearg_fstr)
                    if options.origsim or options.basic:
                        # origsim feature string.
                        origsim_fstr = self.get_origsim_feature_str(cfsim_dict)
                        all_feature_list.append(origsim_fstr)
                    if options.conflict:
                        # conflict feature string.
                        conflict_fstr = self.get_conflict_feature_str(supsim_dict)
                        all_feature_list.append(conflict_fstr)
                    if options.compliment:
                        comp_fstr = self.get_compliment_feature_str(
                            cf2["core"], nowPAS.PA2["arguments"]["all"], nowAlign
                        )
                        all_feature_list.append(comp_fstr)
                    # write instance to test file.
                    fstr = " ".join(all_feature_list)
                    fstr = "%s%s %s\n" % (pos_or_neg, instance_id, fstr)
                    test_file.write(note)
                    test_file.write(fstr)
                    # write to train file.
                    if nowAlign == self.gold_aligns:
                        # all_fstr_type_train = [pos_or_neg_train, binary_fstr, cfsim_fstr, supsim_fstr]
                        fstr = " ".join(all_feature_list)
                        fstr = "%s %s\n" % (pos_or_neg_train, fstr)
                        train_file.write(note)
                        train_file.write(fstr)
                        pos_or_neg_train = "-"
        # ctackle train file empty issue
        if pos_or_neg_train == "+":
            for line in self.print_train_feature():
                if "@boi" in line or "@eoi" in line:
                    continue
                train_file.write(line)

        # close train & test files.
        train_file.write("@eoi\n")
        test_file.write("@eoi\n")
        train_file.close()
        test_file.close()
Beispiel #2
0
    def print_test_feature(self):
        # get all align.
        ALL_ALIGN = []
        if TEST_CANDIDATE == "multiple":
            ALL_SINGLE_ALIGN = list(product(CASE_ENG, CASE_ENG))
            for i in [1, 2, 3, 4]:
                for candi in combinations(ALL_SINGLE_ALIGN, i):
                    align = ["%s-%s" % (a[0], a[1]) for a in candi]
                    if align not in ALL_ALIGN:
                        ALL_ALIGN.append(align)
        elif TEST_CANDIDATE == "single":
            for i in [1, 2, 3, 4]:
                for p1 in combinations(CASE_ENG, i):
                    for p2 in permutations(CASE_ENG, i):
                        align = ["%s-%s" % (p1[x], p2[x]) for x in range(i)]
                        if align not in ALL_ALIGN:
                            ALL_ALIGN.append(align)
        nowPAS = PAStructure(self.num)
        supsim_dict = nowPAS.get_supsim_dictionary()
        cfs = []
        for PA in [nowPAS.PA1, nowPAS.PA2]:
            # vstr = PA['verb']['form']
            v = PA["verb"]
            sup_vec = PA["arguments"]["supporting"]
            inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"])
            cfs.append(inst.cfs)

        # trim cf
        if cfs[0][0]["similarity_socre"] != 0:
            threshold = cfs[0][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[0][i]
                if cf["similarity_socre"] < threshold:
                    cfs[0] = cfs[0][:i]
                    break

        if cfs[1][0]["similarity_socre"] != 0:
            threshold = cfs[1][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[1][i]
                if cf["similarity_socre"] < threshold:
                    cfs[1] = cfs[1][:i]
                    break

        rtn = []
        rtn.append("@boi\n")
        for nowAlign in ALL_ALIGN:
            # binary feature string.
            binary_fstr = self.get_binary_feature_str(nowAlign)
            num_fstr = self.get_num_feature_str(nowAlign)
            # supsim feature string.
            supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign)
            pos_or_neg = "+" if nowAlign == self.gold_aligns else "-"

            for cf1 in cfs[0]:
                for cf2 in cfs[1]:
                    # note string.
                    cfs_id = "%s_%s" % (cf1["id"], cf2["id"])
                    eng_align_id = "-".join([re.sub("-", "", x) for x in nowAlign])
                    instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id)
                    hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in nowAlign])
                    note = "# %s cfs:%s align:%s" % (instance_id, cfs_id, hira_align_id)
                    # cfsim feature string.
                    cfsim_dict = self.get_cfsim_dictionary(cf1, cf2)
                    cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign)
                    all_fstr_type = [pos_or_neg + instance_id, binary_fstr, cfsim_fstr, supsim_fstr]
                    if CORE_ARG:
                        corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign)
                        all_fstr_type.append(corearg_fstr)
                    fstr = " ".join(all_fstr_type)
                    rtn.append("%s\n" % (note))
                    rtn.append("%s\n" % (fstr))
        rtn.append("@eoi\n")
        return rtn
Beispiel #3
0
 def debug(self, options):
     # get materials for supsim feature.
     nowPAS = PAStructure(self.num)
     supsim_dict = nowPAS.get_supsim_dictionary()
     self.get_compliment_feature_str({}, nowPAS.PA2["arguments"]["all"], self.gold_aligns, write_log=True)
     # get candidate caseframes.
     cfs = []
     for PA in [nowPAS.PA1, nowPAS.PA2]:
         # vstr = PA['verb']['form']
         v = PA["verb"]
         sup_vec = PA["arguments"]["supporting"]
         inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"])
         cfs.append(inst.cfs)
     # get rid of caseframes inconsistent with original sentences.
     # pruning caseframes for verb1.
     if cfs[0][0]["similarity_socre"] != 0:
         threshold = cfs[0][0]["similarity_socre"] / 10
         for i in range(TOP_N_CF):
             cf = cfs[0][i]
             if cf["similarity_socre"] < threshold:
                 cfs[0] = cfs[0][:i]
                 break
     # pruning caseframes for verb2.
     if cfs[1][0]["similarity_socre"] != 0:
         threshold = cfs[1][0]["similarity_socre"] / 10
         for i in range(TOP_N_CF):
             cf = cfs[1][i]
             if cf["similarity_socre"] < threshold:
                 cfs[1] = cfs[1][:i]
                 break
     # get materials for cfsim feature.
     for cf1 in cfs[0]:
         for cf2 in cfs[1]:
             cf_str = self.get_cfsim_dictionary(cf1, cf2)["str"]
             sys.stderr.write("%s\n" % (cf_str))
     # binary feature string.
     nowAlign = self.gold_aligns
     binary_fstr = self.get_binary_feature_str(nowAlign)
     num_fstr = self.get_num_feature_str(nowAlign)
     # supsim feature string.
     supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign)
     for cf1 in cfs[0]:
         for cf2 in cfs[1]:
             # cfsim feature string.
             cfsim_dict = self.get_cfsim_dictionary(cf1, cf2)
             cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign)
             # construct list of all features needed.
             all_feature_list = []
             if options.binary or options.basic:
                 all_feature_list.append(binary_fstr)
             if options.num:
                 all_feature_list.append(num_fstr)
             if options.cfsim or options.basic:
                 all_feature_list.append(cfsim_fstr)
             if options.supsim:
                 all_feature_list.append(supsim_fstr)
             if options.corearg:
                 # corearg feature string.
                 corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign)
                 all_feature_list.append(corearg_fstr)
             if options.origsim or options.basic:
                 # origsim feature string.
                 origsim_fstr = self.get_origsim_feature_str(cfsim_dict)
                 all_feature_list.append(origsim_fstr)
             if options.compliment:
                 comp_fstr = self.get_compliment_feature_str(cf2["core"], nowPAS.PA2["arguments"]["all"], nowAlign)
                 all_feature_list.append(comp_fstr)
Beispiel #4
0
    def print_train_feature(self):
        # get binary feature string.
        binary_fstr = self.get_binary_feature_str(self.gold_aligns)
        # get num feature string.
        num_fstr = self.get_num_feature_str(self.gold_aligns)
        # get support similarity string.
        nowPAS = PAStructure(self.num)
        supsim_dict = nowPAS.get_supsim_dictionary()
        supsim_fstr = self.get_supsim_feature_str(supsim_dict, self.gold_aligns)
        # load candidate caseframes.
        cfs = []
        for PA in [nowPAS.PA1, nowPAS.PA2]:
            # vstr = PA['verb']['form']
            v = PA["verb"]
            sup_vec = PA["arguments"]["supporting"]
            inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"])
            cfs.append(inst.cfs)
        # preserve only cfs with positive similarity
        if cfs[0][0]["similarity_socre"] != 0:
            threshold = cfs[0][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[0][i]
                if cf["similarity_socre"] < threshold:
                    cfs[0] = cfs[0][:i]
                    break

        if cfs[1][0]["similarity_socre"] != 0:
            threshold = cfs[1][0]["similarity_socre"] / 10
            for i in range(TOP_N_CF):
                cf = cfs[1][i]
                if cf["similarity_socre"] < threshold:
                    cfs[1] = cfs[1][:i]
                    break
        sys.stderr.write("%s cf left for verb1, %s cf left for verb2.\n" % (len(cfs[0]), len(cfs[1])))
        # ramdomly choose a number to be positive example, modify.
        pos = randint(0, (len(cfs[0]) * len(cfs[1])) - 1)
        rtn = []
        # print "@boi"
        rtn.append("@boi\n")
        count = 0
        for cf1 in cfs[0]:
            for cf2 in cfs[1]:
                # generate note string.
                cfs_id = "%s_%s" % (cf1["id"], cf2["id"])
                eng_align_id = "-".join([re.sub("-", "", x) for x in self.gold_aligns])
                instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id)
                hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in self.gold_aligns])
                note = "# %s cfs:%s align:%s" % (instance_id, cfs_id, hira_align_id)
                # pos & neg issue.
                pos_or_neg = "+" if count == pos else "-"
                count += 1
                # get caseframe information.
                cfsim_dict = self.get_cfsim_dictionary(cf1, cf2)
                cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, self.gold_aligns)
                # list of all features.
                # all_fstr_type = [pos_or_neg, binary_fstr, cfsim_fstr, supsim_fstr]
                all_feature_list = []
                if options.binary or options.basic:
                    all_feature_list.append(binary_fstr)
                if options.num:
                    all_feature_list.append(num_fstr)
                if options.cfsim or options.basic:
                    all_feature_list.append(cfsim_fstr)
                if options.supsim:
                    all_feature_list.append(supsim_fstr)
                if options.corearg:
                    # corearg feature string.
                    corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], self.gold_aligns)
                    all_feature_list.append(corearg_fstr)
                if options.origsim and options.basic:
                    # origsim feature string.
                    origsim_fstr = self.get_origsim_feature_str(cfsim_dict)
                    all_feature_list.append(origsim_fstr)
                if options.conflict:
                    # conflict feature string.
                    conflict_fstr = self.get_conflict_feature_str(supsim_dict)
                    all_feature_list.append(conflict_fstr)
                if options.compliment:
                    comp_fstr = self.get_compliment_feature_str(
                        cf2["core"], nowPAS.PA2["arguments"]["all"], self.gold_aligns
                    )
                    all_feature_list.append(comp_fstr)

                fstr = " ".join(all_feature_list)
                rtn.append("%s\n" % (note))
                rtn.append("%s %s\n" % (pos_or_neg, fstr))
        rtn.append("@eoi\n")
        return rtn