def print_features(self, options): # get all align. if WITHOUT_DD: ALL_ALIGN = get_all_align("withoutDD") else: ALL_ALIGN = get_all_align("expanded_single") # get materials for supsim feature. nowPAS = PAStructure(self.num) supsim_dict = nowPAS.get_supsim_dictionary() self.get_compliment_feature_str({}, nowPAS.PA2["arguments"]["all"], self.gold_aligns, write_log=True) # get candidate caseframes. cfs = [] for PA in [nowPAS.PA1, nowPAS.PA2]: # vstr = PA['verb']['form'] v = PA["verb"] sup_vec = PA["arguments"]["supporting"] inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"]) cfs.append(inst.cfs) # get rid of caseframes inconsistent with original sentences. # pruning caseframes for verb1. if cfs[0][0]["similarity_socre"] != 0: threshold = cfs[0][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[0][i] if cf["similarity_socre"] < threshold: cfs[0] = cfs[0][:i] break # pruning caseframes for verb2. if cfs[1][0]["similarity_socre"] != 0: threshold = cfs[1][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[1][i] if cf["similarity_socre"] < threshold: cfs[1] = cfs[1][:i] break for cf1 in cfs[0]: for cf2 in cfs[1]: cf_str = self.get_cfsim_dictionary(cf1, cf2)["str"] sys.stderr.write("%s\n" % (cf_str)) # initialize train & test files. train_file = open("%s/%s.txt" % (TRAIN_DIR, self.num), "w") test_file = open("%s/%s.txt" % (TEST_DIR, self.num), "w") train_file.write("@boi\n") test_file.write("@boi\n") # start printing features. pos_or_neg_train = "+" for nowAlign in ALL_ALIGN: # binary feature string. binary_fstr = self.get_binary_feature_str(nowAlign) num_fstr = self.get_num_feature_str(nowAlign) # supsim feature string. supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign) # handle positive/negative instance issue. if nowAlign == self.gold_aligns: pos_or_neg = "+" else: pos_or_neg = "-" for cf1 in cfs[0]: for cf2 in cfs[1]: # note string. cfs_id = "%s_%s" % (cf1["id"], cf2["id"]) eng_align_id = "-".join([re.sub("-", "", x) for x in nowAlign]) instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id) hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in nowAlign]) note = "# %s cfs:%s align:%s\n" % (instance_id, cfs_id, hira_align_id) # cfsim feature string. cfsim_dict = self.get_cfsim_dictionary(cf1, cf2) cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign) # construct list of all features needed. all_feature_list = [] if options.binary or options.basic: all_feature_list.append(binary_fstr) if options.num: all_feature_list.append(num_fstr) if options.cfsim or options.basic: all_feature_list.append(cfsim_fstr) if options.supsim: all_feature_list.append(supsim_fstr) if options.corearg: # corearg feature string. corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign) all_feature_list.append(corearg_fstr) if options.origsim or options.basic: # origsim feature string. origsim_fstr = self.get_origsim_feature_str(cfsim_dict) all_feature_list.append(origsim_fstr) if options.conflict: # conflict feature string. conflict_fstr = self.get_conflict_feature_str(supsim_dict) all_feature_list.append(conflict_fstr) if options.compliment: comp_fstr = self.get_compliment_feature_str( cf2["core"], nowPAS.PA2["arguments"]["all"], nowAlign ) all_feature_list.append(comp_fstr) # write instance to test file. fstr = " ".join(all_feature_list) fstr = "%s%s %s\n" % (pos_or_neg, instance_id, fstr) test_file.write(note) test_file.write(fstr) # write to train file. if nowAlign == self.gold_aligns: # all_fstr_type_train = [pos_or_neg_train, binary_fstr, cfsim_fstr, supsim_fstr] fstr = " ".join(all_feature_list) fstr = "%s %s\n" % (pos_or_neg_train, fstr) train_file.write(note) train_file.write(fstr) pos_or_neg_train = "-" # ctackle train file empty issue if pos_or_neg_train == "+": for line in self.print_train_feature(): if "@boi" in line or "@eoi" in line: continue train_file.write(line) # close train & test files. train_file.write("@eoi\n") test_file.write("@eoi\n") train_file.close() test_file.close()
def print_test_feature(self): # get all align. ALL_ALIGN = [] if TEST_CANDIDATE == "multiple": ALL_SINGLE_ALIGN = list(product(CASE_ENG, CASE_ENG)) for i in [1, 2, 3, 4]: for candi in combinations(ALL_SINGLE_ALIGN, i): align = ["%s-%s" % (a[0], a[1]) for a in candi] if align not in ALL_ALIGN: ALL_ALIGN.append(align) elif TEST_CANDIDATE == "single": for i in [1, 2, 3, 4]: for p1 in combinations(CASE_ENG, i): for p2 in permutations(CASE_ENG, i): align = ["%s-%s" % (p1[x], p2[x]) for x in range(i)] if align not in ALL_ALIGN: ALL_ALIGN.append(align) nowPAS = PAStructure(self.num) supsim_dict = nowPAS.get_supsim_dictionary() cfs = [] for PA in [nowPAS.PA1, nowPAS.PA2]: # vstr = PA['verb']['form'] v = PA["verb"] sup_vec = PA["arguments"]["supporting"] inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"]) cfs.append(inst.cfs) # trim cf if cfs[0][0]["similarity_socre"] != 0: threshold = cfs[0][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[0][i] if cf["similarity_socre"] < threshold: cfs[0] = cfs[0][:i] break if cfs[1][0]["similarity_socre"] != 0: threshold = cfs[1][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[1][i] if cf["similarity_socre"] < threshold: cfs[1] = cfs[1][:i] break rtn = [] rtn.append("@boi\n") for nowAlign in ALL_ALIGN: # binary feature string. binary_fstr = self.get_binary_feature_str(nowAlign) num_fstr = self.get_num_feature_str(nowAlign) # supsim feature string. supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign) pos_or_neg = "+" if nowAlign == self.gold_aligns else "-" for cf1 in cfs[0]: for cf2 in cfs[1]: # note string. cfs_id = "%s_%s" % (cf1["id"], cf2["id"]) eng_align_id = "-".join([re.sub("-", "", x) for x in nowAlign]) instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id) hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in nowAlign]) note = "# %s cfs:%s align:%s" % (instance_id, cfs_id, hira_align_id) # cfsim feature string. cfsim_dict = self.get_cfsim_dictionary(cf1, cf2) cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign) all_fstr_type = [pos_or_neg + instance_id, binary_fstr, cfsim_fstr, supsim_fstr] if CORE_ARG: corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign) all_fstr_type.append(corearg_fstr) fstr = " ".join(all_fstr_type) rtn.append("%s\n" % (note)) rtn.append("%s\n" % (fstr)) rtn.append("@eoi\n") return rtn
def debug(self, options): # get materials for supsim feature. nowPAS = PAStructure(self.num) supsim_dict = nowPAS.get_supsim_dictionary() self.get_compliment_feature_str({}, nowPAS.PA2["arguments"]["all"], self.gold_aligns, write_log=True) # get candidate caseframes. cfs = [] for PA in [nowPAS.PA1, nowPAS.PA2]: # vstr = PA['verb']['form'] v = PA["verb"] sup_vec = PA["arguments"]["supporting"] inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"]) cfs.append(inst.cfs) # get rid of caseframes inconsistent with original sentences. # pruning caseframes for verb1. if cfs[0][0]["similarity_socre"] != 0: threshold = cfs[0][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[0][i] if cf["similarity_socre"] < threshold: cfs[0] = cfs[0][:i] break # pruning caseframes for verb2. if cfs[1][0]["similarity_socre"] != 0: threshold = cfs[1][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[1][i] if cf["similarity_socre"] < threshold: cfs[1] = cfs[1][:i] break # get materials for cfsim feature. for cf1 in cfs[0]: for cf2 in cfs[1]: cf_str = self.get_cfsim_dictionary(cf1, cf2)["str"] sys.stderr.write("%s\n" % (cf_str)) # binary feature string. nowAlign = self.gold_aligns binary_fstr = self.get_binary_feature_str(nowAlign) num_fstr = self.get_num_feature_str(nowAlign) # supsim feature string. supsim_fstr = self.get_supsim_feature_str(supsim_dict, nowAlign) for cf1 in cfs[0]: for cf2 in cfs[1]: # cfsim feature string. cfsim_dict = self.get_cfsim_dictionary(cf1, cf2) cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, nowAlign) # construct list of all features needed. all_feature_list = [] if options.binary or options.basic: all_feature_list.append(binary_fstr) if options.num: all_feature_list.append(num_fstr) if options.cfsim or options.basic: all_feature_list.append(cfsim_fstr) if options.supsim: all_feature_list.append(supsim_fstr) if options.corearg: # corearg feature string. corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], nowAlign) all_feature_list.append(corearg_fstr) if options.origsim or options.basic: # origsim feature string. origsim_fstr = self.get_origsim_feature_str(cfsim_dict) all_feature_list.append(origsim_fstr) if options.compliment: comp_fstr = self.get_compliment_feature_str(cf2["core"], nowPAS.PA2["arguments"]["all"], nowAlign) all_feature_list.append(comp_fstr)
def print_train_feature(self): # get binary feature string. binary_fstr = self.get_binary_feature_str(self.gold_aligns) # get num feature string. num_fstr = self.get_num_feature_str(self.gold_aligns) # get support similarity string. nowPAS = PAStructure(self.num) supsim_dict = nowPAS.get_supsim_dictionary() supsim_fstr = self.get_supsim_feature_str(supsim_dict, self.gold_aligns) # load candidate caseframes. cfs = [] for PA in [nowPAS.PA1, nowPAS.PA2]: # vstr = PA['verb']['form'] v = PA["verb"] sup_vec = PA["arguments"]["supporting"] inst = CaseFrames(v, sup_vec, top_k=TOP_N_CF, obligatory=PA["arguments"]["word_replace"]) cfs.append(inst.cfs) # preserve only cfs with positive similarity if cfs[0][0]["similarity_socre"] != 0: threshold = cfs[0][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[0][i] if cf["similarity_socre"] < threshold: cfs[0] = cfs[0][:i] break if cfs[1][0]["similarity_socre"] != 0: threshold = cfs[1][0]["similarity_socre"] / 10 for i in range(TOP_N_CF): cf = cfs[1][i] if cf["similarity_socre"] < threshold: cfs[1] = cfs[1][:i] break sys.stderr.write("%s cf left for verb1, %s cf left for verb2.\n" % (len(cfs[0]), len(cfs[1]))) # ramdomly choose a number to be positive example, modify. pos = randint(0, (len(cfs[0]) * len(cfs[1])) - 1) rtn = [] # print "@boi" rtn.append("@boi\n") count = 0 for cf1 in cfs[0]: for cf2 in cfs[1]: # generate note string. cfs_id = "%s_%s" % (cf1["id"], cf2["id"]) eng_align_id = "-".join([re.sub("-", "", x) for x in self.gold_aligns]) instance_id = "%s_%s_%s" % (self.num, self.get_instanceID(cfs_id), eng_align_id) hira_align_id = ",".join([replaceByDict(x, INV_GOLD_ALIGN_DICT) for x in self.gold_aligns]) note = "# %s cfs:%s align:%s" % (instance_id, cfs_id, hira_align_id) # pos & neg issue. pos_or_neg = "+" if count == pos else "-" count += 1 # get caseframe information. cfsim_dict = self.get_cfsim_dictionary(cf1, cf2) cfsim_fstr = self.get_cfsim_feature_str(cfsim_dict, self.gold_aligns) # list of all features. # all_fstr_type = [pos_or_neg, binary_fstr, cfsim_fstr, supsim_fstr] all_feature_list = [] if options.binary or options.basic: all_feature_list.append(binary_fstr) if options.num: all_feature_list.append(num_fstr) if options.cfsim or options.basic: all_feature_list.append(cfsim_fstr) if options.supsim: all_feature_list.append(supsim_fstr) if options.corearg: # corearg feature string. corearg_fstr = self.get_corearg_feature_str(cf1["core"], cf2["core"], self.gold_aligns) all_feature_list.append(corearg_fstr) if options.origsim and options.basic: # origsim feature string. origsim_fstr = self.get_origsim_feature_str(cfsim_dict) all_feature_list.append(origsim_fstr) if options.conflict: # conflict feature string. conflict_fstr = self.get_conflict_feature_str(supsim_dict) all_feature_list.append(conflict_fstr) if options.compliment: comp_fstr = self.get_compliment_feature_str( cf2["core"], nowPAS.PA2["arguments"]["all"], self.gold_aligns ) all_feature_list.append(comp_fstr) fstr = " ".join(all_feature_list) rtn.append("%s\n" % (note)) rtn.append("%s %s\n" % (pos_or_neg, fstr)) rtn.append("@eoi\n") return rtn