def get_ood_data(templates, train_dev_partition, test_partition, fo_test_ovit, fo_test_ovot, output_header): set_vocab_by_type('ood') print('ood') num_examples = 300 output_rows = [] guid = 0 for t in templates: existing_templates = [] for i in range(num_examples): generated_template = t.generate_one_example() # check generated_template duplicate with existing while generated_template in existing_templates: # print('true') generated_template = t.generate_one_example() # add guid output_rows.append([guid] + generated_template) guid += 1 existing_templates.append(generated_template) # filter by templates partitions output_rows_test_ovit = filter_by_template_partition( train_dev_partition, output_rows) output_rows_test_ovot = filter_by_template_partition( test_partition, output_rows) write_csv(fo_test_ovit, output_rows_test_ovit, output_header) write_csv(fo_test_ovot, output_rows_test_ovot, output_header)
def main(): hans_found_support_case = './esnli_train_subseq.csv' output_file = './esnli_train_subseq_with_templates.csv' output_header = None output_rows = [] hans_templates = get_hans_templates() with open(hans_found_support_case) as f: reader = csv.reader(f) for (i, line) in enumerate(reader): if i == 0: output_header = line continue label = line[1] premise_pos = get_pos_tags(line[2]) hypothesis_pos = get_pos_tags(line[3]) template_name = match_templates(label, premise_pos, hypothesis_pos, hans_templates) if template_name != None: line[-1] = template_name print('template_name: ', template_name) output_rows.append(line) write_csv(output_file, output_rows, output_header)
def main(): fi = './templates.csv' fo = './templates_new.csv' rows = [] with open(fi) as f: reader = csv.reader(f) for (i, line) in enumerate(reader): if i == 0: header = line if i > 0: for j in range(len(line)): item = line[j] while item[0] == ' ': item = item[1:] while item[-1] == ' ': item = item[:-1] line[j] = item rows.append(line) write_csv(fo, rows, header)
def main(): data_dir_name = 'split_abundant_words_subcases' num_seeds = 1 fi_name_list = ['dev_1', 'dev_2', 'dev_4', 'dev_7', 'dev_13', 'dev_32', 'train_1', 'train_2', 'train_4', 'train_8', 'train_16', 'train_32', 'train_64', 'test_ivit_300', 'test_ivot_300', 'test_ovit_300', 'test_ovot_300'] for seed in range(num_seeds): for partition in range(5): path = './%s/seed%d/partition%d/' % (data_dir_name, seed, partition) for fi in fi_name_list: fi_path = path + fi + '.csv' fo_nl = path + fi + '_nl.csv' # natural language explanation fo_pt = path + fi + '_pt.csv'# pointer-only explanations fo_empty_expl = path + fi + '_empty_expl.csv' # empty_expl nl_rows = [] pt_rows = [] empty_expl_rows = [] with open(fi_path) as f: reader = csv.reader(f) for (i, line) in enumerate(reader): if i > 0: guid = line[0] label = line[5] p = line[-4] h = line[-3] nl = line[-2] pt = line[-1] nl_row = [""]*19 nl_row[0] = guid nl_row[1] = label nl_row[2] = p nl_row[3] = h nl_row[4] = nl pt_row = [""]*19 pt_row[0] = guid pt_row[1] = label pt_row[2] = p pt_row[3] = h pt_row[4] = pt empty_expl_row = [""]*19 empty_expl_row[0] = guid empty_expl_row[1] = label empty_expl_row[2] = p empty_expl_row[3] = h nl_rows.append(nl_row) pt_rows.append(pt_row) empty_expl_rows.append(empty_expl_row) write_csv(fo_nl, nl_rows, esnli_format_header) write_csv(fo_pt, pt_rows, esnli_format_header) write_csv(fo_empty_expl, empty_expl_rows, esnli_format_header)
def get_ind_data(templates, train_dev_partition, test_partition, fo_dir, fo_train, fo_dev, fo_test_ivit, fo_test_ivot, output_header): set_vocab_by_type('ind') print('ind') num_examples = 492 output_rows_train = [] output_rows_dev = [] output_rows_test = [] guid_train = 0 guid_dev = 0 guid_test = 0 output_rows_train_by_template = {} output_rows_dev_by_template = {} for t in templates: existing_templates = [] for i in range(num_examples): generated_template = t.generate_one_example() # check generated_template duplicate with existing while generated_template in existing_templates: # print('true') generated_template = t.generate_one_example() # add guid if i < 160: line = [guid_train] + generated_template template_id = generated_template[0] if template_id in output_rows_train_by_template: output_rows_train_by_template[template_id].append(line) else: output_rows_train_by_template[template_id] = [line] output_rows_train.append(line) guid_train += 1 elif i < 192: line = [guid_dev] + generated_template template_id = generated_template[0] if template_id in output_rows_dev_by_template: output_rows_dev_by_template[template_id].append(line) else: output_rows_dev_by_template[template_id] = [line] output_rows_dev.append(line) guid_dev += 1 else: output_rows_test.append([guid_test] + generated_template) guid_test += 1 existing_templates.append(generated_template) # filter by templates partitions output_rows_train_it = filter_by_template_partition( train_dev_partition, output_rows_train) output_rows_dev_it = filter_by_template_partition(train_dev_partition, output_rows_dev) output_rows_test_ivit = filter_by_template_partition( train_dev_partition, output_rows_test) output_rows_test_ivot = filter_by_template_partition( test_partition, output_rows_test) write_csv(fo_train, output_rows_train_it, output_header) write_csv(fo_dev, output_rows_dev_it, output_header) write_csv(fo_test_ivit, output_rows_test_ivit, output_header) write_csv(fo_test_ivot, output_rows_test_ivot, output_header) # few sample for train and dev sets train_sample_sizes = [1, 2, 4, 8, 16, 32, 64] dev_sample_sizes = list(set([int(0.2 * k) + 1 for k in train_sample_sizes])) for train_size in train_sample_sizes: fo_train = '%strain_%d.csv' % (fo_dir, train_size) output_rows = [] for k, v in output_rows_train_by_template.items(): output_rows.extend(v[:train_size]) output_rows = filter_by_template_partition(train_dev_partition, output_rows) write_csv(fo_train, output_rows, output_header) for dev_size in dev_sample_sizes: fo_dev = '%sdev_%d.csv' % (fo_dir, dev_size) output_rows = [] for k, v in output_rows_dev_by_template.items(): output_rows.extend(v[:dev_size]) output_rows = filter_by_template_partition(train_dev_partition, output_rows) write_csv(fo_dev, output_rows, output_header)
hyp_words = [] for word in premise.split(): if word not in [".", "?", "!"]: prem_words.append(word.lower()) for word in hypothesis.split(): if word not in [".", "?", "!"]: hyp_words.append(word.lower()) prem_filtered = " ".join(prem_words) hyp_filtered = " ".join(hyp_words) hypo_len = len(hypothesis.strip().split(" ")) expl1_len = len(expl1.strip().split(" ")) if hypo_len >= 3 and expl1_len >= 3: if hyp_filtered in prem_filtered: if label == "entailment": count_entailment += 1 if label == "neutral": count_neutral += 1 if label == "contradiction": count_contradiction += 1 fo_row_data.append([guid, label, premise, hypothesis, expl1, ""]) print("Entailment:", count_entailment) print("Contradiction:", count_contradiction) print("Neutral:", count_neutral) write_csv(fo_path, fo_row_data, fo_header)