def generate_datasets_with_dependency(): datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD] DEP_DIR = './DATA/vsnli/DEP' TAGGED_DIR = './DATA/vsnli/TAGGED' mkdir(DEP_DIR) for filename in datasets: new_dataset = "" name = filename.split("/")[-1][:-4] P_file = TAGGED_DIR + "/premises_{}.txt".format(name) H_file = TAGGED_DIR + "/hypothesis_{}.txt".format(name) with open(filename) as f, open(P_file) as P_f, open(H_file) as H_f: lines = f.readlines()[1:] #skip header P_reader = csv.reader(P_f, delimiter="\t") H_reader = csv.reader(H_f, delimiter="\t") for i, row in enumerate(lines): #for each sentence levels = "" relations = "" for P_word in P_reader: #for each word in P if not P_word: break levels += P_word[6] + "_" relations += P_word[7] + "_" levels = levels[:-1] + "#" relations = relations[:-1] + "#" for H_word in H_reader: #for each word in P if not H_word: break levels += H_word[6] + "_" relations += H_word[7] + "_" levels = levels[:-1] relations = relations[:-1] new_dataset += row.strip( "\n") + "\t" + levels + "\t" + relations + "\n" with open(DEP_DIR + "/{}.tsv".format(name), "w+") as f: f.write(new_dataset)
def generate_shuffled_datasets(): datasets = [X_TRAIN_DATA, X_DEV_DATA, X_TEST_DATA, X_TEST_DATA_HARD] SHUFFLED_DIR = './DATA/vsnli/SHUFFLED' mkdir('./DATA/vsnli/SHUFFLED') for filename in datasets: # import ipdb; ipdb.set_trace() # TODO BREAKPOINT lines = open(filename).readlines() header = lines[0] lines = lines[1:] random.shuffle(lines) name = os.path.basename(filename) shuffled_file = os.path.join(SHUFFLED_DIR, name) open(shuffled_file, 'w').writelines([header] + lines)
def dataset_without_stopwords(): datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD] stopwords = _load_stopwords() DEP_DIR = './DATA/vsnli/DEP' NO_STOPWORDS_DIR = DEP_DIR + "/NO_STOPWORDS" mkdir(NO_STOPWORDS_DIR) for filename in datasets: name = filename.split("/")[-1][:-4] with open(filename) as in_file, open( NO_STOPWORDS_DIR + "/" + name + ".tsv", "w+") as out_file: reader = csv.reader(in_file, delimiter="\t") writer = csv.writer(out_file, delimiter="\t") header = next(reader, None) writer.writerow(header) for row in reader: P = row[1].strip().split() H = row[2].strip().split() levels = row[7].strip().split("#") P_level = levels[0].split("_") H_level = levels[1].split("_") relations = row[8].strip().split("#") P_rel = relations[0].split("_") H_rel = relations[1].split("_") to_remove = [] for index, word in enumerate(P): if word.lower() in stopwords: to_remove += [index] P = ' '.join( [i for j, i in enumerate(P) if j not in to_remove]) P_level = '_'.join( [i for j, i in enumerate(P_level) if j not in to_remove]) P_rel = '_'.join( [i for j, i in enumerate(P_rel) if j not in to_remove]) to_remove = [] for index, word in enumerate(H): if word.lower() in stopwords: to_remove += [index] H = ' '.join( [i for j, i in enumerate(H) if j not in to_remove]) H_level = '_'.join( [i for j, i in enumerate(H_level) if j not in to_remove]) H_rel = '_'.join( [i for j, i in enumerate(H_rel) if j not in to_remove]) row[1] = P row[2] = H row[7] = P_level + "#" + H_level row[8] = P_rel + "#" + H_rel writer.writerow(row)
def generate_non_token_datasets(): datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD] TO_BE_TAGGED_DIR = './DATA/vsnli/TO_BE_TAGGED' mkdir(TO_BE_TAGGED_DIR) for filename in datasets: P = "" H = "" with open(filename) as f: reader = csv.reader(f, delimiter="\t") next(reader, None) #skip header for row in reader: P += (row[4].strip() + "\n") H += (row[5].strip() + "\n") name = filename.split("/")[-1][:-4] with open(TO_BE_TAGGED_DIR + "/premises_{}.txt".format(name), "w+") as f: f.write(P) with open(TO_BE_TAGGED_DIR + "/hypothesis_{}.txt".format(name), "w+") as f: f.write(H)
def _save(self, img, vectors): if not os.path.exists(IMG_FEATS): mkdir(IMG_FEATS) np.savetxt(IMG_FEATS + "/{}.txt".format(img[:-4]), vectors)