Exemple #1
0
def generate_datasets_with_dependency():
    datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD]
    DEP_DIR = './DATA/vsnli/DEP'
    TAGGED_DIR = './DATA/vsnli/TAGGED'
    mkdir(DEP_DIR)
    for filename in datasets:
        new_dataset = ""
        name = filename.split("/")[-1][:-4]
        P_file = TAGGED_DIR + "/premises_{}.txt".format(name)
        H_file = TAGGED_DIR + "/hypothesis_{}.txt".format(name)
        with open(filename) as f, open(P_file) as P_f, open(H_file) as H_f:
            lines = f.readlines()[1:]  #skip header
            P_reader = csv.reader(P_f, delimiter="\t")
            H_reader = csv.reader(H_f, delimiter="\t")
            for i, row in enumerate(lines):  #for each sentence
                levels = ""
                relations = ""
                for P_word in P_reader:  #for each word in P
                    if not P_word:
                        break
                    levels += P_word[6] + "_"
                    relations += P_word[7] + "_"
                levels = levels[:-1] + "#"
                relations = relations[:-1] + "#"
                for H_word in H_reader:  #for each word in P
                    if not H_word:
                        break
                    levels += H_word[6] + "_"
                    relations += H_word[7] + "_"
                levels = levels[:-1]
                relations = relations[:-1]
                new_dataset += row.strip(
                    "\n") + "\t" + levels + "\t" + relations + "\n"
        with open(DEP_DIR + "/{}.tsv".format(name), "w+") as f:
            f.write(new_dataset)
Exemple #2
0
def generate_shuffled_datasets():
    datasets = [X_TRAIN_DATA, X_DEV_DATA, X_TEST_DATA, X_TEST_DATA_HARD]
    SHUFFLED_DIR = './DATA/vsnli/SHUFFLED'
    mkdir('./DATA/vsnli/SHUFFLED')
    for filename in datasets:
        # import ipdb; ipdb.set_trace()  # TODO BREAKPOINT
        lines = open(filename).readlines()
        header = lines[0]
        lines = lines[1:]
        random.shuffle(lines)
        name = os.path.basename(filename)
        shuffled_file = os.path.join(SHUFFLED_DIR, name)
        open(shuffled_file, 'w').writelines([header] + lines)
Exemple #3
0
def dataset_without_stopwords():
    datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD]
    stopwords = _load_stopwords()
    DEP_DIR = './DATA/vsnli/DEP'
    NO_STOPWORDS_DIR = DEP_DIR + "/NO_STOPWORDS"
    mkdir(NO_STOPWORDS_DIR)
    for filename in datasets:
        name = filename.split("/")[-1][:-4]
        with open(filename) as in_file, open(
                NO_STOPWORDS_DIR + "/" + name + ".tsv", "w+") as out_file:
            reader = csv.reader(in_file, delimiter="\t")
            writer = csv.writer(out_file, delimiter="\t")
            header = next(reader, None)
            writer.writerow(header)
            for row in reader:
                P = row[1].strip().split()
                H = row[2].strip().split()
                levels = row[7].strip().split("#")
                P_level = levels[0].split("_")
                H_level = levels[1].split("_")
                relations = row[8].strip().split("#")
                P_rel = relations[0].split("_")
                H_rel = relations[1].split("_")
                to_remove = []
                for index, word in enumerate(P):
                    if word.lower() in stopwords:
                        to_remove += [index]
                P = ' '.join(
                    [i for j, i in enumerate(P) if j not in to_remove])
                P_level = '_'.join(
                    [i for j, i in enumerate(P_level) if j not in to_remove])
                P_rel = '_'.join(
                    [i for j, i in enumerate(P_rel) if j not in to_remove])
                to_remove = []
                for index, word in enumerate(H):
                    if word.lower() in stopwords:
                        to_remove += [index]
                H = ' '.join(
                    [i for j, i in enumerate(H) if j not in to_remove])
                H_level = '_'.join(
                    [i for j, i in enumerate(H_level) if j not in to_remove])
                H_rel = '_'.join(
                    [i for j, i in enumerate(H_rel) if j not in to_remove])
                row[1] = P
                row[2] = H
                row[7] = P_level + "#" + H_level
                row[8] = P_rel + "#" + H_rel
                writer.writerow(row)
Exemple #4
0
def generate_non_token_datasets():
    datasets = [TRAIN_DATA, DEV_DATA, TEST_DATA, TEST_DATA_HARD]
    TO_BE_TAGGED_DIR = './DATA/vsnli/TO_BE_TAGGED'
    mkdir(TO_BE_TAGGED_DIR)
    for filename in datasets:
        P = ""
        H = ""
        with open(filename) as f:
            reader = csv.reader(f, delimiter="\t")
            next(reader, None)  #skip header
            for row in reader:
                P += (row[4].strip() + "\n")
                H += (row[5].strip() + "\n")
        name = filename.split("/")[-1][:-4]
        with open(TO_BE_TAGGED_DIR + "/premises_{}.txt".format(name),
                  "w+") as f:
            f.write(P)
        with open(TO_BE_TAGGED_DIR + "/hypothesis_{}.txt".format(name),
                  "w+") as f:
            f.write(H)
Exemple #5
0
 def _save(self, img, vectors):
     if not os.path.exists(IMG_FEATS):
         mkdir(IMG_FEATS)
     np.savetxt(IMG_FEATS + "/{}.txt".format(img[:-4]), vectors)