Esempio n. 1
0
def setup(action_space=-1, navigable_locs_path=None):
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)
    # Check for vocabs
    if not os.path.exists(RESULT_DIR):
        create_folders(RESULT_DIR)
    if not os.path.exists(PLOT_DIR):
        create_folders(PLOT_DIR)
    if not os.path.exists(SNAPSHOT_DIR):
        create_folders(SNAPSHOT_DIR)
    if not os.path.exists(navigable_locs_path):
        create_folders(navigable_locs_path)

    if not os.path.exists(TRAIN_VOCAB):
        write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB)
    if not os.path.exists(TRAINVAL_VOCAB):
        write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']), TRAINVAL_VOCAB)

    if navigable_locs_path:
        #if philly:
        #    navigable_locs_path = os.path.join(os.getenv('PT_OUTPUT_DIR'), "tasks/NDH/data")
        #    if not os.path.exists(navigable_locs_path):
        #        create_folders(navigable_locs_path)

        navigable_locs_path += '/navigable_locs.json'

        print('navigable_locs_path', navigable_locs_path)
    preprocess_get_pano_states(navigable_locs_path)
    global nav_graphs
    nav_graphs = None
    if action_space == -1:  # load navigable location cache
        with open(navigable_locs_path, 'r') as f:
            nav_graphs = json.load(f)
    return nav_graphs
Esempio n. 2
0
def setup():
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)
    # Check for vocabs
    if not os.path.exists(TRAIN_VOCAB):
        write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB)
    if not os.path.exists(TRAINVAL_VOCAB):
        write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']), TRAINVAL_VOCAB)
Esempio n. 3
0
def setup():
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)
    # Check for vocabs
    if not os.path.exists(TRAIN_VOCAB):
        write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB)
    if not os.path.exists(TRAINVAL_VOCAB):
        write_vocab(build_vocab(splits=['train','val_seen','val_unseen']), TRAINVAL_VOCAB)
Esempio n. 4
0
def setup():
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)
    # Check for vocabs
    if not os.path.exists(train_vocab):
        write_vocab(build_vocab(splits=['train']), train_vocab)
    if not os.path.exists(trainval_vocab):
        write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']),
                    trainval_vocab)
Esempio n. 5
0
def setup(seed=None):

    if seed is not None:
        hparams.seed = seed
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    # Check for vocabs
    train_vocab_path = os.path.join(hparams.data_path, 'train_vocab.txt')
    if not os.path.exists(train_vocab_path):
        write_vocab(
            build_vocab(hparams.data_path,
                        splits=['train'],
                        min_count=hparams.min_word_count,
                        max_length=hparams.max_input_length,
                        split_by_spaces=hparams.split_by_spaces,
                        prefix='noroom' if hasattr(hparams, 'no_room')
                        and hparams.no_room else 'asknav'), train_vocab_path)
Esempio n. 6
0
def preprocess1(top=0, val_rate=0.1, test_rate=0.1):
    # X_text, Y, _, _ = data.load_data_and_labels_from_csv(dataset="yelp_review_polarity")
    # print("Y:", Y[:10])
    X_1, X_2, Y = data.load_quora_data(file_name=DATA_FILE, top=top)

    max_X1 = max([len(x.split(" ")) for x in X_1])
    max_X2 = max([len(x.split(" ")) for x in X_2])

    vocab1 = learn.preprocessing.VocabularyProcessor(MAX_LEN)
    vocab2 = learn.preprocessing.VocabularyProcessor(MAX_LEN)

    X1 = np.array(list(vocab1.fit_transform(X_1)))
    X2 = np.array(list(vocab2.fit_transform(X_2)))
    Y = np.array(Y)

    write_vocab(vocab1, VOCAB_FILE_1)
    write_vocab(vocab2, VOCAB_FILE_2)

    print("==================")
    print("Train/Test split")
    # X = np.stack((X1, X2), axis=1)
    # print("X1.shape:", X1)
    # print("X2.shape:", X2)
    shuffle_idx = np.random.permutation(np.arange(len(Y)))
    x1_all = X1[shuffle_idx]
    x2_all = X2[shuffle_idx]
    y_all = Y[shuffle_idx]

    test_sample_idx = -1 * int(test_rate * float(len(y_all)))
    x1_train, x1_test = x1_all[:test_sample_idx], x1_all[test_sample_idx:]
    x2_train, x2_test = x2_all[:test_sample_idx], x2_all[test_sample_idx:]
    y_train, y_test = y_all[:test_sample_idx], y_all[test_sample_idx:]

    val_sample_idx = -1 * int(val_rate * float(len(y_train)))
    x1_train, x1_val = x1_train[:val_sample_idx], x1_train[val_sample_idx:]
    x2_train, x2_val = x2_train[:val_sample_idx], x2_train[val_sample_idx:]
    y_train, y_val = y_train[:val_sample_idx], y_train[val_sample_idx:]

    print("Vocab 1 Size: {:d}".format(len(vocab1.vocabulary_)))
    print("Vocab 2 Size: {:d}".format(len(vocab2.vocabulary_)))
    print("Train/Val/Test split: {:d}/{:d}/{:d}".format(
        len(y_train), len(y_val), len(y_test)))
    return (x1_train, x2_train, y_train, x1_val, x2_val, y_val, x1_test,
            x2_test, y_test, vocab1, vocab2)
Esempio n. 7
0
def setup(args, clear=False):
    '''
    主要就是构建词表vocabs.
    '''
    TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH = args.TRAIN_VOCAB_EN, args.TRAIN_VOCAB_ZH  #中文词表和英文词表的路径
    if clear:  ## 删除已经有的词表
        for file in [TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH]:
            if os.path.exists(file):
                os.remove(file)
    # 构建English vocabs
    if not os.path.exists(TRAIN_VOCAB_EN):
        write_vocab(build_vocab(args.DATA_DIR, language='en'), TRAIN_VOCAB_EN)
    # 构建Chinese vocabs
    if not os.path.exists(TRAIN_VOCAB_ZH):
        write_vocab(build_vocab(args.DATA_DIR, language='zh'), TRAIN_VOCAB_ZH)

    # 设定随机种子
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
Esempio n. 8
0
def setup(args, clear=False):
    '''
    Build vocabs from train or train/val set.
    '''
    TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH = args.TRAIN_VOCAB_EN, args.TRAIN_VOCAB_ZH
    if clear:  ## delete previous vocab
        for file in [TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH]:
            if os.path.exists(file):
                os.remove(file)
    # Build English vocabs
    if not os.path.exists(TRAIN_VOCAB_EN):
        write_vocab(build_vocab(args.DATA_DIR, language='en'), TRAIN_VOCAB_EN)
    #build Chinese vocabs
    if not os.path.exists(TRAIN_VOCAB_ZH):
        write_vocab(build_vocab(args.DATA_DIR, language='zh'), TRAIN_VOCAB_ZH)

    # set up seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
Esempio n. 9
0
File: train.py Progetto: ray-97/vnla
def setup(seed=None):

    if seed is not None:
        hparams.seed = seed
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    # Check for vocabs
    train_vocab_path = os.path.join(hparams.data_path, 'train_vocab.txt')
    print("train_vocab_path: " + train_vocab_path)
    if not os.path.exists(train_vocab_path):
        # txt file contains the name of a list of household objects
        write_vocab(build_vocab(
                    hparams.data_path,
                    splits=['train'],
                    min_count=hparams.min_word_count,
                    max_length=hparams.max_input_length, # these vals are inside verbal_hard.json
                    split_by_spaces=hparams.split_by_spaces,
                    prefix='noroom' if hasattr(hparams, 'no_room') and
                           hparams.no_room else 'asknav'),
            train_vocab_path) # build using .json files in same dir: data/asknav/...train
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    vocab.insert(0, PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    vocab_chars = list(vocab_chars)
    vocab_chars.insert(0, PAD)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
Esempio n. 11
0
def main() :
    config = utils.Config()

    utils.mkdir(os.path.join(config.getpath("data"), "rstdt-vocab"))

    filenames = []
    for filename in os.listdir(os.path.join(config.getpath("data"), "rstdt", "wsj", "train")):
        filenames.append(os.path.join(config.getpath("data"), "rstdt", "wsj", "train", filename))
    for filename in os.listdir(os.path.join(config.getpath("data"), "rstdt", "wsj", "test")):
        filenames.append(os.path.join(config.getpath("data"), "rstdt", "wsj", "test", filename))
    filenames = [n for n in filenames if n.endswith(".labeled.bin.ctree")]
    filenames.sort()

    relation_mapper = treetk.rstdt.RelationMapper()

    frelations = []
    crelations = []
    nuclearities = []

    for filename in pyprind.prog_bar(filenames):
        sexp = utils.read_lines(filename, process=lambda line: line)
        sexp = treetk.preprocess(sexp)
        tree = treetk.rstdt.postprocess(treetk.sexp2tree(sexp, with_nonterminal_labels=True, with_terminal_labels=False))

        nodes = treetk.traverse(tree, order="pre-order", include_terminal=False, acc=None)

        part_frelations = []
        part_crelations = []
        part_nuclearities = []
        for node in nodes:
            relations_ = node.relation_label.split("/")
            part_frelations.extend(relations_)
            part_crelations.extend([relation_mapper.f2c(r) for r in relations_])
            part_nuclearities.append(node.nuclearity_label)

        part_frelations.append("<root>")
        part_crelations.append("<root>")

        frelations.append(part_frelations)
        crelations.append(part_crelations)
        nuclearities.append(part_nuclearities)

    fcounter = utils.get_word_counter(lines=frelations)
    ccounter = utils.get_word_counter(lines=crelations)
    ncounter = utils.get_word_counter(lines=nuclearities)

    frelations = fcounter.most_common() # list of (str, int)
    crelations = ccounter.most_common() # list of (str, int)
    nuclearities = ncounter.most_common() # list of (str, int)

    utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "relations.fine.vocab.txt"),
                      frelations)
    utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "relations.coarse.vocab.txt"),
                      crelations)
    utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "nuclearities.vocab.txt"),
                      nuclearities)
Esempio n. 12
0
def preprocess1(top=0):
    print("Load data.")
    # X_text, Y, _, _ = data.load_data_and_labels_from_csv(dataset="yelp_review_polarity")
    # print("Y:", Y[:10])
    X_1, X_2, Y = data.load_quora_data(top=top)
    print("X_1.size:", len(X_1))
    print("X_2.size:", len(X_2))
    print("Y.size:", len(Y))

    max_X1 = max([len(x.split(" ")) for x in X_1])
    max_X2 = max([len(x.split(" ")) for x in X_2])

    vocab1 = learn.preprocessing.VocabularyProcessor(MAX_LEN)
    vocab2 = learn.preprocessing.VocabularyProcessor(MAX_LEN)

    X1 = np.array(list(vocab1.fit_transform(X_1)))
    X2 = np.array(list(vocab2.fit_transform(X_2)))
    Y = np.array(Y)

    write_vocab(vocab1, "./data/quora/vocab1.csv")
    write_vocab(vocab2, "./data/quora/vocab2.csv")

    print("X_1.size:", X1.shape)
    print("X_2.size:", X2.shape)
    print("Y.size:", Y.shape)

    print("==================")
    print("Train/Test split")
    # X = np.stack((X1, X2), axis=1)
    # print("X1.shape:", X1)
    # print("X2.shape:", X2)
    shuffle_idx = np.random.permutation(np.arange(len(Y)))
    x1_all = X1[shuffle_idx]
    x2_all = X2[shuffle_idx]
    y_all = Y[shuffle_idx]

    test_sample_idx = -1 * int(TEST_SPLIT * float(len(y_all)))
    x1_train, x1_test = x1_all[:test_sample_idx], x1_all[test_sample_idx:]
    x2_train, x2_test = x2_all[:test_sample_idx], x2_all[test_sample_idx:]
    y_train, y_test = y_all[:test_sample_idx], y_all[test_sample_idx:]

    val_sample_idx = -1 * int(VALIDATION_SPLIT * float(len(y_train)))
    x1_train, x1_val = x1_train[:val_sample_idx], x1_train[val_sample_idx:]
    x2_train, x2_val = x2_train[:val_sample_idx], x2_train[val_sample_idx:]
    y_train, y_val = y_train[:val_sample_idx], y_train[val_sample_idx:]

    # ret = train_test_split(X, Y, test_size=TEST_SPLIT, random_state=RNG_SEED)
    # X_train, X_test, y_train, y_test = ret

    # ret = train_test_split(X_train, y_train, test_size=VALIDATION_SPLIT,
    #                        random_state=RNG_SEED)
    # X_train, X_val, y_train, y_val = ret
    """
    x1_train = X_train[:, 0]
    x2_train = X_train[:, 1]

    x1_val = X_train[:, 0]
    x2_val = X_train[:, 1]

    x1_test = X_test[:, 0]
    x2_test = X_test[:, 1]
    """
    """

    # Shuffle the data
    print("Shuffle the data.")
    # np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(Y)))
    x1_shuffled = X1[shuffle_indices]
    x2_shuffled = X2[shuffle_indices]
    y_shuffled = Y[shuffle_indices]

    # Split train/test set
    print("Split train/test set")
    dev_sample_index = -1 * int(0.1 * float(len(Y)))
    x1_train, x1_dev = x1_shuffled[:dev_sample_index], x1_shuffled[dev_sample_index:]
    x2_train, x2_dev = x2_shuffled[:dev_sample_index], x2_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    del X1, X2, Y, x1_shuffled, x2_shuffled, y_shuffled
    """

    print("Vocab 1 Size: {:d}".format(len(vocab1.vocabulary_)))
    print("Vocab 2 Size: {:d}".format(len(vocab2.vocabulary_)))
    print("Train/Val/Test split: {:d}/{:d}/{:d}".format(
        len(y_train), len(y_val), len(y_test)))
    return (x1_train, x2_train, y_train, x1_val, x2_val, y_val, x1_test,
            x2_test, y_test, vocab1, vocab2)
def main(args):
    config = utils.Config()

    utils.mkdir(os.path.join(config.getpath("data"), "rstdt-vocab"))

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "rstdt", "renamed"))
    filenames = [n for n in filenames if n.endswith(".edus")]
    filenames.sort()

    # Concat
    filepaths = [
        os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing",
                     filename + ".tokenized.lowercased.replace_digits")
        for filename in filenames
    ]
    textpreprocessor.concat.run(
        filepaths,
        os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing",
                     "concat.tokenized.lowercased.replace_digits"))

    # Build vocabulary
    if args.with_root:
        special_words = ["<root>"]
    else:
        special_words = []
    textpreprocessor.create_vocabulary.run(
        os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing",
                     "concat.tokenized.lowercased.replace_digits"),
        os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt"),
        prune_at=50000,
        min_count=-1,
        special_words=special_words,
        with_unk=True)

    # Build vocabulary for fine-grained/coarse-grained relations
    relation_mapper = treetk.rstdt.RelationMapper()
    frelations = []
    crelations = []
    nuclearities = []
    for filename in filenames:
        sexp = utils.read_lines(os.path.join(
            config.getpath("data"), "rstdt", "renamed",
            filename.replace(".edus", ".labeled.bin.ctree")),
                                process=lambda line: line)
        sexp = treetk.preprocess(sexp)
        tree = treetk.rstdt.postprocess(
            treetk.sexp2tree(sexp,
                             with_nonterminal_labels=True,
                             with_terminal_labels=False))
        nodes = treetk.traverse(tree,
                                order="pre-order",
                                include_terminal=False,
                                acc=None)
        part_frelations = []
        part_crelations = []
        part_nuclearities = []
        for node in nodes:
            relations_ = node.relation_label.split("/")
            part_frelations.extend(relations_)
            part_crelations.extend(
                [relation_mapper.f2c(r) for r in relations_])
            part_nuclearities.append(node.nuclearity_label)
        if args.with_root:
            part_frelations.append("<root>")
            part_crelations.append("<root>")
        frelations.append(part_frelations)
        crelations.append(part_crelations)
        nuclearities.append(part_nuclearities)

    fcounter = utils.get_word_counter(lines=frelations)
    ccounter = utils.get_word_counter(lines=crelations)
    ncounter = utils.get_word_counter(lines=nuclearities)
    frelations = fcounter.most_common()  # list of (str, int)
    crelations = ccounter.most_common()  # list of (str, int)
    nuclearities = ncounter.most_common()  # list of (str, int)
    utils.write_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "relations.fine.vocab.txt"), frelations)
    utils.write_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "relations.coarse.vocab.txt"), crelations)
    utils.write_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "nuclearities.vocab.txt"), nuclearities)