def setup(action_space=-1, navigable_locs_path=None): torch.manual_seed(1) torch.cuda.manual_seed(1) # Check for vocabs if not os.path.exists(RESULT_DIR): create_folders(RESULT_DIR) if not os.path.exists(PLOT_DIR): create_folders(PLOT_DIR) if not os.path.exists(SNAPSHOT_DIR): create_folders(SNAPSHOT_DIR) if not os.path.exists(navigable_locs_path): create_folders(navigable_locs_path) if not os.path.exists(TRAIN_VOCAB): write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB) if not os.path.exists(TRAINVAL_VOCAB): write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']), TRAINVAL_VOCAB) if navigable_locs_path: #if philly: # navigable_locs_path = os.path.join(os.getenv('PT_OUTPUT_DIR'), "tasks/NDH/data") # if not os.path.exists(navigable_locs_path): # create_folders(navigable_locs_path) navigable_locs_path += '/navigable_locs.json' print('navigable_locs_path', navigable_locs_path) preprocess_get_pano_states(navigable_locs_path) global nav_graphs nav_graphs = None if action_space == -1: # load navigable location cache with open(navigable_locs_path, 'r') as f: nav_graphs = json.load(f) return nav_graphs
def setup(): torch.manual_seed(1) torch.cuda.manual_seed(1) # Check for vocabs if not os.path.exists(TRAIN_VOCAB): write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB) if not os.path.exists(TRAINVAL_VOCAB): write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']), TRAINVAL_VOCAB)
def setup(): torch.manual_seed(1) torch.cuda.manual_seed(1) # Check for vocabs if not os.path.exists(TRAIN_VOCAB): write_vocab(build_vocab(splits=['train']), TRAIN_VOCAB) if not os.path.exists(TRAINVAL_VOCAB): write_vocab(build_vocab(splits=['train','val_seen','val_unseen']), TRAINVAL_VOCAB)
def setup(): torch.manual_seed(1) torch.cuda.manual_seed(1) # Check for vocabs if not os.path.exists(train_vocab): write_vocab(build_vocab(splits=['train']), train_vocab) if not os.path.exists(trainval_vocab): write_vocab(build_vocab(splits=['train', 'val_seen', 'val_unseen']), trainval_vocab)
def setup(seed=None): if seed is not None: hparams.seed = seed torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # Check for vocabs train_vocab_path = os.path.join(hparams.data_path, 'train_vocab.txt') if not os.path.exists(train_vocab_path): write_vocab( build_vocab(hparams.data_path, splits=['train'], min_count=hparams.min_word_count, max_length=hparams.max_input_length, split_by_spaces=hparams.split_by_spaces, prefix='noroom' if hasattr(hparams, 'no_room') and hparams.no_room else 'asknav'), train_vocab_path)
def preprocess1(top=0, val_rate=0.1, test_rate=0.1): # X_text, Y, _, _ = data.load_data_and_labels_from_csv(dataset="yelp_review_polarity") # print("Y:", Y[:10]) X_1, X_2, Y = data.load_quora_data(file_name=DATA_FILE, top=top) max_X1 = max([len(x.split(" ")) for x in X_1]) max_X2 = max([len(x.split(" ")) for x in X_2]) vocab1 = learn.preprocessing.VocabularyProcessor(MAX_LEN) vocab2 = learn.preprocessing.VocabularyProcessor(MAX_LEN) X1 = np.array(list(vocab1.fit_transform(X_1))) X2 = np.array(list(vocab2.fit_transform(X_2))) Y = np.array(Y) write_vocab(vocab1, VOCAB_FILE_1) write_vocab(vocab2, VOCAB_FILE_2) print("==================") print("Train/Test split") # X = np.stack((X1, X2), axis=1) # print("X1.shape:", X1) # print("X2.shape:", X2) shuffle_idx = np.random.permutation(np.arange(len(Y))) x1_all = X1[shuffle_idx] x2_all = X2[shuffle_idx] y_all = Y[shuffle_idx] test_sample_idx = -1 * int(test_rate * float(len(y_all))) x1_train, x1_test = x1_all[:test_sample_idx], x1_all[test_sample_idx:] x2_train, x2_test = x2_all[:test_sample_idx], x2_all[test_sample_idx:] y_train, y_test = y_all[:test_sample_idx], y_all[test_sample_idx:] val_sample_idx = -1 * int(val_rate * float(len(y_train))) x1_train, x1_val = x1_train[:val_sample_idx], x1_train[val_sample_idx:] x2_train, x2_val = x2_train[:val_sample_idx], x2_train[val_sample_idx:] y_train, y_val = y_train[:val_sample_idx], y_train[val_sample_idx:] print("Vocab 1 Size: {:d}".format(len(vocab1.vocabulary_))) print("Vocab 2 Size: {:d}".format(len(vocab2.vocabulary_))) print("Train/Val/Test split: {:d}/{:d}/{:d}".format( len(y_train), len(y_val), len(y_test))) return (x1_train, x2_train, y_train, x1_val, x2_val, y_val, x1_test, x2_test, y_test, vocab1, vocab2)
def setup(args, clear=False): ''' 主要就是构建词表vocabs. ''' TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH = args.TRAIN_VOCAB_EN, args.TRAIN_VOCAB_ZH #中文词表和英文词表的路径 if clear: ## 删除已经有的词表 for file in [TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH]: if os.path.exists(file): os.remove(file) # 构建English vocabs if not os.path.exists(TRAIN_VOCAB_EN): write_vocab(build_vocab(args.DATA_DIR, language='en'), TRAIN_VOCAB_EN) # 构建Chinese vocabs if not os.path.exists(TRAIN_VOCAB_ZH): write_vocab(build_vocab(args.DATA_DIR, language='zh'), TRAIN_VOCAB_ZH) # 设定随机种子 torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed)
def setup(args, clear=False): ''' Build vocabs from train or train/val set. ''' TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH = args.TRAIN_VOCAB_EN, args.TRAIN_VOCAB_ZH if clear: ## delete previous vocab for file in [TRAIN_VOCAB_EN, TRAIN_VOCAB_ZH]: if os.path.exists(file): os.remove(file) # Build English vocabs if not os.path.exists(TRAIN_VOCAB_EN): write_vocab(build_vocab(args.DATA_DIR, language='en'), TRAIN_VOCAB_EN) #build Chinese vocabs if not os.path.exists(TRAIN_VOCAB_ZH): write_vocab(build_vocab(args.DATA_DIR, language='zh'), TRAIN_VOCAB_ZH) # set up seed torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed)
def setup(seed=None): if seed is not None: hparams.seed = seed torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # Check for vocabs train_vocab_path = os.path.join(hparams.data_path, 'train_vocab.txt') print("train_vocab_path: " + train_vocab_path) if not os.path.exists(train_vocab_path): # txt file contains the name of a list of household objects write_vocab(build_vocab( hparams.data_path, splits=['train'], min_count=hparams.min_word_count, max_length=hparams.max_input_length, # these vals are inside verbal_hard.json split_by_spaces=hparams.split_by_spaces, prefix='noroom' if hasattr(hparams, 'no_room') and hparams.no_room else 'asknav'), train_vocab_path) # build using .json files in same dir: data/asknav/...train
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab = list(vocab) vocab.insert(0, PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) vocab_chars = list(vocab_chars) vocab_chars.insert(0, PAD) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def main() : config = utils.Config() utils.mkdir(os.path.join(config.getpath("data"), "rstdt-vocab")) filenames = [] for filename in os.listdir(os.path.join(config.getpath("data"), "rstdt", "wsj", "train")): filenames.append(os.path.join(config.getpath("data"), "rstdt", "wsj", "train", filename)) for filename in os.listdir(os.path.join(config.getpath("data"), "rstdt", "wsj", "test")): filenames.append(os.path.join(config.getpath("data"), "rstdt", "wsj", "test", filename)) filenames = [n for n in filenames if n.endswith(".labeled.bin.ctree")] filenames.sort() relation_mapper = treetk.rstdt.RelationMapper() frelations = [] crelations = [] nuclearities = [] for filename in pyprind.prog_bar(filenames): sexp = utils.read_lines(filename, process=lambda line: line) sexp = treetk.preprocess(sexp) tree = treetk.rstdt.postprocess(treetk.sexp2tree(sexp, with_nonterminal_labels=True, with_terminal_labels=False)) nodes = treetk.traverse(tree, order="pre-order", include_terminal=False, acc=None) part_frelations = [] part_crelations = [] part_nuclearities = [] for node in nodes: relations_ = node.relation_label.split("/") part_frelations.extend(relations_) part_crelations.extend([relation_mapper.f2c(r) for r in relations_]) part_nuclearities.append(node.nuclearity_label) part_frelations.append("<root>") part_crelations.append("<root>") frelations.append(part_frelations) crelations.append(part_crelations) nuclearities.append(part_nuclearities) fcounter = utils.get_word_counter(lines=frelations) ccounter = utils.get_word_counter(lines=crelations) ncounter = utils.get_word_counter(lines=nuclearities) frelations = fcounter.most_common() # list of (str, int) crelations = ccounter.most_common() # list of (str, int) nuclearities = ncounter.most_common() # list of (str, int) utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "relations.fine.vocab.txt"), frelations) utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "relations.coarse.vocab.txt"), crelations) utils.write_vocab(os.path.join(config.getpath("data"), "rstdt-vocab", "nuclearities.vocab.txt"), nuclearities)
def preprocess1(top=0): print("Load data.") # X_text, Y, _, _ = data.load_data_and_labels_from_csv(dataset="yelp_review_polarity") # print("Y:", Y[:10]) X_1, X_2, Y = data.load_quora_data(top=top) print("X_1.size:", len(X_1)) print("X_2.size:", len(X_2)) print("Y.size:", len(Y)) max_X1 = max([len(x.split(" ")) for x in X_1]) max_X2 = max([len(x.split(" ")) for x in X_2]) vocab1 = learn.preprocessing.VocabularyProcessor(MAX_LEN) vocab2 = learn.preprocessing.VocabularyProcessor(MAX_LEN) X1 = np.array(list(vocab1.fit_transform(X_1))) X2 = np.array(list(vocab2.fit_transform(X_2))) Y = np.array(Y) write_vocab(vocab1, "./data/quora/vocab1.csv") write_vocab(vocab2, "./data/quora/vocab2.csv") print("X_1.size:", X1.shape) print("X_2.size:", X2.shape) print("Y.size:", Y.shape) print("==================") print("Train/Test split") # X = np.stack((X1, X2), axis=1) # print("X1.shape:", X1) # print("X2.shape:", X2) shuffle_idx = np.random.permutation(np.arange(len(Y))) x1_all = X1[shuffle_idx] x2_all = X2[shuffle_idx] y_all = Y[shuffle_idx] test_sample_idx = -1 * int(TEST_SPLIT * float(len(y_all))) x1_train, x1_test = x1_all[:test_sample_idx], x1_all[test_sample_idx:] x2_train, x2_test = x2_all[:test_sample_idx], x2_all[test_sample_idx:] y_train, y_test = y_all[:test_sample_idx], y_all[test_sample_idx:] val_sample_idx = -1 * int(VALIDATION_SPLIT * float(len(y_train))) x1_train, x1_val = x1_train[:val_sample_idx], x1_train[val_sample_idx:] x2_train, x2_val = x2_train[:val_sample_idx], x2_train[val_sample_idx:] y_train, y_val = y_train[:val_sample_idx], y_train[val_sample_idx:] # ret = train_test_split(X, Y, test_size=TEST_SPLIT, random_state=RNG_SEED) # X_train, X_test, y_train, y_test = ret # ret = train_test_split(X_train, y_train, test_size=VALIDATION_SPLIT, # random_state=RNG_SEED) # X_train, X_val, y_train, y_val = ret """ x1_train = X_train[:, 0] x2_train = X_train[:, 1] x1_val = X_train[:, 0] x2_val = X_train[:, 1] x1_test = X_test[:, 0] x2_test = X_test[:, 1] """ """ # Shuffle the data print("Shuffle the data.") # np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(Y))) x1_shuffled = X1[shuffle_indices] x2_shuffled = X2[shuffle_indices] y_shuffled = Y[shuffle_indices] # Split train/test set print("Split train/test set") dev_sample_index = -1 * int(0.1 * float(len(Y))) x1_train, x1_dev = x1_shuffled[:dev_sample_index], x1_shuffled[dev_sample_index:] x2_train, x2_dev = x2_shuffled[:dev_sample_index], x2_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del X1, X2, Y, x1_shuffled, x2_shuffled, y_shuffled """ print("Vocab 1 Size: {:d}".format(len(vocab1.vocabulary_))) print("Vocab 2 Size: {:d}".format(len(vocab2.vocabulary_))) print("Train/Val/Test split: {:d}/{:d}/{:d}".format( len(y_train), len(y_val), len(y_test))) return (x1_train, x2_train, y_train, x1_val, x2_val, y_val, x1_test, x2_test, y_test, vocab1, vocab2)
def main(args): config = utils.Config() utils.mkdir(os.path.join(config.getpath("data"), "rstdt-vocab")) filenames = os.listdir( os.path.join(config.getpath("data"), "rstdt", "renamed")) filenames = [n for n in filenames if n.endswith(".edus")] filenames.sort() # Concat filepaths = [ os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing", filename + ".tokenized.lowercased.replace_digits") for filename in filenames ] textpreprocessor.concat.run( filepaths, os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing", "concat.tokenized.lowercased.replace_digits")) # Build vocabulary if args.with_root: special_words = ["<root>"] else: special_words = [] textpreprocessor.create_vocabulary.run( os.path.join(config.getpath("data"), "rstdt", "tmp.preprocessing", "concat.tokenized.lowercased.replace_digits"), os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt"), prune_at=50000, min_count=-1, special_words=special_words, with_unk=True) # Build vocabulary for fine-grained/coarse-grained relations relation_mapper = treetk.rstdt.RelationMapper() frelations = [] crelations = [] nuclearities = [] for filename in filenames: sexp = utils.read_lines(os.path.join( config.getpath("data"), "rstdt", "renamed", filename.replace(".edus", ".labeled.bin.ctree")), process=lambda line: line) sexp = treetk.preprocess(sexp) tree = treetk.rstdt.postprocess( treetk.sexp2tree(sexp, with_nonterminal_labels=True, with_terminal_labels=False)) nodes = treetk.traverse(tree, order="pre-order", include_terminal=False, acc=None) part_frelations = [] part_crelations = [] part_nuclearities = [] for node in nodes: relations_ = node.relation_label.split("/") part_frelations.extend(relations_) part_crelations.extend( [relation_mapper.f2c(r) for r in relations_]) part_nuclearities.append(node.nuclearity_label) if args.with_root: part_frelations.append("<root>") part_crelations.append("<root>") frelations.append(part_frelations) crelations.append(part_crelations) nuclearities.append(part_nuclearities) fcounter = utils.get_word_counter(lines=frelations) ccounter = utils.get_word_counter(lines=crelations) ncounter = utils.get_word_counter(lines=nuclearities) frelations = fcounter.most_common() # list of (str, int) crelations = ccounter.most_common() # list of (str, int) nuclearities = ncounter.most_common() # list of (str, int) utils.write_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "relations.fine.vocab.txt"), frelations) utils.write_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "relations.coarse.vocab.txt"), crelations) utils.write_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "nuclearities.vocab.txt"), nuclearities)