# data (preproc) max_len = cfg['data']['max_len'] min_freq = cfg['data']['min_freq'] # load dataset logger.info("### Load dataset ###") in_lang_path = f"cache/in-fra-{max_len}-{min_freq}.pkl" out_lang_path = f"cache/out-eng-{max_len}-{min_freq}.pkl" pair_path = f"cache/fra2eng-{max_len}.pkl" exist_all = all( os.path.exists(path) for path in [in_lang_path, out_lang_path, pair_path]) if not exist_all: data_prepare.prepare(max_len, min_freq) input_lang = Lang.load_from_file("fra", in_lang_path) output_lang = Lang.load_from_file("eng", out_lang_path) pairs = pickle.load(open(pair_path, "rb")) logger.info("\tinput_lang.n_words = {}".format(input_lang.n_words)) logger.info("\toutput_lang.n_words = {}".format(output_lang.n_words)) logger.info("\t# of pairs = {}".format(len(pairs))) dset = TranslationDataset(input_lang, output_lang, pairs, max_len) logger.info(random.choice(pairs)) # split dset by valid indices N_pairs = len(pairs) val_indices_path = f"cache/valid_indices-{N_pairs}.npy" if not os.path.exists(val_indices_path): data_prepare.gen_valid_indices(N_pairs, 0.1, val_indices_path) valid_indices = np.load(val_indices_path) train_indices = list(set(range(len(dset))) - set(valid_indices))