= dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args) dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt')) data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False) data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_val)]) data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_test)]) data_train_lengths, data_val_lengths, data_test_lengths = [ dataprocessor.get_data_lengths(x) for x in [data_train, data_val, data_test] ] detokenizer = nlp.data.SacreMosesDetokenizer() # model prepare ctx = [mx.cpu()] if args.gpu is None else [mx.gpu(args.gpu)] if args.src_max_len <= 0 or args.tgt_max_len <= 0: max_len = np.max([ np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0), np.max(data_test_lengths, axis=0) ], axis=0)
logging_config(args.save_dir) logging.info(args) # data process data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \ = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args) dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt')) data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False) data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_val)]) data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_test)]) data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x) for x in [data_train, data_val, data_test]] detokenizer = nlp.data.SacreMosesDetokenizer() # model prepare ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] num_ctxs = len(ctx) if args.src_max_len <= 0 or args.tgt_max_len <= 0: max_len = np.max( [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0), np.max(data_test_lengths, axis=0)], axis=0)
data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \ = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args) dataprocessor.write_sentences(val_tgt_sentences, os.path.join(args.save_dir, 'val_gt.txt')) dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt')) data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False) data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_val)]) data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i) for i, ele in enumerate(data_test)]) ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x) for x in [data_train, data_val, data_test]] if args.src_max_len <= 0 or args.tgt_max_len <= 0: max_len = np.max( [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0), np.max(data_test_lengths, axis=0)], axis=0) if args.src_max_len > 0: src_max_len = args.src_max_len else: src_max_len = max_len[0] if args.tgt_max_len > 0: tgt_max_len = args.tgt_max_len else: