def single_decoding(self, data, model): self.onDecodingStart() for i, line in enumerate(data): inp = list(batch_generator(self.loader([line.strip()], self._inp_vocab), (self._inp_vocab,), 1))[0][0] out = model.classify(inp, **self._decoding_options) self.onSingleUpdate(i, inp, out)
def batched_decoding(self, data, model): # Load data with open(data) as inp_fp: data = self.loader(inp_fp, self._inp_vocab) self.onDecodingStart() # Start Decoding output = {} data = batch_generator(data, (self._inp_vocab,), batch_size=self._batch) ctr = 0 for src, src_id in data: trg = model.classify(src, **self._decoding_options) # Collecting output for src_i, trg_i, id_i in zip(src, trg, src_id): output[id_i] = src_i, trg_i self.onBatchUpdate(ctr, src, trg) ctr += len(src) self.onDecodingFinish(data, output)
if args.use_cpu: args.gpu = -1 if args.save_models: args.save_len = 1 """ Training """ trainer = ParallelTrainer(args.seed, args.gpu) # data UF.trace("Loading corpus + dictionary") with open(args.src) as src_fp: with open(args.trg) as trg_fp: SRC, TRG, train_data = load_nmt_train_data(src_fp, trg_fp, cut_threshold=args.unk_cut) train_data = list(batch_generator(train_data, (SRC, TRG), args.batch)) UF.trace("SRC size:", len(SRC)) UF.trace("TRG size:", len(TRG)) UF.trace("Data loaded.") # dev data dev_data = None if args.src_dev and args.trg_dev: with open(args.src_dev) as src_fp: with open(args.trg_dev) as trg_fp: UF.trace("Loading dev data") _, _, dev_data = load_nmt_train_data(src_fp, trg_fp, SRC, TRG) dev_data = list(batch_generator(dev_data, (SRC, TRG), args.batch)) UF.trace("Dev data loaded") """ Setup model """
parser.add_argument("--model",type=str,choices=["lstm"], default="lstm", help="Type of model being trained.") parser.add_argument("--unk_cut", type=int, default=1, help="Threshold for words in corpora to be treated as unknown.") parser.add_argument("--dropout", type=positive_decimal, default=0.2, help="Dropout ratio for LSTM.") parser.add_argument("--seed", type=int, default=0, help="Seed for RNG. 0 for totally random seed.") args = parser.parse_args() if args.use_cpu: args.gpu = -1 """ Training """ trainer = ParallelTrainer(args.seed, args.gpu) # data UF.trace("Loading corpus + dictionary") X, Y, data = load_pos_train_data(sys.stdin, cut_threshold=args.unk_cut) data = list(batch_generator(data, (X, Y), args.batch)) UF.trace("INPUT size:", len(X)) UF.trace("LABEL size:", len(Y)) UF.trace("Data loaded.") """ Setup model """ UF.trace("Setting up classifier") opt = optimizers.Adam() model = ParallelTextClassifier(args, X, Y, opt, args.gpu, activation=F.relu, collect_output=args.verbose) """ Training Callback """ def onEpochStart(epoch): UF.trace("Starting Epoch", epoch+1) def report(output, src, trg, trained, epoch): for index in range(len(src)):
def setUp(self): src=["I am Philip .", "I am a student ."] trg=["私 は フィリップ です .", "私 は 学生 です ."] SRC, TRG, data = load_nmt_train_data(src, trg, cut_threshold=0) self.model = Attentional(SRC, TRG, Args(SRC,TRG)) self.data = batch_generator(data, (SRC, TRG), 1)
parser.add_argument("--unk_cut", type=int, default=1, help="Threshold for words in corpora to be treated as unknown.") parser.add_argument("--dropout", type=positive_decimal, default=0.2, help="Dropout ratio for LSTM.") parser.add_argument("--seed", type=int, default=0, help="Seed for RNG. 0 for totally random seed.") parser.add_argument("--dev", type=str, help="Development data.") args = parser.parse_args() if args.use_cpu: args.gpu = -1 """ Training """ trainer = ParallelTrainer(args.seed, args.gpu) # data UF.trace("Loading corpus + dictionary") X, data = load_lm_data(sys.stdin, cut_threshold=args.unk_cut) data = list(batch_generator(data, (X, X), args.batch)) UF.trace("INPUT size:", len(X)) UF.trace("Data loaded.") # dev data dev_data = None if args.dev: with open(args.dev) as dev_fp: UF.trace("Loading dev data") _, dev_data = load_lm_data(dev_fp, X) dev_data = list(batch_generator(dev_data, (X, X), args.batch)) UF.trace("Dev data loaded") """ Setup model """ UF.trace("Setting up classifier") opt = optimizers.Adam()
pass def onSingleUpdate(ctr, src, trg): if op == "gen": print(VOC.str_rpr(trg[0])) elif op == "sppl": print(PPL(trg)) def onDecodingFinish(data, output): if op == "gen": for src_id, (inp, out) in sorted(output.items(), key=lambda x:x[0]): print(TRG.str_rpr(out)) elif op == "cppl": UF.trace("Corpus PPL:", PPL(output)) print(PPL(output)) tester = Tester(load_lm_gen_data, VOC, onDecodingStart, onBatchUpdate, onSingleUpdate, onDecodingFinish, batch=args.batch, out_vocab=VOC, options=decoding_options) if op == "sppl" or op == "cppl": if not args.src: _, data = load_lm_data(sys.stdin, VOC) else: with open(args.src) as src_fp: _, data = load_lm_data(src_fp, VOC) data = list(batch_generator(data, (VOC, VOC), args.batch)) tester.eval(data, model) elif op == "gen": tester.test(args.src, model) else: raise NotImplementedError("Undefined operation:", op)