def test_read_train(self): train=["I am Philip", "I am student"] X, data = load_lm_data(train,cut_threshold=1) x_exp = Vocabulary() for w in "<s> </s> i am".split(): x_exp[w] word_exp = [\ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()], \ [x_exp["<s>"], x_exp["i"], x_exp["am"], x_exp.unk_id()] \ ] next_word_exp = [\ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]], \ [x_exp["i"], x_exp["am"], x_exp.unk_id(), x_exp["</s>"]] \ ] data_exp = list(zip(word_exp, next_word_exp)) self.assertVocEqual(X, x_exp) self.assertEqual(data, data_exp)
parser.add_argument("--model",type=str,choices=["lstm"], default="lstm", help="Type of model being trained.") parser.add_argument("--unk_cut", type=int, default=1, help="Threshold for words in corpora to be treated as unknown.") parser.add_argument("--dropout", type=positive_decimal, default=0.2, help="Dropout ratio for LSTM.") parser.add_argument("--seed", type=int, default=0, help="Seed for RNG. 0 for totally random seed.") parser.add_argument("--dev", type=str, help="Development data.") args = parser.parse_args() if args.use_cpu: args.gpu = -1 """ Training """ trainer = ParallelTrainer(args.seed, args.gpu) # data UF.trace("Loading corpus + dictionary") X, data = load_lm_data(sys.stdin, cut_threshold=args.unk_cut) data = list(batch_generator(data, (X, X), args.batch)) UF.trace("INPUT size:", len(X)) UF.trace("Data loaded.") # dev data dev_data = None if args.dev: with open(args.dev) as dev_fp: UF.trace("Loading dev data") _, dev_data = load_lm_data(dev_fp, X) dev_data = list(batch_generator(dev_data, (X, X), args.batch)) UF.trace("Dev data loaded") """ Setup model """ UF.trace("Setting up classifier")
pass def onSingleUpdate(ctr, src, trg): if op == "gen": print(VOC.str_rpr(trg[0])) elif op == "sppl": print(PPL(trg)) def onDecodingFinish(data, output): if op == "gen": for src_id, (inp, out) in sorted(output.items(), key=lambda x:x[0]): print(TRG.str_rpr(out)) elif op == "cppl": UF.trace("Corpus PPL:", PPL(output)) print(PPL(output)) tester = Tester(load_lm_gen_data, VOC, onDecodingStart, onBatchUpdate, onSingleUpdate, onDecodingFinish, batch=args.batch, out_vocab=VOC, options=decoding_options) if op == "sppl" or op == "cppl": if not args.src: _, data = load_lm_data(sys.stdin, VOC) else: with open(args.src) as src_fp: _, data = load_lm_data(src_fp, VOC) data = list(batch_generator(data, (VOC, VOC), args.batch)) tester.eval(data, model) elif op == "gen": tester.test(args.src, model) else: raise NotImplementedError("Undefined operation:", op)