def setup_model(args): sys.stderr.write(json.dumps(args.__dict__, indent=2) + '\n') setup = json.load(open(args.model_setup)) sys.stderr.write(json.dumps(setup, indent=2) + '\n') # Load a dataset dataset = setup['dataset'] if dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=setup['char_based']) elif dataset == 'snli': train, test, vocab = text_datasets.get_snli( char_based=setup['char_based']) elif dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=dataset.endswith('.fine'), char_based=setup['char_based']) elif dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( dataset, char_based=setup['char_based']) vocab = json.load(open(setup['vocab_path'])) n_class = setup['n_class'] print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) print('# class: {}'.format(n_class)) # Setup a model if setup['model'] == 'rnn': Encoder = nets.RNNEncoder elif setup['model'] == 'bilstm': Encoder = nets.BiLSTMEncoder elif setup['model'] == 'cnn': Encoder = nets.CNNEncoder elif setup['model'] == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=setup['layer'], n_vocab=len(vocab), n_units=setup['unit'], dropout=setup['dropout']) if dataset == 'snli': model = nets.SNLIClassifier(encoder) else: model = nets.TextClassifier(encoder, n_class) chainer.serializers.load_npz(setup['model_path'], model) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU return model, train, test, vocab, setup
def main(): parser = create_parser() args = parser.parse_args() current_datetime = '{}'.format(datetime.datetime.today()) # Load a dataset if args.dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=args.char_based) elif args.dataset == 'snli': train, test, vocab = text_datasets.get_snli(char_based=args.char_based) elif args.dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=args.dataset.endswith('.fine'), char_based=args.char_based) elif args.dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( args.dataset, char_based=args.char_based) train_idx = list(range(len(train))) # calibration data is taken out of training for calibrated dknn / temperature scaling calibration_idx = sorted(random.sample(train_idx, 1000)) calibration = [train[i] for i in calibration_idx] train = [x for i, x in enumerate(train) if i not in calibration_idx] print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) if args.dataset == 'snli': n_class = 3 else: n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Save vocabulary and model's setting current = os.path.dirname(os.path.abspath(__file__)) save_path = os.path.join(current, args.out, '{}_{}'.format(args.dataset, args.model)) if not os.path.isdir(args.out): os.mkdir(args.out) if not os.path.isdir(save_path): os.mkdir(save_path) args.save_path = save_path vocab_path = os.path.join(save_path, 'vocab.json') model_path = os.path.join(save_path, 'best_model.npz') setup_path = os.path.join(save_path, 'args.json') calib_path = os.path.join(save_path, 'calib.json') with open(calib_path, 'w') as f: json.dump(calibration_idx, f) with open(vocab_path, 'w') as f: json.dump(vocab, f) model_setup = args.__dict__ model_setup['vocab_path'] = vocab_path model_setup['model_path'] = model_path model_setup['n_class'] = n_class model_setup['datetime'] = current_datetime with open(setup_path, 'w') as f: json.dump(model_setup, f) print(json.dumps(model_setup, indent=2)) # Setup a model if args.model == 'rnn': Encoder = nets.RNNEncoder if args.model == 'bilstm': Encoder = nets.BiLSTMEncoder elif args.model == 'cnn': Encoder = nets.CNNEncoder elif args.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=args.layer, n_vocab=len(vocab), n_units=args.unit, dropout=args.dropout) if args.dataset == 'snli': model = nets.SNLIClassifier(encoder) else: model = nets.TextClassifier(encoder, n_class) # load word vectors if args.word_vectors: print("loading word vectors") with open(args.word_vectors, "r") as fi: for line in fi: line_list = line.strip().split(" ") word = line_list[0] if word in vocab: vec = model.xp.array(line_list[1::], dtype=np.float32) model.encoder.embed.W.data[vocab[word]] = vec else: print("WARNING: NO Word Vectors") if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer if args.dataset == 'snli': converter = convert_snli_seq else: converter = convert_seq updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=converter, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=os.path.join( args.out, '{}_{}'.format(args.dataset, args.model))) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=converter, device=args.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) idx2word = {} # build reverse dict for word, idx in vocab.items(): idx2word[idx] = word # Run the training trainer.run()
def main(): current_datetime = '{}'.format(datetime.datetime.today()) parser = argparse.ArgumentParser( description='Chainer example: Text Classification') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=30, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--unit', '-u', type=int, default=300, help='Number of units') parser.add_argument('--layer', '-l', type=int, default=1, help='Number of layers of RNN or MLP following CNN') parser.add_argument('--dropout', '-d', type=float, default=0.4, help='Dropout rate') parser.add_argument('--dataset', '-data', default='imdb.binary', choices=[ 'dbpedia', 'imdb.binary', 'imdb.fine', 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ], help='Name of dataset.') parser.add_argument('--model', '-model', default='cnn', choices=['cnn', 'rnn', 'bow'], help='Name of encoder model type.') parser.add_argument('--char-based', action='store_true') args = parser.parse_args() print(json.dumps(args.__dict__, indent=2)) # Load a dataset if args.dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=args.char_based) elif args.dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=args.dataset.endswith('.fine'), char_based=args.char_based) elif args.dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( args.dataset, char_based=args.char_based) print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Setup a model if args.model == 'rnn': Encoder = nets.RNNEncoder elif args.model == 'cnn': Encoder = nets.CNNEncoder elif args.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=args.layer, n_vocab=len(vocab), n_units=args.unit, dropout=args.dropout) model = nets.TextClassifier(encoder, n_class) if args.gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=convert_seq, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=convert_seq, device=args.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(args.out): os.mkdir(args.out) current = os.path.dirname(os.path.abspath(__file__)) vocab_path = os.path.join(current, args.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(current, args.out, 'best_model.npz') model_setup = args.__dict__ model_setup['vocab_path'] = vocab_path model_setup['model_path'] = model_path model_setup['n_class'] = n_class model_setup['datetime'] = current_datetime with open(os.path.join(args.out, 'args.json'), 'w') as f: json.dump(args.__dict__, f) # Run the training trainer.run()
def main(): current_datetime = '{}'.format(datetime.datetime.today()) parser = argparse.ArgumentParser( description='Chainer example: Text Classification') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=30, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--unit', '-u', type=int, default=300, help='Number of units') parser.add_argument('--layer', '-l', type=int, default=1, help='Number of layers of RNN or MLP following CNN') parser.add_argument('--dropout', '-d', type=float, default=0.4, help='Dropout rate') parser.add_argument('--dataset', '-data', default='imdb.binary', choices=['dbpedia', 'imdb.binary', 'imdb.fine', 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj'], help='Name of dataset.') parser.add_argument('--model', '-model', default='cnn', choices=['cnn', 'rnn', 'bow'], help='Name of encoder model type.') parser.add_argument('--char-based', action='store_true') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() print(json.dumps(args.__dict__, indent=2)) # Load a dataset if args.dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=args.char_based) elif args.dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=args.dataset.endswith('.fine'), char_based=args.char_based) elif args.dataset in ['TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj']: train, test, vocab = text_datasets.get_other_text_dataset( args.dataset, char_based=args.char_based) if args.test: train = train[:100] test = test[:100] print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) # Setup a model if args.model == 'rnn': Encoder = nets.RNNEncoder elif args.model == 'cnn': Encoder = nets.CNNEncoder elif args.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=args.layer, n_vocab=len(vocab), n_units=args.unit, dropout=args.dropout) model = nets.TextClassifier(encoder, n_class) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, converter=convert_seq, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(extensions.Evaluator( test_iter, model, converter=convert_seq, device=args.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object( model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(args.out): os.mkdir(args.out) vocab_path = os.path.join(args.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(args.out, 'best_model.npz') model_setup = args.__dict__ model_setup['vocab_path'] = vocab_path model_setup['model_path'] = model_path model_setup['n_class'] = n_class model_setup['datetime'] = current_datetime with open(os.path.join(args.out, 'args.json'), 'w') as f: json.dump(args.__dict__, f) # Run the training trainer.run()
def main(): args = { 'gpu': -1, 'dataset': 'imdb.binary', 'model': 'rnn', 'batchsize': 64, 'epoch': 3, 'out': 'result', 'unit': 100, 'layer': 1, 'dropout': 0.4, 'char_based': False } # Load a dataset if args['dataset'] == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=args['char_based']) elif args['dataset'].startswith('imdb.'): print("IMDB datasets") train, test, vocab = text_datasets.get_imdb( fine_grained=args['dataset'].endswith('.fine'), char_based=args['char_based']) elif args['dataset'] in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( args['dataset'], char_based=args['char_based']) print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train[:1000], args['batchsize']) test_iter = chainer.iterators.SerialIterator(test[:1000], args['batchsize'], repeat=False, shuffle=False) # return train_iter, test_iter # Setup a model if args['model'] == 'rnn': Encoder = nets.RNNEncoder print(type(Encoder)) elif args['model'] == 'cnn': Encoder = nets.CNNEncoder elif args['model'] == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=args['layer'], n_vocab=len(vocab), n_units=args['unit'], dropout=args['dropout']) model = nets.TextClassifier(encoder, n_class) if args['gpu'] >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args['gpu']).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=convert_seq, device=args['gpu']) trainer = training.Trainer(updater, (args['epoch'], 'epoch'), out=args['out']) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=convert_seq, device=args['gpu'])) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) print("STRAT Training!") # Run the training trainer.run() print("Finished!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--load', required=True) args_dir = os.path.join(parser.parse_args().load, 'args.json') with open(args_dir) as f: args = Bunch(json.load(f)) print(json.dumps(args.__dict__, indent=2)) # Load a dataset with open(args.vocab_path) as f: vocab = json.load(f) if args.dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( vocab=vocab, char_based=args.char_based) elif args.dataset == 'sst': train, test, vocab = text_datasets.get_sst(char_based=args.char_based) elif args.dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( vocab=vocab, fine_grained=args.dataset.endswith('.fine'), char_based=args.char_based) elif args.dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( args.dataset, vocab=vocab, char_based=args.char_based) print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) # i_to_word = {v: k for k, v in vocab.items()} # FIXME args.batchsize = 64 max_beam_size = 5 # train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) if args.dataset == 'snli': model = nets.DoubleMaxClassifier(n_layers=args.layer, n_vocab=len(vocab), n_units=args.unit, n_class=n_class, dropout=args.dropout) else: model = nets.SingleMaxClassifier(n_layers=args.layer, n_vocab=len(vocab), n_units=args.unit, n_class=n_class, dropout=args.dropout) if args.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU chainer.serializers.load_npz(args.model_path, model) checkpoint = [] for batch_idx, batch in enumerate(tqdm(test_iter)): # if batch_idx > 10: # break batch = convert_seq(batch, device=args.gpu) xs = batch['xs'] reduced_xs, removed_indices = get_rawr(model, xs, max_beam_size=max_beam_size) xp = cupy.get_array_module(*xs) n_finals = [len(r) for r in reduced_xs] reduced_xs = list(itertools.chain(*reduced_xs)) removed_indices = list(itertools.chain(*removed_indices)) reduced_xs = [xp.asarray(x) for x in reduced_xs] reduced_xs = convert_seq(reduced_xs, device=args.gpu, with_label=False) with chainer.using_config('train', False): ss_0 = xp.asnumpy(model.predict(xs, softmax=True)) ss_1 = xp.asnumpy(model.predict(reduced_xs, softmax=True)) ys_0 = np.argmax(ss_0, axis=1) ys_1 = np.argmax(ss_1, axis=1) start = 0 for example_idx in range(len(xs)): oi = xs[example_idx].tolist() # original input op = int(ys_0[example_idx]) # original predictoin oos = ss_0[example_idx] # original output distribution label = int(batch['ys'][example_idx]) checkpoint.append([]) for i in range(start, start + n_finals[example_idx]): ri = reduced_xs[i].tolist() rp = int(ys_1[i]) rs = ss_1[i] rr = removed_indices[i] entry = { 'original_input': oi, 'reduced_input': ri, 'original_prediction': op, 'reduced_prediction': rp, 'original_scores': oos, 'reduced_scores': rs, 'removed_indices': rr, 'label': label } checkpoint[-1].append(entry) with open(os.path.join(args.out, 'rawr_dev.pkl'), 'wb') as f: pickle.dump(checkpoint, f)