def __init__(self, config): self.model = NCF(config) if config['use_cuda'] is True: use_cuda(True, config['device_id']) self.model.cuda() super(NCFEngine, self).__init__(config) print(self.model)
def __init__(self, config): self.model = MLP(config) if config['use_cuda'] is True: use_cuda(True, config['device_id']) self.model.cuda() super(MLPEngine, self).__init__(config) if config['pretrain']: self.model.load_pretrain_weights()
def __init__(self, config, model): self.model = model(config) super(ModelEngine, self).__init__() if config["normal_config"]['pretrain']: self.model.load_pretrain_weights() if config["normal_config"]['use_cuda'] is True: use_cuda(True, config["normal_config"]['device_id']) self.model.cuda() print(self.model)
def __init__(self, config): self.model = MLP(config) if config['use_cuda'] is True: use_cuda(True, config['device_id']) self.model.cuda() if config['pretrain']: #self.model.load_pretrain_weights() resume_checkpoint(self.model, model_dir=config['pretrain_mlp'], device_id=config['device_id']) super(MLPEngine, self).__init__(config) print(self.model)
def __init__(self, config): self.model = New_Gloabl_sum_embedding(config) if config['use_cuda'] is True: use_cuda(True, config['device_id']) self.model.cuda() super(New_Gloabl_sum_embedding_shareEngine, self).__init__(config) if config['pretrain']: print("load pretrained model...") self.model.load_pretrain_weights() if config['pretrain_grouping']: print("load pretrained grouping embedding") self.model.load_pretrain_grouping() print(self.model)
def __init__(self, config): self.config = config # model configuration self.modelA = MLP(config) self.modelB = MLP(config) if config['use_cuda'] is True: use_cuda(True, config['device_id']) self.modelA.cuda() self.modelB.cuda() print(self.modelA) if config['pretrain']: self.model.load_pretrain_weights() self.optA = use_optimizer(self.modelA, config) self.optB = use_optimizer(self.modelB, config) self.crit = torch.nn.MSELoss() self.alpha = config['alpha']
def main(): parser = argparse.ArgumentParser(description='testing script') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--model_file', type=str, help='pretrained model file') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--hierarchical', action='store_true', default=False, help='use hierarchical model') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) utils.set_seed(args.seed) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold, verbose=True) model = utils.load_model(args.model_file) crit = Criterion(model.word_dict, device_id=device_id) sel_crit = Criterion(model.item_dict, device_id=device_id, bad_toks=['<disconnect>', '<disagree>']) testset, testset_stats = corpus.test_dataset(args.bsz, device_id=device_id) test_loss, test_select_loss = 0, 0 N = len(corpus.word_dict) for batch in testset: # run forward on the batch, produces output, hidden, target, # selection output and selection target out, hid, tgt, sel_out, sel_tgt = Engine.forward(model, batch, volatile=False) # compute LM and selection losses test_loss += tgt.size(0) * crit(out.view(-1, N), tgt).data[0] test_select_loss += sel_crit(sel_out, sel_tgt).data[0] test_loss /= testset_stats['nonpadn'] test_select_loss /= len(testset) print('testloss %.3f | testppl %.3f' % (test_loss, np.exp(test_loss))) print('testselectloss %.3f | testselectppl %.3f' % (test_select_loss, np.exp(test_select_loss)))
save_dir = './saved models/' for bs in batch_size: # Create corpus train_corpus = utils.Corpus() ids_train = train_corpus.get_data(train_path, bs) ids_valid = train_corpus.get_data(valid_path, bs) train_vocab_size = len(train_corpus.dictionary) for seq_len in seq_lengths: num_train_batches = ids_train.size(1) // seq_len num_valid_batches = ids_valid.size(1) // seq_len for lr in learning_rate: model = utils.initialize_model(model_num, train_vocab_size, embed_size) model = utils.use_cuda(model) print('Training vocabulary size: {}'.format(train_vocab_size)) print('Model: {}'.format(model.name)) print('Number of parameters = {}'.format(sum(p.numel() for p in model.parameters()))) run_name = "{}, seq_len={}, lr={}, bs={}".format(model.name, seq_len, lr, bs) file_path = os.path.join(save_dir,run_name + '.pkl') # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) #TODO: CHANGE PARAMETERS - EPS, PATIENCE, etc.. CHECK OTHER TYPES OF SCHEDULER!! https://pytorch.org/docs/stable/optim.html lr_scheduler = LRscheduler.ReduceLROnPlateau(optimizer,eps= 1e-5) # Load model parameters and optimizer condition if available if os.path.exists(file_path):
train_path = './data/train.txt' valid_path = './data/valid.txt' sample_path = './sample.txt' save_dir = './saved models/' model_name = 'LSTM - 249 hidden cells, 1 layers' # Create the corpus corpus = utils.Corpus() ids_train = corpus.get_data(train_path) ids_valid = corpus.get_data(valid_path) vocab_size = len(corpus.dictionary) # Load best model model_num = 4 model = utils.initialize_model(model_num, vocab_size, embed_size) model = utils.use_cuda(model) file_path = os.path.join(save_dir, model_name + '.pkl') load_state = torch.load(file_path, lambda storage, loc: storage) model.load_state_dict(load_state['state_dict']) model.eval() # Turn to eval mode - so there won't be any dropouts! # Sampling with open(sample_path, 'w') as f: for t in temperature: f.write('Sentence wit temperature = {}:\n'.format(t)) # Set initial hidden and memory states state = (utils.use_cuda( torch.zeros(model.num_layers, 1, model.hidden_size)), utils.use_cuda(
def main(): parser = argparse.ArgumentParser(description='training script') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--nembed_word', type=int, default=256, help='size of word embeddings') parser.add_argument('--nembed_ctx', type=int, default=64, help='size of context embeddings') parser.add_argument( '--nhid_lang', type=int, default=256, help='size of the hidden state for the language module') parser.add_argument('--nhid_ctx', type=int, default=64, help='size of the hidden state for the context module') parser.add_argument( '--nhid_strat', type=int, default=64, help='size of the hidden state for the strategy module') parser.add_argument( '--nhid_attn', type=int, default=64, help='size of the hidden state for the attention module') parser.add_argument( '--nhid_sel', type=int, default=64, help='size of the hidden state for the selection module') parser.add_argument('--lr', type=float, default=20.0, help='initial learning rate') parser.add_argument('--min_lr', type=float, default=1e-5, help='min threshold for learning rate annealing') parser.add_argument('--decay_rate', type=float, default=9.0, help='decrease learning rate by this factor') parser.add_argument('--decay_every', type=int, default=1, help='decrease learning rate after decay_every epochs') parser.add_argument('--momentum', type=float, default=0.0, help='momentum for sgd') parser.add_argument('--nesterov', action='store_true', default=False, help='enable nesterov momentum') parser.add_argument('--clip', type=float, default=0.2, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate in embedding layer') parser.add_argument('--init_range', type=float, default=0.1, help='initialization range') parser.add_argument('--max_epoch', type=int, default=30, help='max number of epochs') parser.add_argument('--bsz', type=int, default=25, help='batch size') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--temperature', type=float, default=0.1, help='temperature') parser.add_argument('--sel_weight', type=float, default=1.0, help='selection weight') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--model_file', type=str, default='', help='path to save the final model') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--rnn_ctx_encoder', action='store_true', default=False, help='wheather to use RNN for encoding the context') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) utils.set_seed(args.seed) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold, verbose=True) model = DialogModel(corpus.word_dict, corpus.item_dict, corpus.context_dict, corpus.output_length, args, device_id) if device_id is not None: model.cuda(device_id) engine = Engine(model, args, device_id, verbose=True) train_loss, valid_loss, select_loss = engine.train(corpus) print('final selectppl %.3f' % np.exp(select_loss)) utils.save_model(engine.get_model(), args.model_file)
def workflow(): utils.use_cuda() batch_size = 64 learning_rate = 1e-2 #training_data = datasets.MNIST( training_data = MNIST_local( root="./datasets/mnist", train=True, transform=ToTensor(), folder="./datasets/mnist_local" #target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), src=torch.tensor(1.))) ) #test_data = datasets.MNIST( test_data = MNIST_local( root="./datasets/mnist", train=False, transform=ToTensor(), folder="./datasets/mnist_local" #target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), src=torch.tensor(1.))) ) train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True) test_dataloader = DataLoader(test_data, batch_size=batch_size) # figure = plt.figure(figsize=(8, 8)) # cols, rows = 4, 4 # for i in range(1, cols * rows + 1): # sample_idx = torch.randint(len(test_data), size=(1,)).item() # img, label = test_data[sample_idx] # img = img.transpose(0, 1).transpose(1, 2) # figure.add_subplot(rows, cols, i) # plt.title(label) # plt.axis("off") # plt.imshow(img.squeeze(), cmap="gray") # plt.show() model_type = MnistSmallNN model_create_new = False model_save = True model_file_name = "mnist_small.pth" if model_type == MnistSmallNN else "mnist.pth" model_file_path = f'./model_files/{model_file_name}' model = model_type().to( used_device()) if model_create_new else torch.load(model_file_path).to( used_device()) # model = MnistNN().to(used_device()) # model = torch.load('../model_files/mnist.pth').to(used_device()) # model = MnistSmallNN().to(used_device()) # model = torch.load('../model_files/mnist_small.pth').to(used_device()) # for name, param in model.named_parameters(): # print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n") # loss_fn = nn.MSELoss() loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) epochs = 1000 for t in range(epochs): print(f"Epoch {t + 1}\n-------------------------------") train_loop(train_dataloader, model, loss_fn, optimizer) test_loop(test_dataloader, model, loss_fn) print("Training finished!") if model_save: torch.save(model, model_file_path) print(f"Saved model to {model_file_path}") images = glob.glob(r"./datasets\mnist_my\*png") for image in images: img = Image.open(image) trans0 = ToTensor() trans1 = ToPILImage() trans2 = Grayscale(num_output_channels=1) im = trans2(trans1(trans0(img))) # plt.imshow(im) tens = trans0(im) pred = model(tens.to(used_device())) print(image) print(pred) print(pred.argmax().item()) print() print("Finished!") # workflow()
def main(): parser = argparse.ArgumentParser( description='training script for reference resolution') parser.add_argument('--data', type=str, default='data/onecommon', help='location of the data corpus') parser.add_argument('--model_type', type=str, default='rnn_reference_model', help='type of model to use', choices=models.get_model_names()) parser.add_argument('--ctx_encoder_type', type=str, default='mlp_encoder', help='type of context encoder to use', choices=models.get_ctx_encoder_names()) parser.add_argument('--attention', action='store_true', default=False, help='use attention') parser.add_argument('--nembed_word', type=int, default=128, help='size of word embeddings') parser.add_argument( '--nhid_rel', type=int, default=64, help='size of the hidden state for the language module') parser.add_argument('--nembed_ctx', type=int, default=128, help='size of context embeddings') parser.add_argument('--nembed_cond', type=int, default=128, help='size of condition embeddings') parser.add_argument( '--nhid_lang', type=int, default=128, help='size of the hidden state for the language module') parser.add_argument( '--nhid_strat', type=int, default=128, help='size of the hidden state for the strategy module') parser.add_argument( '--nhid_attn', type=int, default=64, help='size of the hidden state for the attention module') parser.add_argument( '--nhid_sel', type=int, default=64, help='size of the hidden state for the selection module') parser.add_argument( '--share_attn', action='store_true', default=False, help='share attention modules for selection and language output') parser.add_argument('--optimizer', choices=['adam', 'rmsprop'], default='adam', help='optimizer to use') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--min_lr', type=float, default=1e-5, help='min threshold for learning rate annealing') parser.add_argument('--decay_rate', type=float, default=9.0, help='decrease learning rate by this factor') parser.add_argument('--decay_every', type=int, default=1, help='decrease learning rate after decay_every epochs') parser.add_argument('--momentum', type=float, default=0.0, help='momentum for sgd') parser.add_argument('--clip', type=float, default=0.5, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate in embedding layer') parser.add_argument('--init_range', type=float, default=0.01, help='initialization range') parser.add_argument('--max_epoch', type=int, default=20, help='max number of epochs') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--temperature', type=float, default=0.1, help='temperature') parser.add_argument('--lang_weight', type=float, default=1.0, help='language loss weight') parser.add_argument('--ref_weight', type=float, default=1.0, help='reference loss weight') parser.add_argument('--num_ref_weight', type=float, default=1.0, help='reference loss weight') parser.add_argument('--sel_weight', type=float, default=1.0, help='selection loss weight') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--model_file', type=str, default='tmp.th', help='path to save the final model') parser.add_argument('--domain', type=str, default='one_common', help='domain for the dialogue') parser.add_argument('--tensorboard_log', action='store_true', default=False, help='log training with tensorboard') parser.add_argument('--repeat_train', action='store_true', default=False, help='repeat training n times') parser.add_argument('--corpus_type', choices=['full', 'uncorrelated', 'success_only'], default='full', help='type of training corpus to use') parser.add_argument('--remove_location', action='store_true', default=False, help='remove locative information from input') parser.add_argument('--remove_size', action='store_true', default=False, help='remove size information from input') parser.add_argument('--remove_color', action='store_true', default=False, help='remove color information from input') parser.add_argument('--remove_size_color', action='store_true', default=False, help='remove size and color information from input') args = parser.parse_args() if args.repeat_train: seeds = list(range(10)) else: seeds = [1] for seed in seeds: utils.use_cuda(args.cuda) utils.set_seed(args.seed) domain = get_domain(args.domain) model_ty = models.get_model_type(args.model_type) corpus = model_ty.corpus_ty( domain, args.data, train='train_reference_shift_{}.txt'.format(seed), valid='valid_reference_shift_{}.txt'.format(seed), test='test_reference_shift_{}.txt'.format(seed), freq_cutoff=args.unk_threshold, verbose=True) model = model_ty(corpus.word_dict, args) if args.cuda: model.cuda() engine = model_ty.engine_ty(model, args, verbose=True) if args.optimizer == 'adam': best_valid_loss, best_model = engine.train(corpus) elif args.optimizer == 'rmsprop': best_valid_loss, best_model = engine.train_scheduled(corpus) utils.save_model(best_model, args.model_file + '_' + str(seed) + '.th') utils.save_model(best_model.state_dict(), 'stdict_' + args.model_file)
def main(): parser = argparse.ArgumentParser( description='testing script for reference resolution') parser.add_argument('--data', type=str, default='data/onecommon', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=10, help='minimum word frequency to be in dictionary') parser.add_argument('--model_file', type=str, required=True, help='pretrained model file') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--hierarchical', action='store_true', default=False, help='use hierarchical model') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--domain', type=str, default='one_common', help='domain for the dialogue') parser.add_argument('--vocab_corpus', choices=['full', 'uncorrelated', 'success_only'], default='full', help='vocabulary of the corpus to use') parser.add_argument('--corpus_type', choices=['full', 'uncorrelated', 'success_only'], default='full', help='type of test corpus to use') parser.add_argument('--bleu_n', type=int, default=0, help='test ngram bleu') parser.add_argument('--temperature', type=float, default=0.1, help='temperature') # for error analysis parser.add_argument('--transcript_file', type=str, default='final_transcripts.json', help='scenario file') parser.add_argument('--markable_file', type=str, default='markable_annotation.json', help='scenario file') parser.add_argument('--show_errors', action='store_true', default=False, help='show errors') # analysis parameters parser.add_argument('--fix_misspellings', action='store_true', default=False, help='fix misspellings') parser.add_argument('--shuffle_utterance', action='store_true', default=False, help='shuffle order of words in the utterance') parser.add_argument('--shuffle_word_types', type=str, nargs='*', default=[], help='shuffle specified class of words in the output') parser.add_argument('--drop_word_types', type=str, nargs='*', default=[], help='drop specified class of words in the output') parser.add_argument('--replace_word_types', type=str, nargs='*', default=[], help='replace specified class of words in the output') parser.add_argument('--repeat_test', action='store_true', default=False, help='repeat training n times') parser.add_argument('--test_ref_forward', action='store_true', default=False, help='test forward reference instead') args = parser.parse_args() if args.bleu_n > 0: # current support args.bsz = 1 if args.repeat_test: seeds = list(range(10)) else: seeds = [args.seed] repeat_results = defaultdict(list) model_referent_annotation = {} init2num_referents = defaultdict(Counter) for seed in seeds: device_id = utils.use_cuda(args.cuda) utils.set_seed(args.seed) domain = get_domain(args.domain) model = utils.load_model(args.model_file + '_' + str(seed) + '.th') if args.cuda: model.cuda() else: device = torch.device("cpu") model.to(device) model.eval() corpus = model.corpus_ty( domain, args.data, train='train_reference_shift_{}.txt'.format(seed), valid='valid_reference_shift_{}.txt'.format(seed), test='test_reference_shift_{}.txt'.format(seed), freq_cutoff=args.unk_threshold, verbose=True) with open(os.path.join(args.data, args.transcript_file), "r") as f: dialog_corpus = json.load(f) with open(os.path.join(args.data, args.markable_file), "r") as f: markable_annotation = json.load(f) with open( os.path.join(args.data, "aggregated_referent_annotation.json"), "r") as f: aggregated_referent_annotation = json.load(f) scenarios = { scenario['scenario_uuid']: scenario for scenario in dialog_corpus } crit = Criterion(model.word_dict, device_id=device_id) sel_crit = nn.CrossEntropyLoss() ref_crit = nn.BCEWithLogitsLoss() testset, testset_stats = corpus.test_dataset(args.bsz) test_lang_loss, test_select_loss, test_reference_loss, test_select_correct, test_select_total, test_reference_correct, test_reference_total, test_num_ref_correct, test_num_ref_total = 0, 0, 0, 0, 0, 0, 0, 0, 0 """ Variables to keep track of the results for analysis """ # num_referents --> count, count correct num_markables = 0 num_markables_counter = Counter() num_markables_correct = Counter() exact_match = 0 exact_match_counter = Counter() # location of markable --> count, count correct, count exact match location_counter = Counter() location_correct = Counter() location_exact_match = Counter() # information to compute correlation between selection and reference score select_correct = {} reference_correct = {} reference_total = {} # markable text --> count, count correct, count exact match text_counter = Counter() text_correct = Counter() text_exact_match = Counter() # init token --> count, count correct init_counter = Counter() init_correct = Counter() init_exact_match = Counter() # num ref confusion num_ref_confusion = np.zeros([8, 8], dtype=int) anaphora_list = [ "it", "that", "thats", "this", "its", "they", "their", "itself", "them", "those", "it's" ] total_anaphora = 0 correct_anaphora = 0 bleu_scores = [] for batch in testset: ctx, inpt, tgt, ref_inpt, ref_tgt, num_ref_tgt, sel_tgt, scenario_ids, real_ids, agents, chat_ids, sel_idx = batch ctx = Variable(ctx) inpt = Variable(inpt) if ref_inpt is not None: ref_inpt = Variable(ref_inpt) out, ref_out, num_ref_out, sel_out = model.forward( ctx, inpt, ref_inpt, sel_idx) tgt = Variable(tgt) sel_tgt = Variable(sel_tgt) lang_loss = crit(out, tgt) if ref_inpt is not None: ref_tgt = Variable(ref_tgt) ref_tgt = torch.transpose(ref_tgt, 0, 1).contiguous().float() ref_loss = ref_crit(ref_out, ref_tgt) t = Variable(torch.FloatTensor([0])) # threshold if model.args.num_ref_weight > 0: num_ref_pred = num_ref_out.max(dim=2)[1] ref_results = torch.zeros_like(ref_tgt) ref_correct = 0 for i in range(ref_out.size(0)): for j in range(ref_out.size(1)): ref_pred = torch.zeros_like(ref_tgt[i][j]) for ref_idx in range(ref_pred.size(0)): if ref_idx in ref_out[i][j].topk( num_ref_pred[i][j])[1]: ref_pred[ref_idx] = 1.0 ref_results[i][j] = ( ref_pred.long() == ref_tgt[i][j].long()) ref_correct += (ref_pred.long() == ref_tgt[i] [j].long()).sum().item() ref_total = ref_tgt.size(0) * ref_tgt.size( 1) * ref_tgt.size(2) else: ref_results = ((ref_out > 0).long() == ref_tgt.long()) ref_correct = ((ref_out > 0).long() == ref_tgt.long()).sum().item() ref_total = ref_tgt.size(0) * ref_tgt.size( 1) * ref_tgt.size(2) # compute more details of reference resolution for i in range(ref_tgt.size(0)): # markable idx for j in range(ref_tgt.size(1)): # batch idx chat_id = chat_ids[j] # add chat level details if not exists if chat_id not in reference_correct: reference_correct[chat_id] = ref_results[:, j, :].sum( ).item() if chat_id not in reference_total: reference_total[ chat_id] = ref_results[:, j, :].size( 0) * ref_results[:, j, :].size(1) if chat_id not in model_referent_annotation: model_referent_annotation[chat_id] = {} markables = [] # markables information from aggregated_referent_annotation for markable in markable_annotation[chat_id][ "markables"]: markable_id = markable["markable_id"] if markable_id in aggregated_referent_annotation[ chat_id] and markable["speaker"] == agents[ j]: if "unidentifiable" in aggregated_referent_annotation[ chat_id][ markable_id] and aggregated_referent_annotation[ chat_id][markable_id][ "unidentifiable"]: if markable_id not in model_referent_annotation[ chat_id] and markable[ "speaker"] == agents[j]: model_referent_annotation[chat_id][ markable_id] = { 'ambiguous': False, 'referents': [], 'unidentifiable': True } continue markables.append(markable) assert len(markables) == ref_tgt.size(0) if model.args.num_ref_weight > 0: ref_pred = torch.zeros_like(ref_tgt[i][j]) for ref_idx in range(ref_pred.size(0)): #if ref_idx in ref_out[i][j].topk(num_ref_tgt[i][j])[1]: if ref_idx in ref_out[i][j].topk( num_ref_pred[i][j])[1]: ref_pred[ref_idx] = 1.0 correct_result = (ref_pred.long() == ref_tgt[i] [j].long()).sum().item() exact_match_result = torch.equal( ref_pred.long(), ref_tgt[i][j].long()) num_referents = ref_tgt[i][j].long().sum().item() else: correct_result = ((ref_out > 0).long( )[i][j] == ref_tgt.long())[i][j].sum().item() exact_match_result = torch.equal( (ref_out > 0).long()[i][j], ref_tgt.long()[i][j]) num_referents = ref_tgt.long()[i][j].sum().item() ref_pred = (ref_out > 0).long()[i][j] """ Add information to variables """ num_markables += 1 num_markables_counter[num_referents] += 1 num_markables_correct[num_referents] += correct_result # compute exact match if exact_match_result: exact_match += 1 exact_match_counter[ref_tgt.long()[i] [j].sum().item()] += 1 location_exact_match[i] += 1 if num_referents == 1: # temporal condition text_exact_match[markables[i] ["text"].lower()] += 1 init_exact_match[markables[i]["text"].lower(). split(" ")[0]] += 1 location_correct[i] += correct_result location_counter[i] += 1 if num_referents == 1: # temporal condition text_counter[markables[i]["text"].lower()] += 1 text_correct[markables[i] ["text"].lower()] += correct_result init_counter[markables[i]["text"].lower().split( " ")[0]] += 1 init_correct[markables[i]["text"].lower().split( " ")[0]] += correct_result init2num_referents[markables[i]["text"].lower().split( " ")[0]][num_referents] += 1 # test anaphora if markables[i]["text"].lower() in anaphora_list: total_anaphora += 1 if exact_match_result: correct_anaphora += 1 # keep track of model predictions for later visualization chat = [ chat for chat in dialog_corpus if chat['uuid'] == chat_id ] chat = chat[0] if markables[i][ 'markable_id'] not in model_referent_annotation[ chat_id]: model_referent_annotation[chat_id][ markables[i]['markable_id']] = {} model_referent_annotation[chat_id][ markables[i]['markable_id']]['referents'] = [] model_referent_annotation[chat_id][markables[i][ 'markable_id']]['ambiguous'] = False model_referent_annotation[chat_id][markables[i][ 'markable_id']]['unidentifiable'] = False for ent, is_referent in zip( chat['scenario']['kbs'][agents[j]], ref_pred.long().tolist()): #for ent, is_referent in zip(chat['scenario']['kbs'][agents[j]], (ref_out > 0).long()[i][j].tolist()): if is_referent: model_referent_annotation[chat_id][ markables[i] ['markable_id']]['referents'].append( "agent_{}_{}".format( agents[j], ent['id'])) else: ref_loss = None ref_correct = 0 ref_total = 0 sel_loss = sel_crit(sel_out, sel_tgt) sel_correct = (sel_out.max(dim=1)[1] == sel_tgt).sum().item() sel_total = sel_out.size(0) for i in range(sel_tgt.size(0)): # batch idx chat_id = chat_ids[i] sel_resuts = (sel_out.max(dim=1)[1] == sel_tgt) if sel_resuts[i]: select_correct[chat_id] = 1 else: select_correct[chat_id] = 0 if model.args.num_ref_weight > 0 and num_ref_out is not None: num_ref_out = num_ref_out.view(-1, num_ref_out.size(2)) num_ref_tgt = torch.transpose(num_ref_tgt, 0, 1).contiguous() num_ref_tgt = num_ref_tgt.view(-1) num_ref_loss = sel_crit(num_ref_out, num_ref_tgt) num_ref_correct = (num_ref_out.max( dim=1)[1] == num_ref_tgt).sum().item() num_ref_total = num_ref_tgt.size(0) for mi in range(num_ref_out.size(0)): model_pred = num_ref_out[mi].max(dim=0)[1].item() ground_truth = num_ref_tgt[mi].item() num_ref_confusion[ground_truth][model_pred] += 1 else: num_ref_correct = 0 num_ref_total = 0 test_lang_loss += lang_loss.item() test_select_loss += sel_loss.item() if ref_loss: test_reference_loss += ref_loss.item() test_select_correct += sel_correct test_select_total += sel_total test_reference_correct += ref_correct test_reference_total += ref_total test_num_ref_correct += num_ref_correct test_num_ref_total += num_ref_total if args.bleu_n > 0: ctx_h = model.ctx_encoder(ctx.transpose(0, 1)) my_utterance = None idx = 0 while True: if inpt[idx] == model.word_dict.word2idx['YOU:']: start = idx my_utterance = model.read_and_write( inpt[:idx], ctx_h, 30, temperature=args.temperature) my_utterance = model.word_dict.i2w(my_utterance) #print(my_utterance) while not inpt[idx] in [ model.word_dict.word2idx[stop_token] for stop_token in data.STOP_TOKENS ]: idx += 1 end = idx golden_utterance = inpt[start:end] golden_utterance = model.word_dict.i2w( golden_utterance) bleu_scores.append(100 * sentence_bleu( [golden_utterance], my_utterance, weights=[ 1 for i in range(4) if args.bleu_n == i ], #weights=[1 / args.bleu_n] * args.bleu_n, smoothing_function=SmoothingFunction().method7)) if inpt[idx] == model.word_dict.word2idx['<selection>']: break idx += 1 # Main results: # Dividing by the number of words in the input, not the tokens modeled, # because the latter includes padding test_lang_loss /= testset_stats['nonpadn'] test_select_loss /= len(testset) test_select_accuracy = test_select_correct / test_select_total test_reference_accuracy = test_reference_correct / test_reference_total if test_num_ref_total > 0: test_num_ref_accuracy = test_num_ref_correct / test_num_ref_total else: test_num_ref_accuracy = 0 print('testlangloss %.8f | testlangppl %.8f' % (test_lang_loss, np.exp(test_lang_loss))) print('testselectloss %.8f | testselectaccuracy %.6f' % (test_select_loss, test_select_accuracy)) print('testreferenceloss %.8f | testreferenceaccuracy %.6f' % (test_reference_loss, test_reference_accuracy)) print('reference_exact_match %.6f' % (exact_match / num_markables)) for k in num_markables_counter.keys(): print('{}: {:.4f} {:.4f} (out of {})'.format( k, num_markables_correct[k] / (num_markables_counter[k] * 7), exact_match_counter[k] / num_markables_counter[k], num_markables_counter[k])) print('test anaphora: {} (out of {})'.format( correct_anaphora / total_anaphora, total_anaphora)) if args.bleu_n > 0: print('average bleu score {}'.format(np.mean(bleu_scores))) # reference/selection correlation reference_score = [] selection_score = [] for chat_id in reference_correct.keys(): reference_score.append(reference_correct[chat_id] / reference_total[chat_id]) selection_score.append(select_correct[chat_id]) plt.xlabel('reference score', fontsize=14) plt.ylabel('selection score', fontsize=14) sns.regplot(x=reference_score, y=selection_score) plt.savefig('reference_selection_{}.png'.format(seed), dpi=300) plt.clf() reference_score = np.array(reference_score) selection_score = np.array(selection_score) print("reference selection correlation: {}".format( np.corrcoef(reference_score, selection_score))) # keep track of results for this run repeat_results["test_lang_loss"].append(test_lang_loss) repeat_results["test_select_loss"].append(test_select_loss) repeat_results["test_select_accuracy"].append(test_select_accuracy) repeat_results["test_reference_loss"].append(test_reference_loss) repeat_results["test_reference_accuracy"].append( test_reference_accuracy) repeat_results["test_num_ref_accuracy"].append(test_num_ref_accuracy) repeat_results["correlation_score"].append( np.corrcoef(reference_score, selection_score)[0][1]) repeat_results["num_markables_counter"].append( copy.copy(num_markables_counter)) repeat_results["exact_match_counter"].append( copy.copy(exact_match_counter)) repeat_results["num_markables_correct"].append( copy.copy(num_markables_correct)) repeat_results["reference_exact_match"].append(exact_match / num_markables) repeat_results["test_perplexity"].append(np.exp(test_lang_loss)) repeat_results["location_counter"].append(copy.copy(location_counter)) repeat_results["location_correct"].append(copy.copy(location_correct)) repeat_results["location_exact_match"].append( copy.copy(location_exact_match)) repeat_results["init_counter"].append(copy.copy(init_counter)) repeat_results["init_correct"].append(copy.copy(init_correct)) repeat_results["init_exact_match"].append(copy.copy(init_exact_match)) print("=================================\n\n") print("repeat test lang loss %.8f" % np.mean(repeat_results["test_lang_loss"])) print("repeat test select loss %.8f" % np.mean(repeat_results["test_select_loss"])) print("repeat test select accuracy %.8f ( %.8f )" % (np.mean(repeat_results["test_select_accuracy"]), np.std(repeat_results["test_select_accuracy"]))) print("repeat test reference loss %.8f" % np.mean(repeat_results["test_reference_loss"])) print("repeat test reference accuracy %.8f ( %.8f )" % (np.mean(repeat_results["test_reference_accuracy"]), np.std(repeat_results["test_reference_accuracy"]))) print("repeat test num ref accuracy %.8f ( %.8f )" % (np.mean(repeat_results["test_num_ref_accuracy"]), np.std(repeat_results["test_reference_accuracy"]))) print("repeat correlation score %.8f ( %.8f )" % (np.mean(repeat_results["correlation_score"]), np.std(repeat_results["correlation_score"]))) print("repeat correlation score %.8f ( %.8f )" % (np.mean(repeat_results["correlation_score"]), np.std(repeat_results["correlation_score"]))) print("repeat reference exact match %.8f ( %.8f )" % (np.mean(repeat_results["reference_exact_match"]), np.std(repeat_results["reference_exact_match"]))) print("repeat test perplexity %.8f ( %.8f )" % (np.mean(repeat_results["test_perplexity"]), np.std(repeat_results["test_perplexity"]))) for k in num_markables_counter.keys(): print("repeat accuracy and exact match:") num_markables = [] exact_match = [] exact_match_rate = [] num_markables_correct = [] for seed in range(len(seeds)): num_markables.append( repeat_results["num_markables_counter"][seed][k]) exact_match.append(repeat_results["exact_match_counter"][seed][k]) exact_match_rate.append( repeat_results["exact_match_counter"][seed][k] / repeat_results["num_markables_counter"][seed][k]) num_markables_correct.append( repeat_results["num_markables_correct"][seed][k] / (repeat_results["num_markables_counter"][seed][k] * 7)) print('{}: {:.5f} (std {}) {:.5f} (std {}) (count {})'.format( k, np.mean(num_markables_correct), np.std(num_markables_correct), np.mean(exact_match_rate), np.std(exact_match_rate), np.mean(num_markables))) dump_json(model_referent_annotation, "{}_referent_annotation.json".format(args.model_file)) print("exact match at each location:") markable_location_plot = [] exact_match_rate_plot = [] accuracy_plot = [] for loc in range(12): accuracy = [] exact_match_rate = [] total_count = 0 for seed in range(len(seeds)): if repeat_results["location_counter"][seed][loc] > 0: exact_match_rate.append( repeat_results["location_exact_match"][seed][loc] / repeat_results["location_counter"][seed][loc]) total_count += repeat_results["location_counter"][seed][loc] markable_location_plot.append(loc + 1) exact_match_rate_plot.append( repeat_results["location_exact_match"][seed][loc] / repeat_results["location_counter"][seed][loc]) accuracy_plot.append( repeat_results["location_correct"][seed][loc] / (7 * repeat_results["location_counter"][seed][loc])) if len(exact_match_rate) > 0: print('Loc @ {}: {:.5f} (std {:.5f}) (valid runs: {}, total: {})'. format(loc + 1, np.mean(exact_match_rate), np.std(exact_match_rate), len(exact_match_rate), total_count)) plt.xlabel('markable location', fontsize=14) plt.ylabel('exact match rate', fontsize=14) sns.lineplot(x=markable_location_plot, y=exact_match_rate_plot) plt.savefig('location_exact_match_rate.png', dpi=300) plt.clf() plt.xlabel('markable location', fontsize=14) plt.ylabel('accuracy', fontsize=14) sns.lineplot(x=markable_location_plot, y=accuracy_plot) plt.savefig('location_accuracy.png', dpi=300) plt.clf() plt.xlabel('markable position', fontsize=14) plt.ylabel('percentage', fontsize=14) sns.lineplot(x=markable_location_plot, y=accuracy_plot, legend="brief", label="accuracy") sns.lineplot(x=markable_location_plot, y=exact_match_rate_plot, legend="brief", label="exact match") plt.savefig('location_results.png', dpi=300) plt.clf() print("compute results based on initial token:") #for tok in model.word_dict.w2i.keys(): definite_toks = ["the"] indefinite_toks = ["a", "an"] definite_accuracies = [] indefinite_accuracies = [] other_accuracies = [] definite_exact_matches = [] indefinite_exact_matches = [] other_exact_matches = [] definite_counts = [] indefinite_counts = [] other_counts = [] for seed in range(len(seeds)): num_correct = 0 num_exact_match = 0 num_total = 0 for tok in definite_toks: num_total += repeat_results["init_counter"][seed][tok] num_correct += repeat_results["init_correct"][seed][tok] num_exact_match += repeat_results["init_exact_match"][seed][tok] definite_accuracies.append(num_correct / (7 * num_total)) definite_exact_matches.append(num_exact_match / num_total) definite_counts.append(num_total) num_correct = 0 num_exact_match = 0 num_total = 0 for tok in indefinite_toks: num_total += repeat_results["init_counter"][seed][tok] num_correct += repeat_results["init_correct"][seed][tok] num_exact_match += repeat_results["init_exact_match"][seed][tok] indefinite_accuracies.append(num_correct / (7 * num_total)) indefinite_exact_matches.append(num_exact_match / num_total) indefinite_counts.append(num_total) num_correct = 0 num_exact_match = 0 num_total = 0 for tok in repeat_results["init_counter"][seed].keys(): if tok not in definite_toks + indefinite_toks: num_total += repeat_results["init_counter"][seed][tok] num_correct += repeat_results["init_correct"][seed][tok] num_exact_match += repeat_results["init_exact_match"][seed][ tok] other_accuracies.append(num_correct / (7 * num_total)) other_exact_matches.append(num_exact_match / num_total) other_counts.append(num_total) print( "definite: accuracies {} (std {}), exact match rate {} (std {}), total count {} (std {})" .format(np.mean(definite_accuracies), np.std(definite_accuracies), np.mean(definite_exact_matches), np.std(definite_exact_matches), np.mean(definite_counts), np.std(definite_counts))) print( "indefinite: accuracies {} (std {}), exact match rate {} (std {}), total count {} (std {})" .format(np.mean(indefinite_accuracies), np.std(indefinite_accuracies), np.mean(indefinite_exact_matches), np.std(indefinite_exact_matches), np.mean(indefinite_counts), np.std(indefinite_counts))) print( "other: accuracies {} (std {}), exact match rate {} (std {}), total count {} (std {})" .format(np.mean(other_accuracies), np.std(other_accuracies), np.mean(other_exact_matches), np.std(other_exact_matches), np.mean(other_counts), np.std(other_counts))) valid_markables = 0 for chat_id in model_referent_annotation.keys(): for markable_id in model_referent_annotation[chat_id].keys(): if 'unidentifiable' in aggregated_referent_annotation[chat_id][ markable_id] and aggregated_referent_annotation[chat_id][ markable_id]['unidentifiable']: continue valid_markables += 1 print("model valid markables: {}".format(valid_markables)) valid_markables = 0 for chat_id in aggregated_referent_annotation.keys(): for markable_id in aggregated_referent_annotation[chat_id].keys(): if 'unidentifiable' in aggregated_referent_annotation[chat_id][ markable_id] and aggregated_referent_annotation[chat_id][ markable_id]['unidentifiable']: continue valid_markables += 1 print("aggregated valid markables: {}".format(valid_markables))
def main(): parser = argparse.ArgumentParser(description='training script') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--nembed_word', type=int, default=256, help='size of word embeddings') parser.add_argument('--nembed_ctx', type=int, default=64, help='size of context embeddings') parser.add_argument( '--nhid_lang', type=int, default=256, help='size of the hidden state for the language module') parser.add_argument( '--nhid_cluster', type=int, default=256, help='size of the hidden state for the language module') parser.add_argument('--nhid_ctx', type=int, default=64, help='size of the hidden state for the context module') parser.add_argument( '--nhid_strat', type=int, default=64, help='size of the hidden state for the strategy module') parser.add_argument( '--nhid_attn', type=int, default=64, help='size of the hidden state for the attention module') parser.add_argument( '--nhid_sel', type=int, default=64, help='size of the hidden state for the selection module') parser.add_argument('--lr', type=float, default=20.0, help='initial learning rate') parser.add_argument('--min_lr', type=float, default=1e-5, help='min threshold for learning rate annealing') parser.add_argument('--decay_rate', type=float, default=9.0, help='decrease learning rate by this factor') parser.add_argument('--decay_every', type=int, default=1, help='decrease learning rate after decay_every epochs') parser.add_argument('--momentum', type=float, default=0.0, help='momentum for sgd') parser.add_argument('--clip', type=float, default=0.2, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate in embedding layer') parser.add_argument('--init_range', type=float, default=0.1, help='initialization range') parser.add_argument('--max_epoch', type=int, default=30, help='max number of epochs') parser.add_argument('--num_clusters', type=int, default=50, help='number of clusters') parser.add_argument('--bsz', type=int, default=25, help='batch size') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--temperature', type=float, default=0.1, help='temperature') parser.add_argument('--partner_ctx_weight', type=float, default=0.0, help='selection weight') parser.add_argument('--sel_weight', type=float, default=0.6, help='selection weight') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--model_file', type=str, default='', help='path to save the final model') parser.add_argument('--prediction_model_file', type=str, default='', help='path to save the prediction model') parser.add_argument('--selection_model_file', type=str, default='', help='path to save the selection model') parser.add_argument('--cluster_model_file', type=str, default='', help='path to save the cluster model') parser.add_argument('--lang_model_file', type=str, default='', help='path to save the language model') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--skip_values', action='store_true', default=False, help='skip values in ctx encoder') parser.add_argument('--model_type', type=str, default='rnn_model', help='model type', choices=models.get_model_names()) parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--clustering', action='store_true', default=False, help='use clustering') parser.add_argument('--sep_sel', action='store_true', default=False, help='use separate classifiers for selection') args = parser.parse_args() utils.use_cuda(args.cuda) utils.set_seed(args.seed) domain = get_domain(args.domain) model_ty = models.get_model_type(args.model_type) corpus = model_ty.corpus_ty(domain, args.data, freq_cutoff=args.unk_threshold, verbose=True, sep_sel=args.sep_sel) model = model_ty(corpus.word_dict, corpus.item_dict_old, corpus.context_dict, corpus.count_dict, args) if args.cuda: model.cuda() engine = model_ty.engine_ty(model, args, verbose=True) train_loss, valid_loss, select_loss, extra = engine.train(corpus) utils.save_model(engine.get_model(), args.model_file)
def main(): parser = argparse.ArgumentParser(description='training script') parser.add_argument('--data', type=str, default=config.data_dir, help='location of the data corpus') parser.add_argument('--nembed_word', type=int, default=config.nembed_word, help='size of word embeddings') parser.add_argument('--nembed_ctx', type=int, default=config.nembed_ctx, help='size of context embeddings') parser.add_argument( '--nhid_lang', type=int, default=config.nhid_lang, help='size of the hidden state for the language module') parser.add_argument('--nhid_ctx', type=int, default=config.nhid_ctx, help='size of the hidden state for the context module') parser.add_argument( '--nhid_strat', type=int, default=config.nhid_strat, help='size of the hidden state for the strategy module') parser.add_argument( '--nhid_attn', type=int, default=config.nhid_attn, help='size of the hidden state for the attention module') parser.add_argument( '--nhid_sel', type=int, default=config.nhid_sel, help='size of the hidden state for the selection module') parser.add_argument('--lr', type=float, default=config.lr, help='initial learning rate') parser.add_argument('--min_lr', type=float, default=config.min_lr, help='min threshold for learning rate annealing') parser.add_argument('--decay_rate', type=float, default=config.decay_rate, help='decrease learning rate by this factor') parser.add_argument('--decay_every', type=int, default=config.decay_every, help='decrease learning rate after decay_every epochs') parser.add_argument('--momentum', type=float, default=config.momentum, help='momentum for sgd') parser.add_argument('--nesterov', action='store_true', default=config.nesterov, help='enable nesterov momentum') parser.add_argument('--clip', type=float, default=config.clip, help='gradient clipping') parser.add_argument('--dropout', type=float, default=config.dropout, help='dropout rate in embedding layer') parser.add_argument('--init_range', type=float, default=config.init_range, help='initialization range') parser.add_argument('--max_epoch', type=int, default=config.max_epoch, help='max number of epochs') parser.add_argument('--bsz', type=int, default=config.bsz, help='batch size') parser.add_argument('--unk_threshold', type=int, default=config.unk_threshold, help='minimum word frequency to be in dictionary') parser.add_argument('--temperature', type=float, default=config.temperature, help='temperature') parser.add_argument('--sel_weight', type=float, default=config.sel_weight, help='selection weight') parser.add_argument('--seed', type=int, default=config.seed, help='random seed') parser.add_argument('--cuda', action='store_true', default=config.cuda, help='use CUDA') parser.add_argument('--model_file', type=str, default='', help='path to save the final model') parser.add_argument('--visual', action='store_true', default=config.plot_graphs, help='plot graphs') parser.add_argument('--domain', type=str, default=config.domain, help='domain for the dialogue') parser.add_argument('--rnn_ctx_encoder', action='store_true', default=config.rnn_ctx_encoder, help='whether to use RNN for encoding the context') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) logging.info("Starting training using pytorch version:%s" % (str(torch.__version__))) logging.info("CUDA is %s" % ("enabled. Using device_id:"+str(device_id) + " version:" \ +str(torch.version.cuda) + " on gpu:" + torch.cuda.get_device_name(0) if args.cuda else "disabled")) utils.set_seed(args.seed) logging.info( "Building word corpus, requiring minimum word frequency of %d for dictionary" % (args.unk_threshold)) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold, verbose=True) logging.info("Building RNN-based dialogue model from word corpus") model = DialogModel(corpus.word_dict, corpus.item_dict, corpus.context_dict, corpus.output_length, args, device_id) if device_id is not None: model.cuda(device_id) engine = Engine(model, args, device_id, verbose=True) logging.info("Training model") train_loss, valid_loss, select_loss = engine.train(corpus) logging.info('final select_ppl %.3f' % np.exp(select_loss)) # utils.save_model(engine.get_model(), args.model_file) torch.save(engine.get_model().state_dict(), args.model_file)
def main(): parser = argparse.ArgumentParser(description='selfplaying script') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--alice_forward_model_file', type=str, help='Alice forward model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--pred_temperature', type=float, default=1.0, help='temperature') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument( '--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--max_turns', type=int, default=20, help='maximum number of turns in a dialog') parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_alice', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--diverse_alice', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--rollout_bsz', type=int, default=3, help='rollout batch size') parser.add_argument('--rollout_count_threshold', type=int, default=3, help='rollout count threshold') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--selection_model_file', type=str, default='', help='path to save the final model') parser.add_argument('--rollout_model_file', type=str, default='', help='path to save the final model') parser.add_argument('--diverse_bob', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--eps', type=float, default=0.0, help='eps greedy') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--validate', action='store_true', default=False, help='plot graphs') args = parser.parse_args() utils.use_cuda(args.cuda) utils.set_seed(args.seed) alice_model = utils.load_model(args.alice_model_file) alice_ty = get_agent_type(alice_model, args.smart_alice) alice = alice_ty(alice_model, args, name='Alice', train=False, diverse=args.diverse_alice) alice.vis = args.visual bob_model = utils.load_model(args.bob_model_file) bob_ty = get_agent_type(bob_model, args.smart_bob) bob = bob_ty(bob_model, args, name='Bob', train=False, diverse=args.diverse_bob) bob.vis = False dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) selfplay = SelfPlay(dialog, ctx_gen, args, logger) selfplay.run()
def main(): parser = argparse.ArgumentParser(description='Reinforce') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--output_model_file', type=str, help='output model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--pred_temperature', type=float, default=1.0, help='temperature') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument( '--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--eps', type=float, default=0.5, help='eps greedy') parser.add_argument('--momentum', type=float, default=0.1, help='momentum for sgd') parser.add_argument('--lr', type=float, default=0.1, help='learning rate') parser.add_argument('--clip', type=float, default=0.1, help='gradient clip') parser.add_argument('--rl_lr', type=float, default=0.002, help='RL learning rate') parser.add_argument('--rl_clip', type=float, default=2.0, help='RL gradient clip') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--sv_train_freq', type=int, default=-1, help='supervision train frequency') parser.add_argument('--nepoch', type=int, default=1, help='number of epochs') parser.add_argument('--hierarchical', action='store_true', default=False, help='use hierarchical training') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--selection_model_file', type=str, default='', help='path to save the final model') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--validate', action='store_true', default=False, help='plot graphs') parser.add_argument('--scratch', action='store_true', default=False, help='erase prediciton weights') parser.add_argument('--sep_sel', action='store_true', default=False, help='use separate classifiers for selection') args = parser.parse_args() utils.use_cuda(args.cuda) utils.set_seed(args.seed) alice_model = utils.load_model(args.alice_model_file) # RnnModel alice_ty = get_agent_type(alice_model) # RnnRolloutAgent alice = alice_ty(alice_model, args, name='Alice', train=True) alice.vis = args.visual bob_model = utils.load_model(args.bob_model_file) # RnnModel bob_ty = get_agent_type(bob_model) # RnnAgent bob = bob_ty(bob_model, args, name='Bob', train=False) dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) domain = get_domain(args.domain) corpus = alice_model.corpus_ty(domain, args.data, freq_cutoff=args.unk_threshold, verbose=True, sep_sel=args.sep_sel) engine = alice_model.engine_ty(alice_model, args) reinforce = Reinforce(dialog, ctx_gen, args, engine, corpus, logger) reinforce.run() utils.save_model(alice.model, args.output_model_file)
def main(): parser = argparse.ArgumentParser(description='chat utility') parser.add_argument('--model_file', type=str, help='model file') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--context_file', type=str, default='', help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--num_types', type=int, default=3, help='number of object types') parser.add_argument('--num_objects', type=int, default=6, help='total number of objects') parser.add_argument('--max_score', type=int, default=10, help='max score per object') parser.add_argument( '--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--smart_ai', action='store_true', default=False, help='make AI smart again') parser.add_argument('--ai_starts', action='store_true', default=False, help='allow AI to start the dialog') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') args = parser.parse_args() utils.use_cuda(args.cuda) utils.set_seed(args.seed) human = HumanAgent(domain.get_domain(args.domain)) alice_ty = RnnRolloutAgent if args.smart_ai else HierarchicalAgent ai = alice_ty(utils.load_model(args.model_file), args) agents = [ai, human] if args.ai_starts else [human, ai] dialog = Dialog(agents, args) logger = DialogLogger(verbose=True) if args.context_file == '': ctx_gen = ManualContextGenerator(args.num_types, args.num_objects, args.max_score) else: ctx_gen = ContextGenerator(args.context_file) chat = Chat(dialog, ctx_gen, logger) chat.run()
for bs in batch_size: # Create corpuses train_corpus = utils.Corpus() ids_train = train_corpus.get_data(train_path, bs) ids_valid = train_corpus.get_data(valid_path, bs) train_vocab_size = len(train_corpus.dictionary) for seq_len in seq_lengths: num_train_batches = ids_train.size(1) // seq_len num_valid_batches = ids_valid.size(1) // seq_len for lr in learning_rate: model = utils.initialize_model(model_num, train_vocab_size, embed_size) model = utils.use_cuda(model) print('Training vocabulary size: {}'.format(train_vocab_size)) print('Model: {}'.format(model.name)) print('Number of parameters = {}'.format( sum(p.numel() for p in model.parameters()))) run_name = "{}, seq_len={}, lr={}, bs={}".format( model.name, seq_len, lr, bs) file_path = os.path.join(save_dir, run_name + '.pkl') # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) #TODO: CHANGE PARAMETERS - EPS, PATIENCE, etc.. CHECK OTHER TYPES OF SCHEDULER!! https://pytorch.org/docs/stable/optim.html lr_scheduler = LRscheduler.ReduceLROnPlateau(optimizer, eps=1e-7)
def main(): parser = argparse.ArgumentParser(description='Reinforce') parser.add_argument('--data', type=str, default=config.data_dir, help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=config.unk_threshold, help='minimum word frequency to be in dictionary') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--output_model_file', type=str, help='output model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=config.rl_temperature, help='temperature') parser.add_argument('--cuda', action='store_true', default=config.cuda, help='use CUDA') parser.add_argument('--verbose', action='store_true', default=config.verbose, help='print out converations') parser.add_argument('--seed', type=int, default=config.seed, help='random seed') parser.add_argument( '--score_threshold', type=int, default=config.rl_score_threshold, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--gamma', type=float, default=config.rl_gamma, help='discount factor') parser.add_argument('--eps', type=float, default=config.rl_eps, help='eps greedy') parser.add_argument('--nesterov', action='store_true', default=config.nesterov, help='enable nesterov momentum') parser.add_argument('--momentum', type=float, default=config.rl_momentum, help='momentum for sgd') parser.add_argument('--lr', type=float, default=config.rl_lr, help='learning rate') parser.add_argument('--clip', type=float, default=config.rl_clip, help='gradient clip') parser.add_argument('--rl_lr', type=float, default=config.rl_reinforcement_lr, help='RL learning rate') parser.add_argument('--rl_clip', type=float, default=config.rl_reinforcement_clip, help='RL gradient clip') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--bsz', type=int, default=config.rl_bsz, help='batch size') parser.add_argument('--sv_train_freq', type=int, default=config.rl_sv_train_freq, help='supervision train frequency') parser.add_argument('--nepoch', type=int, default=config.rl_nepoch, help='number of epochs') parser.add_argument('--visual', action='store_true', default=config.plot_graphs, help='plot graphs') parser.add_argument('--domain', type=str, default=config.domain, help='domain for the dialogue') parser.add_argument('--reward', type=str, choices=['margin', 'fair', 'length'], default='margin', help='reward function') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) logging.info("Starting training using pytorch version:%s" % (str(torch.__version__))) logging.info("CUDA is %s" % ("enabled. Using device_id:"+str(device_id) + " version:" \ +str(torch.version.cuda) + " on gpu:" + torch.cuda.get_device_name(0) if args.cuda else "disabled")) alice_model = utils.load_model(args.alice_model_file) # we don't want to use Dropout during RL alice_model.eval() # Alice is a RL based agent, meaning that she will be learning while selfplaying logging.info("Creating RlAgent from alice_model: %s" % (args.alice_model_file)) alice = RlAgent(alice_model, args, name='Alice') # we keep Bob frozen, i.e. we don't update his parameters logging.info("Creating Bob's (--smart_bob) LstmRolloutAgent" if args.smart_bob \ else "Creating Bob's (not --smart_bob) LstmAgent" ) bob_ty = LstmRolloutAgent if args.smart_bob else LstmAgent bob_model = utils.load_model(args.bob_model_file) bob_model.eval() bob = bob_ty(bob_model, args, name='Bob') logging.info("Initializing communication dialogue between Alice and Bob") dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) logging.info( "Building word corpus, requiring minimum word frequency of %d for dictionary" % (args.unk_threshold)) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold) engine = Engine(alice_model, args, device_id, verbose=False) logging.info("Starting Reinforcement Learning") reinforce = Reinforce(dialog, ctx_gen, args, engine, corpus, logger) reinforce.run() logging.info("Saving updated Alice model to %s" % (args.output_model_file)) utils.save_model(alice.model, args.output_model_file)
def main(): parser = argparse.ArgumentParser(description='selfplaying script') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--alice_forward_model_file', type=str, help='Alice forward model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--pred_temperature', type=float, default=1.0, help='temperature') parser.add_argument('--log_attention', action='store_true', default=False, help='log attention') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--max_turns', type=int, default=20, help='maximum number of turns in a dialog') parser.add_argument('--log_file', type=str, default='selfplay.log', help='log dialogs to file') parser.add_argument('--smart_alice', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--rollout_bsz', type=int, default=3, help='rollout batch size') parser.add_argument('--rollout_count_threshold', type=int, default=3, help='rollout count threshold') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--selection_model_file', type=str, default='', help='path to save the final model') parser.add_argument('--rollout_model_file', type=str, default='', help='path to save the final model') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--domain', type=str, default='one_common', help='domain for the dialogue') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--eps', type=float, default=0.0, help='eps greedy') parser.add_argument('--data', type=str, default='data/onecommon', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=10, help='minimum word frequency to be in dictionary') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--plot_metrics', action='store_true', default=False, help='plot metrics') parser.add_argument('--markable_detector_file', type=str, default="markable_detector", help='visualize referents') parser.add_argument('--record_markables', action='store_true', default=False, help='record markables and referents') parser.add_argument('--repeat_selfplay', action='store_true', default=False, help='repeat selfplay') args = parser.parse_args() if args.repeat_selfplay: seeds = list(range(10)) else: seeds = [args.seed] repeat_results = [] for seed in seeds: utils.use_cuda(args.cuda) utils.set_seed(args.seed) if args.record_markables: if not os.path.exists(args.markable_detector_file + '_' + str(seed) + '.th'): assert False markable_detector = utils.load_model(args.markable_detector_file + '_' + str(seed) + '.th') if args.cuda: markable_detector.cuda() else: device = torch.device("cpu") markable_detector.to(device) markable_detector.eval() markable_detector_corpus = markable_detector.corpus_ty( domain, args.data, train='train_markable_{}.txt'.format(seed), valid='valid_markable_{}.txt'.format(seed), test='test_markable_{}.txt'.format( seed), #test='selfplay_reference_{}.txt'.format(seed), freq_cutoff=args.unk_threshold, verbose=True) else: markable_detector = None markable_detector_corpus = None alice_model = utils.load_model(args.alice_model_file + '_' + str(seed) + '.th') alice_ty = get_agent_type(alice_model, args.smart_alice) alice = alice_ty(alice_model, args, name='Alice', train=False) bob_model = utils.load_model(args.bob_model_file + '_' + str(seed) + '.th') bob_ty = get_agent_type(bob_model, args.smart_bob) bob = bob_ty(bob_model, args, name='Bob', train=False) dialog = Dialog([alice, bob], args, markable_detector, markable_detector_corpus) ctx_gen = ContextGenerator( os.path.join(args.data, args.context_file + '.txt')) with open(os.path.join(args.data, args.context_file + '.json'), "r") as f: scenario_list = json.load(f) scenarios = {scenario['uuid']: scenario for scenario in scenario_list} logger = DialogLogger(verbose=args.verbose, log_file=args.log_file, scenarios=scenarios) selfplay = SelfPlay(dialog, ctx_gen, args, logger) result = selfplay.run() repeat_results.append(result) print("dump selfplay_markables.json") dump_json(dialog.selfplay_markables, "selfplay_markables.json") print("dump selfplay_referents.json") dump_json(dialog.selfplay_referents, "selfplay_referents.json") print("repeat selfplay results %.8f ( %.8f )" % (np.mean(repeat_results), np.std(repeat_results)))
def main(): parser = argparse.ArgumentParser(description='Reinforce') parser.add_argument('--data', type=str, default='./data/negotiate', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--output_model_file', type=str, help='output model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument( '--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--gamma', type=float, default=0.99, help='discount factor') parser.add_argument('--eps', type=float, default=0.5, help='eps greedy') parser.add_argument('--nesterov', action='store_true', default=False, help='enable nesterov momentum') parser.add_argument('--momentum', type=float, default=0.0, help='momentum for sgd') parser.add_argument('--lr', type=float, default=0.1, help='learning rate') parser.add_argument('--clip', type=float, default=0.1, help='gradient clip') parser.add_argument('--rl_lr', type=float, default=0.1, help='RL learning rate') parser.add_argument('--rl_clip', type=float, default=0.1, help='RL gradient clip') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--bsz', type=int, default=8, help='batch size') parser.add_argument('--sv_train_freq', type=int, default=-1, help='supervision train frequency') parser.add_argument('--nepoch', type=int, default=1, help='number of epochs') parser.add_argument('--visual', action='store_true', default=False, help='plot graphs') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) utils.set_seed(args.seed) alice_model = utils.load_model(args.alice_model_file) # We don't want to use Dropout during RL alice_model.eval() alice = RlAgent(alice_model, args, name='Alice') bob_ty = LstmRolloutAgent if args.smart_bob else LstmAgent bob_model = utils.load_model(args.bob_model_file) bob_model.eval() bob = bob_ty(bob_model, args, name='Bob') dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold) engine = Engine(alice_model, args, device_id, verbose=False) reinforce = Reinforce(dialog, ctx_gen, args, engine, corpus, logger) reinforce.run() utils.save_model(alice.model, args.output_model_file)
def main(): parser = argparse.ArgumentParser( description='training script for markable detection') parser.add_argument('--data', type=str, default='data/onecommon', help='location of the data corpus') parser.add_argument('--nembed_word', type=int, default=128, help='size of word embeddings') parser.add_argument('--nembed_ctx', type=int, default=128, help='size of context embeddings') parser.add_argument( '--nhid_lang', type=int, default=128, help='size of the hidden state for the language module') parser.add_argument('--optimizer', choices=['adam', 'rmsprop'], default='adam', help='optimizer to use') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--min_lr', type=float, default=1e-5, help='min threshold for learning rate annealing') parser.add_argument('--decay_rate', type=float, default=9.0, help='decrease learning rate by this factor') parser.add_argument('--decay_every', type=int, default=1, help='decrease learning rate after decay_every epochs') parser.add_argument('--momentum', type=float, default=0.0, help='momentum for sgd') parser.add_argument('--clip', type=float, default=0.5, help='gradient clipping') parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate in embedding layer') parser.add_argument('--init_range', type=float, default=0.01, help='initialization range') parser.add_argument('--max_epoch', type=int, default=10, help='max number of epochs') parser.add_argument('--bsz', type=int, default=1, help='batch size') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') parser.add_argument('--model_file', type=str, default='markable_detector', help='path to save the final model') parser.add_argument('--domain', type=str, default='one_common', help='domain for the dialogue') parser.add_argument('--tensorboard_log', action='store_true', default=False, help='log training with tensorboard') parser.add_argument('--repeat_train', action='store_true', default=False, help='repeat training n times') parser.add_argument('--test_only', action='store_true', default=False, help='test only') parser.add_argument('--corpus_type', choices=['full', 'uncorrelated', 'success_only'], default='full', help='type of training corpus to use') args = parser.parse_args() if args.repeat_train: seeds = list(range(10)) else: seeds = [1] for seed in seeds: utils.use_cuda(args.cuda) utils.set_seed(args.seed) domain = get_domain(args.domain) corpus = BiLSTM_CRF.corpus_ty( domain, args.data, train='train_markable_{}.txt'.format(seed), valid='valid_markable_{}.txt'.format(seed), test='test_markable_{}.txt'.format(seed), verbose=True) if args.test_only: best_model = utils.load_model(args.model_file + '_' + str(seed) + '.th') if args.cuda: best_model.cuda() else: device = torch.device("cpu") best_model.to(device) best_model.eval() else: model = BiLSTM_CRF(len(corpus.word_dict), corpus.bio_dict, args.nembed_word, args.nhid_lang) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.cuda: model.cuda() best_model, best_valid_loss = copy.deepcopy(model), 1e100 validdata = corpus.valid_dataset(args.bsz) for epoch in range(1, args.max_epoch + 1): traindata = corpus.train_dataset(args.bsz) trainset, trainset_stats = traindata validset, validset_stats = validdata # train pass model.train() total_lang_loss, total_select_loss, total_num_correct, total_num_select = 0, 0, 0, 0 start_time = time.time() for batch in tqdm(trainset): model.zero_grad() ctx, words, markables, scenario_ids, agents, chat_ids = batch ctx = Variable(ctx) words = Variable(words) markables = Variable(markables) loss = model.neg_log_likelihood(words, markables) loss.sum().backward() optimizer.step() # valid pass model.eval() with torch.no_grad(): correct = 0 total = 0 valid_loss = 0 for batch in tqdm(validset): ctx, words, markables, scenario_ids, agents, chat_ids = batch valid_loss += model.neg_log_likelihood( words, markables).sum().item() score, tag_seq = model(words) correct += (torch.Tensor(tag_seq).long() == markables ).sum().item() total += len(tag_seq) print("epoch {}".format(epoch)) print("valid loss: {:.5f}".format(valid_loss)) print("valid accuracy: {:.5f}".format(correct / total)) if valid_loss < best_valid_loss: print("update best model") best_model = copy.deepcopy(model) best_valid_loss = valid_loss # test pass testdata = corpus.test_dataset(args.bsz) testset, testset_stats = testdata best_model.eval() with torch.no_grad(): correct = 0 total = 0 test_loss = 0 for batch in tqdm(testset): ctx, words, markables, scenario_ids, agents, chat_ids = batch test_loss += best_model.neg_log_likelihood( words, markables).sum().item() score, tag_seq = best_model(words) correct += ( torch.Tensor(tag_seq).long() == markables).sum().item() total += len(tag_seq) print("final test {}".format(epoch)) print("test loss: {:.5f}".format(test_loss)) print("test accuracy: {:.5f}".format(correct / total)) if not args.test_only: utils.save_model(best_model, args.model_file + '_' + str(seed) + '.th') utils.save_model(best_model.state_dict(), 'stdict_' + args.model_file)