def test_gettopk(data_dir='data/memes/', dim_proj=256, dim_att=128, maxlen=None, batch_size=700, keep_ratio=1., shuffle_data=False, learning_rate=0.0005, global_steps=50000, disp_freq=100, save_freq=300, test_freq=300, saveto_file='params.npz', weight_decay=0.0005, sigmasqr = 1, tdim = 1., reload_model=False, train=True): """ Topo-LSTM model training. tdim: scale time down by how many times """ options = locals().copy() saveto = data_dir + saveto_file tmsaveto = data_dir + 'timeparams.npz' # loads graph Gp, node_index, node_reverse_index = data_utils.load_graph_withtrack(data_dir) options['n_events'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) timeparams = init_timeparams(options) print 'reusing saved model.' load_params(tmsaveto, timeparams) timetparams = init_tparams(timeparams) # builds Topo-LSTM model print 'Building model...' model = tpgru_model.build_model(tparams, timetparams, options) print 'Loading test data...' test_examples = data_utils.load_examples_seq(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, Gp=Gp) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) scores = evaluate_topk(model['f_prob'], test_loader, model['f_tprob'], options['tdim'], node_reverse_index, data_dir) print 'eval scores: ', scores pprint.pprint(scores)
print("Shape of test_mask: {}".format(test_mask.shape)) print("Average train cascade size: {}".format( np.mean(np.sum(train_mask, axis=1)))) print("Average validation cascade size: {}".format( np.mean(np.sum(validation_mask, axis=1)))) print("Average test cascade size: {}".format(np.mean(np.sum(test_mask, axis=1)))) print("***** Hyper Parameters *****") print("Learning rate: {}".format(FLAGS.learning_rate)) print("Batch size: {}".format(FLAGS.batch_size)) print("Max steps: {}".format(FLAGS.max_steps)) print("Regularization scale: {}".format(FLAGS.regularization_scale)) print("hidden_dim: {}".format(FLAGS.hidden_dim)) train_batches = data_utils.Loader(train_examples, train_mask, FLAGS.batch_size) print("Number of train batches: {}".format(len(train_batches))) # Define placeholders placeholders = { 'contents': tf.placeholder(tf.float32, shape=(None, FLAGS.hidden_dim)), 'sequences': tf.placeholder(tf.int32, shape=(None, FLAGS.max_steps + 1)), 'seq_mask': tf.placeholder(tf.int32, shape=(None, FLAGS.max_steps)), 'hit_at': tf.placeholder(tf.int32) } # Create model model = CascadeRNN(number_of_nodes, FLAGS.hidden_dim, FLAGS.max_steps, nx.to_numpy_matrix(G).astype(np.float32),
def train(data_dir='data/memes/', dim_proj=512, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=1000, test_freq=1000, saveto_file='params.npz', weight_decay=0.0005, reload_model=False, train=True): """ Topo-LSTM model training. """ options = locals().copy() saveto = data_dir + saveto_file # loads graph G, node_index = data_utils.load_graph(data_dir) print nx.info(G) options['n_words'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) # builds Topo-LSTM model print 'Building model...' model = tprnn_model.build_model(tparams, options) print 'Loading test data...' test_examples = data_utils.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, G=G) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utils.load_examples( data_dir, dataset='train', keep_ratio=options['keep_ratio'], node_index=node_index, maxlen=maxlen, G=G) train_loader = data_utils.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) # training loop. start_time = timeit.default_timer() # downhill.minimize( # loss=cost, # algo='adam', # train=train_loader, # # inputs=input_list + [labels], # # params=tparams.values(), # # patience=0, # max_gradient_clip=1, # # max_gradient_norm=1, # learning_rate=learning_rate, # monitors=[('cost', cost)], # monitor_gradients=False) n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): cost = f_update(*train_loader()) cost_history += [cost] if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) # dump model parameters. if global_step % save_freq == 0: params = unzip(tparams) np.savez(saveto, **params) pickle.dump(options, open('%s.pkl' % saveto, 'wb'), -1) # evaluate on test data. if global_step % test_freq == 0: scores = evaluate(model['f_prob'], test_loader) print 'eval scores: ', scores end_time = timeit.default_timer() print 'time used: %d seconds.' % (end_time - start_time) global_step += 1 scores = evaluate(model['f_prob'], test_loader) pprint.pprint(scores)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='./data', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default='bert-base-uncased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--pre_training_path", default='./pre_training', type=str, help="model pre training") parser.add_argument("--save_path", default='./output', type=str, help="model save path") parser.add_argument("--ngpu", default=1, type=int, help="use gpu number") parser.add_argument("--load_model", default=False, action='store_true', help="model load") parser.add_argument("--save_model", default=False, action='store_true', help="model save ") parser.add_argument("--load_path", default='./output', type=str, help="model save path") parser.add_argument("--is_test", default='./output', type=str, help="model save path") parser.add_argument("--task_name", default='cloth', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='EXP/', type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--cache_size", default=256, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_log_steps", default=10, type=int, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") suffix = time.strftime('%Y%m%d-%H%M%S') args.output_dir = os.path.join(args.output_dir, suffix) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) bert_list = [] model_list = [] for m in args.bert_model.split('+'): bert_list .append(m) model_list.append(chose_model_model(m, args)) logging = get_logger(os.path.join(args.output_dir, 'log.txt')) data_file = [] for m in bert_list: data_file.append({'train': 'train', 'valid': 'dev', 'test': 'test'}) for key in data_file[-1].keys(): data_file[-1][key] = data_file[-1][key] + '-' + m + '.pt' if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logging("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logging("device {} n_gpu {} distributed training {}".format(device, n_gpu, bool(args.local_rank != -1))) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() num_train_steps = [] train_data = [] if args.do_train: for id, m in enumerate(bert_list): train_data.append(data_utils.Loader(args.data_dir, data_file[id]['train'], args.cache_size, args.train_batch_size, device)) num_train_steps.append(int( train_data[-1].data_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)) # Prepare model # model = RobertaForCloze.from_pretrained("roberta-base", # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), # #proxies={ "socks":"127.0.0.1:1080",} # # tokenizer = chose_model_token(args.bert_model,args) # tokenizer = chose_model_token(args.bert_model,args) # model.resize_token_embeddings(len(tokenizer)) # model = torch.load() if args.fp16: for id, model in enumerate(model_list): model_list[id].half() for id, model in enumerate(model_list): model_list[id].to(device) if args.local_rank != -1: for id, model in enumerate(model_list): model_list[id] = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: for id, model in enumerate(model_list): model_list[id] = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = [] if args.fp16: for model in model_list: param_optimizer.append((n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()) elif args.optimize_on_cpu: for model in model_list: param_optimizer.append((n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()) else: for model in model_list: param_optimizer.append(list(model.named_parameters())) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [] for p_o in param_optimizer: optimizer_grouped_parameters.append([ {'params': [p for n, p in p_o if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in p_o if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ]) global_step = 0 if args.load_model: if args.ngpu > 1: for id, m in enumerate(args.bert_list): print(' model is loading...... PATH:' + m + '/' + m + '_' + str( args.ngpu) + '.bin') model_list[id] = torch.load(args.load_path + '/' + m + '_' + str(args.ngpu) + '.bin') else: for id, m in enumerate(args.bert_list): print(' model is loading...... PATH:' + args.load_path + '/' + m + '.bin') model_list[id] = torch.load(args.load_path + '/' + m + '.bin') if args.do_train: # import time for id, model in enumerate(model_list): start = time.time() logging("***** Running training *****") logging(" Batch size = {}".format(args.train_batch_size)) logging(" Num steps = {}".format(num_train_steps[id])) model.train() loss_history = [] acc_history = [] t_total = num_train_steps[id] if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = (BertAdam(optimizer_grouped_parameters[id], lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total)) for _ in range(int(args.num_train_epochs)): tr_loss = 0 tr_acc = 0 nb_tr_examples, nb_tr_steps = 0, 0 for inp, tgt in train_data[id].data_iter(): loss, acc = model(inp, tgt) # print(loss) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. acc = acc.sum() if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() # print(loss.shape) tr_loss += loss.item() tr_acc += acc.item() # print(tr_acc) nb_tr_examples += inp[-1].sum() nb_tr_steps += 1 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logging("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if (global_step % args.num_log_steps == 0): logging('step: {} | train loss: {} | train acc {}'.format( global_step, tr_loss / nb_tr_examples, tr_acc / nb_tr_examples)) loss_history.append([global_step, tr_loss]) acc_history.append([global_step, tr_acc]) tr_loss = 0 tr_acc = 0 nb_tr_examples = 0 save_history_path = "./Cord_Pic" end = time.time() print(end - start) loss_history = np.array(loss_history) acc_history = np.array(acc_history) np.save(save_history_path + '/' + bert_list[id] + '.loss_history.npy', loss_history) # 保存为.npy格式 np.save(save_history_path + '/' + bert_list[id] + '.acc_history.npy', acc_history) # 保存为.npy格式 # 读取 # a = np.load('a.npy') # a = a.tolist() if args.save_model: if args.ngpu > 1: print(' model is saving...... PATH:' + args.load_path + '/' + bert_list[id] + '_' + str( args.ngpu) + '.bin') torch.save(model, args.load_path + '/' + bert_list[id] + '_' + str(args.ngpu) + '.bin') else: print(' model is saving...... PATH:' + args.load_path + '/' + bert_list[id] + '.bin') torch.save(model, args.load_path + '/' + bert_list[id] + '.bin') if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logging("***** Running evaluation *****") logging(" Batch size = {}".format(args.eval_batch_size)) valid_data = [] for id, m in enumerate(bert_list): valid_data.append(data_utils.Loader(args.data_dir, data_file[id]['valid'], args.cache_size, args.eval_batch_size, device)) # Run prediction for full data for id, model in enumerate(model_list): out = [] for inp, tgt in valid_data[id].data_iter(shuffle=False): with torch.no_grad(): one_out = model(inp, tgt) out.append(one_out) output = torch.tensor([valid_data[id].data_num, 4]) for batch in range(int(valid_data[id].data_num / args.eval_batch_size)): output[batch * args.eval_batch_size : (batch + 1) * args.eval_batch_size] = out[batch] torch.save(output,bert_list[id] + '_out.pt')
def train(data_dir='data/memes/', dim_proj=256, dim_att=128, maxlen=30, batch_size=256, keep_ratio=1., shuffle_data=True, learning_rate=0.001, global_steps=50000, disp_freq=100, save_freq=100, test_freq=100, saveto_file='params.npz', tmsaveto_file='timeparams.npz', weight_decay=0.0005, sigmasqr=1, tdim=1., reload_model=False, train=True): """ Topo-LSTM model training. tdim: scale time down by how many times """ options = locals().copy() #savedstep = '0' saveto = data_dir + saveto_file tmsaveto = data_dir + tmsaveto_file # loads graph Gp, node_index = data_utils.load_graph(data_dir) #print nx.info(G) options['n_events'] = len(node_index) print options # creates and initializes shared variables. print 'Initializing variables...' params = init_params(options) if reload_model: print 'reusing saved model.' load_params(saveto, params) tparams = init_tparams(params) timeparams = init_timeparams(options) if reload_model: print 'reusing saved model.' load_params(tmsaveto, timeparams) timetparams = init_tparams(timeparams) # builds Topo-LSTM model print 'Building model...' model = tpgru_model.build_model(tparams, timetparams, options) print 'Loading test data...' test_examples = data_utils.load_examples(data_dir, dataset='test', node_index=node_index, maxlen=maxlen, Gp=Gp) test_loader = data_utils.Loader(test_examples, options=options) print 'Loaded %d test examples' % len(test_examples) if train: # prepares training data. print 'Loading train data...' train_examples = data_utils.load_examples( data_dir, dataset='train', keep_ratio=options['keep_ratio'], node_index=node_index, maxlen=maxlen, Gp=Gp) train_loader = data_utils.Loader(train_examples, options=options) print 'Loaded %d training examples.' % len(train_examples) # compiles updates. optimizer = downhill.build(algo='adam', loss=model['cost'], params=tparams.values(), inputs=model['data']) updates = optimizer.get_updates(max_gradient_elem=5., learning_rate=learning_rate) f_update = theano.function(model['data'], model['cost'], updates=list(updates)) toptimizer = downhill.build(algo='adam', loss=model['timecost'], params=timetparams.values(), inputs=model['timedata']) tupdates = toptimizer.get_updates(max_gradient_elem=5., learning_rate=0.005) f_t_update = theano.function(model['timedata'], model['timecost'], updates=list(tupdates)) # training loop. start_time = timeit.default_timer() n_examples = len(train_examples) batches_per_epoch = n_examples // options['batch_size'] + 1 n_epochs = global_steps // batches_per_epoch + 1 global_step = 0 #cost_history = [] for _ in range(n_epochs): for _ in range(batches_per_epoch): batch_data = train_loader() cost = f_update(*(batch_data[:-3] + (batch_data[-2], ))) #cost_history += [cost] timecost = f_t_update(*(batch_data[:-2] + (batch_data[-1], ))) if global_step % disp_freq == 0: print 'global step %d, cost: %f' % (global_step, cost) print 'timecost: %f' % (timecost) # dump model parameters. if global_step % save_freq == 0: params = unzip(tparams) np.savez(data_dir + saveto_file, **params) pickle.dump( options, open('%s.pkl' % (data_dir + saveto_file), 'wb'), -1) timeparams = unzip(timetparams) np.savez(data_dir + tmsaveto_file, **timeparams) # evaluate on test data. if global_step % test_freq == 0: scores = evaluate(model['f_prob'], test_loader, model['f_tprob'], options['tdim']) print 'eval scores: ', scores end_time = timeit.default_timer() print 'time used: %d seconds.' % (end_time - start_time) global_step += 1 scores = evaluate(model['f_prob'], test_loader, model['f_tprob'], options['tdim']) pprint.pprint(scores)