def test_Batch(): b1 = Batch(None, None, 1500) b2 = Batch(None, None, 5) b3 = Batch(None, None, 123) batch_list = [b1, b2, b3] s = sorted(batch_list) assert s[0].batch_loss == 5 assert s[1].batch_loss == 123 assert s[2].batch_loss == 1500
def compute_loss(model, model_info, device, data, loss_fn): model.eval() all_losses = np.empty((0, 35)) # LOOP THROUGH MINIBATCHES for step, (x, y) in tqdm.tqdm(enumerate(ptb_iterator(data, model.batch_size, model.seq_len)), total=(len(data)//model.batch_size - 1)//model.seq_len): if model_info.model == 'TRANSFORMER': batch = Batch(torch.from_numpy(x).long().to(device)) model.zero_grad() outputs = model.forward(batch.data, batch.mask).transpose(1,0) else: inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda() model.zero_grad() hidden = model.init_hidden().to(device) outputs, hidden = model(inputs, hidden) # Target targets = torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous().to(device) # Loss computation outputs = outputs.contiguous() losses_in_batch = [] for output_t, target_t in zip(outputs, targets): losses_in_batch.append(loss_fn(output_t, target_t).data.item()) all_losses = np.vstack((all_losses, losses_in_batch)) # Return return np.mean(all_losses, axis=0)
def loadexpt(cellidx, filename, method, history, fraction=1., mean_adapt=False, roll=True): """ Loads an experiment from disk Parameters ---------- cellidx : int Index of the cell to load filename : string Name of the hdf5 file to load method : string The key in the hdf5 file to load ('train' or 'test') history : int Number of samples of history to include in the toeplitz stimulus fraction : float, optional Fraction of the experiment to load, must be between 0 and 1. (Default: 1.0) """ assert fraction > 0 and fraction <= 1, "Fraction of data to load must be between 0 and 1" # currently only works with the Oct. 07, 15 experiment expt = '15-10-07' with notify('Loading {}ing data'.format(method)): # load the hdf5 file f = h5py.File(os.path.join(datadirs[os.uname()[1]], expt, filename + '.h5'), 'r') # length of the experiment expt_length = f[method]['time'].size num_samples = int(np.floor(expt_length * fraction)) # load the stimulus stim = zscore(np.array(f[method]['stimulus'][:num_samples]).astype('float32')) # photoreceptor model of mean adaptation if mean_adapt: stim = pr_filter(10e-3, stim) # reshaped stimulus (nsamples, time/channel, space, space) if roll: stim_reshaped = np.rollaxis(np.rollaxis(rolling_window(stim, history, axis=0), 2), 3, 1) else: stim_reshaped = stim # get the response for this cell resp = np.array(f[method]['response/firing_rate_10ms'][cellidx, history:num_samples]) return Batch(stim_reshaped, resp)
def compute_loss_one_batch(model): if len(model.megabatch) == 0: if model.megabatch_anneal == 0: for i in range(model.max_megabatch_size): if model.curr_idx < len(model.mb): model.megabatch.append(model.mb[model.curr_idx][1]) model.curr_idx += 1 else: if model.increment and model.curr_megabatch_size < model.max_megabatch_size: model.curr_megabatch_size += 1 model.increment = False print("Increasing megabatch size to {0}".format( model.curr_megabatch_size)) for i in range(model.curr_megabatch_size): if model.curr_idx < len(model.mb): model.megabatch.append(model.mb[model.curr_idx][1]) model.curr_idx += 1 if model.curr_idx % model.megabatch_anneal == 0: model.increment = True megabatch = [] for n, i in enumerate(model.megabatch): arr = [model.data[t] for t in i] example_arr = [] for j in arr: example = (j[0], j[1]) if len(example[0].embeddings) > 0 and len( example[1].embeddings) > 0: example_arr.append(example) continue example[0].populate_embeddings(model.vocab, model.zero_unk, model.ngrams, model.scramble_rate) if not model.share_vocab: example[1].populate_embeddings(model.vocab_fr, model.zero_unk, model.ngrams, model.scramble_rate) else: example[1].populate_embeddings(model.vocab, model.zero_unk, model.ngrams, model.scramble_rate) example_arr.append(example) megabatch.append(example_arr) model.megabatch = megabatch if len(model.megabatch) == 0: return None sents1_list = [] sents2_list = [] sents1_lengths_list = [] sents2_lengths_list = [] for j in model.megabatch: sents1 = [i[0] for i in j] sents2 = [i[1] for i in j] sents_1_torch, lengths_1_torch = model.torchify_batch(sents1) if model.gpu: sents_1_torch = sents_1_torch.cuda() lengths_1_torch = lengths_1_torch.cuda() sents_2_torch, lengths_2_torch = model.torchify_batch(sents2) if model.gpu: sents_2_torch = sents_2_torch.cuda() lengths_2_torch = lengths_2_torch.cuda() sents1_list.append(sents_1_torch) sents2_list.append(sents_2_torch) sents1_lengths_list.append(lengths_1_torch) sents2_lengths_list.append(lengths_2_torch) p1_sents_list, p1_lengths_list, p2_sents_list, p2_lengths_list, = get_pairs_batch( model, sents1_list, sents1_lengths_list, sents2_list, sents2_lengths_list) model.megabatch = [] for i in range(len(p1_sents_list)): new_batch = Batch() new_batch.g1 = sents1_list[i] new_batch.g1_l = sents1_lengths_list[i] new_batch.g2 = sents2_list[i] new_batch.g2_l = sents2_lengths_list[i] new_batch.p1 = p1_sents_list[i] new_batch.p1_l = p1_lengths_list[i] new_batch.p2 = p2_sents_list[i] new_batch.p2_l = p2_lengths_list[i] model.megabatch.append(new_batch) curr_batch = model.megabatch.pop(0) g1, g2, p1, p2 = model.forward(curr_batch) return model.loss_function(g1, g2, p1, p2)
def run_epoch(model, data, is_train=False, device='cuda:0', n_devices=1): if is_train: model.train() # Set model to training mode print("Training..") phase = 'train' else: model.eval() # Set model to evaluate mode print("Evaluating..") phase = 'valid' start_time = time.time() loss = 0.0 total_loss = 0.0 total_tokens = 0 batch_tokens = 0.0 total_seqs = 0 tokens = 0 total_correct = 0.0 n_correct = 0.0 wer_score = 0.0 total_wer_score = 0.0 count = 0 gt = [] hyp = [] #For progress bar bar = progressbar.ProgressBar(maxval=dataset_sizes[phase], widgets=[ progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage() ]) bar.start() j = 0 #Loop over minibatches for step, (x, x_lengths, y, y_lengths, hand_regions, hand_lengths) in enumerate(data): #Update progress bar with every iter j += len(x) bar.update(j) y = torch.from_numpy(y).to(device) x = x.to(device) if (args.hand_query): hand_regions = hand_regions.to(device) else: hand_regions = None #NOTE: clone y to avoid overridding it batch = Batch(x_lengths, y_lengths, hand_lengths, trg=None, emb_type=args.emb_type, DEVICE=device, fixed_padding=args.fixed_padding, rel_window=args.rel_window) if (args.distributed): #Zeroing gradients feature_extractor.zero_grad() encoder.zero_grad() position.zero_grad() output_layer.zero_grad() src_emb, _, _ = feature_extractor(x) src_emb = position(src_emb) src_emb = encoder(src_emb, None, batch.src_mask) output_context = output_layer(src_emb) if (args.hand_query): hand_extractor.zero_grad() hand_emb = hand_extractor(hand_regions) hand_emb = position(hand_emb) hand_emb = encoder(hand_emb, None, batch.src_mask) output_hand = output_layer(hand_emb) comb_emb = encoder(src_emb, hand_emb, batch.rel_mask) output = output_layer(comb_emb) else: output = None output_hand = None else: #Zeroing gradients model.zero_grad() #Shape(batch_size, tgt_seq_length, tgt_vocab_size) #NOTE: no need for trg if we dont have a decoder output, output_context, output_hand = model.forward( x, batch.src_mask, batch.rel_mask, hand_regions, args.arch) #CTC loss expects (Seq, batch, vocab) if (args.hand_query): output = output.transpose(0, 1) output_context = output_context.transpose(0, 1) output_hand = output_hand.transpose(0, 1) else: output = output_context.transpose(0, 1) x_lengths = torch.IntTensor(x_lengths) y_lengths = torch.IntTensor(y_lengths) if (is_train == False): #Run CTC beam decoder using tensorflow #NOTE: blank token in Tensorflow must be (N-classes - 1) #Return tuple of sentences and probs decodes, _ = tf.nn.ctc_beam_search_decoder( inputs=output.cpu().detach().numpy(), sequence_length=x_lengths.cpu().detach().numpy(), merge_repeated=False, beam_width=10, top_paths=1) #Get top 1 path #(batch, Seq) pred = decodes[0] #Transform sparse tensor to numpy pred = tf.sparse.to_dense(pred).numpy() for i in range(len(y)): #NOTE: we are doing log inside ctcdecoder #pred = (seq, beam, batch) ys = y[i, :y_lengths[i]] p = pred[i] hyp = (' '.join([vocab[x.item()] for x in p])) gt = (' '.join([vocab[x.item()] for x in ys])) total_wer_score += wer(gt, hyp, standardize=True) count += 1 #output (Seq, batch, vocab_size) #y (batch, trg_size) #x_lengths (batch) #y_lengths (batch) #NOTE: produce Nan values if x length > y lengths #When extracting keyframes, make sure your src lengths are long enough or simply use zero infinity #Doing average loss here #IMPORTANT: Use Pytorch CTCloss #print(output.shape) #print(y.shape) loss = ctc_loss(output, y.cpu(), x_lengths.cpu(), y_lengths.cpu()) if (args.hand_query): loss += ctc_loss(output_context, y.cpu(), x_lengths.cpu(), y_lengths.cpu()) loss += ctc_loss(output_hand, y.cpu(), x_lengths.cpu(), y_lengths.cpu()) loss = loss / 3 total_loss += loss total_seqs += batch.seq total_tokens += (y != blank_index).data.sum() tokens += (y != blank_index).data.sum() batch_tokens += (y != blank_index).data.sum() if is_train: loss.backward() #Weight clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() if step % 100 == 0: elapsed = time.time() - start_time print( "Step: %d, Loss: %f, Frame per Sec: %f, Token per sec: %f" % (step, (loss / batch_tokens), total_seqs * batch_size / elapsed, tokens / elapsed)) start_time = time.time() total_seqs = 0 tokens = 0 batch_tokens = 0.0 #Free some memory #NOTE: this helps alot in avoiding cuda out of memory del loss, output, output_context, output_hand, y, hand_regions, batch if (is_train): print("Average Loss: %f" % (total_loss.item() / total_tokens.item())) return total_loss.item() / total_tokens.item() else: #Measure WER of all dataset print('Measuring WER..') print("Average WER: %f" % (total_wer_score / count)) return total_loss.item() / total_tokens.item(), total_wer_score / count
def run_epoch(model, data, is_train=False, device='cuda:0', n_devices=1): if is_train: model.train() # Set model to training mode print ("Training..") phase='train' else: model.eval() # Set model to evaluate mode print ("Evaluating..") phase='valid' start_time = time.time() loss = 0.0 total_loss = 0.0 total_tokens = 0 total_seqs = 0 tokens = 0 total_correct = 0.0 n_correct = 0.0 total_wer_score = 0.0 sentence_count = 0 targets = [] hypotheses = [] #For progress bar bar = progressbar.ProgressBar(maxval=dataset_sizes[phase], widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() j = 0 #Loop over minibatches for step, (x, x_lengths, y, y_lengths, gloss, gloss_lengths) in enumerate(data): #Update progress bar with every iter j += len(x) bar.update(j) if(type(gloss) != type(None)): gloss = torch.from_numpy(gloss).to(device) y = torch.from_numpy(y).to(device) x = x.to(device) #NOTE: clone y to avoid overridding it batch = Batch(x_lengths, y_lengths, None, y.clone(), emb_type=args.emb_type, DEVICE=device, fixed_padding=args.fixed_padding, rel_window=args.rel_window) model.zero_grad() #Return tuple of (output, encoder_output) #output = (batch_size, tgt_seq_length, tgt_vocab_size) #encoder_output = (batch_size, input_seq_length, trg_vocab_size) if(args.hybrid): output, encoder_output = model.forward(x, batch.trg, batch.src_mask, batch.trg_mask, batch.rel_mask, None) #CTC loss expects (batch, trg_seq, trg_vocab) encoder_output = encoder_output.transpose(0,1) else: output = model.forward(x, batch.trg, batch.src_mask, batch.trg_mask, batch.rel_mask, None) #Produce translation for blue score #Evaluate on dev if(is_train==False): x = Variable(x) translations = greedy_decode(model, x, None, batch.rel_mask, batch.src_mask, max_len=20, start_symbol=1, device=device) #Loop over batch to create sentences for i in range(len(y)): ys = y[i, :] ys = ys[ys != 0] #NOTE: keep eos ys = ys[1:] translation = translations[i] hyp_trans = [vocab[x.item()] for x in translation] gt_trans = [vocab[x.item()] for x in ys] translation_corpus.append(hyp_trans) #NOTE: required to list of list (since we have 1 reference for each gt sentence) reference_corpus.append([gt_trans]) x_lengths = torch.IntTensor(x_lengths) y_lengths = torch.IntTensor(y_lengths) if(type(gloss_lengths) != type(None)): gloss_lengths = torch.IntTensor(gloss_lengths) #Get CTCloss of batch without averaging if(args.hybrid): loss_ctc = ctc_loss(encoder_output, gloss.cpu(), x_lengths.cpu(), gloss_lengths.cpu()) #Remove sos tokens from y y = y[:, 1:] #Predicted words with highest prob _, pred = torch.max(output, dim=-1) #NOTE: dont count pad for i in range(y.shape[0]): n_correct += (pred[i, :y_lengths[i]-1] == y[i, :y_lengths[i]-1]).sum() #NOTE: The transformer is an auto-regressive model: it makes predictions one part at a time, #and uses its output so far to decide what to do next #Teacher forcing is passing the true output to the next time step regardless of what the model predicts at the current time step. #Input of decoder (with sos and without eos) #Target (without sos and with eos) #NOTE: pred must be same shape as y y = y.contiguous().view(-1) pred = pred.contiguous().view(-1) output = output.view(-1, vocab_size) assert y.shape == pred.shape #Get loss cross entropy (from decoder) of batch without averaging loss = loss_fn(output, y) if(args.hybrid): #Joint CTC/Decoder loss loss = loss + loss_ctc total_loss += loss total_seqs += batch.seq total_tokens += batch.ntokens tokens += batch.ntokens total_correct += n_correct if is_train: loss.backward() #Weight clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() if step % 100 == 0: elapsed = time.time() - start_time print("Step: %d, Loss: %f, Frame per Sec: %f, Token per sec: %f, Word Accuracy: %f" % (step, loss / batch.ntokens, total_seqs * batch_size / elapsed, tokens / elapsed, n_correct.item() / tokens.item())) start_time = time.time() total_seqs = 0 tokens = 0 n_correct = 0.0 #Free some memory #NOTE: this helps alot in avoiding cuda out of memory del loss, output, y if(is_train): print("Total word Accuracy: %f" % (total_correct.item() / total_tokens.item())) return total_loss.item() / total_tokens.item() else: return translation_corpus, reference_corpus, total_loss.item() / total_tokens.item(), total_correct.item() / total_tokens.item()
#Update progress bar with every iter i += len(x) bar.update(i) if (args.hand_query): hand_regions = hand_regions.to(device) else: hand_regions = None y = torch.from_numpy(y).to(device) x = x.to(device) batch = Batch(x_lengths, y_lengths, hand_lengths, trg=None, DEVICE=device, emb_type=args.emb_type, fixed_padding=None, rel_window=args.rel_window) #with torch.no_grad(): output, output_context, output_hand = model.forward( x, batch.src_mask, batch.rel_mask, hand_regions) #CTC loss expects (Seq, batch, vocab) if (args.hand_query): output = output.transpose(0, 1) output_context = output_context.transpose(0, 1) output_hand = output_hand.transpose(0, 1) else:
#Loop over minibatches for step, (x, x_lengths, y, y_lengths, gloss, gloss_lengths) in enumerate(dataloader): #Update progress bar with every iter i += len(x) bar.update(i) y = torch.from_numpy(y).to(device) x = x.to(device) batch = Batch(x_lengths, y_lengths, y, DEVICE=device, emb_type='2d', fixed_padding=None, rel_window=args.rel_window) with torch.no_grad(): #Return translation using our trained model translations = decoding(model, x, batch, None, start_symbol=1, max_len=args.decoding_length, method=args.decoding, n_beam=args.n_beam, device=device)
def cnndm_test_full(args, model, logger): model = AutoExtSummarizer(args) model.to(args.device) model.eval() tokenizer = AutoTokenizer.from_pretrained(args.model_name) train_dataset = CNNDMBlobNoTokens(prefix='test', data_path=args.data_dir, label_key=args.label_key) train_sampler = SequentialSampler(train_dataset) model_collate_fn = functools.partial(collate, pad_token_id=tokenizer.pad_token_id) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=1, collate_fn=single_collate, num_workers=args.num_workers) logger.info("***** Running CNNDM evaluation *****") logger.info(" Num examples = %d", len(train_dataset)) gold = [] pred = [] for batch in tqdm(train_dataloader, desc="Evaluating"): #, disable=True): summary = batch[0][2] story = batch[0][0] blocks = create_labeled_blocks(args, batch[0], tokenizer) block_scores = [] memory = None for block in blocks: _batch = Batch([block], pad_token_id=tokenizer.pad_token_id) source = _batch.src.to(args.device) encoder_mask = _batch.mask.to(args.device) clss = _batch.clss.to(args.device) cls_mask = _batch.mask_cls.to(args.device).bool() with torch.no_grad(): sent_scores, mask, memory = model(source, encoder_mask, clss, cls_mask, memory=memory) #Seperates padding from the ones that are actually 0 sent_scores = sent_scores + mask.float() sent_scores = sent_scores.cpu().data.numpy() block_scores.extend(sent_scores[0]) selected_ids = np.argsort(block_scores)[::-1] _pred = [] for i in selected_ids: candidate = story[i].strip() if (not _block_tri(candidate, _pred)): _pred.append(candidate) if len(_pred) == 3: break pred.append(_pred) gold.append(summary) #python rouge implementation rouge = Rouge() rouge_score = rouge.get_scores([" ".join(p) for p in pred], [" ".join(g) for g in gold], avg=True) rouge_score_formatted = format_rouge_scores(rouge_score) rouge_table = format_rouge_table(rouge_score) similarity_score = calc_sbert_similarity(pred, gold) #similarity_score = 0 print(rouge_score_formatted) print("Similarity score(sbert): %.3f" % similarity_score) print(rouge_table + " & %.3f" % similarity_score) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) rouge_output_file = os.path.join( args.output_dir, "cnndm_test_full_results_{}_{}.txt".format( args.model_name, os.path.basename(args.model_path).split(".")[0])) with open(rouge_output_file, 'w', encoding="utf-8") as f: f.write(rouge_score_formatted) f.write("Similarity score(sbert): %.3f\n" % similarity_score) f.write(rouge_table + " & %.3f" % (similarity_score))
def collate(batch, pad_token_id=0, device=None, is_test=False): return Batch(batch, pad_token_id=pad_token_id, device=device, is_test=is_test)
def collate(batch, pad_token_id=0, device=None): return Batch(batch, pad_token_id=pad_token_id, device=device)
def train_full(args, model, tokenizer, writer): """ Fine-tune the pretrained model on the corpus. """ set_seed(args) # Load the data args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataset = CNNDMBlobNoTokens(prefix='train', data_path=args.data_dir, label_key=args.label_key) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=1, collate_fn=single_collate, num_workers=args.num_workers) # Training schedule if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = t_total // ( len(train_dataloader) // args.gradient_accumulation_steps + 1) else: t_total = (len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs) ##Bertsum optimizer and scheduler if args.optim == 'bertsum': optimizer = build_optim(args, model, None) else: #Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=t_total * 0.1, num_training_steps=t_total) if 'score' in args.label_key: criterion = torch.nn.MSELoss(reduction='sum') else: criterion = torch.nn.BCELoss(reduction='sum') if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) model.zero_grad() train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True) global_step = 0 tr_loss = 0.0 logging_loss = 0.0 start_time = time.time() num_docs = 0 real_batch = [] for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") #, disable=True) for step, batch in enumerate(epoch_iterator): num_docs += 1 blocks = create_labeled_blocks(args, batch[0], tokenizer) free_slots = args.train_batch_size - len(real_batch) real_batch.extend(blocks[:free_slots]) if len(real_batch) == args.train_batch_size: _batch = Batch(real_batch, pad_token_id=tokenizer.pad_token_id) source, encoder_mask, target, clss, cls_mask = _batch.src, _batch.mask, _batch.labels, _batch.clss, _batch.mask_cls source = source.to(args.device) target = target.to(args.device) encoder_mask = encoder_mask.to(args.device) cls_mask = cls_mask.to(args.device).bool() clss = clss.to(args.device) model.train() outputs, mask, _ = model( source, encoder_mask, clss, cls_mask, ) #loss = criterion(outputs,target.float()) #sumloss = loss.sum(dim=1) #summask = mask.float().sum(dim=1) #loss = (sumloss / summask).sum() #loss = (sumloss / summask).mean() loss = criterion(outputs, target.float()) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training #Only do this if mean loss #if args.gradient_accumulation_steps > 1: # loss /= args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() real_batch = [] real_batch.extend(blocks[free_slots:]) if (step + 1) % args.gradient_accumulation_steps == 0: if args.max_grad_norm: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) optimizer.step() if args.optim != 'bertsum': scheduler.step() model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if global_step % args.logging_steps == 0: elapsed = time.time() - start_time logger.info("##STEP: %i", (global_step)) logger.info("Unscaled loss: %f", tr_loss) logger.info('Scaled loss: %f', (tr_loss / (global_step * args.train_batch_size * args.gradient_accumulation_steps))) if args.optim == 'bertsum': logger.info( "loss: %4.2f; lr: %7.7f; %3.0f docs/s;", (tr_loss - logging_loss) / args.logging_steps, optimizer.learning_rate, (global_step * args.train_batch_size * args.gradient_accumulation_steps) / elapsed, ) else: logger.info( "loss: %4.2f; lr: %7.7f; %3.0f docs/s;", (tr_loss - logging_loss) / args.logging_steps, scheduler.get_lr()[0], (global_step * args.train_batch_size * args.gradient_accumulation_steps) / elapsed, ) logger.info("num docs: %f", (num_docs)) logger.info("num docs: %f", (num_docs / elapsed)) if args.optim == 'bertsum': writer.add_scalar("train/lr", optimizer.learning_rate, global_step) else: writer.add_scalar('train/lr', scheduler.get_lr()[0], global_step) writer.add_scalar('train/loss', (tr_loss - logging_loss) / args.logging_steps, global_step), writer.add_scalar( 'train/loss_norm', tr_loss / (global_step * args.train_batch_size * args.gradient_accumulation_steps), global_step) logging_loss = tr_loss if global_step % args.eval_save_steps == 0 or global_step == 2000: if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) checkpoint_path = os.path.join( args.output_dir, "model_step_{}.bin".format(global_step)) checkpoint = model.state_dict() if args.n_gpu > 1: from collections import OrderedDict new_state_dict = OrderedDict() for k, v in checkpoint.items(): name = k[ 7:] # remove 'module.' of dataparallel new_state_dict[name] = v checkpoint = new_state_dict torch.save(checkpoint, checkpoint_path) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.optim == 'bertsum': writer.add_scalar("train/lr", optimizer.learning_rate, global_step) else: writer.add_scalar('train/lr', scheduler.get_lr()[0], global_step) writer.add_scalar('train/loss', (tr_loss - logging_loss) / args.logging_steps, global_step), writer.add_scalar( 'train/loss_norm', tr_loss / (global_step * args.train_batch_size * args.gradient_accumulation_steps), global_step) logging_loss = tr_loss checkpoint_path = os.path.join(args.output_dir, "model_step_{}.bin".format(global_step)) checkpoint = model.state_dict() if args.n_gpu > 1: from collections import OrderedDict new_state_dict = OrderedDict() for k, v in checkpoint.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v checkpoint = new_state_dict torch.save(checkpoint, checkpoint_path) torch.save( args, os.path.join(args.output_dir, "training_arguments_{}.bin".format(global_step))) torch.save( optimizer, os.path.join(args.output_dir, "optimizer_step_{}.bin".format(global_step))) return global_step, tr_loss / global_step
def __init__(self, cell_index, stimulus_type, loss, optimizer, mean_adapt): """ Superclass for managing keras models Parameters ---------- cell_index : int stimulus_type : string Either 'naturalscene' or 'whitenoise' loss : string or object, optional The loss function to use. (Default: poisson_loss) See http://keras.io/objectives/ for more information optimizer : string or object The optimizer to use. (Default: sgd) See http://keras.io/optimizers/ for more information """ # compile the model with notify('Compiling'): self.model.compile(loss=loss, optimizer=optimizer) # save architecture as a json file self.savedir = mksavedir(prefix=str(self)) with notify('Saving architecture'): with open(join(self.savedir, 'architecture.json'), 'w') as f: f.write(self.model.to_json()) # function to write data to a CSV file self.save_csv = partial(tocsv, join(self.savedir, 'performance')) self.save_csv(['Epoch', 'Iteration', 'Training CC', 'Test CC']) # load experimental data self.stimulus_type = stimulus_type if str(self) == 'lstm': numTime = self.stim_shape[0] self.holdout = loadexpt(cell_index, self.stimulus_type, 'test', self.stim_shape[1], mean_adapt=mean_adapt) self.training = loadexpt(cell_index, self.stimulus_type, 'train', self.stim_shape[1], mean_adapt=mean_adapt) X_train = self.training.X y_train = self.training.y X_test = self.holdout.X y_test = self.holdout.y numTrain = (int(X_train.shape[0] / numTime)) * numTime numTest = (int(X_test.shape[0] / numTime)) * numTime X_train = X_train[:numTrain] y_train = y_train[:numTrain] X_test = X_test[:numTest] y_test = y_test[:numTest] X_train = np.reshape( X_train, (int(numTrain / numTime), numTime, self.stim_shape[1], self.stim_shape[2], self.stim_shape[3])) y_train = np.reshape(y_train, (int(numTrain / numTime), numTime, 1)) X_test = np.reshape( X_test, (int(numTest / numTime), numTime, self.stim_shape[1], self.stim_shape[2], self.stim_shape[3])) y_test = np.reshape(y_test, (int(numTest / numTime), numTime, 1)) self.training = Batch(X_train, y_train) self.holdout = Batch(X_test, y_test) else: self.holdout = loadexpt(cell_index, self.stimulus_type, 'test', self.stim_shape[0], mean_adapt=mean_adapt) self.training = loadexpt(cell_index, self.stimulus_type, 'train', self.stim_shape[0], mean_adapt=mean_adapt) # save model information to a markdown file if 'architecture' not in self.__dict__: self.architecture = 'No architecture information specified' metadata = [ '# ' + str(self), '## ' + strftime('%B %d, %Y'), 'Started training on: ' + strftime('%I:%M:%S %p'), '### Architecture', self.architecture, '### Stimulus', 'Experiment 10-07-15', stimulus_type, 'Mean adaptation: ' + str(mean_adapt), 'Cell #{}'.format(cell_index), '### Optimization', str(loss), str(optimizer) ] tomarkdown(join(self.savedir, 'README'), metadata)
def compute_loss_one_batch(model): if len(model.megabatch) == 0: if model.megabatch_anneal == 0: for i in range(model.max_megabatch_size): if model.curr_idx < len(model.mb): model.megabatch.append(model.mb[model.curr_idx][1]) model.curr_idx += 1 else: if model.increment and model.curr_megabatch_size < model.max_megabatch_size: model.curr_megabatch_size += 1 model.increment = False print("Increasing megabatch size to {0}".format(model.curr_megabatch_size)) for i in range(model.curr_megabatch_size): if model.curr_idx < len(model.mb): model.megabatch.append(model.mb[model.curr_idx][1]) model.curr_idx += 1 if model.curr_idx % model.megabatch_anneal == 0: model.increment = True megabatch = [] for n, i in enumerate(model.megabatch): arr = [model.data[t] for t in i] example_arr = [] for j in arr: example = (BigExample(j[0], model.vocab, model.rev_vocab, model.scramble_rate), BigExample(j[1], model.vocab, model.rev_vocab, model.scramble_rate)) if model.args.debug: print("Logging Pairing: {0} {1}".format(j[0].sentence, j[1].sentence)) example_arr.append(example) megabatch.append(example_arr) model.megabatch = megabatch if len(model.megabatch) == 0: return None sents1_list = [] sents2_list = [] sents1_lengths_list = [] sents2_lengths_list = [] for j in model.megabatch: sents1 = [i[0] for i in j] sents2 = [i[1] for i in j] sents_1_torch, lengths_1_torch = model.torchify_batch(sents1) if model.gpu: sents_1_torch = sents_1_torch.cuda() lengths_1_torch = lengths_1_torch.cuda() sents_2_torch, lengths_2_torch = model.torchify_batch(sents2) if model.gpu: sents_2_torch = sents_2_torch.cuda() lengths_2_torch = lengths_2_torch.cuda() sents1_list.append(sents_1_torch) sents2_list.append(sents_2_torch) sents1_lengths_list.append(lengths_1_torch) sents2_lengths_list.append(lengths_2_torch) p1_sents_list, p1_lengths_list, p2_sents_list, p2_lengths_list, = get_pairs_batch(model, sents1_list, sents1_lengths_list, sents2_list, sents2_lengths_list) model.megabatch = [] for i in range(len(p1_sents_list)): new_batch = Batch() new_batch.g1 = sents1_list[i] new_batch.g1_l = sents1_lengths_list[i] new_batch.g2 = sents2_list[i] new_batch.g2_l = sents2_lengths_list[i] new_batch.p1 = p1_sents_list[i] new_batch.p1_l = p1_lengths_list[i] new_batch.p2 = p2_sents_list[i] new_batch.p2_l = p2_lengths_list[i] model.megabatch.append(new_batch) curr_batch = model.megabatch.pop(0) g1, g2, p1, p2 = model.forward(curr_batch) return model.loss_function(g1, g2, p1, p2)