def get_batch_train(source,label, i, seq_len, evaluation=False): """get the first 10 tracks in a session""" from tensorflow.contrib.keras import preprocessing # add mask to ignore skipped track and padding tracks skip_mask = ((label>=0) *(label<2)).long() source = source*skip_mask # reshape seq_len = min(seq_len, source.size(0) - 1 - i) data = source[i:int(i + seq_len/2)] # move 0 to left data = data.t() sessions_list = [] #pack_len = [] for session in data: # loop of batch size session_remove0 = session[session!=0] sessions_list.append(session_remove0) # # length filter # if len(session_remove0)>=5: # sessions_list.append(session_remove0) #pack_len.append(len(session_remove0)-1) # length 10 to 9 for next word data = preprocessing.sequence.pad_sequences(sessions_list,len(session),padding='post',truncating='post') data = torch.Tensor(data).long().t() if evaluation: data.requires_grad = False target = torch.cat([data[1:],torch.zeros(data.shape[1]).long().unsqueeze(0)]) return data, target
def train(epoch, data_source, label): model.train() total_loss = 0 start_time = time.time() ntokens = len(track_dic) for batch, i in enumerate(range(0, data_source.size(0) - 1, seq_len)): data, targets = get_batch_train(data_source, label, i, seq_len) data = data.t() #if data.shape[0]: # to prevent length filter in get_batch_train romove all batches optimizer.zero_grad() model.hidden = model.init_hidden( ) ### This is important, need to be init everytime output = model(data) output = output.transpose(0, 1) targets = targets.contiguous().view(-1) final_decoded = output.contiguous().view(-1, nout) # remove padding rows mask_targets = targets != 0 targets = targets[targets != 0] loc = torch.ByteTensor(mask_targets) #<IndexBackward> <ViewBackward> final_decoded = final_decoded[loc] # mask_decoded = mask_targets.unsqueeze(1).repeat(1, final_decoded.shape[1]) # final_decoded = final_decoded*mask_decoded.float() if final_decoded.shape[0]: # loss = criterion(final_decoded, track_weight[targets]) loss.backward(retain_graph=True) optimizer.step() total_loss += loss.data if batch % log_interval == 0 and batch > 0: cur_loss = total_loss.item() / log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} |' 'raw_loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(data_source) // seq_len, lr, elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() output = None targets = None final_decoded = None return None
def rank(data_source, label): model.eval() with torch.no_grad(): ndcg_acc = 0 ndcg_count = 0 ntokens = len(track_dic) batch_size = data_source.size(1) for i in range(0, data_source.size(0) - 1, seq_len): data, targets = get_batch_past(data_source, label, i, seq_len, evaluation=True) data = data.t() targets = targets.t() tracks_future, targets_future = get_batch_future(data_source, label, i, seq_len, evaluation=True) tracks_future = tracks_future.t() targets_future = targets_future.t() for j in range(batch_size): track_f = tracks_future[j] # remove padding elements track_f = track_f[track_f!=0] score = [] for ii in tracks_future[j]: if ii!=0: score.append(track_features['us_popularity_estimate'][int(ii)]) # get data frame without padding element df_future = pd.DataFrame({'track':np.array(track_f),'score':np.array(score),'skip_info':np.array(targets_future[j][0:len(track_f)])}) # remove padding elements df_future = df_future.loc[df_future['track']!=0] # sort by popularity df_future = df_future.sort_values(by = 'score',ascending=False) #0.8090114383851303 #sort to the worst case #df_future = df_future.sort_values(by = 'skip_info',ascending=False) #0.6693316113866979 # NDCG actual = dcg_score(df_future['skip_info']) best = dcg_score(df_future['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg = actual/best ndcg_acc = ndcg_acc + ndcg else: # avoid nan ndcg_acc = ndcg_acc + 1 ndcg_count = ndcg_count+1 ndcg_avg = ndcg_acc/ndcg_count return ndcg_avg
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() with torch.no_grad(): total_loss = 0 ntokens = len(corpus.dictionary) batch_size = data_source.size(1) hidden = model.init_hidden(batch_size) eff_history_mode = (args.seq_len > args.horizon and not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon processed_data_size = 0 for i in range(0, data_source.size(0) - 1, validseqlen): eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= data_source.size(0) - 1: continue data, targets = get_batch(data_source, i, seq_len, evaluation=True) if args.repack: hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(data.size(1)) data = data.t() net = nn.DataParallel( model, device_ids=devices) if batch_size > 10 else model (_, output, decoded), hidden, _ = net(data, hidden) decoded = decoded.transpose(0, 1) targets = targets[eff_history:].contiguous().view(-1) final_decoded = decoded[eff_history:].contiguous().view( -1, ntokens) loss = criterion(final_decoded, targets) loss = loss.data total_loss += (data.size(1) - eff_history) * loss processed_data_size += data.size(1) - eff_history output = None decoded = None targets = None final_output = None final_decoded = None return total_loss.item() / processed_data_size
def evaluate(data_source, verbose=False): # Turn on evaluation mode which disables dropout. if verbose: from collections import Counter counter = Counter() train_file = f"{args.data}/train.txt" lines = [ counter.update(line.strip().split()) for line in open(train_file, 'r').readlines() ] # fh_out = open(args.verbose_test_file, "w") verbose_criterion = nn.CrossEntropyLoss(reduce=False) model.eval() total_loss = 0. total_freq_loss = 0. total_freq_count = 0 total_infreq_loss = 0. total_infreq_count = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).item() hidden = repackage_hidden(hidden) if verbose: verbose_loss = verbose_criterion(output_flat, targets) verbose_loss = verbose_loss.view(data.size(0), -1) print_contents, [freq_loss, freq_count], [ infreq_loss, infreq_count ] = verbose_test(corpus.dictionary.idx2word, counter, data.t(), verbose_loss.t()) total_freq_loss += freq_loss total_freq_count += freq_count total_infreq_loss += infreq_loss total_infreq_count += infreq_count # for print_line in print_contents: # fh_out.write(f"{print_line}\n") if verbose: # fh_out.close() return math.exp(total_freq_loss / total_freq_count), math.exp( total_infreq_loss / total_infreq_count) return total_loss / len(data_source)
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 batch_size = data_source.size(1) hidden = model.init_hidden(batch_size) eff_history_mode = (args.seq_len > args.horizon and not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon processed_data_size = 0 for i in range(0, data_source.size(0) - 1, validseqlen): eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= data_source.size(0) - 1: continue data, targets = get_batch(data_source, i, seq_len, evaluation=True) if args.repack: hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(data.size(1)) data = data.t() net = nn.DataParallel(model) if batch_size > 10 else model (_, _, output), hidden, _ = net(data, hidden, decode=False) output = output.transpose(0, 1) targets = targets[eff_history:].contiguous().view(-1) final_output = output[eff_history:].contiguous().view(-1, output.size(2)) loss = criterion(model.decoder.weight, model.decoder.bias, final_output, targets) #loss = loss.data total_loss += (data.size(1) - eff_history) * float(loss) processed_data_size += data.size(1) - eff_history del loss, data, targets gc.collect() torch.cuda.empty_cache() data = None output = None targets = None final_output = None return total_loss / processed_data_size
def evaluate(data_source, label_, session_feature): model.eval() with torch.no_grad(): total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) processed_data_size = 0 for i in range(0, data_source.size(0) - 1, seq_len): data, targets, label, sf = get_batch_train(data_source, label_, session_feature, i, seq_len) data = data.t() sf = sf.t() model.hidden = model.init_hidden() output = model(data, sf) output = output.transpose(0, 1) targets = targets.contiguous().view(-1) label = label.contiguous().view(-1) final_decoded = output.contiguous().view(-1, nout + 1) # remove padding rows # mask_targets = targets!=0 # targets = targets[targets!=0] # label= label[label!=-1] # loc = torch.ByteTensor(mask_targets) #<IndexBackward> <ViewBackward> # final_decoded = final_decoded[loc] if final_decoded.shape[0]: # final_targets = torch.cat( [track_weight[targets], label.unsqueeze(1).float()], dim=1) loss = criterion(final_decoded, final_targets) ###### loss = loss.data total_loss += data.size(1) * loss processed_data_size += data.size(1) output = None targets = None final_decoded = None return total_loss.item() / processed_data_size
def evaluate(loader): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i, batch in enumerate(loader): (data, targets) = batch data = data.t() targets = targets.t() output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets.flatten()).item() hidden = repackage_hidden(hidden) return total_loss / (len(loader) - 1)
def evaluate(data_source, label): model.eval() with torch.no_grad(): total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) processed_data_size = 0 for i in range(0, data_source.size(0) - 1, seq_len): data, targets = get_batch_train(data_source, label, i, seq_len, evaluation=True) data = data.t() model.hidden = model.init_hidden( data.shape[0]) # change the batch size output = model(data) output = output.transpose(0, 1) targets = targets.contiguous().view(-1) final_decoded = output.contiguous().view(-1, nout) # remove padding rows mask_targets = targets != 0 targets = targets[targets != 0] loc = torch.ByteTensor( mask_targets) #<IndexBackward> <ViewBackward> final_decoded = final_decoded[loc] if final_decoded.shape[0]: # loss = criterion(final_decoded, track_weight[targets]) ###### loss = loss.data total_loss += data.size(1) * loss processed_data_size += data.size(1) output = None targets = None final_decoded = None return total_loss.item() / processed_data_size
def evaluate(data_source, label_, session_feature): model.eval() with torch.no_grad(): total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) processed_data_size = 0 for i in range(0, data_source.size(0) - 1, seq_len): data, targets, label, sf = get_batch_train(data_source, label_, session_feature, i, seq_len) data = data.t() sf = sf.t() model.hidden = model.init_hidden(data.shape[0]) output = model(data, sf) output = output.transpose(0, 1) targets = targets.contiguous().view(-1) label = label.contiguous().view(-1) final_decoded = output.contiguous().view(-1, ntokens) # remove skipped rows mask_targets = label < 2 loc = torch.ByteTensor( mask_targets) #<IndexBackward> <ViewBackward> targets = targets[loc] final_decoded = final_decoded[loc] if final_decoded.shape[0]: # loss = criterion(final_decoded, targets) ###### loss = loss.data total_loss += data.size(1) * loss processed_data_size += data.size(1) output = None targets = None final_decoded = None return total_loss.item() / processed_data_size
def rank(data_source, label): model.eval() with torch.no_grad(): ndcg_acc = 0 ndcg_count = 0 ndcg_acc_1 = 0 ndcg_count_1 = 0 ndcg_acc_2 = 0 ndcg_count_2 = 0 total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) for i in range(0, data_source.size(0) - 1, seq_len): data, targets = get_batch_past(data_source, label, i, seq_len, evaluation=True) data = data.t() targets = targets.t() tracks_future, targets_future = get_batch_future(data_source, label, i, seq_len, evaluation=True) tracks_future = tracks_future.t() targets_future = targets_future.t() #music_rnn; music_lstm model.hidden = model.init_hidden() rank_vec = model(data)[:, -1, :] # batch, ntokens [12, 1, 50704] # advoid for loop # advoid for loop for j in range(batch_size): track_f = tracks_future[j] # remove padding elements #track_f = track_f[track_f!=0] cos = nn.CosineSimilarity(dim=1, eps=1e-6) score = cos(rank_vec[j].unsqueeze(0), track_weight[track_f]) # get data frame without padding element df_future = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) # remove padding elements df_future = df_future.loc[df_future['track'] != 0] # sort tracks_future according to score df_future = df_future.sort_values( by='score', ascending=False) #0.8154440681444343 #0.8227163023038474 #df_future = df_future.sample(frac=1) # 0.8115378563756852 #0.7787248338261271 # NDCG actual = dcg_score(df_future['skip_info']) best = dcg_score( df_future['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg = actual / best ndcg_acc = ndcg_acc + ndcg else: # avoid nan ndcg_acc = ndcg_acc + 1 ndcg_count = ndcg_count + 1 if (targets[j] == 0).sum() < x: track_f = tracks_future[j] cos = nn.CosineSimilarity(dim=1, eps=1e-6) score_1 = cos(rank_vec[j].unsqueeze(0), track_weight[track_f]) df_future_1 = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score_1), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) df_future_1 = df_future_1.loc[df_future_1['track'] != 0] df_future_1 = df_future_1.sort_values( by='score', ascending=False ) #0.8154440681444343 #0.8227163023038474 # NDCG actual_1 = dcg_score(df_future_1['skip_info']) best_1 = dcg_score( df_future_1['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg_1 = actual_1 / best_1 ndcg_acc_1 = ndcg_acc_1 + ndcg_1 else: # avoid nan ndcg_acc_1 = ndcg_acc_1 + 1 ndcg_count_1 = ndcg_count_1 + 1 else: track_f = tracks_future[j] cos = nn.CosineSimilarity(dim=1, eps=1e-6) score_2 = cos(rank_vec[j].unsqueeze(0), track_weight[track_f]) df_future_2 = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score_2), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) df_future_2 = df_future_2.loc[df_future_2['track'] != 0] df_future_2 = df_future_2.sort_values( by='score', ascending=False ) #0.8154440681444343 #0.8227163023038474 # NDCG actual_2 = dcg_score(df_future_2['skip_info']) best_2 = dcg_score( df_future_2['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg_2 = actual_2 / best_2 ndcg_acc_2 = ndcg_acc_2 + ndcg_2 else: # avoid nan ndcg_acc_2 = ndcg_acc_2 + 1 ndcg_count_2 = ndcg_count_2 + 1 ndcg_avg = ndcg_acc / ndcg_count ndcg_avg_1 = ndcg_acc_1 / ndcg_count_1 ndcg_avg_2 = ndcg_acc_2 / ndcg_count_2 return ndcg_avg, ndcg_avg_1, ndcg_avg_2
def evaluate(data_source, label): # Turn on evaluation mode which disables dropout. model.eval( ) # will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode. with torch.no_grad( ): # impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script). total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) hidden = model.init_hidden(batch_size) eff_history_mode = (args.seq_len > args.horizon and not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon processed_data_size = 0 for i in range(0, data_source.size(0) - 1, args.seq_len): eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= data_source.size(0) - 1: continue data, targets = get_batch_train(data_source, label, i, seq_len, evaluation=True) data = data.t() if args.repack: # repack is not work in my new model since each batch batch_size is different hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(data.size(0)) net = nn.DataParallel(model) if batch_size > 10 else model (_, output, decoded), hidden, _ = net( data, hidden) ####### output; decoded => vector decoded = decoded.transpose(0, 1) #### targets = targets[eff_history:].contiguous().view(-1) final_decoded = decoded[eff_history:].contiguous().view( -1, ntokens) # remove padding rows mask_targets = targets != 0 targets = targets[targets != 0] loc = torch.ByteTensor( mask_targets) #<IndexBackward> <ViewBackward> final_decoded = final_decoded[loc] if final_decoded.shape[0]: # loss = criterion(final_decoded, targets) ###### loss = loss.data total_loss += (data.size(1) - eff_history) * loss processed_data_size += data.size(1) - eff_history output = None decoded = None targets = None final_output = None final_decoded = None return total_loss.item() / processed_data_size
def train(epoch, data_source, label): model.train() total_loss = 0 total_aux_losses = 0 start_time = time.time() ntokens = len(track_dic) hidden = model.init_hidden(args.batch_size) eff_history_mode = (args.seq_len > 0 or not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon for batch, i in enumerate(range(0, data_source.size(0) - 1, args.seq_len)): # When not using repackaging mode, we DISCARD the first arg.horizon outputs in backprop (which are # the "effective history". eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= data_source.size(0) - 1: continue data, targets = get_batch_train(data_source, label, i, seq_len) data = data.t() #if data.shape[0]:# to prevent length filter in get_batch_train romove all batches if args.repack: # repack is not work in my new model since each batch batch_size is different hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(data.size(0)) optimizer.zero_grad() net = nn.DataParallel(model) if data.size(0) > 10 else model (raw_output, output, decoded), hidden, all_decoded = net( data, hidden) ##########################slow # raw_output.shape = [12, 20, 29] # output.shape = [12, 20, 29] # decoded.shape = [12, 20, 50704] # hidden[0].shape = hidden[1].shape = [12, 1029, 1] # all_decoded.shape = [12, 1, 20, 50704] decoded = decoded.transpose(0, 1) # decoded.shape = [20, 12, 50704] targets = targets[eff_history:].contiguous().view(-1) # targets.shape = torch.Size([180]) final_decoded = decoded[eff_history:].contiguous().view(-1, ntokens) #final_decoded.shape = torch.Size([180, 50704]) # remove padding rows mask_targets = targets != 0 targets = targets[targets != 0] loc = torch.ByteTensor(mask_targets) #<IndexBackward> <ViewBackward> final_decoded = final_decoded[loc] if final_decoded.shape[0]: # Loss 1: CE loss raw_loss = criterion(final_decoded, targets) # #qiqi's check # if raw_loss>20: # print(raw_loss) # print(batch) # print(i) # print(final_decoded.shape) # print(targets.shape) # Loss 2: Aux loss aux_losses = 0 if args.aux > 0: all_decoded = all_decoded[:, :, eff_history:].permute( 1, 2, 0, 3).contiguous() # (N, M, L, C) --> (M, L, N, C) aux_size = all_decoded.size(0) all_decoded = all_decoded.view(aux_size, -1, ntokens) # remove padding rows all_decoded = all_decoded.transpose(0, 1)[loc].transpose(0, 1) aux_losses = args.aux * sum([ criterion(all_decoded[i], targets) for i in range(aux_size) ]) # Combine losses loss = raw_loss + aux_losses #+ alpha_loss + beta_loss loss.backward( retain_graph=True) #####################################slow torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data if args.aux: total_aux_losses += aux_losses.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval cur_aux_loss = total_aux_losses.item( ) / args.log_interval if args.aux else 0 elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'raw_loss {:5.2f} | aux_loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(data_source) // validseqlen, lr, elapsed * 1000 / args.log_interval, cur_loss, cur_aux_loss, math.exp(cur_loss))) total_loss = 0 total_aux_losses = 0 start_time = time.time() # if batch % args.log_interval == 0 and batch > 0: # cur_loss = total_loss.item() / args.log_interval # cur_aux_loss = total_aux_losses.item() / args.log_interval if args.aux else 0 # elapsed = time.time() - start_time # # try: # print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'raw_loss {:5.2f} | aux_loss {:5.2f} | ppl {:8.2f}'.format( # epoch, batch, len(data_source) // validseqlen, lr, # elapsed * 1000 / args.log_interval, # cur_loss, cur_aux_loss, math.exp(cur_loss))) # except: # print(batch) # print(i) # print(cur_loss) # # total_loss = 0 # total_aux_losses = 0 # start_time = time.time() # sys.stdout.flush() raw_output = None output = None decoded = None targets = None final_output = None final_decoded = None all_decoded = None all_outputs = None final_raw_output = None return None
def train(epoch): model.train() total_loss = 0 total_aux_losses = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) eff_history_mode = (args.seq_len > args.horizon and not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon for batch, i in enumerate(range(0, train_data.size(0) - 1, validseqlen)): # When not using repackaging mode, we DISCARD the first arg.horizon outputs in backprop (which are # the "effective history". eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= train_data.size(0) - 1: continue data, targets = get_batch(train_data, i, seq_len) if args.repack: hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(args.batch_size) optimizer.zero_grad() data = data.t() net = nn.DataParallel(model) if data.size(0) > 10 else model (raw_output, _, output), hidden, all_outputs = net(data, hidden, decode=False) raw_output = raw_output.transpose(0, 1) output = output.transpose(0, 1) targets = targets[eff_history:].contiguous().view(-1) final_output = output[eff_history:].contiguous().view( -1, output.size(2)) dec_weight, dec_bias = model.decoder.weight, model.decoder.bias # Loss 1: CE loss raw_loss = criterion(dec_weight, dec_bias, final_output, targets) # Loss 2: Aux loss aux_losses = 0 if args.aux > 0: all_outputs = all_outputs[:, :, eff_history:].permute(1, 2, 0, 3).contiguous() aux_size = all_outputs.size(0) # The number of auxiliary losses all_outputs = all_outputs.view(aux_size, -1, all_outputs.size(3)) aux_losses = args.aux * sum([ criterion(dec_weight, dec_bias, all_outputs[i], targets) for i in range(aux_size) ]) # Combine losses loss = raw_loss + aux_losses loss.backward() torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data if args.aux: total_aux_losses += aux_losses.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval cur_aux_loss = total_aux_losses.item( ) / args.log_interval if args.aux else 0 elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'raw_loss {:5.2f} | aux_loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // validseqlen, lr, elapsed * 1000 / args.log_interval, cur_loss, cur_aux_loss, math.exp(cur_loss))) total_loss = 0 total_aux_losses = 0 start_time = time.time() sys.stdout.flush() data = None raw_output = None output = None targets = None final_output = None all_outputs = None
def train(cumulative_steps=None, cumulative_time=None): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if args.distributed: hidden = model.module.init_hidden(args.batch_size) else: hidden = model.init_hidden(args.batch_size) done = False for i, batch in enumerate(train_loader): total_duration_tracker_start = time.time() # Batch size should be the second dimension, not first. (data, targets) = batch data = data.t() targets = targets.t() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) # Shape of output and targets need to align. loss = criterion(output.view(-1, ntokens), targets.flatten()) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, i, len(train_loader), lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if cumulative_steps is not None: cumulative_steps += 1 if (args.throughput_estimation_interval is not None and cumulative_steps % args.throughput_estimation_interval == 0): print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), cumulative_steps)) if args.steps is not None and cumulative_steps >= args.steps: done = True break if args.max_duration is not None: cumulative_time += time.time() - total_duration_tracker_start total_duration_tracker_start = time.time() if cumulative_time >= args.max_duration: done = True break return (cumulative_steps, cumulative_time, done)
def rank(data_source, label): model.eval() with torch.no_grad(): ndcg_acc = 0 ndcg_count = 0 ndcg_acc_1 = 0 ndcg_count_1 = 0 ndcg_acc_2 = 0 ndcg_count_2 = 0 total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) for i in range(0, data_source.size(0) - 1, seq_len): data, data_skiped, targets = get_batch_past(data_source, label, i, seq_len, evaluation=True) data = data.t() data_skiped = data_skiped.t() targets = targets.t() tracks_future, targets_future = get_batch_future(data_source, label, i, seq_len, evaluation=True) tracks_future = tracks_future.t() targets_future = targets_future.t() rank_vec = model(data)[:, -1, :] rank_vec_skipped = model(data_skiped)[:, -1, :] # advoid for loop for j in range(batch_size): if (targets[j] == 0).sum() <= 5: track_f = tracks_future[j] # remove padding elements #track_f = track_f[track_f!=0] score = rank_vec[j][track_f] # get data frame without padding element df_future = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) # remove padding elements df_future = df_future.loc[df_future['track'] != 0] # sort tracks_future according to score df_future = df_future.sort_values( by='score', ascending=False ) #0.8154440681444343 #0.8227163023038474 #df_future = df_future.sample(frac=1) # 0.8115378563756852 #0.7787248338261271 # NDCG actual = dcg_score(df_future['skip_info']) best = dcg_score( df_future['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg = actual / best ndcg_acc = ndcg_acc + ndcg else: # avoid nan ndcg_acc = ndcg_acc + 1 ndcg_count = ndcg_count + 1 else: track_f = tracks_future[j] # remove padding elements #track_f = track_f[track_f!=0] score = rank_vec_skipped[j][track_f] # get data frame without padding element df_future = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) # remove padding elements df_future = df_future.loc[df_future['track'] != 0] # sort tracks_future according to score df_future = df_future.sort_values( by='score', ascending=True ) #0.8154440681444343 #0.8227163023038474 #df_future = df_future.sample(frac=1) # 0.8115378563756852 #0.7787248338261271 # NDCG actual = dcg_score(df_future['skip_info']) best = dcg_score( df_future['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg = actual / best ndcg_acc = ndcg_acc + ndcg else: # avoid nan ndcg_acc = ndcg_acc + 1 ndcg_count = ndcg_count + 1 ndcg_avg = ndcg_acc / ndcg_count return ndcg_avg
def train(epoch): model.train() total_loss = 0 total_aux_losses = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) eff_history_mode = (args.seq_len > 0 or not args.repack) if eff_history_mode: validseqlen = args.seq_len - args.horizon seq_len = args.seq_len else: validseqlen = args.horizon seq_len = args.horizon for batch, i in enumerate(range(0, train_data.size(0) - 1, validseqlen)): # When not using repackaging mode, we DISCARD the first arg.horizon outputs in backprop (which are # the "effective history". eff_history = args.horizon if eff_history_mode else 0 if i + eff_history >= train_data.size(0) - 1: continue data, targets = get_batch(train_data, i, seq_len) if args.repack: hidden = repackage_hidden(hidden) else: hidden = model.init_hidden(args.batch_size) optimizer.zero_grad() data = data.t() net = nn.DataParallel(model) if data.size(0) > 10 else model (raw_output, output, decoded), hidden, all_decoded = net(data, hidden) decoded = decoded.transpose(0, 1) targets = targets[eff_history:].contiguous().view(-1) final_decoded = decoded[eff_history:].contiguous().view(-1, ntokens) # Loss 1: CE loss raw_loss = criterion(final_decoded, targets) # Loss 2: Aux loss aux_losses = 0 if args.aux > 0: all_decoded = all_decoded[:, :, eff_history:].permute( 1, 2, 0, 3).contiguous() # (N, M, L, C) --> (M, L, N, C) aux_size = all_decoded.size(0) all_decoded = all_decoded.view(aux_size, -1, ntokens) aux_losses = args.aux * sum( [criterion(all_decoded[i], targets) for i in range(aux_size)]) # Loss 3: AR & TAR alpha_loss = 0 beta_loss = 0 if args.alpha > 0: output = output.transpose(0, 1) final_output = output[eff_history:] alpha_loss = args.alpha * final_output.pow(2).mean() if args.beta > 0: raw_output = raw_output.transpose(0, 1) final_raw_output = raw_output[eff_history:] beta_loss = args.beta * (final_raw_output[1:] - final_raw_output[:-1]).pow(2).mean() # Combine losses loss = raw_loss + aux_losses + alpha_loss + beta_loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data if args.aux: total_aux_losses += aux_losses.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval cur_aux_loss = total_aux_losses[ 0] / args.log_interval if args.aux else 0 elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'raw_loss {:5.2f} | aux_loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // validseqlen, lr, elapsed * 1000 / args.log_interval, cur_loss, cur_aux_loss, math.exp(cur_loss))) total_loss = 0 total_aux_losses = 0 start_time = time.time() sys.stdout.flush() raw_output = None output = None decoded = None targets = None final_output = None final_decoded = None all_decoded = None all_outputs = None final_raw_output = None
def rank(data_source, label_, sessions_feature): model.eval() with torch.no_grad(): ndcg_acc = 0 ndcg_count = 0 ndcg_acc_1 = 0 ndcg_count_1 = 0 ndcg_acc_2 = 0 ndcg_count_2 = 0 total_loss = 0 ntokens = len(track_dic) batch_size = data_source.size(1) for i in range(0, data_source.size(0) - 1, seq_len): data, label, sf = get_batch_past(data_source, label_, sessions_feature, i, seq_len, evaluation=True) data = data.t() label = label.t() sf = sf.t() tracks_future, targets_future = get_batch_future(data_source, label_, i, seq_len, evaluation=True) tracks_future = tracks_future.t() targets_future = targets_future.t() #music_rnn; music_lstm hidden = model.init_hidden() rank_vec = model(data, sf)[:, -1, :] #rank_vec = model(data,sf,hidden)[0][2][:,-1,:] # advoid for loop # advoid for loop for j in range(batch_size): track_f = tracks_future[j] # remove padding elements #track_f = track_f[track_f!=0] score = rank_vec[j][track_f] # get data frame without padding element df_future = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) # remove padding elements df_future = df_future.loc[df_future['track'] != 0] # sort tracks_future according to score df_future = df_future.sort_values( by='score', ascending=False) #0.8154440681444343 #0.8227163023038474 #df_future = df_future.sample(frac=1) # 0.8115378563756852 #0.7787248338261271 # NDCG actual = dcg_score(df_future['skip_info']) best = dcg_score( df_future['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg = actual / best ndcg_acc = ndcg_acc + ndcg else: # avoid nan ndcg_acc = ndcg_acc + 1 ndcg_count = ndcg_count + 1 if (label[j] >= 2).sum() < x: # no more than 5 skip track_f = tracks_future[j] score_1 = rank_vec[j][track_f] df_future_1 = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score_1), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) df_future_1 = df_future_1.loc[df_future_1['track'] != 0] df_future_1 = df_future_1.sort_values( by='score', ascending=False ) #0.8154440681444343 #0.8227163023038474 # NDCG actual_1 = dcg_score(df_future_1['skip_info']) best_1 = dcg_score( df_future_1['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg_1 = actual_1 / best_1 ndcg_acc_1 = ndcg_acc_1 + ndcg_1 else: # avoid nan ndcg_acc_1 = ndcg_acc_1 + 1 ndcg_count_1 = ndcg_count_1 + 1 else: track_f = tracks_future[j] score_2 = rank_vec[j][track_f] df_future_2 = pd.DataFrame({ 'track': np.array(track_f), 'score': np.array(score_2), 'skip_info': np.array(targets_future[j][0:len(track_f)]) }) df_future_2 = df_future_2.loc[df_future_2['track'] != 0] df_future_2 = df_future_2.sort_values( by='score', ascending=False ) #0.8154440681444343 #0.8227163023038474 # NDCG actual_2 = dcg_score(df_future_2['skip_info']) best_2 = dcg_score( df_future_2['skip_info'].sort_values(ascending=True)) if best: #best might be 0, while skip_info is 3,3,3,.... ndcg_2 = actual_2 / best_2 ndcg_acc_2 = ndcg_acc_2 + ndcg_2 else: # avoid nan ndcg_acc_2 = ndcg_acc_2 + 1 ndcg_count_2 = ndcg_count_2 + 1 ndcg_avg = ndcg_acc / ndcg_count ndcg_avg_1 = ndcg_acc_1 / ndcg_count_1 ndcg_avg_2 = ndcg_acc_2 / ndcg_count_2 return ndcg_avg, ndcg_avg_1, ndcg_avg_2