print() print('Best val loss: {:4f}'.format(best_loss)) print('Best val metrics: {}'.format(best_metrics)) print('At epoch: {:d}'.format(best_epoch)) try: del inputs, labels except: pass torch.cuda.empty_cache() return model, optimizer, epoch, loss # In[9]: tokenizer = MyTokenizer(cfg=cfg) # In[11]: ds = MNSAllDataset('../data/Kdd/train_all_processed_label.h5', neg_k=cfg['num_negative_sampling'], single_thread=False) val_ds = BasicAllDataset('../data/Kdd/valid_all_processed_label.h5', single_thread=True) # In[12]: train_size = len(ds) train_split = int(train_size * cfg['train_fraction']) train_indices = list(range(train_size)) train_sampler = data.sampler.SubsetRandomSampler(train_indices[:train_split])
#def LabelPassBert(): def read_label_info(path='../data/Kdd/multimodal_labels.txt'): num_to_label = [] with open(path, 'r') as f: l = f.readline() lines = f.readlines() for l in lines: words = l.strip().split('\t') num_to_label.append(words[1]) return num_to_label myTokenizer = MyTokenizer(cfg) def convert_label_to_token_id(class_num_to_label, mytokenizer): # lens = [] label_to_token_id = [] for v in class_num_to_label: # lens.append(len(tokenizer.tokenizer.tokenize(v))) # class label从0开始,label_to_token_id[k]表示label k 的token label_to_token_id.append( mytokenizer.convert_str_to_ids(v, tensor=True, max_len=512, pad=False).view(1, -1)) # print(np.percentile(lens, [0, 25, 50, 75, 95, 99, 100])) return label_to_token_id