def Encoding_table(table_data, data_type, id): args = parse_opt() num_table = len(table_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') model.to(device) model = DDP(model) tables = [] for i in range(num_table): table = table_data[i] table_cell_list = [] for row in range(len(table)): for col in range(len(table[row])): table_cell_list.append(table[row][col]) tables.append(table_cell_list) table_out = [] for m in range(num_table): table_tokens = tokenizer(tables[m], padding='max_length', truncation=True, max_length=args.table_max_len, return_tensors='pt') table_tokens.to(device) out = model(**table_tokens).last_hidden_state table_out.append(out) torch.save( table_out, args.data + "/processed_datasets/embedding_data/{}_table_{}.pt".format( data_type, id))
def Encoding_sentence(sent_data, data_type, id): args = parse_opt() num_sent = len(sent_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') model.to(device) model = DDP(model) supports = [] statements = [] for i in range(num_sent): supports.append(sent_data[i][1]) statements.append(sent_data[i][2]) support_tokens = tokenizer(supports, padding='max_length', truncation=True, max_length=args.sent_max_len, return_tensors='pt') statement_tokens = tokenizer(statements, padding='max_length', truncation=True, max_length=args.sent_max_len, return_tensors='pt') support_tokens.to(device) statement_tokens.to(device) support_out = model(**support_tokens).last_hidden_state statement_out = model(**statement_tokens).last_hidden_state torch.save( support_out, args.data + "/processed_datasets/embedding_data/{}_support_{}.pt".format( data_type, id)) torch.save( statement_out, args.data + "/processed_datasets/embedding_data/{}_statement_{}.pt".format( data_type, id))
def Encoding_column(column_data, data_type, id): args = parse_opt() num_column = len(column_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') model.to(device) model = DDP(model) columns = [] for i in range(num_column): columns.append(column_data[i]) column_out = [] for m in range(num_column): column_tokens = tokenizer(columns[m], padding='max_length', truncation=True, max_length=args.table_max_len, return_tensors='pt') column_tokens.to(device) out = model(**column_tokens).last_hidden_state column_out.append(out) torch.save( column_out, args.data + "/processed_datasets/embedding_data/{}_column_{}.pt".format( data_type, id))
scores = model(sentences, words_per_sentence) # (batch_size, n_classes) # accuracy _, predictions = scores.max(dim=1) # (n_documents) correct_predictions = torch.eq(predictions, labels).sum().item() accuracy = correct_predictions / labels.size(0) # keep track of metrics accs.update(accuracy, labels.size(0)) # final test accuracy print('\n * TEST ACCURACY - %.1f percent\n' % (accs.avg * 100)) if __name__ == '__main__': config = parse_opt() # load model checkpoint_path = os.path.join(config.checkpoint_path, config.checkpoint_basename + '.pth.tar') model, _, _, _, _, _ = load_checkpoint(checkpoint_path, device) model = model.to(device) model.eval() # load test data test_loader = load_data(config, 'test') test(model, config.model_name, test_loader)
预测音频情感 Args: config: 配置项 audio_path (str): 要预测的音频路径 model: 加载的模型 """ # utils.play_audio(audio_path) if config.feature_method == 'o': # 一个玄学 bug 的暂时性解决方案 of.get_data(config, audio_path, train=False) test_feature = of.load_feature(config, train=False) elif config.feature_method == 'l': test_feature = lf.get_data(config, audio_path, train=False) result = model.predict(test_feature) result_prob = model.predict_proba(test_feature) print('Recogntion: ', config.class_labels[int(result)]) print('Probability: ', result_prob) utils.radar(result_prob, config.class_labels) if __name__ == '__main__': audio_path = '/Users/zou/Renovamen/Developing/Speech-Emotion-Recognition/datasets/CASIA/angry/201-angry-liuchanhg.wav' config = utils.parse_opt() model = models.load(config) predict(config, audio_path, model)
def Encoding(dataset, data_type): print(20 * "=" + "pre encoding" + 10 * '=') args = parse_opt() num_data = dataset.__len__() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') model.to(device) #model = torch.nn.DataParallel(model) model = DDP(model) supports = [] statements = [] tables = [] columns = [] labels = [] for i in tqdm(range(num_data)): data_dict = dataset.__getitem__(i) supports.append(data_dict['support']) statements.append(data_dict['statement']) labels.append(data_dict['label']) columns.append(data_dict['column']) table = data_dict['table'] table_cell_list = [] for row in range(len(table)): for col in range(len(table[row])): table_cell_list.append(table[row][col]) tables.append(table_cell_list) split_data_num = num_data // (args.batch) split_list = [] for n in range(args.batch - 1): split_list.append(split_data_num * n) split_list.append(num_data) for j in tqdm(range(len(split_list) - 1)): support_tokens = tokenizer(supports[split_list[j]:split_list[j + 1]], padding='max_length', truncation=True, max_length=args.sent_max_len, return_tensors='pt') statement_tokens = tokenizer(statements[split_list[j]:split_list[j + 1]], padding='max_length', truncation=True, max_length=args.sent_max_len, return_tensors='pt') ''' support_tokens = support_tokens.cuda() statement_tokens = statement_tokens.cuda() ''' support_tokens.to(device) statement_tokens.to(device) support_out = model(**support_tokens).last_hidden_state statement_out = model(**statement_tokens).last_hidden_state column_out = [] #print(20*"="+"encoding column"+20*"=") for column in tqdm(columns[split_list[j]:split_list[j + 1]]): column_tokens = tokenizer(column, padding='max_length', truncation=True, max_length=args.table_max_len, return_tensors='pt') #column_tokens = column_tokens.cuda() column_tokens.to(device) out = model(**column_tokens).last_hidden_state column_out.append(out) table_out = [] #print(20*"="+"encoding table"+20*"=") for table in tqdm(tables[split_list[j]:split_list[j + 1]]): table_tokens = tokenizer(table, padding='max_length', truncation=True, max_length=args.table_max_len, return_tensors='pt') #table_tokens = table_tokens.cuda() table_tokens.to(device) out = model(**table_tokens).last_hidden_state table_out.append(out) data = [] min_data = [] mini_num = len(support_out) for m in range(mini_num): min_data.append(support_out[m]) min_data.append(statement_out[m]) min_data.append(table_out[m]) min_data.append(column_out[m]) min_data.append(torch.tensor(labels[m])) data.append(min_data) torch.save( data, args.data + "/processed_datasets/embedding_data/{}_part0_{}.pt".format( data_type, j))
datapath = '/home/DATA/TabFact' sent_data, table_ids, table_data = read_dataset(datapath, data_type) num_data = len(sent_data) labels = [] for i in range(num_data): labels.append(sent_data[i][-1]) with open( datapath + '/processed_datasets/embedding_data/{}_label.json'.format( data_type), 'w') as f: json.dump(labels, f) f.close() if __name__ == "__main__": args = parse_opt() ''' write_dataset("train") write_dataset("test") write_dataset("dev") write_dataset("example") ''' #write_embedding_dataset("example") ''' for i in tqdm(range(512)): write_embedding_sent_dataset('train', i) write_embedding_sent_dataset_end('train') ''' ''' for j in tqdm(range(512)): write_embedding_table_dataset('train', j)