'dropout_prob': 0, 'lstm_layer_num': 1, 'num_labels': 45 } mymodel = REL_BLSTM_CRF(config=model_config, show_param=True) ###=========================================================== ###模型参数测试 ###=========================================================== ##pass ###=========================================================== ###试训练 -- train_part ###=========================================================== data_set = AutoKGDataset('./d1/') train_dataset = data_set.train_dataset[:20] eval_dataset = data_set.dev_dataset[:10] # train_dataset = data_set.train_dataset # eval_dataset = data_set.dev_dataset os.makedirs('result', exist_ok=True) data_loader = KGDataLoader(data_set, rebuild=False, temp_dir='result/') # print(data_loader.embedding_info_dicts['entity_type_dict']) print(data_loader.embedding_info_dicts['label_location_dict']) show_metadata(data_loader.metadata_) print('start_tags:', data_loader.rel_seq_map_dict[data_loader.START_TAG]) print('end_tags:', data_loader.rel_seq_map_dict[data_loader.END_TAG]) train_param = {
# for rel, v in temp.items(): # print(rel) # print(v[0]) # print(v[1]) return temp if __name__ == '__main__': # load_bert_pretrained_dict() result_dir = './result/' data_set = AutoKGDataset('./data/d4/') train_dataset = data_set.train_dataset[:200] import os os.makedirs(result_dir, exist_ok=True) data_loader = KGDataLoader2(data_set, rebuild=False, temp_dir=result_dir) show_dict_info(data_loader) # train_data_mat_dict = data_loader.transform_rel(train_dataset, istest=False, ratio=0) train_data_mat_dict = data_loader.transform(train_dataset, istest=False, data_type='rel', ratio=0) data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='rel', isshuffle=True) # # data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='ent', isshuffle=True) pred = data_loader.transform_back(train_data_mat_dict, data_type='rel')
def main(): LOGGER.info("===== Start program") LOGGER.info('===== Initialize args') args = _parse_args() _init_python_path(args) LOGGER.info(f'===== task mode: {args.mode}') ##获取数据集 dataset = AutoKGDataset(args.dataset_dir) # show_metadata(dataset.metadata_) LOGGER.info('===== Load metadata') LOGGER.info(f'===== use_cuda: {args.use_cuda}') metadata = dataset.get_metadata() args.time_budget = metadata.get('time_budget', args.time_budget) LOGGER.info(f'Time budget: {args.time_budget}') # data_loader = KGDataLoader1(dataset, rebuild=False, temp_dir=args.result_dir) ##For model REL_BLSTM_CRF # data_loader = KGDataLoader2(dataset, rebuild=False, temp_dir=args.result_dir) ##For novel tagging model data_loader = KGDataLoader3(dataset, rebuild=False, temp_dir=args.result_dir) ##For model BERT_Hierarchical print('max sentence length: ', data_loader.sentence_max_len) # show_dict_info(data_loader) # print(data_loader.entity_type_dict) # print(list(data_loader.rel_seq_map_dict)) ## Reload model model_params = { # 'embedding_dim' : 768, # 'hidden_dim' : 64, 'n_ent_tags' : len(data_loader.ent_seq_map_dict), # 'n_rel_tags' : len(data_loader.rel_seq_map_dict), 'n_rels' : len(data_loader.relation_type_dict), 'n_words' : len(data_loader.character_location_dict), 'use_cuda':args.use_cuda, 'dropout_prob': 0, 'lstm_layer_num': 1 } ##TODO: ##!!! need KGDataLoader3 mymodel = BERT_Hierarchical(model_params, show_param=True) if args.use_cuda: train_dataset = dataset.train_dataset test_dataset = dataset.test_dataset eval_dataset = dataset.dev_dataset else: train_dataset = dataset.train_dataset[:CPU_TRAIN] test_dataset = dataset.test_dataset eval_dataset = dataset.dev_dataset[:CPU_EVAL] test_dataset_final = dataset._read_dataset( os.path.join(args.answer_dir, 'test.solution') ) test_dataset_final = dataset.check_repeat_sentence(test_dataset_final) ## remove the repeat sentences if args.mode == 'train': LOGGER.info('===== Start Train') _train(mymodel, args, data_loader, train_dataset=train_dataset, eval_dataset=eval_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda) LOGGER.info('===== Start Eval') _eval(mymodel, args, data_loader, data_set=test_dataset_final, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda) if args.mode == 'eval': LOGGER.info('===== Start Eval') _eval(mymodel, args, data_loader, data_set=eval_dataset, RELOAD_MODEL='model_lr_0.01.p', use_cuda=args.use_cuda) if args.mode == 'predict': LOGGER.info('===== Start Predict') _predict(mymodel, args, data_loader, data_set=test_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda)
ranges = torch.arange(0, max_len).long() #(max_len) if lens.is_cuda: ranges = ranges.cuda() ranges = ranges.unsqueeze(0).expand(batch_size, max_len) #(batch_size, max_len) lens_exp = lens.unsqueeze(1).expand_as(ranges) #(batch_size, max_len) mask = ranges < lens_exp return mask if __name__ == '__main__': ###=========================================================== ###试训练 ###=========================================================== data_set = AutoKGDataset('./data/newdata2/d10') train_dataset = data_set.train_dataset eval_dataset = data_set.dev_dataset # train_dataset = data_set.train_dataset # eval_dataset = data_set.dev_dataset os.makedirs('result', exist_ok=True) data_loader = KGDataLoader3(data_set, rebuild=False, temp_dir='result/') model_config = { 'embedding_dim': 768, 'n_rels': len(data_loader.relation_type_dict), 'use_cuda': 1, 'dropout_prob': 0, } mymodel = BERT_Hierarchical(model_config, show_param=True)
def main(): LOGGER.info("===== Start program") LOGGER.info('===== Initialize args') args = _parse_args() _init_python_path(args) LOGGER.info(f'===== task mode: {args.mode}') ##获取数据集 dataset = AutoKGDataset(args.dataset_dir) # show_metadata(dataset.metadata_) LOGGER.info('===== Load metadata') LOGGER.info(f'===== use_cuda: {args.use_cuda}') metadata = dataset.get_metadata() args.time_budget = metadata.get('time_budget', args.time_budget) LOGGER.info(f'Time budget: {args.time_budget}') data_loader = KGDataLoader(dataset, rebuild=False, temp_dir=args.result_dir) show_dict_info(data_loader) print(data_loader.sentence_max_len) ## Reload model model_params = { # 'embedding_dim' : 768, # 'hidden_dim' : 64, 'n_ent_tags': len(data_loader.ent_seq_map_dict), 'n_rel_tags': len(data_loader.rel_seq_map_dict), 'n_rels': len(data_loader.label_location_dict) + 1, 'n_words': len(data_loader.character_location_dict), 'use_cuda': args.use_cuda, 'dropout_prob': 0, 'lstm_layer_num': 1, # 'start_ent_idx': data_loader.ent_seq_map_dict[data_loader.START_TAG], ## <start> tag index for entity tag seq # 'end_ent_idx': data_loader.ent_seq_map_dict[data_loader.END_TAG], ## <end> tag index for entity tag seq # 'start_rel_idx': data_loader.rel_seq_map_dict[data_loader.START_TAG], ## <start> tag index for relation tag seq # 'end_rel_idx': data_loader.rel_seq_map_dict[data_loader.END_TAG], ## <end> tag index for relation tag seq } ##TODO: ### 使用自己写的CRF的模型,效果差不多,简化代码,先不用了, not used now======== ## mymodel = BLSTM_CRF(model_params, show_param=True) ## mymodel = BERT_LSTM_CRF(model_params, show_param=True) ## mymodel = BERT_CRF(model_params, show_param=True) ## mymodel = REL_BLSTM_CRF(model_params, show_param=True) ##关系抽取模型,不使用 ### 主要使用对比模型============================ # mymodel = BASELINE(model_params, show_param=True) # mymodel = BERT_MLP(model_params, show_param=True) mymodel = BERT_MLP2(model_params, show_param=True) # mymodel = BERT_CRF2(model_params, show_param=True) # mymodel = BERT_LSTM_CRF2(model_params, show_param=True) if args.use_cuda: train_dataset = dataset.train_dataset test_dataset = dataset.test_dataset eval_dataset = dataset.dev_dataset else: train_dataset = dataset.train_dataset[:CPU_TRAIN] test_dataset = dataset.test_dataset eval_dataset = dataset.dev_dataset[:CPU_EVAL] test_dataset_final = dataset._read_dataset( os.path.join(args.answer_dir, 'test.solution')) test_dataset_final = dataset.check_repeat_sentence( test_dataset_final) ## remove the repeat sentences if args.mode == 'train': LOGGER.info('===== Start Train') _train(mymodel, args, data_loader, train_dataset=train_dataset, eval_dataset=eval_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda) LOGGER.info('===== Start Eval') _eval(mymodel, args, data_loader, data_set=test_dataset_final, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda) if args.mode == 'eval': LOGGER.info('===== Start Eval') _eval(mymodel, args, data_loader, data_set=eval_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda) if args.mode == 'predict': LOGGER.info('===== Start Predict') _predict(mymodel, args, data_loader, data_set=test_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda)