def main(): args.data_dir = os.path.join(args.data_dir, args.dataset) args.output_dir = os.path.join(args.output_dir, args.dataset) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format( args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) CUDA = torch.cuda.is_available() if CUDA: print("using CUDA") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print("args = ", args) ori_model = 'None' ori_load = True for idx in range(args.s_N, args.N): data_idx = idx if args.all_data or args.up_bound: train_data, validation_data, test_data, entity2id, relation2id, sub_entity2id, test_sub_triples, valid_triples_list, valid_train_triples_list = \ build_all_data(args.data_dir, seed=args.seed, up_bound=args.up_bound, data_idx=data_idx) else: train_data, validation_data, test_data, entity2id, relation2id, sub_entity2id, test_sub_triples, valid_triples_list, valid_train_triples_list = \ build_data(args.data_dir, seed=args.seed, data_idx=data_idx, test_idx=args.test_idx, process=args.process, low_th=args.low_th) entity_embeddings = np.random.randn( len(entity2id), args.embedding_size * args.k_factors) if "_" in args.model_name: relation_embeddings = np.random.randn( len(relation2id), args.embedding_size * args.top_n) else: relation_embeddings = np.random.randn(len(relation2id), args.embedding_size) print("Initialised relations and entities randomly") entity_embeddings = torch.FloatTensor(entity_embeddings) relation_embeddings = torch.FloatTensor(relation_embeddings) print("Initial entity dimensions {} , relation dimensions {}".format( entity_embeddings.size(), relation_embeddings.size())) train_loader = Corpus(args, train_data, validation_data, test_data, sub_entity2id, relation2id, args.batch_size, args.valid_invalid_ratio, valid_triples_list, valid_train_triples_list) file_name = "model_name_" + str( args.model_name ) + "_embedding_size_" + str(args.embedding_size) + "_lr_" + str( args.lr) + "_epochs_" + str(args.epochs) + "_k_factors_" + str( args.k_factors) + "_batch_size_" + str( args.batch_size) + "_step_size_" + str( args.step_size) + "_l1_" + str( args.l1) + "_use_second_nei_" + str( args.use_second_nei) + "_w1_" + str( args.w1) + "_up_bound_" + str( args.up_bound) + "_top_n_" + str( args.top_n) + "_att_lr_" + str( args.att_lr) if args.all_data: model_path = os.path.join(args.output_dir, file_name) else: model_path = os.path.join(args.output_dir, str(data_idx), file_name) if not os.path.exists(model_path): os.makedirs(model_path) if args.model_name == 'ConvKB': model = ConvKB(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'TransE': model = TransE(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'ConvKB_2': model = ConvKB_2(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'TransE_2': model = TransE_2(entity_embeddings, relation_embeddings, config=args) else: print("no such model name") print("load path", args.load) if args.load != 'None' and ori_load: model = load_model(model, args.load) print("model loaded") ori_load = False if ori_model != 'None': model = copy.deepcopy(ori_model) print("load model from", idx - 1) model.cuda() for name, param in model.named_parameters(): #print("name", name) if param.requires_grad == False: print("False", name) param.requires_grad = True best_epoch = 0 if args.evaluate == 0: best_epoch = train(args, train_loader, model, model_path, data_idx) ori_model = copy.deepcopy(model) evaluate(args, model, model_path, train_loader, file_name, data_idx, best_epoch=best_epoch, test_sub_triples=test_sub_triples) evaluate(args, model, model_path, train_loader, file_name, data_idx, best_epoch=best_epoch, test_sub_triples=test_sub_triples, best_or_final='final') args.load = os.path.join(model_path, 'trained_final.pth')
def main(): args.data_dir = os.path.join(args.data_dir, args.dataset) args.output_dir = os.path.join(args.output_dir, args.dataset) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format( args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if CUDA: args.use_cuda = CUDA torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True print("args = ", args) ori_model = 'None' ori_load = True for idx in range(args.N): data_idx = idx # Load data adj, features, labels, idx_train, idx_val, idx_test, test_sub_idx, ori_adj, ori_idx_train, ori_idx_valid = \ load_data(args, data_idx, base_path=args.data_dir, dataset=args.dataset) file_name = "model_name_" + str(args.model_name) + "_lr_" + str( args.lr) + "_epochs_" + str(args.epochs) + "_k_factors_" + str( args.k_factors) + "_up_bound_" + str( args.up_bound) + "_top_n_" + str( args.top_n) + "_att_lr_" + str( args.att_lr) + "_hidden_" + str( args.hidden) + "_w1_" + str(args.w1) if args.all_data: model_path = os.path.join(args.output_dir, file_name) else: model_path = os.path.join(args.output_dir, str(data_idx), file_name) if not os.path.exists(model_path): os.makedirs(model_path) # Model and optimizer if args.model_name == "SpGAT": model = SpGAT(nfeat=features.shape[1], nhid=args.hidden, nclass=int(labels.max()) + 1, dropout=args.dropout, nheads=args.nb_heads, alpha=args.alpha) elif args.model_name == "SpGAT_2": model = SpGAT_2(nfeat=features.shape[1], nclass=int(labels.max()) + 1, config=args) elif args.model_name == "SpGAT2": model = SpGAT_2(nfeat=features.shape[1], nclass=int(labels.max()) + 1, config=args) else: model = GAT(nfeat=features.shape[1], nhid=args.hidden, nclass=int(labels.max()) + 1, dropout=args.dropout, nheads=args.nb_heads, alpha=args.alpha) print("load path", args.load) if args.load != 'None' and ori_load: model = load_model(model, args.load) print("model loaded") ori_load = False if ori_model != 'None': model = copy.deepcopy(ori_model) print("load model from", idx - 1) print(model.state_dict().keys()) if CUDA: model.cuda() features = Variable(features.cuda()) adj = Variable(adj.cuda()) labels = Variable(labels.cuda()) idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda() if "_" in args.model_name and not args.all_data and data_idx > 0 and ori_adj is not None: ori_adj = Variable(ori_adj.cuda()) ori_idx_train = ori_idx_train.cuda() ori_idx_valid = ori_idx_valid.cuda() loader = Corpus(features, adj, labels, idx_train, idx_val, idx_test, ori_adj, ori_idx_train, ori_idx_valid) for name, param in model.named_parameters(): if param.requires_grad == False: print("False", name) param.requires_grad = True best_epoch = 0 if args.evaluate == 0: best_epoch = train(model, model_path, loader, data_idx) ori_model = copy.deepcopy(model) evaluate(model, model_path, loader, data_idx, best_epoch=best_epoch, test_sub_idx=test_sub_idx) evaluate(model, model_path, loader, data_idx, best_epoch=best_epoch, test_sub_idx=test_sub_idx, best_or_final='final') args.load = os.path.join(model_path, 'trained_final.pth')
def main(): args.data_dir = os.path.join(args.data_dir, args.dataset) args.output_dir = os.path.join(args.output_dir, args.dataset) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format( args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) CUDA = torch.cuda.is_available() if CUDA: print("using CUDA") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) print("args = ", args) train_data, validation_data, test_data, entity2id, relation2id = build_data( args.data_dir) if args.pretrained_emb: entity_embeddings, relation_embeddings = init_embeddings( os.path.join(args.data_dir, 'entity2vec.txt'), os.path.join(args.data_dir, 'relation2vec.txt'), args.k_factors, args.embedding_size) print("Initialised relations and entities from TransE") else: entity_embeddings = np.random.randn(len(entity2id), args.embedding_size) relation_embeddings = np.random.randn(len(relation2id), args.embedding_size) print("Initialised relations and entities randomly") entity_embeddings = torch.FloatTensor(entity_embeddings) relation_embeddings = torch.FloatTensor(relation_embeddings) print("Initial entity dimensions {} , relation dimensions {}".format( entity_embeddings.size(), relation_embeddings.size())) train_loader = Corpus(args, train_data, validation_data, test_data, entity2id, relation2id, args.batch_size, args.valid_invalid_ratio) file_name = "train_" + str(args.model_name) + "_embedding_size_" + str( args.embedding_size) + "_lr_" + str(args.lr) + "_epochs_" + str( args.epochs) + "_batch_size_" + str( args.batch_size) + "_dropout_" + str( args.dropout) + "_step_size_" + str( args.step_size) + "_arch_" + str( args.arch) + "_layers_" + str( args.layers) + "_margin_" + str( args.margin) + "_do_margin_loss_" + str( args.do_margin_loss) model_path = os.path.join(args.output_dir, file_name) output_file = os.path.join(args.output_dir, "results_" + file_name + ".txt") if not os.path.exists(model_path): os.makedirs(model_path) if args.model_name == 'NASE': arc = eval("genotypes.%s" % args.arch) print("\narc:", arc) model = NASE(entity_embeddings, relation_embeddings, arc, config=args) else: print("no such model name") if args.load != 'None': model.load_state_dict(torch.load(args.load)) print("model loaded") model.cuda() best_epoch = 0 if args.evaluate == 0: best_epoch = train(args, train_loader, model, model_path) evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='best') evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='final')
def main(): args.data_dir = os.path.join(args.data_dir, args.dataset) args.output_dir = os.path.join(args.output_dir, args.dataset) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) CUDA = torch.cuda.is_available() if CUDA: print("using CUDA") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) print("args = ", args) train_data, validation_data, test_data, entity2id, relation2id = build_data(args.data_dir) if args.pretrained_emb: entity_embeddings, relation_embeddings = init_embeddings(os.path.join(args.data_dir, 'entity2vec.txt'), os.path.join(args.data_dir, 'relation2vec.txt'), args.k_factors, args.embedding_size) print("Initialised relations and entities from TransE") else: entity_embeddings = np.random.randn(len(entity2id), args.embedding_size) relation_embeddings = np.random.randn(len(relation2id), args.embedding_size) print("Initialised relations and entities randomly") entity_embeddings = torch.FloatTensor(entity_embeddings) relation_embeddings = torch.FloatTensor(relation_embeddings) print("Initial entity dimensions {} , relation dimensions {}".format(entity_embeddings.size(), relation_embeddings.size())) train_loader = Corpus(args, train_data, validation_data, test_data, entity2id, relation2id, args.batch_size, args.valid_invalid_ratio) file_name = "search_" + str(args.model_name) + "_embedding_size_" + str(args.embedding_size) + "_lr_" + str( args.lr) + "_epochs_" + str(args.epochs) + "_batch_size_" + str(args.batch_size) + "_dropout_" + str( args.dropout) + "_step_size_" + str(args.step_size) + "_layers_" + str(args.layers) + "_margin_" + str(args.margin) model_path = os.path.join(args.output_dir, file_name) output_file = os.path.join(args.output_dir, "results_" + file_name + ".txt") if not os.path.exists(model_path): os.makedirs(model_path) if args.model_name == 'NASE': model = KG_search(entity_embeddings, relation_embeddings, config=args) else: print("no such model name") if args.load != 'None': model.load_state_dict(torch.load(args.load)) print("model loaded") model.cuda() architect = Architect(model, args) cnt_params = np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6 print("param size = ", cnt_params, "MB") for name, param in model.named_parameters(): if param.requires_grad == False: print("name",name) param.requires_grad = True #print("arch_parameters", model.arch_parameters()) best_epoch = 0 if args.evaluate == 0: best_epoch = train(args, train_loader, model, model_path, architect) evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='best') evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='final')
def main(): args.data_dir = os.path.join(args.data_dir, args.dataset) args.output_dir = os.path.join(args.output_dir, args.dataset) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory ({}) already exists and is not empty.".format( args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) CUDA = torch.cuda.is_available() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) print("args = ", args) train_data, validation_data, test_data, entity2id, relation2id = build_data( args.data_dir) if args.pretrained_emb: entity_embeddings, relation_embeddings = init_embeddings( os.path.join(args.data_dir, 'entity2vec.txt'), os.path.join(args.data_dir, 'relation2vec.txt'), args.k_factors, args.embedding_size) print("Initialised relations and entities from TransE") else: entity_embeddings = np.random.randn( len(entity2id), args.embedding_size * args.k_factors) relation_embeddings = np.random.randn(len(relation2id), args.embedding_size) print("Initialised relations and entities randomly") entity_embeddings = torch.FloatTensor(entity_embeddings) relation_embeddings = torch.FloatTensor(relation_embeddings) print("Initial entity dimensions {} , relation dimensions {}".format( entity_embeddings.size(), relation_embeddings.size())) train_loader = Corpus(args, train_data, validation_data, test_data, entity2id, relation2id, args.batch_size, args.valid_invalid_ratio) file_name = "model_name_" + str( args.model_name ) + "_embedding_size_" + str(args.embedding_size) + "_k_factors_" + str( args.k_factors) + "_lr_" + str(args.lr) + "_epochs_" + str( args.epochs ) + "_out_channels_" + str(args.out_channels) + "_batch_size_" + str( args.batch_size) + "_dropout_" + str( args.dropout) + "_pretrained_emb_" + str( args.pretrained_emb) + "_step_size_" + str( args.step_size) + "_gamma_" + str( args.gamma) + "_w1_" + str(args.w1) + "_w2_" + str( args.w2) + "_sample_num_" + str( args.sample_num) + "_top_n_" + str( args.top_n) model_path = os.path.join(args.output_dir, file_name) output_file = os.path.join(args.output_dir, "results_" + file_name + ".txt") if not os.path.exists(model_path): os.makedirs(model_path) if args.model_name == 'ConvKB': model = ConvKB(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'TransE': model = TransE(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'DisenE': model = DisenE(entity_embeddings, relation_embeddings, config=args) elif args.model_name == 'DisenE_Trans': model = DisenE_Trans(entity_embeddings, relation_embeddings, config=args) else: print("no such model name") if args.load != 'None': model.load_state_dict(torch.load(args.load)) print("model loaded") if CUDA: print("using CUDA") model.cuda() best_epoch = 0 if args.evaluate == 0: best_epoch = train(args, train_loader, model, CUDA, model_path) evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='best') evaluate(args, model, model_path, train_loader, output_file, best_epoch=best_epoch, best_or_final='final')
default='../data/', help='path to corpus dir') parser.add_argument('-src', '--corpus-name', type=str, default='data.txt', help='path to corpus data') parser.add_argument('--save-dir', type=str, default='./data/', help='path to save processed data') parser.add_argument('--pre-w2v', type=str, default='../data/w2v') args = parser.parse_args() args.corpus_data = args.corpus_dir + args.corpus_name corpus = Corpus(args.corpus_data, args.pre_w2v, args.save_dir, train_dev=0) dl = DataLoader(args.save_dir, batch_size=128, train_dev=0)() # dl_train, dl_test = train_test_split(dl, test_size=0.33) pre_w2v = torch.load(args.save_dir + 'pre_w2v') pre_w2v = torch.Tensor(pre_w2v).to(device) model_ckpt = torch.load(os.path.join( args.save_dir, '{}.pyt'.format("Transformer_NER_best")), map_location=torch.device(device)) config = load_obj(args.save_dir + 'Config.json') model = Transformer_Mix(config, pre_w2v).to(device) model.load_state_dict(model_ckpt['model']) # pred_tags = [] # true_tags = []