def loadData(dataname, treeDic, fold_x_train, fold_x_test, droprate): data_path = os.path.join(cwd, 'data', dataname + 'graph') print("loading train set", ) traindata_list = GraphDataset(fold_x_train, treeDic, droprate=droprate, data_path=data_path) print("train no:", len(traindata_list)) print("loading test set", ) testdata_list = GraphDataset(fold_x_test, treeDic, data_path=data_path) print("test no:", len(testdata_list)) return traindata_list, testdata_list
def predict(args): model_config, optimizer_config, _ = Config.from_json(args.config) model_name = model_config.name model_class = getattr(models, model_name) if model_config.init_weight_path is None: model_config.init_weight = None else: model_config.init_weight = t.from_numpy(pickle.load(open(model_config.init_weight_path, 'rb'))).float() phase = 'test' fea_filename = os.path.join(args.data, '{}.fea'.format(phase)) pos_filename = os.path.join(args.data, '{}.pos'.format(phase)) fea_file = open(fea_filename, 'rb') with open(pos_filename, 'r') as f: positions = [int(v.strip()) for v in f] dataset = GraphDataset(fea_file, positions) dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collect_single, num_workers=1) kfold_models = [] args.models = ['best'] * FOLD if not args.models else args.models for i, model_name in enumerate(args.models): model = model_class(**model_config.values) ckpt_file = os.path.join(args.save_dir, f'fold{i}', f'model.{model_name}.pt.tar') if os.path.isfile(ckpt_file): load_ckpt(ckpt_file, model) else: raise Exception("No such path {}".format(ckpt_file)) if args.cuda: model = model.cuda() model.eval() kfold_models.append(model) curr_preds = defaultdict(set) pbar = tqdm(dataloader) for data in pbar: with t.no_grad(): results = infer(data, kfold_models, args.cuda) for key in results: curr_preds[key].update(results[key]) idxs = [] entities = [] for key in curr_preds: print(key) idxs.append(key) curr_preds[key].remove('') entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1])) preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities}) preds.to_csv(os.path.join(args.save_dir, 'submit.csv'), index=False)
def predict(args): # vocabs = load_vocab(args.vocab) # inv_vocabs = {v: k for k, v in vocabs.items()} phase = 'test' fea_filename = os.path.join(args.data, '{}.fea'.format(phase)) pos_filename = os.path.join(args.data, '{}.pos'.format(phase)) fea_file = open(fea_filename, 'rb') with open(pos_filename, 'r') as f: positions = [int(v.strip()) for v in f] dataset = GraphDataset(fea_file, positions) dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collect_single, num_workers=1) model_list = [] for name in args.names.split(','): config_path = os.path.join('outputs', name, 'model_config.json') model_config, optimizer_config, _ = Config.from_json(config_path) model_name = model_config.name model_class = getattr(models, model_name) model = model_class(**model_config.values) ckpt_file = os.path.join('outputs', name, 'model.best.pt.tar') if os.path.isfile(ckpt_file): load_ckpt(ckpt_file, model) else: raise Exception("No such path {}".format(ckpt_file)) if args.cuda: model = model.cuda() model.eval() model_list.append(model) curr_preds = defaultdict(set) pbar = tqdm(dataloader) for data in pbar: with t.no_grad(): results = infer(data, model_list, args) for key in results: curr_preds[key].update(results[key]) idxs = [] entities = [] for key in curr_preds: # print(key) idxs.append(key) curr_preds[key].remove('') entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1])) preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities}) preds.to_csv(args.save_name, index=False)
def construct_dataset(self, data) : return GraphDataset(data)
import torch.nn.functional as F import torch.optim as optim from torch.utils.data import DataLoader from sklearn.model_selection import KFold as kfold import torch as th import multifractal def collate(samples): # The input `samples` is a list of pairs # (graph, label). graphs, labels = map(list, zip(*samples)) batched_graph = dgl.batch(graphs) return batched_graph, torch.tensor(labels) dataset = GraphDataset() graph, label = dataset[0] print(label) #embed = nn.Embedding() def nodeFeatures(g, types): #g = dgl.add_self_loop(g) #graph = dgl.DGLGraph.to_networkx(g) if (types == "simple"): return g.in_degrees() elif (types == "weight"): return dgl.khop_adj(g, 1) #g.ndata['w'] elif (types == "multifractal"): return multifractal.multifractal(g) class GCNClassifier(nn.Module):
def train(args): Log = log_info(os.path.join(args.save_dir, 'process.info')) Log(args) model_config, optimizer_config, scheduler_config = Config.from_json( args.config) model_name = model_config.name model_class = getattr(models, model_name) if model_config.init_weight_path is None: model_config.init_weight = None else: model_config.init_weight = t.from_numpy( pickle.load(open(model_config.init_weight_path, 'rb'))).float() model = model_class(**model_config.values) phase = 'dev' dataloaders = {} datasets = {} collate_fn = collect_single fea_filename = os.path.join(args.data, '{}.fea'.format(phase)) pos_filename = os.path.join(args.data, '{}.pos'.format(phase)) fea_file = open(fea_filename, 'rb') with open(pos_filename, 'r') as f: positions = [int(v.strip()) for v in f] dataset = GraphDataset(fea_file, positions) dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn, num_workers=1) dataloaders[phase] = dataloader datasets[phase] = dataset if model_config.freeze: for param in model.bert4pretrain.parameters(): param.requires_grad = False optimizer_config.lr = optimizer_config.lr * args.lr_scale if hasattr(optim, optimizer_config.name): optimizer = getattr(optim, optimizer_config.name)(model.parameters(), **optimizer_config.values) scheduler = getattr(optim.lr_scheduler, scheduler_config.name)(optimizer, **scheduler_config.values) else: t_total = len(dataloaders['dev']) * args.epoch # no_decay = ['bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, # {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] optimizer = getattr(optimization, optimizer_config.name)(model.parameters(), **optimizer_config.values) scheduler = getattr(optimization, scheduler_config.name)(optimizer, t_total=t_total, **scheduler_config.values) ckpt_file = os.path.join(args.load_dir, 'model.best.pt.tar') if os.path.isfile(ckpt_file): load_ckpt(ckpt_file, model, optimizer, scheduler, args.cuda) else: raise Exception("No such path {}".format(ckpt_file)) # pdb.set_trace() for epoch in range(1, 1 + args.epoch): model.train() pbar = tqdm(dataloaders[phase]) pbar.set_description("[{} Epoch {}]".format(phase, epoch)) running_loss = 0. running_size = 0. for data in pbar: optimizer.zero_grad() size, loss = infer(data, model, args.cuda) loss.backward() optimizer.step() running_loss += loss.item() running_size += size pbar.set_postfix(mean_loss=running_loss / running_size) save_ckpt(os.path.join(args.save_dir, 'model.best.pt.tar'), epoch, model.state_dict(), optimizer.state_dict(), scheduler.state_dict())
#%% if __name__ == "__main__": np.random.seed(SEED) torch.manual_seed(SEED) # hyper parameters # train_data = GraphDataset(TRAIN_DIR) # val_data = GraphDataset(VAL_DIR) # if small_dataset: # train_loader = DataLoader(train_data[:1000], batch_size=batch_size) # val_loader = DataLoader(val_data[:200], batch_size=batch_size) # else: # train_loader = DataLoader(train_data, batch_size=batch_size) # val_loader = DataLoader(val_data, batch_size=batch_size) test_data = GraphDataset(TEST_DIR) test_loader = DataLoader(test_data, batch_size=batch_size) model = HGNN(in_channels, out_channels) optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=decay_lr_every, gamma=decay_lr_factor) if checkpoint_dir: load_checkpoint(checkpoint_dir, model) model = model.to(device) norm_centers_dict = None with open(NORM_CENTERS_DICT_DIR, 'rb') as f: norm_centers_dict = pickle.load(f) norm_centers_ls = sorted(norm_centers_dict.items())
# %% if __name__ == "__main__": epochs = 100 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') batch_size = 2 decay_lr_factor = 0.9 decay_lr_every = 10 lr = 0.005 in_channels, out_channels = 8, 60 show_every = 10 os.chdir('..') # get model model = HGNN(in_channels, out_channels).to(device) dataset = GraphDataset('.') data_iter = DataLoader(dataset, batch_size=batch_size) for data in data_iter: out = model(data) # %% ''' def get_data_path_ls(dir_): return [os.path.join(dir_, data_path) for data_path in os.listdir(dir_)] # data preparation DIR = 'input_data' data_path_ls = get_data_path_ls(DIR) # hyper parameters epochs = 100 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) print('model loaded from %s' % checkpoint_path) return checkpoint_path['end_epoch'] #%% if __name__ == "__main__": # training envs np.random.seed(SEED) torch.manual_seed(SEED) device = torch.device( f'cuda:{gpus[0]}' if torch.cuda.is_available() else 'cpu') # prepare dara train_data = GraphDataset(TRAIN_DIR).shuffle() val_data = GraphDataset(VAL_DIR) if small_dataset: train_loader = DataListLoader(train_data[:1000], batch_size=batch_size, shuffle=True) val_loader = DataListLoader(val_data[:200], batch_size=batch_size) else: train_loader = DataListLoader(train_data, batch_size=batch_size, shuffle=True) val_loader = DataListLoader(val_data, batch_size=batch_size) model = HGNN(in_channels, out_channels) model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0]) model = model.to(device=device)
def train(args): Log = log_info(os.path.join(args.save_dir, 'process{}.info'.format(args.fold))) Log(args) model_config, optimizer_config, scheduler_config = Config.from_json(args.config) model_name = model_config.name model_class = getattr(models, model_name) Log(model_config.values) model = model_class(**model_config.values) if os.path.exists(args.ckpt_path): load_ckpt(args.ckpt_path, model) dataloaders = {} datasets = {} sampler = None collate_fn = collect_single phases = ['train'] if args.do_eval: phases.append('dev') if args.do_test: phases.append('test') for phase in phases: if phase != 'test' and args.fold: fea_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.fea'.format(phase)) pos_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.pos'.format(phase)) else: fea_filename = os.path.join(args.data, '{}.fea'.format(phase)) pos_filename = os.path.join(args.data, '{}.pos'.format(phase)) fea_file = open(fea_filename, 'rb') with open(pos_filename, 'r') as f: positions = [int(v.strip()) for v in f] dataset = GraphDataset(fea_file, positions) if args.multi_gpu and phase == 'train': sampler = t.utils.data.RandomSampler(dataset) dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, sampler=sampler, num_workers=1) else: dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=(phase=='train'), collate_fn=collate_fn, num_workers=1) dataloaders[phase] = dataloader datasets[phase] = dataset if args.multi_gpu: args.n_gpu = t.cuda.device_count() model = model.cuda() model = t.nn.DataParallel(model) elif args.cuda: args.n_gpu = 1 model = model.cuda() bert_parameters = list(map(id, model.bert4pretrain.parameters())) other_parameters = filter(lambda p: id(p) not in bert_parameters, model.parameters()) if hasattr(optim, optimizer_config.name): optimizer = getattr(optim, optimizer_config.name)([ {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate}, {'params': model.bert4pretrain.parameters()} ], **optimizer_config.values) scheduler = getattr(optim.lr_scheduler, scheduler_config.name)(optimizer, **scheduler_config.values) else: t_total = len(dataloaders['train']) * args.epoch # no_decay = ['bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, # {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] optimizer = getattr(optimization, optimizer_config.name)([ {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate}, {'params': model.bert4pretrain.parameters()} ], **optimizer_config.values) scheduler = getattr(optimization, scheduler_config.name)(optimizer, t_total=t_total, **scheduler_config.values) if not os.path.isdir(args.save_dir): os.mkdir(args.save_dir) # pdb.set_trace() if args.log: writer = SummaryWriter(os.path.join(args.save_dir, 'logs')) else: writer = None pre_fn, step_fn, post_fn = tm.acc_metric_builder(args, scheduler_config, model, optimizer, scheduler, writer, Log) for epoch in range(1, 1+args.epoch): for phase in phases: pre_fn() if phase == 'train': model.train() else: model.eval() pbar = tqdm(dataloaders[phase]) pbar.set_description("[{} Epoch {}]".format(phase, epoch)) for data in pbar: optimizer.zero_grad() with t.set_grad_enabled(phase == 'train'): result, loss = infer(data, model, args.cuda, is_evaluate=phase!='train') if args.multi_gpu and args.n_gpu > 1: loss = loss.mean() if phase == 'train': loss.backward() # t.nn.utils.clip_grad_norm_(model.parameters(), 7) optimizer.step() step_fn(result, loss, pbar, phase) post_fn(phase, epoch) if args.log: writer.close() with open(os.path.join(args.save_dir, 'invalid_entities'), 'wb') as f: pickle.dump(tm.Invalid_entities, f)
def predict(args): model_config, *_ = Config.from_json(args.config) model_name = model_config.name model_class = getattr(models, model_name) if model_config.init_weight_path is None: model_config.init_weight = None else: model_config.init_weight = t.from_numpy(pickle.load(open(model_config.init_weight_path, 'rb'))).float() if model_config.activation is None: pass elif model_config.activation == 'identical': model_config.activation = lambda v: v elif model_config.activation == 'gelu': model_config.activation = models.layers.activation.gelu else: model_config.activation = getattr(t, model_config.activation, None) or getattr(F, model_config.activation, None) collate_fn = lambda batch: collect_multigraph(model_config.need_norm, model_config.concat_ab, batch) phase = 'test' fea_filename = os.path.join(args.data, '{}.fea'.format(phase)) tgt_filename = os.path.join(args.data, '{}.tgt'.format(phase)) pos_filename = os.path.join(args.data, '{}.pos'.format(phase)) fea_file = open(fea_filename, 'rb') with open(tgt_filename, 'r') as f: targets = [int(v.strip()) for v in f] with open(pos_filename, 'r') as f: positions = [int(v.strip()) for v in f] dataset = GraphDataset(fea_file, targets, positions) dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=1) epoch = args.best_epoch total_proba = None model = model_class(**model_config.values) ckpt_file = os.path.join(args.save_dir, 'model.epoch{}.pt.tar'.format(epoch)) if os.path.isfile(ckpt_file): load_ckpt(ckpt_file, model) else: raise Exception("No such path {}".format(ckpt_file)) if args.cuda: model = model.cuda() model.eval() running_loss = 0. running_results = Counter() curr_proba = [] pbar = tqdm(dataloader) for data in pbar: with t.no_grad(): proba = infer(data, model, model_config.seq_len, args.cuda) curr_proba.append(proba) curr_proba = np.concatenate(curr_proba, axis=0) if total_proba is None: total_proba = curr_proba else: assert total_proba.shape == curr_proba.shape total_proba += curr_proba df = pd.DataFrame(data=total_proba, columns=['proba0', 'proba1']) predictions = total_proba.argmax(1) df['predictions'] = predictions df['targets'] = dataset.targets df.to_csv(os.path.join(args.save_dir, 'result.csv'))