Beispiel #1
0
def loadData(dataname, treeDic, fold_x_train, fold_x_test, droprate):
    data_path = os.path.join(cwd, 'data', dataname + 'graph')
    print("loading train set", )
    traindata_list = GraphDataset(fold_x_train,
                                  treeDic,
                                  droprate=droprate,
                                  data_path=data_path)
    print("train no:", len(traindata_list))
    print("loading test set", )
    testdata_list = GraphDataset(fold_x_test, treeDic, data_path=data_path)
    print("test no:", len(testdata_list))
    return traindata_list, testdata_list
Beispiel #2
0
def predict(args):
    model_config, optimizer_config, _ = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size,
                                            shuffle=False, collate_fn=collect_single, num_workers=1)

    kfold_models = []
    args.models = ['best'] * FOLD if not args.models else args.models
    for i, model_name in enumerate(args.models):
        model = model_class(**model_config.values)
        ckpt_file = os.path.join(args.save_dir, f'fold{i}', f'model.{model_name}.pt.tar')
        if os.path.isfile(ckpt_file):
            load_ckpt(ckpt_file, model)
        else:
            raise Exception("No such path {}".format(ckpt_file))
        if args.cuda:
            model = model.cuda()
        model.eval()
        kfold_models.append(model)

    curr_preds = defaultdict(set)
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            results = infer(data, kfold_models, args.cuda)
            for key in results:
                curr_preds[key].update(results[key])
    idxs = []
    entities = []
    for key in curr_preds:
        print(key)
        idxs.append(key)
        curr_preds[key].remove('')
        entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1]))
    preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities})
    preds.to_csv(os.path.join(args.save_dir, 'submit.csv'), index=False)
Beispiel #3
0
def predict(args):
    # vocabs = load_vocab(args.vocab)
    # inv_vocabs = {v: k for k, v in vocabs.items()}
    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         shuffle=False,
                                         collate_fn=collect_single,
                                         num_workers=1)

    model_list = []
    for name in args.names.split(','):
        config_path = os.path.join('outputs', name, 'model_config.json')
        model_config, optimizer_config, _ = Config.from_json(config_path)
        model_name = model_config.name
        model_class = getattr(models, model_name)

        model = model_class(**model_config.values)
        ckpt_file = os.path.join('outputs', name, 'model.best.pt.tar')
        if os.path.isfile(ckpt_file):
            load_ckpt(ckpt_file, model)
        else:
            raise Exception("No such path {}".format(ckpt_file))
        if args.cuda:
            model = model.cuda()
        model.eval()
        model_list.append(model)
    curr_preds = defaultdict(set)
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            results = infer(data, model_list, args)
            for key in results:
                curr_preds[key].update(results[key])
    idxs = []
    entities = []
    for key in curr_preds:
        # print(key)
        idxs.append(key)
        curr_preds[key].remove('')
        entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1]))
    preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities})
    preds.to_csv(args.save_name, index=False)
Beispiel #4
0
 def construct_dataset(self, data) :
     return GraphDataset(data)
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold as kfold
import torch as th
import multifractal


def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

dataset = GraphDataset()
graph, label = dataset[0]
print(label)
#embed = nn.Embedding()

def nodeFeatures(g, types):
    #g = dgl.add_self_loop(g)
    #graph = dgl.DGLGraph.to_networkx(g)
    if (types == "simple"):
        return g.in_degrees()
    elif (types == "weight"):
        return dgl.khop_adj(g, 1) #g.ndata['w']
    elif (types == "multifractal"):
        return multifractal.multifractal(g)

class GCNClassifier(nn.Module):
def train(args):
    Log = log_info(os.path.join(args.save_dir, 'process.info'))
    Log(args)
    model_config, optimizer_config, scheduler_config = Config.from_json(
        args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(
            pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    model = model_class(**model_config.values)

    phase = 'dev'
    dataloaders = {}
    datasets = {}
    collate_fn = collect_single
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         collate_fn=collate_fn,
                                         num_workers=1)
    dataloaders[phase] = dataloader
    datasets[phase] = dataset

    if model_config.freeze:
        for param in model.bert4pretrain.parameters():
            param.requires_grad = False
    optimizer_config.lr = optimizer_config.lr * args.lr_scale
    if hasattr(optim, optimizer_config.name):
        optimizer = getattr(optim,
                            optimizer_config.name)(model.parameters(),
                                                   **optimizer_config.values)
        scheduler = getattr(optim.lr_scheduler,
                            scheduler_config.name)(optimizer,
                                                   **scheduler_config.values)
    else:
        t_total = len(dataloaders['dev']) * args.epoch
        # no_decay = ['bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
        optimizer = getattr(optimization,
                            optimizer_config.name)(model.parameters(),
                                                   **optimizer_config.values)
        scheduler = getattr(optimization,
                            scheduler_config.name)(optimizer,
                                                   t_total=t_total,
                                                   **scheduler_config.values)

    ckpt_file = os.path.join(args.load_dir, 'model.best.pt.tar')
    if os.path.isfile(ckpt_file):
        load_ckpt(ckpt_file, model, optimizer, scheduler, args.cuda)
    else:
        raise Exception("No such path {}".format(ckpt_file))

    # pdb.set_trace()
    for epoch in range(1, 1 + args.epoch):
        model.train()

        pbar = tqdm(dataloaders[phase])
        pbar.set_description("[{} Epoch {}]".format(phase, epoch))
        running_loss = 0.
        running_size = 0.
        for data in pbar:
            optimizer.zero_grad()

            size, loss = infer(data, model, args.cuda)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_size += size
            pbar.set_postfix(mean_loss=running_loss / running_size)
    save_ckpt(os.path.join(args.save_dir, 'model.best.pt.tar'), epoch,
              model.state_dict(), optimizer.state_dict(),
              scheduler.state_dict())
Beispiel #7
0
#%%
if __name__ == "__main__":
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    # hyper parameters

    # train_data = GraphDataset(TRAIN_DIR)
    # val_data = GraphDataset(VAL_DIR)
    # if small_dataset:
    #     train_loader = DataLoader(train_data[:1000], batch_size=batch_size)
    #     val_loader = DataLoader(val_data[:200], batch_size=batch_size)
    # else:
    #     train_loader = DataLoader(train_data, batch_size=batch_size)
    #     val_loader = DataLoader(val_data, batch_size=batch_size)
    test_data = GraphDataset(TEST_DIR)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = HGNN(in_channels, out_channels)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=decay_lr_every,
                                          gamma=decay_lr_factor)
    if checkpoint_dir:
        load_checkpoint(checkpoint_dir, model)
        model = model.to(device)

    norm_centers_dict = None
    with open(NORM_CENTERS_DICT_DIR, 'rb') as f:
        norm_centers_dict = pickle.load(f)
    norm_centers_ls = sorted(norm_centers_dict.items())
# %%
if __name__ == "__main__":
    epochs = 100
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    batch_size = 2
    decay_lr_factor = 0.9
    decay_lr_every = 10
    lr = 0.005
    in_channels, out_channels = 8, 60
    show_every = 10
    os.chdir('..')
    # get model
    model = HGNN(in_channels, out_channels).to(device)

    dataset = GraphDataset('.')
    data_iter = DataLoader(dataset, batch_size=batch_size)
    for data in data_iter:
        out = model(data)
# %%
    '''
    def get_data_path_ls(dir_):
        return [os.path.join(dir_, data_path) for data_path in os.listdir(dir_)]

    # data preparation
    DIR = 'input_data'
    data_path_ls = get_data_path_ls(DIR)

    # hyper parameters
    epochs = 100
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Beispiel #9
0
    model.load_state_dict(state['state_dict'])
    optimizer.load_state_dict(state['optimizer'])
    print('model loaded from %s' % checkpoint_path)
    return checkpoint_path['end_epoch']


#%%
if __name__ == "__main__":
    # training envs
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    device = torch.device(
        f'cuda:{gpus[0]}' if torch.cuda.is_available() else 'cpu')

    # prepare dara
    train_data = GraphDataset(TRAIN_DIR).shuffle()
    val_data = GraphDataset(VAL_DIR)
    if small_dataset:
        train_loader = DataListLoader(train_data[:1000],
                                      batch_size=batch_size,
                                      shuffle=True)
        val_loader = DataListLoader(val_data[:200], batch_size=batch_size)
    else:
        train_loader = DataListLoader(train_data,
                                      batch_size=batch_size,
                                      shuffle=True)
        val_loader = DataListLoader(val_data, batch_size=batch_size)

    model = HGNN(in_channels, out_channels)
    model = nn.DataParallel(model, device_ids=gpus, output_device=gpus[0])
    model = model.to(device=device)
Beispiel #10
0
def train(args):
    Log = log_info(os.path.join(args.save_dir, 'process{}.info'.format(args.fold)))
    Log(args)
    model_config, optimizer_config, scheduler_config = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)
    Log(model_config.values)
    model = model_class(**model_config.values)
    if os.path.exists(args.ckpt_path):
        load_ckpt(args.ckpt_path, model)

    dataloaders = {}
    datasets = {}
    sampler = None
    collate_fn = collect_single
    phases = ['train']
    if args.do_eval:
        phases.append('dev')
    if args.do_test:
        phases.append('test')
    for phase in phases:
        if phase != 'test' and args.fold:
            fea_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.fea'.format(phase))
            pos_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.pos'.format(phase))
        else:
            fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
            pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
        fea_file = open(fea_filename, 'rb')
        with open(pos_filename, 'r') as f:
            positions = [int(v.strip()) for v in f]
        dataset = GraphDataset(fea_file, positions)
        if args.multi_gpu and phase == 'train':
            sampler = t.utils.data.RandomSampler(dataset)
            dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False,
                                                collate_fn=collate_fn, sampler=sampler, num_workers=1)
        else:
            dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size,
                                                shuffle=(phase=='train'), collate_fn=collate_fn, num_workers=1)
        dataloaders[phase] = dataloader
        datasets[phase] = dataset

    if args.multi_gpu:
        args.n_gpu = t.cuda.device_count()
        model = model.cuda()
        model = t.nn.DataParallel(model)
    elif args.cuda:
        args.n_gpu = 1
        model = model.cuda()

    bert_parameters = list(map(id, model.bert4pretrain.parameters()))
    other_parameters = filter(lambda p: id(p) not in bert_parameters, model.parameters())
    if hasattr(optim, optimizer_config.name):
        optimizer = getattr(optim, optimizer_config.name)([
            {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate},
            {'params': model.bert4pretrain.parameters()}
        ], **optimizer_config.values)
        scheduler = getattr(optim.lr_scheduler, scheduler_config.name)(optimizer, **scheduler_config.values)
    else:
        t_total = len(dataloaders['train']) * args.epoch
        # no_decay = ['bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
        optimizer = getattr(optimization, optimizer_config.name)([
            {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate},
            {'params': model.bert4pretrain.parameters()}
        ], **optimizer_config.values)
        scheduler = getattr(optimization, scheduler_config.name)(optimizer, t_total=t_total, **scheduler_config.values)

    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)

    # pdb.set_trace()
    if args.log:
        writer = SummaryWriter(os.path.join(args.save_dir, 'logs'))
    else:
        writer = None
    pre_fn, step_fn, post_fn = tm.acc_metric_builder(args, scheduler_config, model,
                                                        optimizer, scheduler, writer, Log)

    for epoch in range(1, 1+args.epoch):
        for phase in phases:
            pre_fn()
            if phase == 'train':
                model.train()
            else:
                model.eval()

            pbar = tqdm(dataloaders[phase])
            pbar.set_description("[{} Epoch {}]".format(phase, epoch))
            for data in pbar:
                optimizer.zero_grad()

                with t.set_grad_enabled(phase == 'train'):
                    result, loss = infer(data, model, args.cuda, is_evaluate=phase!='train')
                    if args.multi_gpu and args.n_gpu > 1:
                        loss = loss.mean()
                    if phase == 'train':
                        loss.backward()
                        # t.nn.utils.clip_grad_norm_(model.parameters(), 7)
                        optimizer.step()
                step_fn(result, loss, pbar, phase)
            post_fn(phase, epoch)
    if args.log:
        writer.close()
    with open(os.path.join(args.save_dir, 'invalid_entities'), 'wb') as f:
        pickle.dump(tm.Invalid_entities, f)
Beispiel #11
0
def predict(args):

    model_config, *_ = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    if model_config.activation is None:
        pass
    elif model_config.activation == 'identical':
        model_config.activation = lambda v: v
    elif model_config.activation == 'gelu':
        model_config.activation = models.layers.activation.gelu
    else:
        model_config.activation = getattr(t, model_config.activation, None) or getattr(F, model_config.activation, None)

    collate_fn = lambda batch: collect_multigraph(model_config.need_norm, model_config.concat_ab, batch)

    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    tgt_filename = os.path.join(args.data, '{}.tgt'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(tgt_filename, 'r') as f:
        targets = [int(v.strip()) for v in f]
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, targets, positions)
    dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size,
                                            shuffle=False, collate_fn=collate_fn, num_workers=1)

    epoch = args.best_epoch
    total_proba = None
    model = model_class(**model_config.values)
    ckpt_file = os.path.join(args.save_dir, 'model.epoch{}.pt.tar'.format(epoch))
    if os.path.isfile(ckpt_file):
        load_ckpt(ckpt_file, model)
    else:
        raise Exception("No such path {}".format(ckpt_file))
    if args.cuda:
        model = model.cuda()

    model.eval()
    running_loss = 0.
    running_results = Counter()

    curr_proba = []
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            proba = infer(data, model, model_config.seq_len, args.cuda)
            curr_proba.append(proba)
    curr_proba = np.concatenate(curr_proba, axis=0)
    if total_proba is None:
        total_proba = curr_proba
    else:
        assert total_proba.shape == curr_proba.shape
        total_proba += curr_proba

    df = pd.DataFrame(data=total_proba, columns=['proba0', 'proba1'])
    predictions = total_proba.argmax(1)
    df['predictions'] = predictions
    df['targets'] = dataset.targets
    df.to_csv(os.path.join(args.save_dir, 'result.csv'))