def predict(args):
    # vocabs = load_vocab(args.vocab)
    # inv_vocabs = {v: k for k, v in vocabs.items()}
    model_config, optimizer_config, _ = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(
            pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = MRCDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         shuffle=False,
                                         collate_fn=collect_mrc,
                                         num_workers=1)

    model = model_class(**model_config.values)
    ckpt_file = os.path.join(args.save_dir,
                             'model.{}.pt.tar'.format(args.model))
    if os.path.isfile(ckpt_file):
        load_ckpt(ckpt_file, model)
    else:
        raise Exception("No such path {}".format(ckpt_file))
    if args.cuda:
        model = model.cuda()

    model.eval()
    curr_preds = defaultdict(set)
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            results = infer(data, model, args)
            for key in results:
                curr_preds[key].update(results[key])
    idxs = []
    entities = []
    for key in curr_preds:
        print(key)
        idxs.append(key)
        curr_preds[key].remove('')
        entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1]))
    preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities})
    preds.to_csv(os.path.join(args.save_dir, 'submit.csv'), index=False)
Example #2
0
def predict(args):
    # vocabs = load_vocab(args.vocab)
    # inv_vocabs = {v: k for k, v in vocabs.items()}
    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         shuffle=False,
                                         collate_fn=collect_single,
                                         num_workers=1)

    model_list = []
    for name in args.names.split(','):
        config_path = os.path.join('outputs', name, 'model_config.json')
        model_config, optimizer_config, _ = Config.from_json(config_path)
        model_name = model_config.name
        model_class = getattr(models, model_name)

        model = model_class(**model_config.values)
        ckpt_file = os.path.join('outputs', name, 'model.best.pt.tar')
        if os.path.isfile(ckpt_file):
            load_ckpt(ckpt_file, model)
        else:
            raise Exception("No such path {}".format(ckpt_file))
        if args.cuda:
            model = model.cuda()
        model.eval()
        model_list.append(model)
    curr_preds = defaultdict(set)
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            results = infer(data, model_list, args)
            for key in results:
                curr_preds[key].update(results[key])
    idxs = []
    entities = []
    for key in curr_preds:
        # print(key)
        idxs.append(key)
        curr_preds[key].remove('')
        entities.append(';'.join([v for v in curr_preds[key] if len(v) > 1]))
    preds = pd.DataFrame({'id': idxs, 'unknownEntities': entities})
    preds.to_csv(args.save_name, index=False)
def train(args):
    Log = log_info(os.path.join(args.save_dir, 'process.info'))
    Log(args)
    model_config, optimizer_config, scheduler_config = Config.from_json(
        args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(
            pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    model = model_class(**model_config.values)

    phase = 'dev'
    dataloaders = {}
    datasets = {}
    collate_fn = collect_single
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, positions)
    dataloader = t.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         collate_fn=collate_fn,
                                         num_workers=1)
    dataloaders[phase] = dataloader
    datasets[phase] = dataset

    if model_config.freeze:
        for param in model.bert4pretrain.parameters():
            param.requires_grad = False
    optimizer_config.lr = optimizer_config.lr * args.lr_scale
    if hasattr(optim, optimizer_config.name):
        optimizer = getattr(optim,
                            optimizer_config.name)(model.parameters(),
                                                   **optimizer_config.values)
        scheduler = getattr(optim.lr_scheduler,
                            scheduler_config.name)(optimizer,
                                                   **scheduler_config.values)
    else:
        t_total = len(dataloaders['dev']) * args.epoch
        # no_decay = ['bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
        optimizer = getattr(optimization,
                            optimizer_config.name)(model.parameters(),
                                                   **optimizer_config.values)
        scheduler = getattr(optimization,
                            scheduler_config.name)(optimizer,
                                                   t_total=t_total,
                                                   **scheduler_config.values)

    ckpt_file = os.path.join(args.load_dir, 'model.best.pt.tar')
    if os.path.isfile(ckpt_file):
        load_ckpt(ckpt_file, model, optimizer, scheduler, args.cuda)
    else:
        raise Exception("No such path {}".format(ckpt_file))

    # pdb.set_trace()
    for epoch in range(1, 1 + args.epoch):
        model.train()

        pbar = tqdm(dataloaders[phase])
        pbar.set_description("[{} Epoch {}]".format(phase, epoch))
        running_loss = 0.
        running_size = 0.
        for data in pbar:
            optimizer.zero_grad()

            size, loss = infer(data, model, args.cuda)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_size += size
            pbar.set_postfix(mean_loss=running_loss / running_size)
    save_ckpt(os.path.join(args.save_dir, 'model.best.pt.tar'), epoch,
              model.state_dict(), optimizer.state_dict(),
              scheduler.state_dict())
def train(args):
    Log = log_info(os.path.join(args.save_dir, 'process{}.info'.format(args.fold)))
    Log(args)
    model_config, optimizer_config, scheduler_config = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)
    Log(model_config.values)
    model = model_class(**model_config.values)

    dataloaders = {}
    datasets = {}
    sampler = None
    collate_fn = collect_mrc
    phases = ['train']
    if args.do_eval:
        phases.append('dev')
    if args.do_test:
        phases.append('test')
    for phase in phases:
        if phase != 'test' and args.fold:
            fea_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.fea'.format(phase))
            pos_filename = os.path.join(args.data, 'fold{}'.format(args.fold), '{}.pos'.format(phase))
        else:
            fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
            pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
        fea_file = open(fea_filename, 'rb')
        with open(pos_filename, 'r') as f:
            positions = [int(v.strip()) for v in f]
        dataset = MRCDataset(fea_file, positions)
        if args.multi_gpu and phase == 'train':
            sampler = t.utils.data.RandomSampler(dataset)
            dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False,
                                                collate_fn=collate_fn, sampler=sampler, num_workers=0)
        else:
            dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size,
                                                shuffle=(phase=='train'), collate_fn=collate_fn, num_workers=0)
        dataloaders[phase] = dataloader
        datasets[phase] = dataset

    if args.multi_gpu:
        args.n_gpu = t.cuda.device_count()
        model = model.cuda()
        model = t.nn.DataParallel(model)
    elif args.cuda:
        args.n_gpu = 1
        model = model.cuda()

    bert_parameters = list(map(id, model.bert4pretrain.parameters()))
    other_parameters = filter(lambda p: id(p) not in bert_parameters, model.parameters())
    if hasattr(optim, optimizer_config.name):
        optimizer = getattr(optim, optimizer_config.name)([
            {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate},
            {'params': model.bert4pretrain.parameters()}
        ], **optimizer_config.values)
        scheduler = getattr(optim.lr_scheduler, scheduler_config.name)(optimizer, **scheduler_config.values)
    else:
        t_total = len(dataloaders['train']) * args.epoch
        # no_decay = ['bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
        optimizer = getattr(optimization, optimizer_config.name)([
            {'params': other_parameters, 'lr': optimizer_config.lr*args.scale_rate},
            {'params': model.bert4pretrain.parameters()}
        ], **optimizer_config.values)
        scheduler = getattr(optimization, scheduler_config.name)(optimizer, t_total=t_total, **scheduler_config.values)

    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)

    # pdb.set_trace()
    if args.log:
        writer = SummaryWriter(os.path.join(args.save_dir, 'logs'))
    else:
        writer = None
    pre_fn, step_fn, post_fn = tm.mrc_acc_metric_builder(args, scheduler_config, model,
                                                        optimizer, scheduler, writer, Log)

    for epoch in range(1, 1+args.epoch):
        for phase in phases:
            pre_fn()
            if phase == 'train':
                model.train()
            else:
                model.eval()

            pbar = tqdm(dataloaders[phase])
            pbar.set_description("[{} Epoch {}]".format(phase, epoch))
            for data in pbar:
                optimizer.zero_grad()

                with t.set_grad_enabled(phase == 'train'):
                    result, loss = infer(data, model, args.cuda, is_evaluate=phase!='train')
                    if args.multi_gpu and args.n_gpu > 1:
                        loss = loss.mean()
                    if phase == 'train':
                        loss.backward()
                        # t.nn.utils.clip_grad_norm_(model.parameters(), 7)
                        optimizer.step()
                step_fn(result, loss, pbar, phase)
            post_fn(phase, epoch)
    if args.log:
        writer.close()
    with open(os.path.join(args.save_dir, 'invalid_entities'), 'wb') as f:
        pickle.dump(tm.Invalid_entities, f)
Example #5
0
def predict(args):

    model_config, *_ = Config.from_json(args.config)
    model_name = model_config.name
    model_class = getattr(models, model_name)

    if model_config.init_weight_path is None:
        model_config.init_weight = None
    else:
        model_config.init_weight = t.from_numpy(pickle.load(open(model_config.init_weight_path, 'rb'))).float()

    if model_config.activation is None:
        pass
    elif model_config.activation == 'identical':
        model_config.activation = lambda v: v
    elif model_config.activation == 'gelu':
        model_config.activation = models.layers.activation.gelu
    else:
        model_config.activation = getattr(t, model_config.activation, None) or getattr(F, model_config.activation, None)

    collate_fn = lambda batch: collect_multigraph(model_config.need_norm, model_config.concat_ab, batch)

    phase = 'test'
    fea_filename = os.path.join(args.data, '{}.fea'.format(phase))
    tgt_filename = os.path.join(args.data, '{}.tgt'.format(phase))
    pos_filename = os.path.join(args.data, '{}.pos'.format(phase))
    fea_file = open(fea_filename, 'rb')
    with open(tgt_filename, 'r') as f:
        targets = [int(v.strip()) for v in f]
    with open(pos_filename, 'r') as f:
        positions = [int(v.strip()) for v in f]
    dataset = GraphDataset(fea_file, targets, positions)
    dataloader = t.utils.data.DataLoader(dataset, batch_size=args.batch_size,
                                            shuffle=False, collate_fn=collate_fn, num_workers=1)

    epoch = args.best_epoch
    total_proba = None
    model = model_class(**model_config.values)
    ckpt_file = os.path.join(args.save_dir, 'model.epoch{}.pt.tar'.format(epoch))
    if os.path.isfile(ckpt_file):
        load_ckpt(ckpt_file, model)
    else:
        raise Exception("No such path {}".format(ckpt_file))
    if args.cuda:
        model = model.cuda()

    model.eval()
    running_loss = 0.
    running_results = Counter()

    curr_proba = []
    pbar = tqdm(dataloader)
    for data in pbar:
        with t.no_grad():
            proba = infer(data, model, model_config.seq_len, args.cuda)
            curr_proba.append(proba)
    curr_proba = np.concatenate(curr_proba, axis=0)
    if total_proba is None:
        total_proba = curr_proba
    else:
        assert total_proba.shape == curr_proba.shape
        total_proba += curr_proba

    df = pd.DataFrame(data=total_proba, columns=['proba0', 'proba1'])
    predictions = total_proba.argmax(1)
    df['predictions'] = predictions
    df['targets'] = dataset.targets
    df.to_csv(os.path.join(args.save_dir, 'result.csv'))