Example #1
0
def main(cfg):

    set_seed(7)

    print('load dev')
    dev_list = []
    for i in range(cfg.filenum):
        dev_list.append(np.load("data/raw/dev-{}.npy".format(i)))
    validate_dataset = np.concatenate(dev_list, axis=0)
    print('load config')
    model_cfg = ModelConfig()
    print('load news info')
    news_title = np.load('data/news_info.npy')

    cfg.mc = model_cfg
    cfg.result_path = './result/'
    cfg.checkpoint_path = './checkpoint/'
    finished = mp.Value('i', 0)

    assert (cfg.gpus > 1)
    valid_dataset_list = split_valid_dataset(validate_dataset, cfg.gpus)

    processes = []
    for rank in range(cfg.gpus):
        p = mp.Process(
            target=init_processes,
            args=(cfg, rank, None, "data/raw/train-{}-new.npy".format(rank),
                  valid_dataset_list[rank], news_title, finished, run, "nccl"))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
Example #2
0
def main(cfg):
    set_seed(7)

    file_num = cfg.filenum
    cfg.result_path = './result/'
    print('load dict')
    news_dict = json.load(
        open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8'))
    cfg.news_num = len(news_dict)
    print('load words dict')
    word_dict = json.load(
        open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8'))
    cfg.word_num = len(word_dict)

    if cfg.model == 'dssm':
        model = DSSM(cfg)
    elif cfg.model == 'gru':
        model = GRURec(cfg)

    saved_model_path = os.path.join('./checkpoint/',
                                    'model.ep{0}'.format(cfg.epoch))
    print("Load from:", saved_model_path)
    if not os.path.exists(saved_model_path):
        print("Not Exist: {}".format(saved_model_path))
        return []
    model.cpu()
    pretrained_model = torch.load(saved_model_path, map_location='cpu')
    print(model.load_state_dict(pretrained_model, strict=False))

    for point_num in range(file_num):
        print("processing {}/raw/test-{}.npy".format(cfg.root, point_num))
        valid_dataset = FMData(
            np.load("{}/raw/test-{}.npy".format(cfg.root, point_num)))

        dataset_list = split_dataset(valid_dataset, cfg.gpus)

        processes = []
        for rank in range(cfg.gpus):
            cur_device = torch.device("cuda:{}".format(rank))

            p = mp.Process(target=run,
                           args=(cfg, rank, dataset_list[rank], cur_device,
                                 model))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        gather(cfg, point_num)

    gather_all(cfg.result_path, file_num, validate=True, save=True)
Example #3
0
def main(cfg):
    set_seed(7)

    file_num = cfg.filenum
    cfg.result_path = './result/'
    print('load config')
    model_cfg = ModelConfig(cfg.root)
    cfg.mc = model_cfg
    print('load news info')
    news_title = np.load('{}/news_info.npy'.format(cfg.root))

    model = DFN(model_cfg)

    saved_model_path = os.path.join('./checkpoint/',
                                    'model.ep{0}'.format(cfg.epoch))
    print("Load from:", saved_model_path)
    if not os.path.exists(saved_model_path):
        print("Not Exist: {}".format(saved_model_path))
        return []
    model.cpu()
    pretrained_model = torch.load(saved_model_path, map_location='cpu')
    print(model.load_state_dict(pretrained_model, strict=False))

    for point_num in range(cfg.start_dev, file_num):
        print("processing {}/raw/{}-{}.npy".format(cfg.root, cfg.type,
                                                   point_num))
        valid_dataset = np.load("{}/raw/{}-{}.npy".format(
            cfg.root, cfg.type, point_num))

        dataset_list = split_dataset(valid_dataset, cfg.gpus)

        processes = []
        for rank in range(cfg.gpus):
            cur_device = torch.device("cuda:{}".format(rank))

            p = mp.Process(target=run,
                           args=(cfg, rank, dataset_list[rank], cur_device,
                                 model, news_title))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        gather(cfg, point_num)

    gather_all(cfg.result_path,
               file_num,
               start_file=cfg.start_dev,
               validate=True,
               save=True)
Example #4
0
def run(cfg, rank, device, finished, train_dataset_path, valid_dataset_file, news_title):
    """
    train and evaluate
    :param args: config
    :param rank: process id
    :param device: device
    :param train_dataset: dataset instance of a process
    :return:
    """
    
    set_seed(7)
    print("Worker %d is setting dataset ... " % rank)
    # Build Dataloader
    train_dataset = RecoData(cfg.mc, np.load(train_dataset_path), news_title)
    train_data_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True)
    valid_dataset = RecoData(cfg.mc, valid_dataset_file, news_title)
    valid_data_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False)

    # # Build model.
    model = DFN(cfg.mc)
    model.to(device)
    # Build optimizer.
    steps_one_epoch = len(train_data_loader)
    train_steps = cfg.epoch * steps_one_epoch
    print("Total train steps: ", train_steps)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=cfg.lr)
    print("Worker %d is working ... " % rank)
    # Fast check the validation process
    if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0):
        validate(cfg, -1, model, device, rank, valid_data_loader, fast_dev=True)
        logging.warning(model)
        gather_all(cfg.result_path, 1, validate=True, save=False)
    
    # Training and validation
    for epoch in range(cfg.epoch):
        # print(model.match_prediction_layer.state_dict()['2.bias'])
        train(cfg, epoch, rank, model, train_data_loader,
              optimizer, steps_one_epoch, device)
    
        validate(cfg, epoch, model, device, rank, valid_data_loader)
        # add finished count
        finished.value += 1

        if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0):
            save_checkpoint_by_epoch(model.state_dict(), epoch, cfg.checkpoint_path)

            while finished.value < cfg.gpus:
                time.sleep(1)
            gather_all(cfg.result_path, cfg.gpus, validate=True, save=False)
            finished.value = 0
Example #5
0
def run(cfg, rank, dev_dataset_file, device, model, news_title):
    set_seed(7)

    model.to(device)
    model.eval()

    dev_dataset = RecoData(cfg.mc, dev_dataset_file, news_title)
    valid_data_loader = DataLoader(dev_dataset,
                                   batch_size=cfg.batch_size,
                                   shuffle=False)

    if ((cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0)):
        data_iter = tqdm(enumerate(valid_data_loader),
                         desc="EP_dev:%d" % 1,
                         total=len(valid_data_loader),
                         bar_format="{l_bar}{r_bar}")
    else:
        data_iter = enumerate(valid_data_loader)

    with torch.no_grad():
        preds, truths, imp_ids = list(), list(), list()
        for i, data in data_iter:

            imp_ids += data[:, 0].cpu().numpy().tolist()
            data = data.to(device)

            # 1. Forward
            pred = model(data[:, 2:])
            if pred.dim() > 1:
                pred = pred.squeeze()
            try:
                preds += pred.cpu().numpy().tolist()
            except:
                print(data.size())
                preds.append(int(pred.cpu().numpy()))
            truths += data[:, 1].long().cpu().numpy().tolist()

        tmp_dict = {}
        tmp_dict['imp'] = imp_ids
        tmp_dict['labels'] = truths
        tmp_dict['preds'] = preds

        with open(cfg.result_path + 'tmp_small_{}.json'.format(rank),
                  'w',
                  encoding='utf-8') as f:
            json.dump(tmp_dict, f)
Example #6
0
def main(cfg):

    set_seed(7)
    # print('load train')
    # train_list = []
    # for i in range(cfg.filenum):
    #     train_list.append(np.load("data/raw/train-{}.npy".format(i)))
    # train_dataset = FMData(np.concatenate(train_list, axis=0))
    print('load dev')
    dev_list = []
    for i in range(cfg.filenum):
        dev_list.append(
            np.load("{}/raw/{}-{}.npy".format(cfg.root, cfg.vtype, i)))
    validate_dataset = FMData(np.concatenate(dev_list, axis=0))
    print('load news dict')
    news_dict = json.load(
        open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8'))
    print('load words dict')
    word_dict = json.load(
        open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8'))
    cfg.news_num = len(news_dict)
    cfg.word_num = len(word_dict)
    cfg.result_path = './result/'
    cfg.checkpoint_path = './checkpoint/'
    finished = mp.Value('i', 0)

    assert (cfg.gpus > 1)
    # dataset_list = split_dataset(train_dataset, cfg.gpus)
    valid_dataset_list = split_valid_dataset(validate_dataset, cfg.gpus)

    processes = []
    for rank in range(cfg.gpus):
        p = mp.Process(target=init_processes,
                       args=(cfg, rank, None,
                             "{}/raw/train-{}-new.npy".format(cfg.root, rank),
                             valid_dataset_list[rank], finished, run, "nccl"))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()