Example #1
0
def main(cfg):
    set_seed(7)

    file_num = cfg.filenum
    cfg.result_path = './result/'
    print('load dict')
    news_dict = json.load(
        open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8'))
    cfg.news_num = len(news_dict)
    print('load words dict')
    word_dict = json.load(
        open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8'))
    cfg.word_num = len(word_dict)

    if cfg.model == 'dssm':
        model = DSSM(cfg)
    elif cfg.model == 'gru':
        model = GRURec(cfg)

    saved_model_path = os.path.join('./checkpoint/',
                                    'model.ep{0}'.format(cfg.epoch))
    print("Load from:", saved_model_path)
    if not os.path.exists(saved_model_path):
        print("Not Exist: {}".format(saved_model_path))
        return []
    model.cpu()
    pretrained_model = torch.load(saved_model_path, map_location='cpu')
    print(model.load_state_dict(pretrained_model, strict=False))

    for point_num in range(file_num):
        print("processing {}/raw/test-{}.npy".format(cfg.root, point_num))
        valid_dataset = FMData(
            np.load("{}/raw/test-{}.npy".format(cfg.root, point_num)))

        dataset_list = split_dataset(valid_dataset, cfg.gpus)

        processes = []
        for rank in range(cfg.gpus):
            cur_device = torch.device("cuda:{}".format(rank))

            p = mp.Process(target=run,
                           args=(cfg, rank, dataset_list[rank], cur_device,
                                 model))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        gather(cfg, point_num)

    gather_all(cfg.result_path, file_num, validate=True, save=True)
Example #2
0
def main(cfg):
    set_seed(7)

    file_num = cfg.filenum
    cfg.result_path = './result/'
    print('load config')
    model_cfg = ModelConfig(cfg.root)
    cfg.mc = model_cfg
    print('load news info')
    news_title = np.load('{}/news_info.npy'.format(cfg.root))

    model = DFN(model_cfg)

    saved_model_path = os.path.join('./checkpoint/',
                                    'model.ep{0}'.format(cfg.epoch))
    print("Load from:", saved_model_path)
    if not os.path.exists(saved_model_path):
        print("Not Exist: {}".format(saved_model_path))
        return []
    model.cpu()
    pretrained_model = torch.load(saved_model_path, map_location='cpu')
    print(model.load_state_dict(pretrained_model, strict=False))

    for point_num in range(cfg.start_dev, file_num):
        print("processing {}/raw/{}-{}.npy".format(cfg.root, cfg.type,
                                                   point_num))
        valid_dataset = np.load("{}/raw/{}-{}.npy".format(
            cfg.root, cfg.type, point_num))

        dataset_list = split_dataset(valid_dataset, cfg.gpus)

        processes = []
        for rank in range(cfg.gpus):
            cur_device = torch.device("cuda:{}".format(rank))

            p = mp.Process(target=run,
                           args=(cfg, rank, dataset_list[rank], cur_device,
                                 model, news_title))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        gather(cfg, point_num)

    gather_all(cfg.result_path,
               file_num,
               start_file=cfg.start_dev,
               validate=True,
               save=True)
Example #3
0
def run(cfg, rank, device, finished, train_dataset_path, valid_dataset_file, news_title):
    """
    train and evaluate
    :param args: config
    :param rank: process id
    :param device: device
    :param train_dataset: dataset instance of a process
    :return:
    """
    
    set_seed(7)
    print("Worker %d is setting dataset ... " % rank)
    # Build Dataloader
    train_dataset = RecoData(cfg.mc, np.load(train_dataset_path), news_title)
    train_data_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True)
    valid_dataset = RecoData(cfg.mc, valid_dataset_file, news_title)
    valid_data_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False)

    # # Build model.
    model = DFN(cfg.mc)
    model.to(device)
    # Build optimizer.
    steps_one_epoch = len(train_data_loader)
    train_steps = cfg.epoch * steps_one_epoch
    print("Total train steps: ", train_steps)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=cfg.lr)
    print("Worker %d is working ... " % rank)
    # Fast check the validation process
    if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0):
        validate(cfg, -1, model, device, rank, valid_data_loader, fast_dev=True)
        logging.warning(model)
        gather_all(cfg.result_path, 1, validate=True, save=False)
    
    # Training and validation
    for epoch in range(cfg.epoch):
        # print(model.match_prediction_layer.state_dict()['2.bias'])
        train(cfg, epoch, rank, model, train_data_loader,
              optimizer, steps_one_epoch, device)
    
        validate(cfg, epoch, model, device, rank, valid_data_loader)
        # add finished count
        finished.value += 1

        if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0):
            save_checkpoint_by_epoch(model.state_dict(), epoch, cfg.checkpoint_path)

            while finished.value < cfg.gpus:
                time.sleep(1)
            gather_all(cfg.result_path, cfg.gpus, validate=True, save=False)
            finished.value = 0