Esempio n. 1
0
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    seed = 1111
    set_seed(seed)
    #### get data

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device", device)

    if args.parallel:
        local_rank = args.local_rank
        torch.distributed.init_process_group(backend="nccl")
        device = torch.device('cuda:{}'.format(local_rank))

    data_obj = _DATA()
    train_data, valid_data, vocab_obj = data_obj.f_load_data_yelp(args)
    # train_data, valid_data = data()

    if args.train:
        now_time = datetime.datetime.now()
        time_name = str(now_time.month) + "_" + str(now_time.day) + "_" + str(
            now_time.hour) + "_" + str(now_time.minute)
        model_file = os.path.join(args.model_path,
                                  args.data_name + "_" + args.model_name)

        if not os.path.isdir(model_file):
            print("create a directory ", model_file)
            os.mkdir(model_file)

        args.model_file = model_file + "/model_best_" + time_name + ".pt"
        print("model_file", model_file)

    print("vocab_size", vocab_obj.vocab_size)
    print("user num", vocab_obj.user_size)
    ### get model
    network = _GEN_NETWORK(vocab_obj, args)

    ### add count parameters
    total_param_num = 0
    for name, param in network.named_parameters():
        if param.requires_grad:
            param_num = param.numel()
            total_param_num += param_num
            print(name, "\t", param_num)

    print("total parameters num", total_param_num)

    if args.train:
        logger_obj = _LOGGER()
        logger_obj.f_add_writer(args)

        E_network = _ENC_NETWORK(vocab_obj, args)
        E_network = E_network.to(device)
        # E_network = torch.nn.parallel.DistributedDataParallel(E_network, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)

        # torch.distributed.barrier()
        # map_location = {'cuda:%d'%0:'cuda:%d'%local_rank}

        model_path = args.model_path
        E_model_file = args.E_model_file
        E_model_abs_file = os.path.join(model_path, E_model_file)
        print("E_model_abs_file", E_model_abs_file)

        check_point = torch.load(E_model_abs_file)

        # check_point = torch.load(E_model_abs_file, map_location=map_location)
        E_network.load_state_dict(check_point['model'])

        # torch.distributed.barrier()

        network = network.to(device)
        if args.parallel:
            network = torch.nn.parallel.DistributedDataParallel(
                network,
                device_ids=[local_rank],
                output_device=local_rank,
                find_unused_parameters=True)

        de_parameters = network.parameters()
        de_optimizer = _OPTIM(de_parameters, args)

        trainer = _TRAINER(vocab_obj, args, device)
        trainer.f_train_M(train_data, valid_data, E_network, network,
                          de_optimizer, logger_obj, local_rank)

        logger_obj.f_close_writer()

    if args.test:
        print("=" * 10, "test", "=" * 10)
        infer_obj = _INFER(vocab_obj, args, device)

        infer_obj.f_init_infer(network, args.model_file, reload_model=True)

        infer_obj.f_inference(train_data, valid_data)

    if args.eval:
        print("=" * 10, "eval", "=" * 10)

        eval_obj = _EVAL(vocab_obj, args, device)

        eval_obj.f_init_eval(network, args.model_file, reload_model=True)

        eval_obj.f_eval(train_data, valid_data)
Esempio n. 2
0
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    seed = 1111
    set_seed(seed)
    #### get data
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device", device)

    args.decoder_model_type = args.decoder_model_type.lower()
    global_step = args.global_step_eval
    
    print("checkpoint dir", args.checkpoint_dir)

    output_decoder_dir = os.path.join(args.checkpoint_dir, "checkpoint-decoder-{}".format(global_step))
    output_full_dir = os.path.join(args.checkpoint_dir, "checkpoint-full-{}".format(global_step))

    print("output_decoder_dir: ", output_decoder_dir)
    print("output_full_dir: ", output_full_dir)

    checkpoints = [[output_decoder_dir]]

    MODEL_CLASSES = {'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer)}
    
    decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]

    model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)

    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
    print("decoder_tokenizer_name ", args.decoder_tokenizer_name)
    print("decoder_model_name_or_path ", args.decoder_model_name_or_path)

    model_decoder.to(device)

    if args.block_size <= 0:
        args.block_size = tokenizer_decoder.max_len_single_sentence
    
    print("max_len_single_sentence: ", tokenizer_decoder.max_len_single_sentence)
    args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)
    print("block size: ", args.block_size)

    checkpoint = torch.load(os.path.join(output_full_dir, "training.bin"))

    special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
    num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)

    print('We have added', num_added_toks, 'tokens to GPT2')
    model_decoder.resize_token_embeddings(len(tokenizer_decoder))

    assert tokenizer_decoder.pad_token == '<PAD>'

    data_obj = _DATA()
    train_data, valid_data, vocab_obj = data_obj.f_load_data_yelp_GPT(tokenizer_decoder, args)
    # train_data, valid_data = data()
    
    if args.train:
        now_time = datetime.datetime.now()
        time_name = str(now_time.month)+"_"+str(now_time.day)+"_"+str(now_time.hour)+"_"+str(now_time.minute)
        model_file = os.path.join(args.model_path, args.data_name+"_"+args.model_name)

        if not os.path.isdir(model_file):
            print("create a directory ", model_file)
            os.mkdir(model_file)

        args.model_file = model_file+"/model_best_"+time_name+".pt"
        print("model_file", model_file)

    network = _GEN_NETWORK(vocab_obj, args)

    ### add count parameters
    total_param_num = 0
    for name, param in network.named_parameters():
        if param.requires_grad:
            param_num = param.numel()
            total_param_num += param_num
            print(name, "\t", param_num)
    
    print("total parameters num", total_param_num)

    if  args.train:
        logger_obj = _LOGGER()
        logger_obj.f_add_writer(args)
        
        E_network = _ENC_NETWORK(vocab_obj, args)
        E_network = E_network.to(device)
        # E_network = torch.nn.parallel.DistributedDataParallel(E_network, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)

        # torch.distributed.barrier()
        # map_location = {'cuda:%d'%0:'cuda:%d'%local_rank}

        model_path = args.model_path
        # E_model_file = args.E_model_file
        # E_model_abs_file = os.path.join(model_path, E_model_file)
        # print("E_model_abs_file", E_model_abs_file)
        
        # check_point = torch.load(E_model_abs_file)

        # check_point = torch.load(E_model_abs_file, map_location=map_location)
        # E_network.load_state_dict(check_point['model'])

        # if args.user_pretrained_model:
        #     pre_model = check_point['model_state_dict']
        #     model_dict = network.state_dict()

        #     pre_dict = {k:v for k, v in pre_model.items() if k in model_dict}
        #     model_dict.update(pre_dict)
        #     network.load_state_dict(model_dict)

        network.init_tokenizer_decoder(tokenizer_decoder, model_decoder)
        network = network.to(device)
        
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in network.named_parameters() if not any(nd in n for nd in no_decay)],'weight_decay':args.weight_decay }, 
            {'params': [p for n, p in network.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
        ]
        t_total = len(train_data) // args.gradient_accumulation_steps * args.num_train_epochs

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)

        local_rank = 0
        trainer = _TRAINER(vocab_obj, args, device)
        trainer.f_train_M(train_data, valid_data, E_network, network, optimizer, scheduler, logger_obj, local_rank, args)

        logger_obj.f_close_writer()

    if args.test:
        print("="*10, "test", "="*10)
        infer_obj = _INFER(vocab_obj, args, device)

        infer_obj.f_init_infer(network, args.model_file, reload_model=True)

        infer_obj.f_inference(train_data, valid_data)
    
    if args.eval:
        print("="*10, "eval", "="*10)
        
        eval_obj = _EVAL(vocab_obj, args, device)

        

        eval_obj.f_init_eval(network, args.model_file, reload_model=True)

        eval_obj.f_eval(train_data, valid_data)