Esempio n. 1
0
def run(proc_id, n_gpus, args, devices, dataset):
    dev_id = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=dev_id)
    if n_gpus > 0:
        th.cuda.set_device(dev_id)

    train_labels = dataset.train_labels
    train_truths = dataset.train_truths
    num_edges = train_truths.shape[0]

    reverse_types = {
        to_etype_name(k): 'rev-' + to_etype_name(k)
        for k in dataset.possible_rating_values
    }
    reverse_types.update({v: k for k, v in reverse_types.items()})
    sampler = dgl.dataloading.MultiLayerNeighborSampler([None],
                                                        return_eids=True)
    dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, {
        to_etype_name(k): th.arange(
            dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k)))
        for k in dataset.possible_rating_values
    },
                                                sampler,
                                                use_ddp=n_gpus > 1,
                                                batch_size=args.minibatch_size,
                                                shuffle=True,
                                                drop_last=False)

    if proc_id == 0:
        valid_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.valid_dec_graph,
            th.arange(dataset.valid_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.valid_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)
        test_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.test_dec_graph,
            th.arange(dataset.test_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.test_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)

    nd_possible_rating_values = \
        th.FloatTensor(dataset.possible_rating_values)
    nd_possible_rating_values = nd_possible_rating_values.to(dev_id)

    net = Net(args=args, dev_id=dev_id)
    net = net.to(dev_id)
    if n_gpus > 1:
        net = DistributedDataParallel(net,
                                      device_ids=[dev_id],
                                      output_device=dev_id)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_epoch = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    print("Start training ...")
    dur = []
    iter_idx = 1

    for epoch in range(1, args.train_max_epoch):
        if n_gpus > 1:
            dataloader.set_epoch(epoch)
        if epoch > 1:
            t0 = time.time()
        net.train()
        with tqdm.tqdm(dataloader) as tq:
            for step, (input_nodes, pair_graph, blocks) in enumerate(tq):
                head_feat, tail_feat, blocks = load_subtensor(
                    input_nodes, pair_graph, blocks, dataset,
                    dataset.train_enc_graph)
                frontier = blocks[0]
                compact_g = flatten_etypes(pair_graph, dataset,
                                           'train').to(dev_id)
                true_relation_labels = compact_g.edata['label']
                true_relation_ratings = compact_g.edata['rating']

                head_feat = head_feat.to(dev_id)
                tail_feat = tail_feat.to(dev_id)
                frontier = frontier.to(dev_id)

                pred_ratings = net(compact_g, frontier, head_feat, tail_feat,
                                   dataset.possible_rating_values)
                loss = rating_loss_net(pred_ratings,
                                       true_relation_labels.to(dev_id)).mean()
                count_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(net.parameters(),
                                         args.train_grad_clip)
                optimizer.step()

                if proc_id == 0 and iter_idx == 1:
                    print("Total #Param of net: %d" %
                          (torch_total_param_num(net)))

                real_pred_ratings = (
                    th.softmax(pred_ratings, dim=1) *
                    nd_possible_rating_values.view(1, -1)).sum(dim=1)
                rmse = ((real_pred_ratings -
                         true_relation_ratings.to(dev_id))**2).sum()
                count_rmse += rmse.item()
                count_num += pred_ratings.shape[0]

                tq.set_postfix(
                    {
                        'loss': '{:.4f}'.format(count_loss / iter_idx),
                        'rmse': '{:.4f}'.format(count_rmse / count_num)
                    },
                    refresh=False)

                iter_idx += 1

        if epoch > 1:
            epoch_time = time.time() - t0
            print("Epoch {} time {}".format(epoch, epoch_time))

        if epoch % args.train_valid_interval == 0:
            if n_gpus > 1:
                th.distributed.barrier()
            if proc_id == 0:
                valid_rmse = evaluate(args=args,
                                      dev_id=dev_id,
                                      net=net,
                                      dataset=dataset,
                                      dataloader=valid_dataloader,
                                      segment='valid')
                logging_str = 'Val RMSE={:.4f}'.format(valid_rmse)

                if valid_rmse < best_valid_rmse:
                    best_valid_rmse = valid_rmse
                    no_better_valid = 0
                    best_epoch = epoch
                    test_rmse = evaluate(args=args,
                                         dev_id=dev_id,
                                         net=net,
                                         dataset=dataset,
                                         dataloader=test_dataloader,
                                         segment='test')
                    best_test_rmse = test_rmse
                    logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
                else:
                    no_better_valid += 1
                    if no_better_valid > args.train_early_stopping_patience\
                        and learning_rate <= args.train_min_lr:
                        logging.info(
                            "Early stopping threshold reached. Stop training.")
                        break
                    if no_better_valid > args.train_decay_patience:
                        new_lr = max(
                            learning_rate * args.train_lr_decay_factor,
                            args.train_min_lr)
                        if new_lr < learning_rate:
                            logging.info("\tChange the LR to %g" % new_lr)
                            learning_rate = new_lr
                            for p in optimizer.param_groups:
                                p['lr'] = learning_rate
                            no_better_valid = 0
                            print("Change the LR to %g" % new_lr)
            # sync on evalution
            if n_gpus > 1:
                th.distributed.barrier()

        if proc_id == 0:
            print(logging_str)
    if proc_id == 0:
        print(
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))
Esempio n. 2
0
def train(args):
    print(args)
    # dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
    #                     test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    dataset = DataSetLoader(args.data_name, args.device,
                use_one_hot_fea=args.use_one_hot_fea,
                symm=args.gcn_agg_norm_symm,
                test_ratio=args.data_test_ratio,
                valid_ratio=args.data_valid_ratio)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature)
        loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/iter_idx, count_rmse/count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Esempio n. 3
0
def run(proc_id, n_gpus, args, devices, dataset):
    dev_id = devices[proc_id]
    train_labels = dataset.train_labels
    train_truths = dataset.train_truths
    num_edges = train_truths.shape[0]
    sampler = GCMCSampler(dataset, 'train')

    seeds = th.arange(num_edges)
    dataloader = DataLoader(dataset=seeds,
                            batch_size=args.minibatch_size,
                            collate_fn=sampler.sample_blocks,
                            shuffle=True,
                            pin_memory=True,
                            drop_last=False,
                            num_workers=args.num_workers_per_gpu)

    if proc_id == 0:
        valid_sampler = GCMCSampler(dataset, 'valid')
        valid_seeds = th.arange(dataset.valid_truths.shape[0])
        valid_dataloader = DataLoader(dataset=valid_seeds,
                                      batch_size=args.minibatch_size,
                                      collate_fn=valid_sampler.sample_blocks,
                                      shuffle=False,
                                      pin_memory=True,
                                      drop_last=False,
                                      num_workers=args.num_workers_per_gpu)

        test_sampler = GCMCSampler(dataset, 'test')
        test_seeds = th.arange(dataset.test_truths.shape[0])
        test_dataloader = DataLoader(dataset=test_seeds,
                                     batch_size=args.minibatch_size,
                                     collate_fn=test_sampler.sample_blocks,
                                     shuffle=False,
                                     pin_memory=True,
                                     drop_last=False,
                                     num_workers=args.num_workers_per_gpu)

    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=dev_id)
    if n_gpus > 0:
        th.cuda.set_device(dev_id)

    nd_possible_rating_values = \
        th.FloatTensor(dataset.possible_rating_values)
    nd_possible_rating_values = nd_possible_rating_values.to(dev_id)

    net = Net(args=args, dev_id=dev_id)
    net = net.to(dev_id)
    if n_gpus > 1:
        net = DistributedDataParallel(net,
                                      device_ids=[dev_id],
                                      output_device=dev_id)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_epoch = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    print("Start training ...")
    dur = []
    iter_idx = 1

    for epoch in range(1, args.train_max_epoch):
        if epoch > 1:
            t0 = time.time()
        net.train()
        for step, sample_data in enumerate(dataloader):
            compact_g, frontier, head_feat, tail_feat, \
                true_relation_labels, true_relation_ratings = sample_data
            head_feat = head_feat.to(dev_id)
            tail_feat = tail_feat.to(dev_id)
            frontier = frontier.to(dev_id)

            pred_ratings = net(compact_g, frontier, head_feat, tail_feat,
                               dataset.possible_rating_values)
            loss = rating_loss_net(pred_ratings,
                                   true_relation_labels.to(dev_id)).mean()
            count_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
            optimizer.step()

            if proc_id == 0 and iter_idx == 1:
                print("Total #Param of net: %d" % (torch_total_param_num(net)))

            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
            rmse = ((real_pred_ratings -
                     true_relation_ratings.to(dev_id))**2).sum()
            count_rmse += rmse.item()
            count_num += pred_ratings.shape[0]

            if iter_idx % args.train_log_interval == 0:
                logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}".format(
                    iter_idx, count_loss / iter_idx, count_rmse / count_num)
                count_rmse = 0
                count_num = 0

            if iter_idx % args.train_log_interval == 0:
                print("[{}] {}".format(proc_id, logging_str))

            iter_idx += 1
        if epoch > 1:
            epoch_time = time.time() - t0
            print("Epoch {} time {}".format(epoch, epoch_time))

        if epoch % args.train_valid_interval == 0:
            if n_gpus > 1:
                th.distributed.barrier()
            if proc_id == 0:
                valid_rmse = evaluate(args=args,
                                      dev_id=dev_id,
                                      net=net,
                                      dataset=dataset,
                                      dataloader=valid_dataloader,
                                      segment='valid')
                logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

                if valid_rmse < best_valid_rmse:
                    best_valid_rmse = valid_rmse
                    no_better_valid = 0
                    best_epoch = epoch
                    test_rmse = evaluate(args=args,
                                         dev_id=dev_id,
                                         net=net,
                                         dataset=dataset,
                                         dataloader=test_dataloader,
                                         segment='test')
                    best_test_rmse = test_rmse
                    logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
                else:
                    no_better_valid += 1
                    if no_better_valid > args.train_early_stopping_patience\
                        and learning_rate <= args.train_min_lr:
                        logging.info(
                            "Early stopping threshold reached. Stop training.")
                        break
                    if no_better_valid > args.train_decay_patience:
                        new_lr = max(
                            learning_rate * args.train_lr_decay_factor,
                            args.train_min_lr)
                        if new_lr < learning_rate:
                            logging.info("\tChange the LR to %g" % new_lr)
                            learning_rate = new_lr
                            for p in optimizer.param_groups:
                                p['lr'] = learning_rate
                            no_better_valid = 0
                            print("Change the LR to %g" % new_lr)
            # sync on evalution
            if n_gpus > 1:
                th.distributed.barrier()

        print(logging_str)
    if proc_id == 0:
        print(
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))
Esempio n. 4
0
File: train.py Progetto: ghk829/dgl
def train(args):
    print(args)
    if args.data_name == 'jukebox':
        dataset = JukeboxDataset('dataset/listen_count.txt')
    else:
        dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.MSELoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'ndcg','precision','recall','fscore','support'], ['%d','%.4f', '%.4f','%s','%s','%s','%s'],
                                     os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 1
    count_num = 1
    count_loss = 0
    count_step = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    def batch(iterable, n=1):
        current_batch = []
        for item in iterable:
            current_batch.append(item)
            if len(current_batch) == n:
                yield current_batch
                current_batch = []
        if current_batch:
            yield current_batch
    batches = []
    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        unique_item_list = dataset.train['item_id'].unique().tolist()

        ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                   dataset.user_feature, dataset.movie_feature)
        from tqdm import tqdm
        if iter_idx ==1:
            for row in tqdm(list(dataset.train.itertuples())):
                user, item, rating = row.user_id, row.item_id, row.rating
                userid = dataset.global_user_id_map[user]
                observed = dataset.train[dataset.train['user_id'] == user]['item_id'].unique().tolist()
                negatives = set()
                while len(negatives) < 1:
                    sample = random.choice(unique_item_list)
                    if sample not in observed:
                        negatives.add(sample)
                        batches.append((userid, dataset.global_item_id_map[item], dataset.global_item_id_map[sample]))

        for bt in tqdm(list(batch(batches, 2**14))):
            uidfeat = ufeat[[e[0] for e in bt]]
            posfeat = ifeat[[e[1] for e in bt]]
            negfeat = ifeat[[e[2] for e in bt]]

            pos_scores = uidfeat @ net.decoder.Q @ posfeat.T
            neg_scores = uidfeat @ net.decoder.Q @ negfeat.T

            lmbd = 1e-5
            mf_loss = -nn.BCELoss()(th.sigmoid(pos_scores), th.ones_like(pos_scores)) + nn.LogSigmoid()(pos_scores - neg_scores).mean()
            mf_loss = -1 * mf_loss

            regularizer = (th.norm(uidfeat,dim=1)**2).mean() + (th.norm(posfeat,dim=1)**2).mean() + (th.norm(negfeat,dim=1)**2).mean() + (th.norm(net.decoder.Q))
            emb_loss = lmbd * regularizer
            print('mf_loss', mf_loss)
            print('emb_loss', emb_loss)
            optimizer.zero_grad()
            loss = mf_loss + emb_loss
            count_loss += loss.item()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
            optimizer.step()
            ufeat, ifeat = net.encoder(dataset.train_enc_graph,
                                       dataset.user_feature, dataset.movie_feature)
            count_step += 1

        print('train done')

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (count_step + 1))
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/(count_step + 1), count_rmse/count_num,
                np.average(dur))
            count_rmse = 1
            count_num = 1

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            precision, recall, fscore, support = evaluate_others(args=args, net=net, dataset=dataset, segment='valid')
            ndcg = evaluate_ndcg(args=args, net=net, dataset=dataset, segment='valid')
            print('ndcg', ndcg, 'precision', precision, 'recall', recall, 'fscore', fscore, 'support', support)
            valid_loss_logger.log(iter=iter_idx, ndcg=ndcg, precision=precision, recall=recall, fscore=fscore,
                                  support=support)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Esempio n. 5
0
def train(args):
    print(args)

    dataset = DataSetLoader(args.data_name,
                            args.device,
                            use_one_hot_fea=args.use_one_hot_fea,
                            symm=args.gcn_agg_norm_symm,
                            test_ratio=args.data_test_ratio,
                            valid_ratio=args.data_valid_ratio,
                            sample_rate=args.sample_rate)

    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    #args.decoder = "MLP"
    net = Net(args=args)

    #print(args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels

    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(
        ['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"],
        ['%d', '%.4f', '%.4f', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = -np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)

    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(
        args.device)

    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)
    dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(
        args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        '''
        noisy_labels = th.LongTensor(np.random.choice([-1, 0, 1], train_gt_ratings.shape[0], replace=True, p=[0.001, 0.998, 0.001])).to(args.device)

        train_gt_labels += noisy_labels
    
        max_label = dataset.max_l + th.zeros_like(train_gt_labels)
        min_label = dataset.min_l + th.zeros_like(train_gt_labels)
        max_label = max_label.long()
        min_label = min_label.long()
        train_gt_labels = th.where(train_gt_labels > max_label, max_label, train_gt_labels)
        train_gt_labels = th.where(train_gt_labels < min_label, min_label, train_gt_labels)
        '''

        if iter_idx > 3:
            t0 = time.time()
        net.train()
        if iter_idx > 250:
            Two_Stage = True
        else:
            Two_Stage = False
        Two_Stage = False
        pred_ratings, reg_loss, user_out, movie_out, W = net(
            dataset.train_enc_graph, dataset.train_dec_graph,
            dataset.user_feature, dataset.movie_feature, Two_Stage)
        #print("user_out:\n", user_out[0])
        #print("movie_out:\n", movie_out[0])
        #print("W:\n", W.shape)
        if args.loss_func == "CE":
            loss = rating_loss_net(
                pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss
            '''
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2)
            loss += mse_loss * 0.0001
            '''
        elif args.loss_func == "Hinge":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
            gap = (real_pred_ratings - train_gt_labels)**2
            hinge_loss = th.where(gap > 1.0, gap * gap, gap).mean()
            loss = hinge_loss
        elif args.loss_func == "MSE":
            '''
            seeds = th.arange(pred_ratings.shape[0])
            random.shuffle(seeds)
            for i in range((pred_ratings.shape[0] - 1) // 50 + 1):
                start = i * 50
                end = (i + 1) * 50
                if end > (pred_ratings.shape[0] - 1):
                    end = pred_ratings.shape[0] - 1
                batch = seeds[start:end]
                loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss
                count_loss += loss.item() * 50 / pred_ratings.shape[0]
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
                optimizer.step()
                pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph,
                                   dataset.user_feature, dataset.movie_feature)
            '''
            loss = th.mean((pred_ratings[:, 0] -
                            nd_possible_rating_values[train_gt_labels])**
                           2) + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        if args.loss_func == "CE":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
        elif args.loss_func == "MSE":
            real_pred_ratings = pred_ratings[:, 0]
        rmse = ((real_pred_ratings - train_gt_ratings)**2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss / iter_idx, count_rmse / count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            ndcg_valid = evaluate_metric(args=args,
                                         net=net,
                                         dataset=dataset,
                                         segment='valid',
                                         debug=False)
            print("ndcg_valid:", ndcg_valid)
            valid_loss_logger.log(iter=iter_idx,
                                  rmse=valid_rmse,
                                  ndcg_20=ndcg_valid[0],
                                  ndcg_40=ndcg_valid[1],
                                  ndcg_80=ndcg_valid[2])
            print("-" * 80)

            #test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
            #test_loss_logger.log(iter=iter_idx, rmse=test_rmse, ndcg_20 = ndcg_k[0], ndcg_40 = ndcg_k[1], ndcg_80 = ndcg_k[2])
            #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
            logging_str += ',\tndcg_valid_20={:.4f}'.format(ndcg_valid[0])
            logging_str += ',\tndcg_valid_40={:.4f}'.format(ndcg_valid[1])
            logging_str += ',\tndcg_valid_80={:.4f}'.format(ndcg_valid[2])

            ndcg_valid_20 = ndcg_valid[0]
            #print("***********",ndcg_valid_20)

            if ndcg_valid_20 > best_valid_ndcg:
                best_valid_ndcg = ndcg_valid_20
                print("************best_valid_ndcg:", best_valid_ndcg)
                print("************ndcg_valid_20:", ndcg_valid_20)
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test',
                                     debug=True,
                                     idx=iter_idx)
                ndcg_test = evaluate_metric(args=args,
                                            net=net,
                                            dataset=dataset,
                                            segment='test',
                                            debug=False)
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[0])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[1])
                logging_str += ',\tbest ndcg_test={:.4f}'.format(ndcg_test[2])
                #best_test_rmse = test_rmse
                best_test_ndcg = ndcg_test
                #test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                test_loss_logger.log(iter=iter_idx,
                                     rmse=test_rmse,
                                     ndcg_20=ndcg_test[0],
                                     ndcg_40=ndcg_test[1],
                                     ndcg_80=ndcg_test[2])
                #logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor,
                                 args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
            #print("************best_valid_ndcg:",best_valid_ndcg)
            #print("************ndcg_valid_20:",ndcg_valid_20)
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print(
        'Best Iter Idx={}, best ndcg_20={:.4f}, best ndcg_40={:.4f}, best ndcg_80={:.4f}'
        .format(best_iter, best_test_ndcg[0], best_test_ndcg[1],
                best_test_ndcg[2]))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
Esempio n. 6
0
def train(args):
    print(args)
    dataset = DataSetLoader(args.data_name, args.device,
                use_one_hot_fea=args.use_one_hot_fea,
                symm=args.gcn_agg_norm_symm,
                test_ratio=args.data_test_ratio,
                valid_ratio=args.data_valid_ratio,
                sample_rate = args.sample_rate)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    # train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
    #                                  os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    # valid_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f',  '%.4f',  '%.4f',  '%.4f'],
    #                                  os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
    # test_loss_logger = MetricLogger(['iter', 'rmse', "ndcg_20", "ndcg_40", "ndcg_80"], ['%d', '%.4f',  '%.4f',  '%.4f',  '%.4f'],
    #                                 os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))
    ### prepare the logger
    train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
                                     os.path.join(args.save_dir, 'train_loss.csv'))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(args.save_dir, 'valid_loss.csv'))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(args.save_dir, 'test_loss.csv'))
    ### declare the loss information
    best_valid_rmse = np.inf
    best_valid_ndcg = -np.inf
    best_test_ndcg = []
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    
    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)

    #dataset.valid_recall_dec_graph = dataset.valid_recall_dec_graph.to(args.device)
    #dataset.test_recall_dec_graph = dataset.test_recall_dec_graph.to(args.device)

    print("Start training ...")
    dur = []


    train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info)
    
    # 首先需要对每个用户采样出他的0样本,这个操作做一次就可以了。
    # 其次每次从这些样本中随机抽取一些边作为0的边
    # def sample_negative(interact_status, sample_num, random_number):
    #     #"""return sample_num sampled negative items"""
    #     random.seed(random_number)
    #     interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_num))
    #     return interact_status[['user_id', 'negative_samples']]

    # def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data, seed):
    #     #train_rating_pairs, train_rating_values = dataset._generate_pair_value(dataset.train_rating_info)
    #     train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(sampled_data)    
    #     train_rating_pairs_new = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1]))
    #     train_rating_values_new = np.append(train_rating_values, train_rating_values_zeros)
    #     train_enc_graph_NS = dataset._generate_enc_graph(train_rating_pairs_new, train_rating_values_new, add_support = True)
    #     #print("dataset.train_dec_graph:", dataset.train_enc_graph)
    #     train_enc_graph_NS = train_enc_graph_NS.int().to(args.device)
    #     valid_enc_graph_NS = train_enc_graph_NS    
    #     return train_enc_graph_NS

    def update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data):
        train_rating_pairs_zeros, train_rating_values_zeros = dataset._generate_pair_value_for_zero(dataset.train_rating_info, sampled_data)
        train_rating_pairs = (np.append(train_rating_pairs[0], train_rating_pairs_zeros[0]), np.append(train_rating_pairs[1], train_rating_pairs_zeros[1]))
        train_rating_values = np.append(train_rating_values, train_rating_values_zeros)
        dataset.train_enc_graph = dataset._generate_enc_graph(train_rating_pairs, train_rating_values, add_support = True)
        dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
        dataset.valid_enc_graph = dataset.train_enc_graph    
        return dataset.train_enc_graph

    def sample_data(interact_status, random_number, sample_rate):
        random.seed(random_number)
        #print("length:", len(interact_status['negative_items']))
        #for i in interact_status['negative_items']
        #print("neg:\n",interact_status['negative_items'])
        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, sample_rate))
        return interact_status[['user_id', 'negative_items', 'negative_samples']]


    seed_list = np.random.randint(0, 10000, (args.train_max_iter,))

    #negitive_all = dataset.negative_all(dataset.train_rating_info)
    # max_num = 0
    # for i in range(0,len(negitive_all)):
    #     if len(negitive_all['negative_items'][i]) > max_num:
    #         max_num = len(negitive_all['negative_items'][i])
    # min_num = np.inf
    # for i in range(0,len(negitive_all)):
    #     if len(negitive_all['negative_items'][i]) < min_num:
    #         min_num = len(negitive_all['negative_items'][i])

    # sheet = np.zeros((len(negitive_all), max_num))     
    # for i in range(0,len(negitive_all)):
    #     for j in range (0, len(np.array(negitive_all['negative_items'][i]))):
    #         sheet[i][j] =  np.array(negitive_all['negative_items'][i])[j]
    # sheet_new = sheet[:,:min_num]
    # print(sheet_new)
    # X = np.array(negitive_all['negative_items'])
    # max_len = max(len(xx) for xx in X) 
    # M = np.array( [np.concatenate([xx, np.zeros( max_len - len(xx))]) for xx in X])
    # sheet = []
    # for i in range(M.shape[0]):
    #     random.shuffle(M[i])
        #print(list(M[i]))
        #print(np.random.shuffle(list(M[i])))
        #sheet.append[np.random.shuffle(list(M[i]))]
    
    #np.random.randint(0,10,(4,3))

    # print("neg_all:",negitive_all)
    # sampled_data = sample_data(negitive_all, random_number = 1, sample_rate = 3)
    # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data)
    # dataset.valid_enc_graph = dataset.train_enc_graph 

    for iter_idx in range(1, args.train_max_iter):
        # """
        # 方法:创建一个最基本的encode图,然后再加边,每次加一种点
        # """
        #print(len(negitive_all))
        #print ("max_num:", max_num,min_num)
        #print("M shape:", sheet_new.shape)
        #print("sheet:",M)
        #print(np.random.shuffle(np.array(sheet_new)))
        #map_matrix = np.random.randint(0,min_num,(sheet_new.shape[0], sheet_new.shape[1])) < 20
        #print(sheet_new[map_matrix].shape)
        #print(np.where(sheet_new[map_matrix]))
        #print(sheet_new)
        # if args.sample_rate > 0:
            # 这是随机采样的代码 
            # """
            # 如何采样?
            # 1. 单次采样:时间占用还好
            # 2. 每次随机采样:
            #     我们先存一个所有负样本的表,每次在这个负样本的表中去采样

            # 对于更新函数,我们需要对train_enc_graph进行更新,

            # 函数:
            # 1. 一个采样函数
            # 2. 更新函数
            # """
            # print(1)
            # sampled_data = sample_data(negitive_all, random_number = seed_list[iter_idx], sample_rate = 10)
            # print(2)
            # dataset.train_enc_graph = update_encode_graph(dataset, train_rating_pairs, train_rating_values, sampled_data)
            # print(3)
            # dataset.valid_enc_graph = dataset.train_enc_graph 
            #print(4)

        if iter_idx > 3:
            t0 = time.time()
        net.train()
        if iter_idx > 250:
            Two_Stage = True
        else:
            Two_Stage = False
        Two_Stage = False
        pred_ratings, reg_loss, user_out, movie_out, W = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature, Two_Stage)
        #print("pre:",pred_ratings[0])
        if args.loss_func == "CE":
            loss = rating_loss_net(pred_ratings, train_gt_labels).mean() + args.ARR * reg_loss
            '''
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            mse_loss = th.sum((real_pred_ratings - train_gt_ratings) ** 2)
            loss += mse_loss * 0.0001
            '''
        elif args.loss_func == "Hinge":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
            gap = (real_pred_ratings - train_gt_labels) ** 2
            hinge_loss = th.where(gap > 1.0, gap*gap, gap).mean()
            loss = hinge_loss
        elif args.loss_func == "MSE":
            '''
            seeds = th.arange(pred_ratings.shape[0])
            random.shuffle(seeds)
            for i in range((pred_ratings.shape[0] - 1) // 50 + 1):
                start = i * 50
                end = (i + 1) * 50
                if end > (pred_ratings.shape[0] - 1):
                    end = pred_ratings.shape[0] - 1
                batch = seeds[start:end]
                loss = F.mse_loss(pred_ratings[batch, 0], nd_possible_rating_values[train_gt_labels[batch]]) + args.ARR * reg_loss
                count_loss += loss.item() * 50 / pred_ratings.shape[0]
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
                optimizer.step()
                pred_ratings, reg_loss = net(dataset.train_enc_graph, dataset.train_dec_graph,
                                   dataset.user_feature, dataset.movie_feature)
            '''
            loss = th.mean((pred_ratings[:, 0] - nd_possible_rating_values[train_gt_labels]) ** 2) + args.ARR * reg_loss
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()
        #print("iter:",iter_idx, loss)
        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))

        if args.loss_func == "CE":
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
        elif args.loss_func == "MSE":
            real_pred_ratings = pred_ratings[:, 0]

        rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss/iter_idx, count_rmse/count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0
        
        
        
        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
            valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
            test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
            logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
            dev_step(args, net, dataset=dataset, segment='test', debug = False)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test', debug = True, idx = iter_idx)
                
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info("Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx  % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
        best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()