Example #1
0
def test_InferDataset():
    config_file = "../../../config.yaml"
    ip_list_file = "../../../ip_list.txt"
    config = prepare_config(config_file)

    ds = InferDataset(config, ip_list_file)
    loader = Dataloader(ds, batch_size=1, num_workers=1)
    for data in loader:
        print(data[0])
        break
Example #2
0
def test_PairDataset():
    config_file = "../../../config.yaml"
    ip_list_file = "../../../ip_list.txt"
    config = prepare_config(config_file)

    ds = TrainPairDataset(config, ip_list_file)

    loader = Dataloader(ds,
                        batch_size=4,
                        num_workers=1,
                        stream_shuffle_size=100,
                        collate_fn=CollateFn())
    pairs = []
    start = time.time()
    for batch_data in loader:
        pairs.extend(batch_data)
        print(batch_data)
        time.sleep(10)
    print("total time: %s" % (time.time() - start))
Example #3
0
File: test.py Project: WenjinW/PGL
                             num_workers=1,
                             collate_fn=DS.CollateFn(config))

    ### automatic evaluator. takes dataset name as input
    evaluator = PCQM4MEvaluator()

    # ---------------- test ----------------------- #
    log.info("testing ...")
    pred_dict = evaluate(model, test_loader)

    test_output_path = os.path.join(config.output_dir, config.task_name)
    make_dir(test_output_path)
    test_output_file = os.path.join(test_output_path, "test_pred.npz")

    log.info("saving test result to %s" % test_output_file)
    np.savez_compressed(test_output_file,
                        pred_dict['y_pred'].astype(np.float32))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='gnn')
    parser.add_argument("--config", type=str, default="./config.yaml")
    parser.add_argument("--task_name", type=str, default="task_name")
    parser.add_argument("--mode", type=str, default="train")
    parser.add_argument("--output_path", type=str, default="./")
    args = parser.parse_args()

    config = prepare_config(args.config, isCreate=False, isSave=False)
    make_dir(args.output_path)
    infer(config, args.output_path)
Example #4
0
    ds = MolDataset(config)
    split_idx = ds.get_idx_split()
    train_ds = Subset(ds, split_idx['train'], mode='train')
    valid_ds = Subset(ds, split_idx['valid'], mode='valid')
    test_ds = Subset(ds, split_idx['test'], mode='test')
    print("Train exapmles: %s" % len(train_ds))
    print("Valid exapmles: %s" % len(valid_ds))
    print("Test exapmles: %s" % len(test_ds))

    for i in range(len(train_ds)):
        gdata = train_ds[i]
        print("nfeat: ", np.sum(gdata['node_feat']))
        print("edges: ", np.sum(gdata['edge_index']))
        print("label: ", gdata['label'])
        if i == 10:
            break

    print("valid data")
    for i in range(len(valid_ds)):
        gdata = valid_ds[i]
        print("nfeat: ", np.sum(gdata['node_feat']))
        print("edges: ", np.sum(gdata['edge_index']))
        print("label: ", gdata['label'])
        if i == 10:
            break


if __name__ == "__main__":
    config = prepare_config("./config.yaml", isCreate=False, isSave=False)
    test_dataset(config)
Example #5
0
File: main.py Project: WenjinW/PGL
import paddle.nn.functional as F
import paddle.distributed as dist

import pgl
from pgl.utils.data import Dataloader
from pgl.utils.logger import log

from ogb.lsc import PCQM4MEvaluator
from ogb.utils import smiles2graph

from utils.config import prepare_config, make_dir
from utils.logger import prepare_logger, log_to_file
import model as M
import dataset as DS

config = prepare_config("./config.yaml", isCreate=False, isSave=False)
env = dist.ParallelEnv()
rank = env.rank
ip_address = config.ip_address.split(',')
os.environ['PADDLE_CURRENT_ENDPOINT'] = ip_address[rank]
os.environ['PADDLE_TRAINER_ENDPOINTS'] = config.ip_address

reg_criterion = paddle.nn.loss.L1Loss()


def data2tensor(batch_dict):
    feed_dict = {}
    for key, value in batch_dict.items():
        if isinstance(value, pgl.Graph):
            feed_dict[key] = value.tensor()
        elif isinstance(value, np.ndarray):
Example #6
0
            writer.write("\t".join(item) + "\n")

    p2c_edges_file = os.path.join(config.processed_path,
                                  'paper2conf_edges.txt')
    log.info("saving paper2conf edges to %s" % p2c_edges_file)
    with open(p2c_edges_file, 'w') as writer:
        for item in tqdm.tqdm(paper2conf_edges):
            writer.write("\t".join(item) + "\n")

    author_label_file = os.path.join(config.processed_path, 'author_label.txt')
    log.info("saving author label to %s" % author_label_file)
    with open(author_label_file, 'w') as writer:
        for item in tqdm.tqdm(author_label):
            writer.write("\t".join(item) + "\n")

    conf_label_file = os.path.join(config.processed_path, 'conf_label.txt')
    log.info("saving conf label to %s" % conf_label_file)
    with open(conf_label_file, 'w') as writer:
        for item in tqdm.tqdm(conf_label):
            writer.write("\t".join(item) + "\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='metapath2vec')
    parser.add_argument('--config', default="./config.yaml", type=str)
    args = parser.parse_args()

    config = prepare_config(args.config)

    main(config)
Example #7
0
def main(data_path, data, out_path, config_path, eval_every):
    start_time = time.time()

    # read paths
    trn_data = os.path.join(data_path, f'MIND{data}_train')
    vld_data = os.path.join(data_path, f'MIND{data}_dev')
    util_data = os.path.join(data_path, 'utils')

    trn_paths = set_data_paths(trn_data)
    vld_paths = set_data_paths(vld_data)
    util_paths = set_util_paths(util_data)

    trn_pickle_path = os.path.join(trn_data, 'dataset.pickle')
    vld_pickle_path = os.path.join(vld_data, 'dataset.pickle')

    # read configuration file
    config = prepare_config(config_path,
                            wordEmb_file=util_paths['embedding'],
                            wordDict_file=util_paths['word_dict'],
                            userDict_file=util_paths['uid2index'])

    # out path
    num_global = config['pop'] # 7
    num_fresh = config['fresh'] # 1 
    out_path = os.path.join(out_path, f'MIND{data}_dev_pop{num_global}_fresh{num_fresh}')
    os.makedirs(out_path, exist_ok=True)

    # set
    seed = config['seed']
    set_seed(seed)
    epochs = config['epochs']
    metrics = {metric: 0. for metric in config['metrics']}

    # load dictionaries
    word2idx = load_dict(config['wordDict_file'])
    uid2idx = load_dict(config['userDict_file'])

    # load datasets and define dataloaders
    if os.path.exists(trn_pickle_path):
        with open(trn_pickle_path, 'rb') as f:
            trn_set = pickle.load(f)
    else:
        trn_selector = NewsSelector(data_type1=data, data_type2='train',
                                    num_pop=20,
                                    num_fresh=20)
        trn_set = DataSetTrn(trn_paths['news'], trn_paths['behaviors'],
                             word2idx=word2idx, uid2idx=uid2idx,
                             selector=trn_selector, config=config)
        with open(trn_pickle_path, 'wb') as f:
            pickle.dump(trn_set, f)

    if os.path.exists(vld_pickle_path):
        with open(vld_pickle_path, 'rb') as f:
            vld_set = pickle.load(f)
    else:
        vld_selector = NewsSelector(data_type1=data, data_type2='dev',
                                    num_pop=20,
                                    num_fresh=20)
        vld_set = DataSetTest(vld_paths['news'], vld_paths['behaviors'],
                              word2idx=word2idx, uid2idx=uid2idx,
                              selector=vld_selector, config=config,
                              label_known=True)
        with open(vld_pickle_path, 'wb') as f:
            pickle.dump(vld_set, f)

    trn_loader = DataLoader(trn_set, batch_size=config['batch_size'],
                            shuffle=True, num_workers=8)
    vld_impr_idx, vld_his, vld_impr, vld_label, vld_pop, vld_fresh =\
        vld_set.raw_impr_idxs, vld_set.histories_words, vld_set.imprs_words,\
        vld_set.labels, vld_set.pops_words, vld_set.freshs_words

    # define models, optimizer, loss
    # TODO: w2v --> BERT model
    word2vec_emb = np.load(config['wordEmb_file'])
    model = NRMS(config, word2vec_emb).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=float(config['learning_rate']),
                           weight_decay=float(config['weight_decay']))
    criterion = nn.CrossEntropyLoss()

    print(f'[{time.time()-start_time:5.2f} Sec] Ready for training...')

    # train and evaluate
    for epoch in range(1, epochs+1):
        start_time = time.time()
        batch_loss = 0.
        '''
        training
        '''
        for i, (trn_his, trn_pos, trn_neg, trn_pop, trn_fresh) \
                in tqdm(enumerate(trn_loader), desc='Training', total=len(trn_loader)):
            # ready for training
            model.train()
            optimizer.zero_grad()

            # prepare data
            trn_his, trn_pos, trn_neg, trn_pop, trn_fresh = \
                trn_his.to(DEVICE), trn_pos.to(DEVICE), trn_neg.to(DEVICE),\
                trn_pop.to(DEVICE), trn_fresh.to(DEVICE)
            trn_pop = trn_pop[:, :config['pop'], :]
            trn_fresh = trn_fresh[:, :config['fresh'], :]
            trn_cand = torch.cat((trn_pos, trn_neg), dim=1)
            trn_global = torch.cat((trn_pop, trn_fresh), dim=1)
            trn_gt = torch.zeros(size=(trn_cand.shape[0],)).long().to(DEVICE)

            # inference
            if config['global']:
                trn_user_out = model((trn_his, trn_global), source='pgt')
            else:
                trn_user_out = model(trn_his, source='history')
            trn_cand_out = model(trn_cand, source='candidate')
            prob = torch.matmul(trn_cand_out, trn_user_out.unsqueeze(2)).squeeze()

            # training
            loss = criterion(prob, trn_gt)
            loss.backward()
            optimizer.step()
            batch_loss += loss.item()

        inter_time = time.time()
        epoch_loss = batch_loss/(i+1)

        if epoch % eval_every != 0:
            result = f'Epoch {epoch:3d} [{inter_time - start_time:5.2f}Sec]' \
                     f', TrnLoss:{epoch_loss:.4f}'
            print(result)
            continue

        '''
        evaluation
        '''
        with open(os.path.join(out_path, f'prediction-{epoch}.txt'), 'w') as f:
            for j in tqdm(range(len(vld_impr)), desc='Evaluation', total=len(vld_impr)):
                impr_idx_j = vld_impr_idx[j]
                vld_his_j = torch.tensor(vld_his[j]).long().to(DEVICE).unsqueeze(0)
                vld_pop_j = torch.tensor(vld_pop[j]).long().to(DEVICE).unsqueeze(0)
                vld_fresh_j = torch.tensor(vld_fresh[j]).long().to(DEVICE).unsqueeze(0)
                vld_pop_j = vld_pop_j[:, :config['pop'], :]
                vld_fresh_j = vld_fresh_j[:, :config['fresh'], :]
                vld_global_j = torch.cat((vld_pop_j, vld_fresh_j), dim=1)
                if config['global']:
                    vld_user_out_j = model((vld_his_j, vld_global_j), source='pgt')
                else:
                    vld_user_out_j = model(vld_his_j, source='history')
                vld_cand_j = torch.tensor(vld_impr[j]).long().to(DEVICE).unsqueeze(0)
                vld_cand_out_j = model(vld_cand_j, source='candidate')

                scores_j = torch.matmul(vld_cand_out_j, vld_user_out_j.unsqueeze(2)).squeeze()
                scores_j = scores_j.detach().cpu().numpy()
                argmax_idx = (-scores_j).argsort()
                ranks = np.empty_like(argmax_idx)
                ranks[argmax_idx] = np.arange(1, scores_j.shape[0]+1)
                ranks_str = ','.join([str(r) for r in list(ranks)])
                f.write(f'{impr_idx_j} [{ranks_str}]\n')

                vld_gt_j = np.array(vld_label[j])

                for metric, _ in metrics.items():
                    if metric == 'auc':
                        score = roc_auc_score(vld_gt_j, scores_j)
                        metrics[metric] += score
                    elif metric == 'mrr':
                        score = mrr_score(vld_gt_j, scores_j)
                        metrics[metric] += score
                    elif metric.startswith('ndcg'):  # format like: ndcg@5;10
                        k = int(metric.split('@')[1])
                        score = ndcg_score(vld_gt_j, scores_j, k=k)
                        metrics[metric] += score

        for metric, _ in metrics.items():
            metrics[metric] /= len(vld_impr)

        end_time = time.time()

        result = f'Epoch {epoch:3d} [{inter_time - start_time:5.2f} / {end_time - inter_time:5.2f} Sec]' \
                 f', TrnLoss:{epoch_loss:.4f}, '
        for enum, (metric, _) in enumerate(metrics.items(), start=1):
            result += f'{metric}:{metrics[metric]:.4f}'
            if enum < len(metrics):
                result += ', '
        print(result)
Example #8
0
    log.info("saving features")
    np.save(
        "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")),
        mgf)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='gnn')
    parser.add_argument("--config", type=str, default="./config.yaml")
    parser.add_argument("--task_name", type=str, default="task_name")
    parser.add_argument("--infer_model", type=str, default=None)
    parser.add_argument("--log_id", type=str, default=None)
    args = parser.parse_args()

    if args.infer_model is not None:
        config = prepare_config(args.config, isCreate=False, isSave=False)
        config.model_path_for_infer = args.infer_model
        infer(config)
    else:
        config = prepare_config(args.config, isCreate=True, isSave=True)

        log_to_file(log, config.log_dir, config.log_filename)

        if config.warm_start_from is not None:
            log.info("loading model config from %s" %
                     config.pretrained_config_file)
            pretrained_config = prepare_config(config.pretrained_config_file)
            pretrained_model_config = pretrained_config.pretrained_model_config
        else:
            pretrained_model_config = config.model_config