Ejemplo n.º 1
0
def test_samples_independence_in_batch():
    vocab = {'cat': 1, 'dog': 2, 'bird': 3}  # 0 reserved for padding
    model = HAN(vocab=vocab, freeze_emb=True, load_glove=False)
    model.eval()

    def get_rand_sample() -> torch.Tensor:
        return torch.randint(low=1,
                             high=len(vocab),
                             size=(1, 12, 10),
                             dtype=torch.int64)

    n_test = 5
    for _ in range(n_test):
        x_sample = get_rand_sample()
        y_sample = get_rand_sample()
        z_sample = get_rand_sample()

        batch_a = torch.cat([x_sample, z_sample])
        batch_b = torch.cat([y_sample, z_sample])

        output_a = model(batch_a)
        output_b = model(batch_b)

        for key in ['logits', 'w_scores', 's_scores']:
            torch.allclose(output_a[key][1], output_b[key][1])
Ejemplo n.º 2
0
def test_batch_size() -> None:
    bs = 384

    vocab = ImdbReviewsDataset.get_imdb_vocab(IMBD_ROOT)
    model = HAN(vocab=vocab, freeze_emb=True, load_glove=False)
    model.cuda()

    batch = torch.ones((bs, TXT_CLIP, SNT_CLIP), dtype=torch.int64).cuda()
    model(batch)
def main(args: Namespace) -> None:
    set_global_seed(args.seed)
    is_wandb = setup_wandb()

    train_loader, test_loader, vocab = get_loaders(batch_size=args.batch_size)
    loaders = OrderedDict([('train', train_loader), ('valid', test_loader)])

    model = HAN(vocab=vocab, freeze_emb=args.freeze_emb)

    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.SGD(lr=1e-2,
                                momentum=.9,
                                params=model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    if is_wandb:
        Runner = runner_pkg.SupervisedWandbRunner
        extra_args = {'monitoring_params': {'project': 'neuro_imdb'}}
    else:
        Runner = runner_pkg.SupervisedRunner
        extra_args = {}

    runner = Runner(input_key='features',
                    output_key=None,
                    input_target_key='targets',
                    device=args.device if is_available() else tdevice('cpu'))

    callbacks = [
        clb.AccuracyCallback(prefix='accuracy',
                             input_key='targets',
                             output_key='logits',
                             accuracy_args=[1],
                             threshold=.5,
                             num_classes=1,
                             activation=None),
        clb.EarlyStoppingCallback(patience=5,
                                  minimize=False,
                                  min_delta=0.02,
                                  metric='accuracy01')
    ]
    runner.train(model=model,
                 criterion=criterion,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 logdir=str(args.logdir),
                 num_epochs=args.n_epoch,
                 verbose=True,
                 main_metric='accuracy01',
                 valid_loader='valid',
                 callbacks=callbacks,
                 minimize_metric=False,
                 checkpoint_data={'params': model.init_params},
                 **extra_args)
Ejemplo n.º 4
0
def test_model() -> None:
    vocab = {'cat': 1, 'dog': 2, 'bird': 3}  # 0 reserved for padding
    model = HAN(vocab=vocab, freeze_emb=True, load_glove=False)
    batch = torch.randint(low=1,
                          high=len(vocab),
                          size=(16, 12, 10),
                          dtype=torch.int64)
    logits = model(batch)['logits']

    assert torch.all(0 <= logits)
    assert torch.all(1 >= logits)
Ejemplo n.º 5
0
def test_forward_for_dataset() -> None:
    # data
    dataset = get_test_dataset()
    ids = [2, 5]
    docs = collate_docs([dataset[i] for i in ids])['features']
    n_doc, n_snt, n_wrd = docs.shape

    # model
    model = HAN(vocab=dataset.vocab, freeze_emb=True, load_glove=False)

    # forward
    output = model(docs)
    pred = output['logits']
    w_scores = output['w_scores']
    s_scores = output['s_scores']

    assert pred.numel() == n_doc
    assert w_scores.shape == docs.shape
    assert s_scores.shape == (n_doc, n_snt)
Ejemplo n.º 6
0
def main():
    args = init_para()
    config_file = ["./src/config.ini"]
    config = Config(config_file, args)

    g_hin = HIN(config.input_fold, config.data_type, config.relation_list)

    # Model selection
    if args.model == "RHINE":
        g_hin.load_matrix()
        g_hin.generate_matrix(config.combination)
        RHINEdp = RHINEDataProcess(config, g_hin)
        RHINEdp.generate_triples()
        RHINEdp.merge_triples(config.relation_category)
        print("Train")
        TrainRHINE(config, g_hin.node2id_dict)
    elif args.model == "Metapath2vec":
        config.temp_file += args.dataset + '_' + config.metapath + '.txt'
        config.out_emd_file += args.dataset + '_' + config.metapath + '.txt'

        random_walk_based_mp(g_hin, config.metapath, config.num_walks,
                             config.walk_length, config.temp_file)
        m2v = Metapath2VecTrainer(config, g_hin)
        m2v.train()

    elif args.model == "HeteSpaceyWalk":
        config.temp_file += args.dataset + '_' + config.metapath + '.txt'
        config.out_emd_file += args.dataset + '_' + config.metapath + '.txt'
        random_walk_spacey_mp(g_hin, config.metapath, config.data_type,
                              config.num_walks, config.walk_length,
                              config.temp_file, config.beta)
        m2v = Metapath2VecTrainer(config)
        m2v.train()
    elif args.model == "DHNE":
        hyper_edge_sample(g_hin,
                          output_datafold=config.temp_file,
                          scale=config.scale,
                          tup=config.triple_hyper)
        dataset = read_data_sets(train_dir=config.temp_file)
        dim_feature = [
            sum(dataset.train.nums_type) - n for n in dataset.train.nums_type
        ]
        Process(dataset,
                dim_feature,
                embedding_size=config.dim,
                hidden_size=config.hidden_size,
                learning_rate=config.alpha,
                alpha=config.alpha,
                batch_size=config.batch_size,
                num_neg_samples=config.neg_num,
                epochs_to_train=config.epochs,
                output_embfold=config.out_emd_file,
                output_modelfold=config.output_modelfold,
                prefix_path=config.prefix_path,
                reflect=g_hin.matrix2id_dict)
    # elif args.model == "HHNE":
    #     random_walk_txt = config.temp_file + args.dataset + '-' + config.metapath + '.txt'
    #     node_type_mapping_txt = config.temp_file + 'node_type_mapping.txt'
    #     config.out_emd_file += args.dataset + '-' + config.metapath + '.txt'
    #     print("Metapath walking!")
    #     if len(config.metapath) == 3:
    #         # data = random_walk_three(config.num_walks, config.walk_length, config.metapath, g_hin, random_walk_txt)
    #         data = random_walk_three(1, 5, config.metapath, g_hin, random_walk_txt)
    #     elif len(config.metapath) == 5:
    #         data = random_walk_five(config.num_walks, config.walk_length, config.metapath, g_hin, random_walk_txt)
    #
    #     node_type_mapping_txt = g_hin.node_type_mapping(node_type_mapping_txt)
    #     dataset = HHNE.Dataset(random_walk_txt=random_walk_txt,window_size=config.window_size)
    #     print("Train" + str(len(dataset.index2nodeid)))
    #     pos_holder, tar_holder, tag_holder, pro_holder, grad_pos, grad_tar = HHNE.bulid_model(EMBED_SIZE=config.dim)
    #     HHNE.TrainHHNE(pos_holder, tar_holder, tag_holder, pro_holder, grad_pos, grad_tar, dataset,
    #               BATCH_SIZE=config.batch_size, NUM_EPOCHS=config.epochs, NUM_SAMPLED=config.neg_num,
    #               VOCAB_SIZE=len(dataset.nodeid2index), EMBED_SIZE=config.dim, startingAlpha=config.alpha,
    #               lr_decay=config.lr_decay, output_embfold=config.out_emd_file)
    elif args.model == "MetaGraph2vec":
        config.temp_file += 'graph_rw.txt'
        config.out_emd_file += args.dataset + '_node.txt'
        mgg = MetaGraphGenerator()
        if args.dataset == "acm":
            mgg.generate_random_three(config.temp_file, config.num_walks,
                                      config.walk_length, g_hin.node,
                                      g_hin.relation_dict)
        elif args.dataset == "dblp":
            mgg.generate_random_four(config.temp_file, config.num_walks,
                                     config.walk_length, g_hin.node,
                                     g_hin.relation_dict)
        model = Metapath2VecTrainer(config, g_hin)
        print("Training")
        model.train()
    # elif args.model == "PME":
    #     pme = PME(
    #         g_hin.input_edge,
    #         g_hin.node2id_dict,
    #         g_hin.relation2id_dict,
    #         config.dim,
    #         config.dimensionR,
    #         config.loadBinaryFlag,
    #         config.outBinaryFlag,
    #         config.num_workers,
    #         config.nbatches,
    #         config.epochs,
    #         config.no_validate,
    #         config.alpha,
    #         config.margin,
    #         config.M,
    #         config.out_emd_file
    #     )
    #     # pme.load()
    #     pme.train()
    #     pme.out()
    elif args.model == "PTE":
        config.temp_file += args.dataset + '.txt'
        config.out_emd_file += args.dataset + '_node.txt'
        print('PTE')
        data = PTEDataReader(g_hin, config)
        alias_table = AliasSampling(data)
        pte = PTETrainer(g_hin, config, data, alias_table)
        print('Training')
        pte.train()
    elif args.model == "HERec":
        mp_list = config.metapath_list.split("|")
        for mp in mp_list:
            HERec_gen_neighbour(g_hin, mp, config.temp_file)
            config.input = config.temp_file + mp + ".txt"
            config.out_put = config.out_emd_file + mp + ".txt"
            DW(config)
        HERec_union_metapth(config.out_emd_file, mp_list,
                            len(g_hin.node[mp_list[0][0]]), config.dim)
    elif args.model == "HIN2vec":
        HIN2vec(g_hin, config.out_emd_file, config)
    elif args.model == "HAN":
        data_process = HAN_process(g_hin, config.mp_list, args.dataset,
                                   config.featype)
        config.out_emd_file += args.dataset + '_node.txt'
        m = HAN(config, data_process)
        m.train()
    elif args.model == "HeGAN":
        model = HeGAN(g_hin, args, config)
        model.train(config, g_hin.node2id_dict)
    else:
        pass
Ejemplo n.º 7
0
def main():
    args = init_para()
    config_file = ["./src/config.ini"]
    config = Config(config_file, args)

    g_hin = HIN(args.dataset, config.data_type, config.relation_list)

    # Model selection
    if args.model == "RHINE":
        g_hin.load_matrix()
        g_hin.generate_matrix(config.combination)
        RHINEdp = RHINEDataProcess(config, g_hin)
        RHINEdp.generate_triples()
        RHINEdp.merge_triples(config.relation_category)
        print("Train")
        TrainRHINE(config, g_hin.node2id_dict)
    elif args.model == "Metapath2vec":
        config.temp_file += args.dataset + '-' + config.metapath + '.txt'
        config.out_emd_file += args.dataset + '-' + config.metapath + '.txt'
        print("Metapath walking!")
        if len(config.metapath) == 3:
            data = random_walk_three(config.num_walks, config.walk_length, config.metapath, g_hin, config.temp_file)
        elif len(config.metapath) == 5:
            data = random_walk_five(config.num_walks, config.walk_length, config.metapath, g_hin, config.temp_file)
        m2v = Metapath2VecTrainer(config)
        print("Training")
        m2v.train()
    elif args.model == "DHNE":
        hyper_edge_sample(g_hin, output_datafold=config.temp_file, scale=config.scale, tup=config.triple_hyper)
        dataset = read_data_sets(train_dir=config.temp_file)
        dim_feature = [sum(dataset.train.nums_type) - n for n in dataset.train.nums_type]
        Process(dataset, dim_feature, embedding_size=config.dim, hidden_size=config.hidden_size,
                learning_rate=config.alpha, alpha=config.alpha, batch_size=config.batch_size,
                num_neg_samples=config.neg_num, epochs_to_train=config.epochs, output_embfold=config.out_emd_file,
                output_modelfold=config.output_modelfold, prefix_path=config.prefix_path, reflect=g_hin.matrix2id_dict)

    elif args.model == "MetaGraph2vec":
        config.temp_file += 'graph_rw.txt'
        config.out_emd_file += 'node.txt'
        mgg = MetaGraphGenerator()
        if args.dataset == "acm":
            mgg.generate_random_three(config.temp_file, config.num_walks, config.walk_length, g_hin.node,
                                      g_hin.relation_dict)
        elif args.dataset == "dblp":
            mgg.generate_random_four(config.temp_file, config.num_walks, config.walk_length, g_hin.node,
                                     g_hin.relation_dict)
        model = Metapath2VecTrainer(config)
        print("Training")
        model.train()
    elif args.model == "HERec":
        mp_list = config.metapath_list.split("|")
        for mp in mp_list:
            # HERec_gen_neighbour(g_hin, mp, config.temp_file)
            config.input = config.temp_file + mp + ".txt"
            config.out_put = config.out_emd_file + mp + ".txt"
            DW(config)
        HERec_union_metapth(config.out_emd_file, mp_list, len(g_hin.node[mp_list[0][0]]), config.dim)
    elif args.model == "HIN2vec":
        HIN2vec(g_hin, config.out_emd_file, config)
    elif args.model == "HAN":
        data_process = HAN_process(g_hin, config.mp_list, args.dataset)
        config.out_emd_file += 'node.txt'
        m = HAN(config, data_process)
        m.train()
    elif args.model == "HeGAN":
        model = HeGAN(g_hin, args, config)
        model.train(config, g_hin.node2id_dict)
    else:
        pass