Example #1
0
 def init_model(self):
     self.model = KGEModel(
         self.t.train_seeds,
         nentity=self.entity_count,
         nrelation=self.attr_count,
         nvalue=self.value_count,
         hidden_dim=200,
         gamma=24.0,
     ).to(self.device)
Example #2
0
    def load_model(self, time):
        model_path = "./result/model_" + str(time) + ".pth"
        checkpoint = torch.load(model_path)

        model = KGEModel(ent_tot=self.nentity,
                         rel_tot=self.nrelation,
                         dim_e=50,
                         dim_r=50)
        model.load_state_dict(checkpoint['net'])
        current_learning_rate = 0.0001
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=current_learning_rate)
        optimizer.load_state_dict(checkpoint['optimizer'])

        return model, optimizer
Example #3
0
def run_grid(nentity,
             nrelation,
             train_triples,
             valid_triples,
             test_triples,
             all_true_triples,
             args,
             rule_iterators=None,
             adv_model=None):
    ntriples = len(train_triples)

    if args.inject:
        print('injecting rules')
    else:
        print('rules not injected')

    if args.ruge:
        print('Using RUGE injection model')

    reset_empty_values(args)
    current_learning_rate = args.learning_rate

    if args.negative_adversarial_sampling:
        print('Temperature - ', args.adversarial_temperature)
        print()

    info = 'Model - {}; opt - {}; batch size - {}; dataset - {}; lr - {}, gamma = {}; '.format(
        args.model, args.opt, args.batch_size, args.data_path,
        current_learning_rate, args.gamma)
    info2 = 'Loss fnc - {}; inv - {}; impl - {}; sym - {}; eq - {}'.format(
        args.loss, args.inv, args.impl, args.sym, args.eq)
    print(info)
    print(info2)

    current_learning_rate = args.learning_rate
    EPSILONS = itertools.product(EPSILONS_INV, EPSILONS_IMPL, EPSILONS_SYM,
                                 EPSILONS_EQ)
    WEIGHTS = itertools.product(WEIGHTS_INV, WEIGHTS_IMPL, WEIGHTS_SYM,
                                WEIGHTS_EQ)

    idx = -1  # for saving models with several parameters
    for g1, g2 in zip(GAMMA1, GAMMA2):
        for eps_inv, eps_impl, eps_sym, eps_eq in EPSILONS:
            for w_inv, w_impl, w_sym, w_eq in WEIGHTS:
                for dim, n_negs, steps in itertools.product(
                        DIMENSIONS, N_NEGS_LIST, N_STEPS_LIST):
                    idx += 1
                    # re-initialize the model
                    kge_model = KGEModel(model_name=args.model,
                                         nentity=nentity,
                                         nrelation=nrelation,
                                         ntriples=ntriples,
                                         hidden_dim=dim,
                                         args=args)
                    if 'inverse' in RULE_TYPES:
                        kge_model.rule_weight['inverse'] = w_inv
                        kge_model.epsilon_inv = eps_inv
                    if 'implication' in RULE_TYPES:
                        kge_model.rule_weight['implication'] = w_impl
                        kge_model.epsilon_impl = eps_impl
                    if 'symmetry' in RULE_TYPES:
                        kge_model.rule_weight['symmetry'] = w_sym
                        kge_model.epsilon_sym = eps_sym
                    if 'equality' in RULE_TYPES:
                        kge_model.rule_weight['equality'] = w_eq
                        kge_model.epsilon_eq = eps_eq

                    kge_model.set_loss(args.loss)
                    logging.info('Model: %s' % args.model)
                    logging.info('Data Path: %s' % args.data_path)
                    logging.info('#entity: %d' % nentity)
                    logging.info('#relation: %d' % nrelation)
                    logging.info('optimizer: %s' % args.opt)
                    logging.info('learning rate: %f' % current_learning_rate)
                    logging.info('loss: %s' % args.loss)
                    if args.inv:
                        logging.info(
                            'using inverse rules: eps = %f, weight = %f' %
                            (kge_model.epsilon_inv,
                             kge_model.rule_weight['inverse']))
                    if args.impl:
                        logging.info(
                            'using implication rules: eps = %f, weight = %f' %
                            (kge_model.epsilon_impl,
                             kge_model.rule_weight['implication']))
                    if args.sym:
                        logging.info(
                            'using symmetry rules: eps = %f, weight = %f' %
                            (kge_model.epsilon_sym,
                             kge_model.rule_weight['symmetry']))
                    if args.eq:
                        logging.info(
                            'using equality rules: eps = %f, weight = %f' %
                            (kge_model.epsilon_eq,
                             kge_model.rule_weight['equality']))
                    logging.info('Model Parameter Configuration:')
                    for name, param in kge_model.named_parameters():
                        logging.info('Parameter %s: %s, require_grad = %s' %
                                     (name, str(param.size()),
                                      str(param.requires_grad)))
                    logging.info('Loss function %s' % args.loss)
                    if args.cuda:
                        kge_model = kge_model.cuda()

                    logging.info('Ramdomly Initializing %s Model...' %
                                 args.model)

                    print_rules_info(kge_model, args)
                    args.max_steps = steps
                    args.negative_sample_size = n_negs
                    #out_line = '#steps = {}, #negs = {};'.format(args.max_steps, args.negative_sample_size)
                    logging.info('Max steps - %d' % args.max_steps)
                    logging.info('Negative sample %d ' %
                                 args.negative_sample_size)
                    assert kge_model.inject == args.inject, 'Inject is wrong'
                    # train
                    train_iterator = construct_dataloader(
                        args, train_triples, nentity, nrelation)
                    step = train_model(0, valid_triples, all_true_triples,
                                       kge_model, adv_model, train_iterator,
                                       rule_iterators, args, str(idx))

                    # valid
                    logging.info('Evaluating on Valid Dataset...')
                    metrics = kge_model.test_step(kge_model, valid_triples,
                                                  all_true_triples, args)
                    #metrics1 = kge_model.getScore(kge_model, valid_triples, all_true_triples, args)
                    log_metrics('Valid', step, metrics)
                    info = 'Validation (%d): ' % step
                    for key, val in metrics.items():
                        info = info + key + ' - ' + str(val) + ';'
                    print(info)
                    # test
                    out_line = '#steps = {}, #negs = {}, dim = {};'.format(
                        step, args.negative_sample_size, kge_model.hidden_dim)
                    metrics = kge_model.test_step(kge_model, test_triples,
                                                  all_true_triples, args)
                    print("Hello")
                    metrics1 = kge_model.getScore(kge_model, test_triples,
                                                  all_true_triples, args)
                    log_metrics('Test', step, metrics)
                    values = [
                        str(metrics['MRR']),
                        str(metrics['MR']),
                        str(metrics['HITS@1']),
                        str(metrics['HITS@3']),
                        str(metrics['HITS@10'])
                    ]
                    out_line = out_line + ';'.join(values)
                    print(out_line)

                    logging.info(
                        '\n-----------------------------------------------')
            print()
Example #4
0
def main(args):
    # if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (not args.do_case) and (not args.fire_test) and (not args.rel_do_test) :
    #     raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    # train_triples = read_triple(os.path.join(args.data_path, 'train_1900.txt'), entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    # seen_test_triples = read_triple(os.path.join(args.data_path, 'seen_test.txt'), entity2id, relation2id)
    # test_triples = read_triple(os.path.join(args.data_path, 'test_alone_triples.txt'), entity2id, relation2id)

    # def file_name(file_dir):
    #     for root, dirs, files in os.walk(file_dir):
    #         return files
    # rel_dataset = file_name("/scratch/mengyali/workspace/rotate/data/wn18rr/rel_dataset_txt/")
    # for rel in rel_dataset:
    #     test_triples = read_triple(os.path.join(args.data_path, "rel_dataset_txt/"+str(rel)), entity2id, relation2id)
    #     logging.info('#test: %d' % len(test_triples))

    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch', entity2id,
                         relation2id, args.data_path, args.typecons),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch', entity2id,
                         relation2id, args.data_path, args.typecons),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)

            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples,
                                              all_true_triples, args)
                log_metrics('Valid', step, metrics)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples,
                                      all_true_triples, args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples,
                                      all_true_triples, args)
        # logging.info("----------------------"+str(rel)+"---------------------\n")
        log_metrics('Test', step, metrics)

    if args.get_metric:
        logging.info(
            'Evaluating on Test Dataset and Show the metric in two sides...')
        head_metrics, tail_metrics = kge_model.get_metric(
            kge_model, test_triples, all_true_triples, args)
        logging.info("--------------- Head ------------\n")
        log_metrics('Test-Head', step, head_metrics)
        logging.info("--------------- Tail ------------\n")
        log_metrics('Test-Tail', step, tail_metrics)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)

    # Codes about StAR
    if args.get_scores:
        for type in ['dev', 'test']:
            kge_model.get_scores(kge_model, type, valid_triples,
                                 all_true_triples, args)

    if args.get_model_dataset:
        kge_model.get_model_dataset(kge_model, 'train', valid_triples,
                                    all_true_triples, args)

    if args.get_cases:
        kge_model.get_cases(kge_model, test_triples, all_true_triples, args)

    if args.rel_do_test:
        train_rel_dict = collections.Counter([ex[1] for ex in train_triples])
        rel_dict = dict()
        logging.info('Evaluating on Each Test Dataset Devided by Relation...')
        test_ex_dict_departby_rel = dict.fromkeys(relation2id.keys(), [])
        for _rel in relation2id:
            test_ex_dict_departby_rel[_rel] = [
                _ex for _ex in test_triples if _ex[1] == relation2id[_rel]
            ]

        for _rel in test_ex_dict_departby_rel.keys():
            _rel_test_triples = test_ex_dict_departby_rel[_rel]
            _rel_data = [
                train_rel_dict[relation2id[_rel]],
                len(_rel_test_triples)
            ]
            if len(_rel_test_triples) != 0:
                metrics = kge_model.test_step(kge_model, _rel_test_triples,
                                              all_true_triples, args)
                _rel_data.extend([
                    round(metrics['HITS@1'], 3),
                    round(metrics['HITS@3'], 3),
                    round(metrics['HITS@10'], 3),
                    round(metrics['MR'], 1),
                    round(metrics['MRR'], 3)
                ])
            else:
                _rel_data.extend([0, 0, 0, 0, 0])
            rel_dict[_rel] = _rel_data

        sorted_rel = sorted(rel_dict.items(),
                            key=lambda x: x[1][0],
                            reverse=True)

        save_dir = args.init_checkpoint
        with open(join(save_dir, "rel_unbalanced.txt"), "w",
                  encoding="utf-8") as fp:
            fp.write(str(sorted_rel))
        torch.save(sorted_rel, join(save_dir, "rel_unbalanced"))

        # SaveInExcle(sorted_rel, save_dir)
        print("explore unbalanced finished")
Example #5
0
    TrainDataset(test_triples,
                 nentity,
                 nrelation,
                 1024,
                 512,
                 entity_dict,
                 train_triples=train_triples)
)  #, filter_idx = filter_relations(test_triples)))
train_iterator = DatasetIterator(
    TrainDataset(
        train_triples, nentity, nrelation, 1024, 256,
        entity_dict))  #, filter_idx = filter_relations(train_triples)))

kge_model = KGEModel(model_name="QuatE",
                     nentity=nentity,
                     nrelation=nrelation,
                     hidden_dim=args["hidden_dim"],
                     evaluator=evaluator)
if args["cuda"]:
    kge_model.cuda()

learning_rate = args["lr"]  #learning_rate
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                    kge_model.parameters()),
                             lr=learning_rate)

training_logs = []
valid_logs = []
for step in range(args["n_epoch"] * train_iterator.epoch_size):

    loss = kge_model.train_step(optimizer, train_iterator, args)
def main(args):
    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')
    
    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    
    # Write logs to checkpoint and console
    set_logger(args)

    with open(args.data_path) as fin:
        entity2id = bidict()
        relation2id = bidict()
        train_triples = []
        for line in fin:
            _tmp = [x.strip() for x in re.split("[,\t]", line) if x.strip()][:3]
            if len(_tmp) < 3:
                continue
            e1, relation, e2 = tuple(_tmp)
            if not e1 in entity2id:
                entity2id[e1] = len(entity2id)
            if not e2 in entity2id:
                entity2id[e2] = len(entity2id)
            if not relation in relation2id:
                relation2id[relation] = len(relation2id)
            train_triples.append((entity2id[e1], relation2id[relation], entity2id[e2]))

    nentity = len(entity2id)
    nrelation = len(relation2id)
    
    args.nentity = nentity
    args.nrelation = nrelation
    
    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    
    logging.info('#train: %d' % len(train_triples))
    
    #All true triples
    all_true_triples = train_triples
    
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding
    )
    
    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()
    
    # Set training dataloader iterator
    train_dataloader_head = DataLoader(
        TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch'), 
        batch_size=args.batch_size,
        shuffle=True, 
        num_workers=max(1, args.cpu_num//2),
        collate_fn=TrainDataset.collate_fn
    )
    
    train_dataloader_tail = DataLoader(
        TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'tail-batch'), 
        batch_size=args.batch_size,
        shuffle=True, 
        num_workers=max(1, args.cpu_num//2),
        collate_fn=TrainDataset.collate_fn
    )
    
    train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
    
    # Set training configuration
    current_learning_rate = args.learning_rate
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, kge_model.parameters()), 
        lr=current_learning_rate
    )
    if args.warm_up_steps:
        warm_up_steps = args.warm_up_steps
    else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        
        current_learning_rate = checkpoint['current_learning_rate']
        warm_up_steps = checkpoint['warm_up_steps']
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0
    
    step = init_step
    
    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' % args.adversarial_temperature)
    
    # Set valid dataloader as it would be evaluated during training
    
    training_logs = []
    
    #Training Loop
    for step in range(init_step, args.max_steps):
        
        log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
        
        training_logs.append(log)
        
        if step >= warm_up_steps:
            current_learning_rate = current_learning_rate / 10
            logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
            optimizer = torch.optim.Adam(
                filter(lambda p: p.requires_grad, kge_model.parameters()), 
                lr=current_learning_rate
            )
            warm_up_steps = warm_up_steps * 3
        
        if step % args.save_checkpoint_steps == 0:
            save_variable_list = {
                'step': step, 
                'current_learning_rate': current_learning_rate,
                'warm_up_steps': warm_up_steps
            }
            save_model(kge_model, optimizer, save_variable_list, args, entity2id, relation2id)
            
        if step % args.log_steps == 0:
            metrics = {}
            for metric in training_logs[0].keys():
                metrics[metric] = sum([log[metric] for log in training_logs])/len(training_logs)
            log_metrics('Training average', step, metrics)
            training_logs = []
            
    save_variable_list = {
        'step': step, 
        'current_learning_rate': current_learning_rate,
        'warm_up_steps': warm_up_steps
    }
    save_model(kge_model, optimizer, save_variable_list, args, entity2id, relation2id)
        
    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #7
0
File: run.py Project: rpatil524/ogb
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (
            not args.evaluate_train):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)

    args.save_path = 'log/%s/%s/%s-%s/%s' % (
        args.dataset, args.model, args.hidden_dim, args.gamma,
        time.time()) if args.save_path == None else args.save_path
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    dataset = LinkPropPredDataset(name='ogbl-biokg')
    split_edge = dataset.get_edge_split()
    train_triples, valid_triples, test_triples = split_edge[
        "train"], split_edge["valid"], split_edge["test"]
    nrelation = int(max(train_triples['relation'])) + 1
    entity_dict = dict()
    cur_idx = 0
    for key in dataset[0]['num_nodes_dict']:
        entity_dict[key] = (cur_idx,
                            cur_idx + dataset[0]['num_nodes_dict'][key])
        cur_idx += dataset[0]['num_nodes_dict'][key]
    nentity = sum(dataset[0]['num_nodes_dict'].values())

    evaluator = Evaluator(name=args.dataset)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Dataset: %s' % args.dataset)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    # train_triples = split_dict['train']
    logging.info('#train: %d' % len(train_triples['head']))
    # valid_triples = split_dict['valid']
    logging.info('#valid: %d' % len(valid_triples['head']))
    # test_triples = split_dict['test']
    logging.info('#test: %d' % len(test_triples['head']))

    train_count, train_true_head, train_true_tail = defaultdict(
        lambda: 4), defaultdict(list), defaultdict(list)
    for i in tqdm(range(len(train_triples['head']))):
        head, relation, tail = train_triples['head'][i], train_triples[
            'relation'][i], train_triples['tail'][i]
        head_type, tail_type = train_triples['head_type'][i], train_triples[
            'tail_type'][i]
        train_count[(head, relation, head_type)] += 1
        train_count[(tail, -relation - 1, tail_type)] += 1
        train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        entity_dict = checkpoint['entity_dict']

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        # logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        # checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        # entity_dict = checkpoint['entity_dict']
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0 and step > 0:  # ~ 41 seconds/saving
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps,
                    'entity_dict': entity_dict
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Train', step, metrics, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples, args,
                                              entity_dict)
                log_metrics('Valid', step, metrics, writer)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples, args,
                                      entity_dict)
        log_metrics('Valid', step, metrics, writer)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, args,
                                      entity_dict)
        log_metrics('Test', step, metrics, writer)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        small_train_triples = {}
        indices = np.random.choice(len(train_triples['head']),
                                   args.ntriples_eval_train,
                                   replace=False)
        for i in train_triples:
            if 'type' in i:
                small_train_triples[i] = [train_triples[i][x] for x in indices]
            else:
                small_train_triples[i] = train_triples[i][indices]
        metrics = kge_model.test_step(kge_model,
                                      small_train_triples,
                                      args,
                                      entity_dict,
                                      random_sampling=True)
        log_metrics('Train', step, metrics, writer)
    exit(0) # exit program

# print(len(words_indexes))
# print(len(entity2id))
# exit(0)

# load model here
kge_model = KGEModel(
    #model_name=args.model,
    #hidden_dim=args.hidden_dim,
    #hidden_dim=50,  # just for debugging (remove this line later)
    vocab_size=len(words_indexes),
    embedding_size=args.embedding_dim,
    gamma=args.gamma,
    #gamma=2.4, # just for debugging (remove this line later)
    batch_size = args.batch_size,
    neg_ratio=args.neg_ratio,
    dpo1=args.dpo1,
    dpo2=args.dpo2,
    bn1 = args.bn1,
    bn2 = args.bn2,
    channel1_num=args.ConvL1FiltersNum,
    channel2_num=args.ConvL2FiltersNum
)

#  args.name, args.model_name+fold+'-'+str(epoch+1)+'epochs'
model_path = os.path.join(os.getcwd(),'models',args.name+'_'+args.model_name+fold+'-'+str(args.num_epochs)+'epochs'+'.pth')
# model_path = os.path.join(os.getcwd(),'models',args.name+'_'+args.model_name+fold+'.pth')
kge_model.load_state_dict(torch.load(model_path))
kge_model = kge_model.cuda()
Example #9
0
File: run.py Project: cdhx/RotatE
def main(args):
    #什么模式
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')
    #是否要用config修改一些命令行参数
    if args.init_checkpoint:  # 如果init_checkpoint有值(是一个路径)就从config里面修改一些参数
        override_config(args)  #这个函数里面要用到init_checkpoint,因此进了这个if就说明它有值就不会报错
    elif args.data_path is None:  #override函数里,如果path是none就从config里读,如果就没进上一个if,就不能从config里读,默认是none,后来自己改成默认值,按说这里不指定的话就会raise这个错误
        raise ValueError('one of init_checkpoint/data_path must be choosed.')
    #训练但是没给保存路径
    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')
    #有保存路径但是文件夹不存在,就创建一个
    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)
    #里面是所有实体/关系的代码 1 xx  2 xx  3 xx
    #读出来的entity2id,relation2id是键为实体/关系代码,值为序号的字典
    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split(
                '\t')  #1 xx   eid=str(1),entity=xx
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        #contries数据集里面会有一个文件是regions.list其他的数据集没有
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:  #只有5行
                region = line.strip()
                regions.append(entity2id[region])  #这里的值应该是区域的序号
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    #开始获取训练验证测试数据集,并打印size
    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))

    # All true triples
    all_true_triples = train_triples + valid_triples + test_triples
    #构造模型
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad,
                   kge_model.parameters()),  #fitter操作,只优化requires_grad为true的
            lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        training_logs = []

        # Training Loop
        for step in range(init_step, args.max_steps):
            #train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)

            training_logs.append(log)
            #动态调整学习率
            if step >= warm_up_steps:  #大于warm_up_steps后学习率变为原来的1/10
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()),
                    lr=current_learning_rate  #更新优化器里的学习率
                )
                warm_up_steps = warm_up_steps * 3  #更新warm_up_steps
            #每隔save_checkpoint_steps保存一次模型
            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples,
                                              all_true_triples, args)
                log_metrics('Valid', step, metrics)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples,
                                      all_true_triples, args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #10
0
def main(args):
    if (
        (not args.do_train)
        and (not args.do_valid)
        and (not args.do_test)
        and (not args.evaluate_train)
    ):
        raise ValueError("one of train/val/test mode must be choosed.")

    if args.init_checkpoint:
        override_config(args)

    args.save_path = (
        "log/%s/%s/%s-%s/%s"
        % (args.dataset, args.model, args.hidden_dim, args.gamma, time.time())
        if args.save_path == None
        else args.save_path
    )
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    dataset = LinkPropPredDataset(name=args.dataset)
    split_dict = dataset.get_edge_split()
    nentity = dataset.graph["num_nodes"]
    nrelation = int(max(dataset.graph["edge_reltype"])[0]) + 1

    evaluator = Evaluator(name=args.dataset)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info("Model: %s" % args.model)
    logging.info("Dataset: %s" % args.dataset)
    logging.info("#entity: %d" % nentity)
    logging.info("#relation: %d" % nrelation)

    train_triples = split_dict["train"]
    logging.info("#train: %d" % len(train_triples["head"]))
    valid_triples = split_dict["valid"]
    logging.info("#valid: %d" % len(valid_triples["head"]))
    test_triples = split_dict["test"]
    logging.info("#test: %d" % len(test_triples["head"]))

    train_count, train_true_head, train_true_tail = (
        defaultdict(lambda: 4),
        defaultdict(list),
        defaultdict(list),
    )
    for i in tqdm(range(len(train_triples["head"]))):
        head, relation, tail = (
            train_triples["head"][i],
            train_triples["relation"][i],
            train_triples["tail"][i],
        )
        train_count[(head, relation)] += 1
        train_count[(tail, -relation - 1)] += 1
        train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator,
    )

    logging.info("Model Parameter Configuration:")
    for name, param in kge_model.named_parameters():
        logging.info(
            "Parameter %s: %s, require_grad = %s"
            % (name, str(param.size()), str(param.requires_grad))
        )

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(
                train_triples,
                nentity,
                nrelation,
                args.negative_sample_size,
                "head-batch",
                train_count,
                train_true_head,
                train_true_tail,
            ),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn,
        )

        train_dataloader_tail = DataLoader(
            TrainDataset(
                train_triples,
                nentity,
                nrelation,
                args.negative_sample_size,
                "tail-batch",
                train_count,
                train_true_head,
                train_true_tail,
            ),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn,
        )

        train_iterator = BidirectionalOneShotIterator(
            train_dataloader_head, train_dataloader_tail
        )

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()),
            lr=current_learning_rate,
        )
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info("Loading checkpoint %s..." % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, "checkpoint"))
        init_step = checkpoint["step"]
        kge_model.load_state_dict(checkpoint["model_state_dict"])
        if args.do_train:
            current_learning_rate = checkpoint["current_learning_rate"]
            warm_up_steps = checkpoint["warm_up_steps"]
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    else:
        logging.info("Ramdomly Initializing %s Model..." % args.model)
        init_step = 0

    step = init_step

    logging.info("Start Training...")
    logging.info("init_step = %d" % init_step)
    logging.info("batch_size = %d" % args.batch_size)
    logging.info(
        "negative_adversarial_sampling = %d" % args.negative_adversarial_sampling
    )
    logging.info("hidden_dim = %d" % args.hidden_dim)
    logging.info("gamma = %f" % args.gamma)
    logging.info(
        "negative_adversarial_sampling = %s" % str(args.negative_adversarial_sampling)
    )
    if args.negative_adversarial_sampling:
        logging.info("adversarial_temperature = %f" % args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info("learning_rate = %d" % current_learning_rate)

        training_logs = []

        # Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info(
                    "Change learning_rate to %f at step %d"
                    % (current_learning_rate, step)
                )
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()),
                    lr=current_learning_rate,
                )
                warm_up_steps = warm_up_steps * 3

            if (
                step % args.save_checkpoint_steps == 0 and step > 0
            ):  # ~ 41 seconds/saving
                save_variable_list = {
                    "step": step,
                    "current_learning_rate": current_learning_rate,
                    "warm_up_steps": warm_up_steps,
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs]) / len(
                        training_logs
                    )
                log_metrics("Train", step, metrics, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info("Evaluating on Valid Dataset...")
                metrics = kge_model.test_step(kge_model, valid_triples, args)
                log_metrics("Valid", step, metrics, writer)

        save_variable_list = {
            "step": step,
            "current_learning_rate": current_learning_rate,
            "warm_up_steps": warm_up_steps,
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info("Evaluating on Valid Dataset...")
        metrics = kge_model.test_step(kge_model, valid_triples, args)
        log_metrics("Valid", step, metrics, writer)

    if args.do_test:
        logging.info("Evaluating on Test Dataset...")
        metrics = kge_model.test_step(kge_model, test_triples, args)
        log_metrics("Test", step, metrics, writer)

    if args.evaluate_train:
        logging.info("Evaluating on Training Dataset...")
        small_train_triples = {}
        indices = np.random.choice(
            len(train_triples["head"]), args.ntriples_eval_train, replace=False
        )
        for i in train_triples:
            small_train_triples[i] = train_triples[i][indices]
        metrics = kge_model.test_step(
            kge_model, small_train_triples, args, random_sampling=True
        )
        log_metrics("Train", step, metrics, writer)
Example #11
0
    args.model = argparse_dict['model']
    args.double_entity_embedding = argparse_dict['double_entity_embedding']
    args.double_relation_embedding = argparse_dict['double_relation_embedding']
    args.hidden_dim = argparse_dict['hidden_dim']
    args.test_batch_size = argparse_dict['test_batch_size']
    args.fake = argparse_dict['fake']
    args.method = argparse_dict['method']
    args.save_path = argparse_dict['save_path']


override_config(args)
checkpoint = torch.load(os.path.join(save_path, 'checkpoint'))
kge_model = KGEModel(
    model_name=model,
    nentity=args.nentity,
    nrelation=args.nrelation,
    hidden_dim=args.hidden_dim,
    gamma=argparse_dict["gamma"],
    double_entity_embedding=argparse_dict["double_entity_embedding"],
    double_relation_embedding=argparse_dict["double_relation_embedding"])
kge_model.load_state_dict(checkpoint['model_state_dict'])
kge_model = kge_model.cuda()
trainer = NoiGANTrainer(train_triples, fake_triples, args, kge_model, False)
trainer.classifier.load_state_dict(checkpoint['classifier'])
# trainer.generator.load_state_dict(checkpoint['generator'])
true_head, true_tail = TrainDataset.get_true_head_and_tail(all_true_triples)

query_head, query_relation, query_tail, args.mode = "Joby_Talbot", "wroteMusicFor", "The_Hitchhiker's_Guide_to_the_Galaxy_(film)", "tail-batch"
head, relation, tail = entity2id[query_head], relation2id[
    query_relation], entity2id[query_tail]
args.negative_sample_size = 1024
Example #12
0
def main(args):

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))

    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples
    current_learning_rate = args.learning_rate

    ntriples = len(train_triples)
    '''print('Model: %s' % args.model)
                print('Data Path: %s' % args.data_path)
                print('#entity: %d' % nentity)
                print('#relation: %d' % nrelation)
                print('optimizer: ', OPT)
                if args.train_old: print('USING ORIGINAL TRAINING FUNCTION')

                print('learning_rate = %f' % current_learning_rate)
                print('batch_size = %d' % args.batch_size)
                print('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
                print('hidden_dim = %d' % args.hidden_dim)
                print('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
                if args.negative_adversarial_sampling:
                    print('adversarial_temperature = %f' % args.adversarial_temperature)
    '''

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    logging.info('optimizer: %s' % OPT)
    if args.train_old: logging.info('USING ORIGINAL TRAINING FUNCTION')
    #else: print('GRID TESTING\nUsing new loss function')

    info = 'Model - {}; opt - {}; batch size - {}, dim - {}; dataset - {}; lr - {}; '.format(
        args.model, OPT, str(args.batch_size), args.hidden_dim, args.data_path,
        str(current_learning_rate))
    print(info)
    for g1, g2 in zip(GAMMA1, GAMMA2):
        for n_neg in N_NEGS_LIST:
            for steps in N_STEPS_LIST:
                current_learning_rate = args.learning_rate
                # re-initialize the model
                kge_model = KGEModel(
                    model_name=args.model,
                    nentity=nentity,
                    nrelation=nrelation,
                    ntriples=ntriples,
                    hidden_dim=args.hidden_dim,
                    gamma=args.gamma,
                    gamma1=g1,
                    gamma2=g2,
                    double_entity_embedding=args.double_entity_embedding,
                    double_relation_embedding=args.double_relation_embedding,
                )
                kge_model.set_loss(args.loss)

                logging.info('Model Parameter Configuration:')
                for name, param in kge_model.named_parameters():
                    logging.info(
                        'Parameter %s: %s, require_grad = %s' %
                        (name, str(param.size()), str(param.requires_grad)))
                logging.info('Loss function %s' % args.loss)
                if args.cuda:
                    kge_model = kge_model.cuda()

                logging.info('Ramdomly Initializing %s Model...' % args.model)

                args.max_steps = steps
                args.negative_sample_size = n_neg
                out_line = 'g1 = {}, g2 = {}, #steps = {}, #negs = {};'.format(
                    kge_model.gamma1, kge_model.gamma2, args.max_steps,
                    args.negative_sample_size)
                logging.info('gamma1 = %f, gamma2 = %f' % (g1, g2))
                logging.info('Max steps - %d' % args.max_steps)
                logging.info('Negative sample %d ' % args.negative_sample_size)

                train_iterator = construct_dataloader(args, train_triples,
                                                      nentity, nrelation)
                step = grid_train_model(0, valid_triples, all_true_triples,
                                        kge_model, train_iterator, args)
                metrics = kge_model.test_step(kge_model, test_triples,
                                              all_true_triples, args)
                log_metrics('Test', step, metrics)
                values = [
                    str(metrics['MRR']),
                    str(metrics['MR']),
                    str(metrics['HITS@1']),
                    str(metrics['HITS@3']),
                    str(metrics['HITS@10'])
                ]
                out_line = out_line + ';'.join(values)
                print(out_line)

                logging.info(
                    '\n-----------------------------------------------')
Example #13
0
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (
            not args.evaluate_train):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)

    args.save_path = 'log/%s/%s/%s-%s/%s' % (
        args.dataset, args.model, args.hidden_dim, args.gamma,
        time.time()) if args.save_path == None else args.save_path
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)
    #     dataset = WikiKG90MDataset(root='/Users/chding/desktop/KDD/dataset_mini/')
    dataset = WikiKG90MDataset(root='dataset_all/')
    # dataset = LinkPropPredDataset(name = args.dataset)
    # split_dict = dataset.get_edge_split()
    # nentity = dataset.graph['num_nodes']
    # nrelation = int(max(dataset.graph['edge_reltype'])[0])+1
    evaluator = WikiKG90MEvaluator()  #Evaluator(name = args.dataset)

    # train_triples = split_dict['train']
    train_triples = dataset.train_hrt  #np.load('/Users/chding/Desktop/KDD/dataset/wikikg90m_kddcup2021/processed/train_hrt.npy')
    logging.info('#train: %d' % len(train_triples))

    valid_task = dataset.valid_dict['h,r->t']
    hr = valid_task['hr']
    t_candidate = valid_task['t_candidate']
    t_correct_index = valid_task['t_correct_index'].reshape(len(hr), 1)
    valid_triples = np.concatenate((hr, t_candidate, t_correct_index), axis=1)
    logging.info('#valid: %d' % len(valid_triples))

    test_task = dataset.test_dict[
        'h,r->t']  # get a dictionary storing the h,r->t task.
    hr_test = test_task['hr']
    t_candidate_test = test_task['t_candidate']

    test_triples = np.concatenate((hr_test, t_candidate_test), axis=1)
    logging.info("验证的样本维度为:{}".format(valid_triples.shape))
    # logging.info("验证的样本维度为:{}".format(valid_triples[0]))
    logging.info("测试的样本维度为:{}".format(test_triples.shape))
    # np.load('/Users/chding/Desktop/KDD/dataset/wikikg90m_kddcup2021/processed/test_hr.npy')

    logging.info('#test: %d' % len(test_triples))
    nentity = dataset.num_entities  #len(np.unique(train_triples[:, [0, 2]]))
    nrelation = len(np.unique(train_triples[:, 1]))

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Dataset: %s' % args.dataset)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_count, train_true_head, train_true_tail = defaultdict(
        lambda: 4), defaultdict(list), defaultdict(list)
    for i in tqdm(range(len(train_triples))):
        head, relation, tail = train_triples[i][0], train_triples[i][
            1], train_triples[i][2]
        train_count[(head, relation)] += 1
        # train_count[(tail, -relation-1)] += 1
        # train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    logging.info('want to load cuda')
    if args.cuda:
        device = "cuda:0"
        kge_model = kge_model.to(device)


#         kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        # train_dataloader_head = DataLoader(
        #     TrainDataset(train_triples, nentity, nrelation,
        #         args.negative_sample_size, 'head-batch',
        #         train_count, train_true_head, train_true_tail),
        #     batch_size=args.batch_size,
        #     shuffle=True,
        #     num_workers=0,  # max(1, args.cpu_num//2),
        #     collate_fn=TrainDataset.collate_fn
        # )
        logging.info('Loading Data')
        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch', train_count,
                         train_true_head, train_true_tail),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)
        print('read data OK!')
        train_iterator = OneShotIterator(
            train_dataloader_tail)  #train_dataloader_head,   #Bidirectional

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    print('init OK!')

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in tqdm(range(init_step, args.max_steps)):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0 and step > 0:  # ~ 41 seconds/saving
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                logging.info("training_logs :{}".format(training_logs))
                # for metric in training_logs: #[0].keys():
                #     metrics[metric] = sum([log[metric] for log in training_logs])/len(training_logs)
                #     log_metrics('Train', step, metric, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step_0(kge_model, valid_triples, args)
                logging.info(metrics)
                # log_metrics('Valid', step, metrics, writer)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        print('do valid')
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.valid_step(kge_model, valid_triples,
                                       args)  #这里是一个数字 MRR
        logging.info("Valid metrics: {}".format(metrics))
        # log_metrics('Valid', step, metrics, writer)

    if args.do_test:
        print('do test')
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, args, 'result/')
        logging.info("Test metrics: {}".format(metrics))
        # log_metrics('Test', step, metrics, writer)

    if args.evaluate_train:
        print('do eval_train')
        logging.info('Evaluating on Training Dataset...')
        small_train_triples = []
        indices = np.random.choice(len(train_triples),
                                   args.ntriples_eval_train,
                                   replace=False)
        for i in indices:
            small_train_triples.append(train_triples[i])
        small_train_triples = np.array(small_train_triples)
        metrics = kge_model.test_step_0(kge_model,
                                        small_train_triples,
                                        args,
                                        random_sampling=True)
        logging.info("Training metrics: {}".format(metrics))
def main(args):
    if args.seed != -1:
        torch.manual_seed(args.seed)
        if args.cuda:
            torch.cuda.manual_seed(args.seed)

    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
        if args.do_train and args.do_valid:
            if not os.path.exists("%s/best/" % args.save_path):
                os.makedirs("%s/best/" % args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))

    train_triples_tsr = torch.LongTensor(train_triples).transpose(
        0, 1)  #idx X batch
    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples
    #if args.use_gnn:
    #    assert False
    #    #kge_model = GNN_KGEModel(
    #    #    model_name=args.model,
    #    #    nentity=nentity,
    #    #    nrelation=nrelation,
    #    #    hidden_dim=args.hidden_dim,
    #    #    gamma=args.gamma,
    #    #    num_layers=args.gnn_layers,
    #    #    args = args,
    #    #    dropout=args.dropout,
    #    #    double_entity_embedding=args.double_entity_embedding,
    #    #    double_relation_embedding=args.double_relation_embedding,
    #    #)
    #else:
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        args=args,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
    )

    logging.info('Model Configuration:')
    logging.info(str(kge_model))
    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()
        train_triples_tsr = train_triples_tsr.cuda()
    #kge_model.build_cxt_triple_map(train_triples)
    if args.do_train:
        # Set training dataloader iterator
        if args.same_head_tail:
            #shuffle train_triples first and no shuffle within dataloaders. So both head and tail will share the same idx
            shuffle(train_triples)
            train_dataloader_head = DataLoader(
                TrainDataset(train_triples, nentity, nrelation,
                             args.negative_sample_size, 'head-batch'),
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=max(1, args.cpu_num // 2),
                collate_fn=TrainDataset.collate_fn)

            train_dataloader_tail = DataLoader(
                TrainDataset(train_triples, nentity, nrelation,
                             args.negative_sample_size, 'tail-batch'),
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=max(1, args.cpu_num // 2),
                collate_fn=TrainDataset.collate_fn)
        else:
            train_dataloader_head = DataLoader(
                TrainDataset(train_triples, nentity, nrelation,
                             args.negative_sample_size, 'head-batch'),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=max(1, args.cpu_num // 2),
                collate_fn=TrainDataset.collate_fn)

            train_dataloader_tail = DataLoader(
                TrainDataset(train_triples, nentity, nrelation,
                             args.negative_sample_size, 'tail-batch'),
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=max(1, args.cpu_num // 2),
                collate_fn=TrainDataset.collate_fn)
        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)
        #else:
        #    train_dataloader_rel = DataLoader(
        #        TrainDataset(train_triples, nentity, nrelation,
        #            args.negative_sample_head_size*args.negative_sample_tail_size,
        #            'rel-batch',
        #            negative_sample_head_size =args.negative_sample_head_size,
        #            negative_sample_tail_size =args.negative_sample_tail_size,
        #            half_correct=args.negative_sample_half_correct),
        #        batch_size=args.batch_size,
        #        shuffle=True,
        #        num_workers=max(1, args.cpu_num//2),
        #        collate_fn=TrainDataset.collate_fn
        #    )
        #    train_iterator = BidirectionalOneShotIterator.one_shot_iterator(train_dataloader_rel)
        #    tail_only = True

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()),
            lr=current_learning_rate,
            weight_decay=args.weight_decay,
        )

        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=1,
                                                    gamma=0.5,
                                                    last_epoch=-1)
        #if args.warm_up_steps:
        #    warm_up_steps = args.warm_up_steps
        #else:
        #    warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        if 'score_weight' in kge_model.state_dict(
        ) and 'score_weight' not in checkpoint['model_state_dict']:
            checkpoint['model_state_dict'][
                'score_weights'] = kge_model.state_dict()['score_weights']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            #warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            current_learning_rate = 0
    elif args.init_embedding:
        logging.info('Loading pretrained embedding %s ...' %
                     args.init_embedding)
        if kge_model.entity_embedding is not None:
            entity_embedding = np.load(
                os.path.join(args.init_embedding, 'entity_embedding.npy'))
            relation_embedding = np.load(
                os.path.join(args.init_embedding, 'relation_embedding.npy'))
            entity_embedding = torch.from_numpy(entity_embedding).to(
                kge_model.entity_embedding.device)
            relation_embedding = torch.from_numpy(relation_embedding).to(
                kge_model.relation_embedding.device)
            kge_model.entity_embedding.data[:entity_embedding.
                                            size(0)] = entity_embedding
            kge_model.relation_embedding.data[:relation_embedding.
                                              size(0)] = relation_embedding
        init_step = 1
        current_learning_rate = 0
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 1

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %.5f' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    #loss_func = nn.BCEWithLogitsLoss(reduction="none") if args.use_bceloss else nn.LogSigmoid()
    if args.use_bceloss:
        loss_func = nn.BCELoss(reduction="none")
    elif args.use_softmarginloss:
        loss_func = nn.SoftMarginLoss(reduction="none")
    else:
        loss_func = nn.LogSigmoid()
    #kge_model.cluster_relation_entity_embedding(args.context_cluster_num, args.context_cluster_scale)
    if args.do_train:
        training_logs = []
        best_metrics = None
        #Training Loop
        optimizer.zero_grad()
        for step in range(init_step, args.max_steps + 1):
            if step % args.update_freq == 1 or args.update_freq == 1:
                optimizer.zero_grad()
            log = kge_model.train_step(kge_model, train_iterator,
                                       train_triples_tsr, loss_func, args)
            if step % args.update_freq == 0:
                optimizer.step()

            training_logs.append(log)

            #if step >= warm_up_steps:
            #    current_learning_rate = current_learning_rate / 10
            #    logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
            #    optimizer = torch.optim.Adam(
            #        filter(lambda p: p.requires_grad, kge_model.parameters()),
            #        lr=current_learning_rate
            #    )
            #    warm_up_steps = warm_up_steps * 3
            if step % args.schedule_steps == 0:
                scheduler.step()

            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    #'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, [metrics])
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples,
                                              all_true_triples,
                                              train_triples_tsr, args)
                log_metrics('Valid', step, metrics)
                if is_better_metric(best_metrics, metrics):
                    save_variable_list = {
                        'step': step,
                        'current_learning_rate': current_learning_rate,
                        #'warm_up_steps': warm_up_steps
                    }
                    save_model(kge_model, optimizer, save_variable_list, args,
                               True)
                    best_metrics = metrics
                #kge_model.cluster_relation_entity_embedding(args.context_cluster_num, args.context_cluster_scale)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            #'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)
    if args.do_valid and args.do_train:
        #load the best model
        best_checkpoint = torch.load("%s/best/checkpoint" % args.save_path)
        kge_model.load_state_dict(best_checkpoint['model_state_dict'])
        logging.info("Loading best model from step %d" %
                     best_checkpoint['step'])
        step = best_checkpoint['step']

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples,
                                      all_true_triples, train_triples_tsr,
                                      args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples,
                                      all_true_triples, train_triples_tsr,
                                      args)
        log_metrics('Test', step, metrics)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples,
                                      all_true_triples, train_triples_tsr,
                                      args)
        log_metrics('Test', step, metrics)
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        id2entity = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)
            id2entity[int(eid)] = entity

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        id2relationship = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)
            id2relationship[int(rid)] = relation

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    e_vocab = Dictionary(tok2ind=entity2id, ind2tok=id2entity)
    r_vocab = Dictionary(tok2ind=relation2id, ind2tok=id2relationship)
    ## TODO: add graph file
    graph = KB(os.path.join(args.data_path, 'train.txt'),
               e_vocab=e_vocab,
               r_vocab=r_vocab)

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))
    candidate_entities = None
    if args.rerank_minerva:
        candidate_entities = defaultdict(set)
        with open(
                "/home/shdhulia//Limits-of-Path-Reasoner/outputs/FB15K-237/thisone_test/all_answers.txt"
        ) as candidate_file:
            # with open("/home/shdhulia/minerva_answers/fb.txt") as candidate_file:
            for line in candidate_file:
                pt = line.strip().split("\t")
                e1 = entity2id[pt[0]]
                r = relation2id[pt[1]]
                predicted_es = set(
                    [entity2id[p] for p in pt[2:] if p in entity2id])
                candidate_entities[(e1, r)] = set(predicted_es)

    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        # train_dataloader_head = DataLoader(
        #     TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch'),
        #     batch_size=args.batch_size,
        #     shuffle=True,
        #     num_workers=max(1, args.cpu_num//2),
        #     collate_fn=TrainDataset.collate_fn
        # )

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples,
                         nentity,
                         nrelation,
                         args.negative_sample_size,
                         'tail-batch',
                         KB=graph),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        # train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
        train_iterator = OneShotIterator(train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)

    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)

            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples,
                                              all_true_triples, args)
                log_metrics('Valid', step, metrics)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model,
                                      valid_triples,
                                      all_true_triples,
                                      args,
                                      candidate_entities,
                                      id2e=id2entity,
                                      id2rel=id2relationship)
        log_metrics('Valid', step, metrics)

    # if args.do_test:
    #     logging.info('Evaluating on Test Dataset...')
    #     metrics = kge_model.test_step(kge_model, test_triples, all_true_triples, args)
    #     log_metrics('Test', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model,
                                      test_triples,
                                      all_true_triples,
                                      args,
                                      candidate_entities,
                                      id2e=id2entity,
                                      id2rel=id2relationship)
        log_metrics('Test', step, metrics)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #16
0
def main(args):
    if not torch.cuda.is_available():
        args.cuda = False

    if args.ruge:
        args.loss = 'ruge'

    if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (
            not args.do_experiment) and (not args.do_grid):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)

    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console

    set_logger(args)
    if args.regularization != 0:
        print('L3 regularization with coeff - ', args.regularization)
    if args.l2_r != 0:
        print('L2 regularization with coeff - ', args.l2_r)
    if args.project != 0:
        print('projecting before training')
    #logging.info('Inverse loss = premise - concl (reverse)')
    if OPT_STOPPING:
        logging.info('Opt stopping is ON')
        print('Opt stopping is on')

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    if args.inject:
        logging.info('With rule injection')
    else:
        logging.info('NO INJECTION')

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(
        os.path.join(args.data_path, 'test.txt'), entity2id,
        relation2id)  # For testing on Symmetric in WordNet: Symmetric_testWN18
    logging.info('#test: %d' % len(test_triples))

    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples
    train_args = {}

    # set up rule iterators
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_batches = len(train_triples) // args.batch_size
    if n_batches < len(train_triples) / args.batch_size: n_batches += 1
    rule_iterators = {}
    rules_info = ''
    if args.inv:
        n_inverse, inverse_batchsize, rule_iterators[
            'inverse'] = setup_rule_loader(n_batches, args.batch_size,
                                           args.data_path,
                                           'groundings_inverse.txt', device,
                                           RULE_BATCH_SIZE_INV)
        rules_info += 'Inverse: batch size %d out of %d rules' % (
            inverse_batchsize, n_inverse) + '\n'
    if args.eq:
        n_eq, eq_batchsize, rule_iterators['equality'] = setup_rule_loader(
            n_batches, args.batch_size, args.data_path,
            'groundings_equality.txt', device, RULE_BATCH_SIZE_EQ)
        rules_info += 'Equality: batch size %d out of %d rules' % (
            eq_batchsize, n_eq) + '\n'
    if args.impl:
        n_impl, impl_batchsize, rule_iterators[
            'implication'] = setup_rule_loader(n_batches, args.batch_size,
                                               args.data_path,
                                               'groundings_implication.txt',
                                               device, RULE_BATCH_SIZE_IMPL)
        rules_info += 'implication: batch size %d out of %d rules\n' % (
            impl_batchsize, n_impl)
    if args.sym:
        n_symmetry, sym_batchsize, rule_iterators[
            'symmetry'] = setup_rule_loader(n_batches, args.batch_size,
                                            args.data_path,
                                            'groundings_symmetric.txt', device,
                                            RULE_BATCH_SIZE_SYM)
        rules_info += 'symmetry: batch size %d out of %d rules\n' % (
            sym_batchsize, n_symmetry)
    if args.ruge or args.ruge_inject:
        n_rules, rule_iterators['ruge'] = construct_ruge_loader(
            n_batches, args)
        rules_info += 'RUGE: Total %d rules\n' % n_rules

    if rules_info:
        logging.info(rules_info)

    # ----------- adversarial ------------------
    if args.adversarial:
        clauses_filename = os.path.join(args.data_path, 'clauses_0.9.pl')
        adv_clauses, clentity2id = dt.read_clauses(clauses_filename,
                                                   relation2id)
        n_clause_entities = len(clentity2id)
        mult = 2
        if args.model in ['TransE', 'pRotatE']: mult = 1
        if 'QuatE' in args.model: mult = 4
        adv_model = ADVModel(clauses=adv_clauses,
                             n_entities=len(clentity2id),
                             dim=mult * args.hidden_dim,
                             use_cuda=args.cuda)
        if args.cuda:
            adv_model = adv_model.cuda()
    else:
        adv_model = None

    if args.do_grid:
        if rules_info:
            print(rules_info)
        run_grid(nentity, nrelation, train_triples, valid_triples,
                 test_triples, all_true_triples, args, rule_iterators,
                 adv_model)
        exit()
    ntriples = len(train_triples)
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        ntriples=ntriples,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
    )
    kge_model.set_loss(args.loss)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))
    logging.info('Loss function %s' % args.loss)
    if args.cuda and args.parallel:
        gpus = [0, 1]
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(x) for x in gpus)
        kge_model.cuda()
        kge_model = torch.nn.DataParallel(kge_model, device_ids=[0, 1])

    elif args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train or args.do_experiment:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            logging.info('Ramdomly Initializing %s Model...' % args.model)
            init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        train_model(init_step, valid_triples, all_true_triples, kge_model,
                    train_iterator, len(train_triples), args)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        model_module = kge_model.module if args.parallel else kge_model
        metrics = model_module.test_step(kge_model, train_triples,
                                         all_true_triples, args)
        #metrics1 = model_module.getScore(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)

    # experiment on the updated function
    if args.do_experiment:
        logging.info('\n\nSTARTING EXPERIMENT\n')

    train_model(init_step, valid_triples, all_true_triples, kge_model,
                train_iterator, rule_iterators, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        model_module = kge_model.module if args.parallel else kge_model
        metrics = model_module.test_step(kge_model, valid_triples,
                                         all_true_triples, args)
        #metrics1 = model_module.getScore(kge_model, train_triples, all_true_triples, args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        model_module = kge_model.module if args.parallel else kge_model
        metrics = model_module.test_step(kge_model, test_triples,
                                         all_true_triples, args)
        log_metrics('Test', step, metrics)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        model_module = kge_model.module if args.parallel else kge_model
        metrics = model_module.test_step(kge_model, train_triples,
                                         all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #17
0
File: run.py Project: zyksir/NoiGAN
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')

    if not args.do_train and args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, "train.txt"),
                                entity2id, relation2id)
    if args.self_test:
        train_triples = train_triples[len(train_triples) // 5:]
    if args.fake:
        fake_triples = pickle.load(
            open(os.path.join(args.data_path, "fake%s.pkl" % args.fake), "rb"))
        fake = torch.LongTensor(fake_triples)
        train_triples += fake_triples
    else:
        fake_triples = [(0, 0, 0)]
        fake = torch.LongTensor(fake_triples)
    if args.cuda:
        fake = fake.cuda()
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))

    all_true_triples = train_triples + valid_triples + test_triples

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding)
    trainer = None
    if args.method == "CLF":
        trainer = ClassifierTrainer(train_triples, fake_triples, args,
                                    kge_model, args.hard)
    elif args.method == "LT":
        trainer = LTTrainer(train_triples, fake_triples, args, kge_model)
    elif args.method == "NoiGAN":
        trainer = NoiGANTrainer(train_triples, fake_triples, args, kge_model,
                                args.hard)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch'),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = 0  #checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            # current_learning_rate = checkpoint['current_learning_rate']
            # warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %f' % current_learning_rate)

        training_logs = []

        #Training Loop
        triple2confidence_weights = None
        for step in range(init_step, args.max_steps):
            if args.method == "CLF" and step % args.classify_steps == 0:
                logging.info('Train Classifier')
                metrics = trainer.train_classifier(trainer)
                log_metrics('Classifier', step, metrics)
                metrics = trainer.test_ave_score(trainer)
                log_metrics('Classifier', step, metrics)
                trainer.cal_confidence_weight()
            elif args.method == "NoiGAN" and step % args.classify_steps == 0:
                logging.info('Train NoiGAN')
                trainer.train_NoiGAN(trainer)
                metrics = trainer.test_ave_score(trainer)
                log_metrics('Classifier', step, metrics)
                trainer.cal_confidence_weight()

            log = kge_model.train_step(kge_model,
                                       optimizer,
                                       train_iterator,
                                       args,
                                       trainer=trainer)

            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args,
                           trainer)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples,
                                              all_true_triples, args)
                log_metrics('Valid', step, metrics)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args, trainer)

    if trainer is not None:
        logging.info("Evaluating Classifier on Train Dataset")
        metrics = trainer.test_ave_score(trainer)
        log_metrics('Train', step, metrics)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples,
                                      all_true_triples, args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)
        # logging.info("\t".join([metric for metric in metrics.values()]))

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples,
                                      all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #18
0
relation2tail, relation2head = defaultdict(lambda: set()), defaultdict(lambda: set())
for h, r, t in all_triples:
    true_head[(r, t)].add(h)
    true_tail[(h, r)].add(t)
    true_relation[(h, t)].add(r)
    relation2tail[r].add(t)
    relation2head[r].add(h)

model_path = "../models/TransE_YAGO3-10_CLF20_hard_2"
fake_triples = pickle.load(open(os.path.join(data_path, "fake20.pkl"), "rb"))
checkpoint = torch.load(os.path.join(model_path, 'checkpoint'))


hidden_dim = 250
gamma = 12.0
kge_model = KGEModel(model_name="TransE", nentity=nentity, nrelation=nrelation, hidden_dim=hidden_dim, gamma=gamma).cuda()
classifier = SimpleNN(hidden_dim).cuda()
generator = SimpleNN(hidden_dim).cuda()
kge_model.load_state_dict(checkpoint['model_state_dict'])
try:
    classifier.load_state_dict(checkpoint['classifier_state_dict'])
    generator.load_state_dict(checkpoint['generator_state_dict'])
except:
    pass
distance, predict, true_label = [], [], []
# for triple in tqdm(train_triples, total=len(train_triples)):
i = 0
while i < len(train_triples):
    sys.stdout.write("%d in %d\r" % (i, len(train_triples)))
    sys.stdout.flush()
    j = min(i+1024, len(train_triples))
Example #19
0
def main(arg):

    with open(
            r'C:\Users\pc\Desktop\编程\KnowledgeGraphEmbedding-master\data\FB15k\entities.dict'
    ) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)
    with open(
            r'C:\Users\pc\Desktop\编程\KnowledgeGraphEmbedding-master\data\FB15k\relations.dict'
    ) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)
    nentity = len(entity2id)
    nrelation = len(relation2id)

    arg.nentity = nentity
    arg.nrelation = nrelation

    logging.info('Model:%s' % arg.MODEL)
    logging.info('Data Path:%s' % arg.DATA_PATH)
    logging.info('entity:%d' % arg.nentity)
    logging.info('relation:%d' % arg.nrelation)

    #extract data from file
    train_triples = read_triple(os.path.join(arg.DATA_PATH, 'train.txt'),
                                entity2id, relation2id)
    logging.info('#train:%d' % len(train_triples))
    valid_triples = read_triple(os.path.join(arg.DATA_PATH, 'valid.txt'),
                                entity2id, relation2id)
    logging.info('#valid:%d' % len(valid_triples))
    test_triples = read_triple(os.path.join(arg.DATA_PATH, 'test.txt'),
                               entity2id, relation2id)
    logging.info('#test:%d' % len(test_triples))
    #all true triples
    all_true_triples = train_triples + valid_triples + test_triples

    #construct model
    kge_model = KGEModel(
        model_name=arg.MODEL,
        nentity=arg.nentity,
        nrelation=arg.nrelation,
        hidden_dim=arg.HIDDEN_DIM,
        gamma=arg.gamma,
        double_entity_embedding=arg.double_entity_embedding,
        double_relation_embedding=arg.double_relation_embedding)

    #print model para configuration
    logging.info('Model Parameter Configuration')
    for name, para in kge_model.named_parameters():
        #print(name,para.size(),para.requires_grad
        logging.info('Parameter %s:%s,require_grad=%s' %
                     (name, str(para.size()), str(para.requires_grad)))

    #do train
    train_dataloader_head = DataLoader(TrainDataset(train_triples, nentity,
                                                    nrelation,
                                                    arg.negative_sample_size,
                                                    'head-bath'),
                                       batch_size=arg.BATCH_SIZE,
                                       shuffle=True,
                                       num_workers=max(1, arg.cpu_num // 2),
                                       collate_fn=TrainDataset.collate_fn)
    train_dataloader_tail = DataLoader(TrainDataset(train_triples, nentity,
                                                    nrelation,
                                                    arg.negative_sample_size,
                                                    'tail-batch'),
                                       batch_size=arg.BATCH_SIZE,
                                       shuffle=True,
                                       num_workers=max(1, arg.cpu_num // 2),
                                       collate_fn=TrainDataset.collate_fn)

    train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                  train_dataloader_tail)

    #set train configuration
    current_learning_rate = arg.LR
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        kge_model.parameters()),
                                 lr=current_learning_rate)

    warm_up_steps = arg.warm_up_steps if arg.warm_up_steps else arg.max_steps // 2
    init_step = 0
    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % arg.BATCH_SIZE)
    #logging.info('negative_adversarial_sampling = %d' % arg.negative_adversarial_sampling'])
    logging.info('hidden_dim = %d' % arg.HIDDEN_DIM)
    logging.info('gamma = %f' % arg.gamma)
    #logging.info('negative_adversarial_sampling = %s' % str(arg.negative_adversarial_sampling']))

    #start training
    training_logs = []

    for step in range(init_step, arg.max_steps):
        log = kge_model.train_step(kge_model, optimizer, train_iterator, arg)
        training_logs.append(log)
        #update warm-up-step
        if step >= warm_up_steps:  #大于warm_up_steps后学习率变为原来的1/10
            current_learning_rate = current_learning_rate / 10
            logging.info('Change learning_rate to %f at step %d' %
                         (current_learning_rate, step))
            optimizer = torch.optim.Adam(
                filter(lambda p: p.requires_grad, kge_model.parameters()),
                lr=current_learning_rate  #更新优化器里的学习率
            )
            warm_up_steps = warm_up_steps * 3  #更新warm_up_steps
        #save model
        if step % arg.save_checkpoint_steps == 0:
            save_variable_list = {
                'step': step,
                'current_learning_rate': current_learning_rate,
                'warm_up_steps': warm_up_steps
            }
            save_model(kge_model, optimizer, save_variable_list, arg)
    #save after last time
    save_variable_list = {
        'step': step,
        'current_learning_rate': current_learning_rate,
        'warm_up_steps': warm_up_steps
    }
    save_model(kge_model, optimizer, save_variable_list, args)
Example #20
0
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')
    
    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')
    
    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    
    # Write logs to checkpoint and console
    set_logger(args)
    
    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        id2entity = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)
            id2entity[int(eid)] = entity

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        id2relation = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)
            id2relation[int(rid)] = relation
    
    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)
    
    args.nentity = nentity
    args.nrelation = nrelation
    
    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    
    # --------------------------------------------------
    # Comments by Meng:
    # During training, pLogicNet will augment the training triplets,
    # so here we load both the augmented triplets (train.txt) for training and
    # the original triplets (train_kge.txt) for evaluation.
    # Also, the hidden triplets (hidden.txt) are also loaded for annotation.
    # --------------------------------------------------
    train_triples = read_triple(os.path.join(args.workspace_path, 'train_kge.txt'), entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    train_original_triples = read_triple(os.path.join(args.data_path, 'train.txt'), entity2id, relation2id)
    logging.info('#train original: %d' % len(train_original_triples))
    #valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'), entity2id, relation2id)
    #logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'), entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))
    hidden_triples = read_triple(os.path.join(args.workspace_path, 'hidden.txt'), entity2id, relation2id)
    logging.info('#hidden: %d' % len(hidden_triples))
    
    #All true triples
    all_true_triples = train_original_triples + test_triples
    
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding
    )
    
    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()
    
    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'tail-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
        
        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()), 
            lr=current_learning_rate
        )
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0
    
    step = init_step
    
    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' % args.adversarial_temperature)

    if args.record:
        local_path = args.workspace_path
        ensure_dir(local_path)

        opt = vars(args)
        with open(local_path + '/opt.txt', 'w') as fo:
            for key, val in opt.items():
                fo.write('{} {}\n'.format(key, val))
    
    # Set valid dataloader as it would be evaluated during training
    
    if args.do_train:
        training_logs = []
        
        #Training Loop
        for step in range(init_step, args.max_steps):
            
            log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
            
            training_logs.append(log)
            
            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()), 
                    lr=current_learning_rate
                )
                warm_up_steps = warm_up_steps * 3
            
            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step, 
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)
                
            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs])/len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []
                
            if args.do_valid and (step + 1) % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics, preds = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
                log_metrics('Valid', step, metrics)
        
        save_variable_list = {
            'step': step, 
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)
        
    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics, preds = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
        log_metrics('Valid', step, metrics)
        
        # --------------------------------------------------
        # Comments by Meng:
        # Save the prediction results of KGE on validation set.
        # --------------------------------------------------

        if args.record:
            # Save the final results
            with open(local_path + '/result_kge_valid.txt', 'w') as fo:
                for metric in metrics:
                    fo.write('{} : {}\n'.format(metric, metrics[metric]))

            # Save the predictions on test data
            with open(local_path + '/pred_kge_valid.txt', 'w') as fo:
                for h, r, t, f, rk, l in preds:
                    fo.write('{}\t{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], f, rk))
                    for e, val in l:
                        fo.write('{}:{:.4f} '.format(id2entity[e], val))
                    fo.write('\n')
    
    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics, preds = kge_model.test_step(kge_model, test_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
        
        # --------------------------------------------------
        # Comments by Meng:
        # Save the prediction results of KGE on test set.
        # --------------------------------------------------

        if args.record:
            # Save the final results
            with open(local_path + '/result_kge.txt', 'w') as fo:
                for metric in metrics:
                    fo.write('{} : {}\n'.format(metric, metrics[metric]))

            # Save the predictions on test data
            with open(local_path + '/pred_kge.txt', 'w') as fo:
                for h, r, t, f, rk, l in preds:
                    fo.write('{}\t{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], f, rk))
                    for e, val in l:
                        fo.write('{}:{:.4f} '.format(id2entity[e], val))
                    fo.write('\n')

    # --------------------------------------------------
    # Comments by Meng:
    # Save the annotations on hidden triplets.
    # --------------------------------------------------

    if args.record:
        # Annotate hidden triplets
        scores = kge_model.infer_step(kge_model, hidden_triples, args)
        with open(local_path + '/annotation.txt', 'w') as fo:
            for (h, r, t), s in zip(hidden_triples, scores):
                fo.write('{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], s))
    
    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics, preds = kge_model.test_step(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #21
0
def main(args):
    # if (not args.do_train) and (not args.do_valid) and (not args.do_test):
    #     raise ValueError('one of train/val/test mode must be choosed.')
    
    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')
    
    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    
    # Write logs to checkpoint and console
    set_logger(args)
    
    # with open(os.path.join(args.data_path, 'entities.dict')) as fin:
    #     entity2id = dict()
    #     id2entity = dict()
    #     for line in fin:
    #         eid, entity = line.strip().split('\t')
    #         entity2id[entity] = int(eid)
    #         id2entity[int(eid)] = entity

    # with open(os.path.join(args.data_path, 'relations.dict')) as fin:
    #     relation2id = dict()
    #     id2relation = dict()
    #     for line in fin:
    #         rid, relation = line.strip().split('\t')
    #         relation2id[relation] = int(rid)
    #         id2relation[int(rid)] = relation
    
    # # Read regions for Countries S* datasets
    # if args.countries:
    #     regions = list()
    #     with open(os.path.join(args.data_path, 'regions.list')) as fin:
    #         for line in fin:
    #             region = line.strip()
    #             regions.append(entity2id[region])
    #     args.regions = regions

    '''amazon dataset'''
    with open(os.path.join(args.data_path, 'entity2id.txt')) as fin:
        entity2id = dict()
        id2entity = dict()
        for line in fin:
            if len(line.strip().split('\t')) < 2:
                continue
            entity, eid = line.strip().split('\t')
            entity2id[entity] = int(eid)
            id2entity[int(eid)] = entity

    with open(os.path.join(args.data_path, 'relation2id.txt')) as fin:
        relation2id = dict()
        id2relation = dict()
        for line in fin:
            if len(line.strip().split('\t')) < 2:
                continue
            relation, rid = line.strip().split('\t')
            relation2id[relation] = int(rid)
            id2relation[int(rid)] = relation

    nentity = len(entity2id)
    nrelation = len(relation2id)
    
    args.nentity = nentity
    args.nrelation = nrelation
    
    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    
    # --------------------------------------------------
    # Comments by Meng:
    # During training, pLogicNet will augment the training triplets,
    # so here we load both the augmented triplets (train.txt) for training and
    # the original triplets (train_kge.txt) for evaluation.
    # Also, the hidden triplets (hidden.txt) are also loaded for annotation.
    # --------------------------------------------------
    # train_triples = read_triple(os.path.join(args.workspace_path, 'train_kge.txt'), entity2id, relation2id)
    # logging.info('#train: %d' % len(train_triples))
    # train_original_triples = read_triple(os.path.join(args.data_path, 'train.txt'), entity2id, relation2id)
    # logging.info('#train original: %d' % len(train_original_triples))
    # valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'), entity2id, relation2id)
    # logging.info('#valid: %d' % len(valid_triples))
    # test_triples = read_triple(os.path.join(args.data_path, 'test.txt'), entity2id, relation2id)
    # logging.info('#test: %d' % len(test_triples))
    # hidden_triples = read_triple(os.path.join(args.workspace_path, 'hidden.txt'), entity2id, relation2id)
    # logging.info('#hidden: %d' % len(hidden_triples))

    train_triples = read_triple(os.path.join(args.workspace_path, 'train_kge.txt'), entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    train_original_triples = read_triple(os.path.join(args.data_path, 'train.txt'), entity2id, relation2id)
    logging.info('#train original: %d' % len(train_original_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'kg_val_triples_Cell_Phones_and_Accessories.txt'), entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'kg_test_triples_Cell_Phones_and_Accessories.txt'), entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))
    test_candidates = np.load(os.path.join(args.data_path, 'rec_test_candidate100.npz'))['candidates'][:, 1:]
    # test_candidates = np.load('/common/users/yz956/kg/code/OpenDialKG/cand.npy')
    # hidden_triples = read_triple(os.path.join(args.workspace_path, 'hidden.txt'), entity2id, relation2id)
    hidden_triples = read_triple("/common/users/yz956/kg/code/KBRD/data/cpa/cpa/hidden_50.txt", entity2id, relation2id)
    logging.info('#hidden: %d' % len(hidden_triples))
    
    #All true triples
    all_true_triples = train_original_triples + valid_triples + test_triples
    
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding
    )
    
    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()
    
    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'tail-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
        
        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()), 
            lr=current_learning_rate
        )
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0
    
    step = init_step
    
    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' % args.adversarial_temperature)

    if args.record:
        local_path = args.workspace_path
        ensure_dir(local_path)

        opt = vars(args)
        with open(local_path + '/opt.txt', 'w') as fo:
            for key, val in opt.items():
                fo.write('{} {}\n'.format(key, val))
    
    # Set valid dataloader as it would be evaluated during training
    
    if args.do_train:
        training_logs = []
        
        #Training Loop
        for step in range(init_step, args.max_steps):
            
            log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
            
            training_logs.append(log)
            
            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()), 
                    lr=current_learning_rate
                )
                warm_up_steps = warm_up_steps * 3
            
            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step, 
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)
                
            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs])/len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []
                
            if args.do_valid and (step + 1) % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics, preds = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
                log_metrics('Valid', step, metrics)
        
        save_variable_list = {
            'step': step, 
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)
        
    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics, preds = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
        log_metrics('Valid', step, metrics)
        
        # --------------------------------------------------
        # Comments by Meng:
        # Save the prediction results of KGE on validation set.
        # --------------------------------------------------

        if args.record:
            # Save the final results
            with open(local_path + '/result_kge_valid.txt', 'w') as fo:
                for metric in metrics:
                    fo.write('{} : {}\n'.format(metric, metrics[metric]))

            # Save the predictions on test data
            with open(local_path + '/pred_kge_valid.txt', 'w') as fo:
                for h, r, t, f, rk, l in preds:
                    fo.write('{}\t{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], f, rk))
                    for e, val in l:
                        fo.write('{}:{:.4f} '.format(id2entity[e], val))
                    fo.write('\n')
    
    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        # metrics, preds = kge_model.test_step(kge_model, test_triples, all_true_triples, args)
        metrics, preds = kge_model.test_step(kge_model, test_triples, test_candidates, all_true_triples, args)
        log_metrics('Test', step, metrics)
        
        # --------------------------------------------------
        # Comments by Meng:
        # Save the prediction results of KGE on test set.
        # --------------------------------------------------

        if args.record:
            # Save the final results
            with open(local_path + '/result_kge.txt', 'w') as fo:
                for metric in metrics:
                    fo.write('{} : {}\n'.format(metric, metrics[metric]))

            # Save the predictions on test data
            with open(local_path + '/pred_kge.txt', 'w') as fo:
                for h, r, t, f, rk, l in preds:
                    fo.write('{}\t{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], f, rk))
                    for e, val in l:
                        fo.write('{}:{:.4f} '.format(id2entity[e], val))
                    fo.write('\n')

    # --------------------------------------------------
    # Comments by Meng:
    # Save the annotations on hidden triplets.
    # --------------------------------------------------

    if args.record:
        # Annotate hidden triplets
        scores = kge_model.infer_step(kge_model, hidden_triples, args)
        # with open(local_path + '/annotation.txt', 'w') as fo:
        #     for (h, r, t), s in zip(hidden_triples, scores):
        #         fo.write('{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], s))

        # Annotate hidden triplets
        print('annotation')
        
        cand = {}
        with gzip.open('/common/users/yz956/kg/code/KBRD/data/cpa/cpa/kg_test_candidates_Cell_Phones_and_Accessories.txt.gz', 'rt') as f:
            for line in f:
                cells = line.split()
                uid = int(cells[0])
                item_ids = [int(i) for i in cells[1:]]
                cand[uid] = item_ids
        ann, train = [], []
        d = {}
        with open('/common/users/yz956/kg/code/KBRD/data/cpa/cpa/sample_pre.txt') as ft:
            for line in ft:
                line = line.strip().split('\t')
                train.append(line[1:])
        for u in range(61254):
            hiddens = []
            for i in cand[u]:
            # for i in range(61254, 108858):
                hiddens.append((u, 0, i))
            scores = kge_model.infer_step(kge_model, hiddens, args)
            score_np = np.array(scores)
            d = dict(zip(cand[u], scores))
            # d = dict(zip(range(61254, 108858), scores))
            d = sorted(d.items(), key=lambda x: x[1], reverse=True)
            
            # d_50 = d[:50]
            # for idx, t in enumerate(train[u]):
            #     for (tt, prob) in d_50:
            #         if int(t) == tt:
            #             d_50.remove((tt, prob))
            #             d_50.append(d[50 + idx])
            # assert len(d_50) == 50
            # d = {}

            d_50 = d
            ann.append(d_50)
        with open(local_path + '/annotation_1000_htr.txt', 'w') as fo:
            for idx, d in enumerate(ann):
                for (t, score) in d:
                    fo.write(str(idx) + '\t' + str(t) + '\t0\t' + str(score) + '\n')

        # with open(local_path + '/hidden_50_p.txt', 'w') as fo:
        #     for idx, d in enumerate(ann):
        #         for (t, score) in d:
        #             fo.write(str(idx) + '\t' + str(t) + '\t0\n')
        
        scores = kge_model.infer_step(kge_model, hidden_triples, args)
        with open(local_path + '/annotation_htr.txt', 'w') as fo:
            for (h, r, t), s in zip(hidden_triples, scores):
                # fo.write('{}\t{}\t{}\t{}\n'.format(id2entity[h], id2relation[r], id2entity[t], s))
                fo.write('{}\t{}\t{}\t{}\n'.format(str(h), str(t), str(r), s))
    
    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics, preds = kge_model.test_step(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #22
0
File: run.py Project: zyksir/NoiGAN
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')

    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)

    nentity = len(entity2id)
    nrelation = len(relation2id)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    train_triples = read_triple(os.path.join(args.data_path, args.train_set), entity2id, relation2id)
    if args.fake:
        fake_triples = pickle.load(open(os.path.join(args.data_path, "fake%s.pkl" % args.fake), "rb"))
        fake = torch.LongTensor(fake_triples)
        train_triples += fake_triples
    else:
        fake_triples = [(0, 0, 0)]
        fake = torch.LongTensor(fake_triples)
    if args.cuda:
        fake = fake.cuda()
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'), entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'), entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))

    all_true_triples = train_triples + valid_triples + test_triples

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding
    )

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad)))
    if args.cuda:
        kge_model = kge_model.cuda()

    # Set training dataloader iterator
    train_dataset_head = TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'head-batch')
    train_dataset_tail = TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, 'tail-batch')
    for triple in tqdm(train_dataset_head.triples, total=len(train_dataset_head.triples)):
        train_dataset_head.subsampling_weights[triple] = torch.FloatTensor([1.0])
    train_dataset_tail.subsampling_weights = train_dataset_head.subsampling_weights

    train_dataloader_head = DataLoader(
        train_dataset_head,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=max(1, args.cpu_num // 2),
        collate_fn=TrainDataset.collate_fn
    )

    train_dataloader_tail = DataLoader(
        train_dataset_tail,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=max(1, args.cpu_num // 2),
        collate_fn=TrainDataset.collate_fn
    )

    train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
    classifier, generator = None, None
    if args.method == "clf" or args.method is None:
        args.gen_dim = args.hidden_dim
        clf_triples = random.sample(train_triples, len(train_triples)//10)
        clf_dataset_head = TrainDataset(clf_triples, nentity, nrelation,
                                        args.negative_sample_size, 'head-batch')
        clf_dataset_tail = TrainDataset(clf_triples, nentity, nrelation,
                                        args.negative_sample_size, 'tail-batch')
        clf_dataset_head.true_head, clf_dataset_head.true_tail = train_dataset_head.true_head, train_dataset_head.true_tail
        clf_dataset_tail.true_head, clf_dataset_tail.true_tail = train_dataset_tail.true_head, train_dataset_tail.true_tail
        clf_dataset_head.subsampling_weights = train_dataset_head.subsampling_weights
        clf_dataset_tail.subsampling_weights = train_dataset_head.subsampling_weights
        clf_dataloader_head = DataLoader(
            clf_dataset_head,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn
        )

        clf_dataloader_tail = DataLoader(
            clf_dataset_tail,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn
        )
        clf_iterator = BidirectionalOneShotIterator(clf_dataloader_head, clf_dataloader_tail)

        gen_dataset_head = TrainDataset(clf_triples, nentity, nrelation,
                                        args.negative_sample_size, 'head-batch')
        gen_dataset_tail = TrainDataset(clf_triples, nentity, nrelation,
                                        args.negative_sample_size, 'tail-batch')
        gen_dataset_head.true_head, gen_dataset_head.true_tail = train_dataset_head.true_head, train_dataset_head.true_tail
        gen_dataset_tail.true_head, gen_dataset_tail.true_tail = train_dataset_tail.true_head, train_dataset_tail.true_tail
        gen_dataset_head.subsampling_weights = train_dataset_head.subsampling_weights
        gen_dataset_tail.subsampling_weights = train_dataset_head.subsampling_weights
        gen_dataloader_head = DataLoader(
            gen_dataset_head,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn
        )

        gen_dataloader_tail = DataLoader(
            gen_dataset_tail,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn
        )
        gen_iterator = BidirectionalOneShotIterator(gen_dataloader_head, gen_dataloader_tail)

        # if args.double_entity_embedding:
        #     classifier = SimpleNN(input_dim=args.hidden_dim, hidden_dim=5)
        #     generator = SimpleNN(input_dim=args.hidden_dim, hidden_dim=5)
        # else:
        classifier = SimpleNN(input_dim=args.hidden_dim, hidden_dim=5)
        generator = SimpleNN(input_dim=args.hidden_dim, hidden_dim=5)

        if args.cuda:
            classifier = classifier.cuda()
            generator = generator.cuda()
        clf_lr = 0.005 # if "FB15k" in args.data_path else 0.01
        clf_opt = torch.optim.Adam(classifier.parameters(), lr=clf_lr)
        gen_opt = torch.optim.SGD(generator.parameters(), lr=0.0001)
    elif args.method == "KBGAN":
        generator = KGEModel(
            model_name=args.model,
            nentity=nentity,
            nrelation=nrelation,
            hidden_dim=args.gen_dim,
            gamma=args.gamma,
            double_entity_embedding=args.double_entity_embedding,
            double_relation_embedding=args.double_relation_embedding
        )
        if args.cuda:
            generator = generator.cuda()
        # if args.gen_init is not None:
        #     checkpoint = torch.load(os.path.join(args.gen_init, 'checkpoint'))
        #     generator.load_state_dict(checkpoint['model_state_dict'])
        gen_opt = torch.optim.Adam(generator.parameters(), lr=args.learning_rate)

    # Set training configuration
    current_learning_rate = args.learning_rate
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, kge_model.parameters()),
        lr=current_learning_rate
    )
    if args.warm_up_steps:
        warm_up_steps = args.warm_up_steps
    else:
        warm_up_steps = args.max_steps # // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = 0
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            warm_up_steps = checkpoint['warm_up_steps']
            logging.info("warm_up_steps = %d" % warm_up_steps)
        else:
            current_learning_rate = args.learning_rate
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('learning_rate = %d' % current_learning_rate)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' % args.adversarial_temperature)

    # Set valid  as it would be evaluated during training
    if args.do_train:
        if args.method == "clf" and args.init_checkpoint:
            # classifier.find_topK_triples(kge_model, classifier, train_iterator, clf_iterator, GAN_iterator)
            # logging.info("fake triples in classifier training %d / %d" % (
            #     len(set(fake_triples).intersection(set(clf_iterator.dataloader_head.dataset.triples))),
            #     len(clf_iterator.dataloader_head.dataset.triples)))
            for epoch in range(1200):
                log = classifier.train_classifier_step(kge_model, classifier, clf_opt, clf_iterator, args, generator=None, model_name=args.model)
                if (epoch+1) % 200 == 0:
                    logging.info(log)
                if epoch == 4000:
                    clf_opt = torch.optim.Adam(classifier.parameters(), lr=clf_lr/10)
            clf_opt = torch.optim.Adam(classifier.parameters(), lr=clf_lr)


        training_logs = []

        # Training Loop
        logging.info(optimizer)
        soft = False
        epoch_reward, epoch_loss, avg_reward, log = 0, 0, 0, {}
        for step in range(init_step, args.max_steps):
            if args.method == "clf" and step % 10001 == 0:
                if args.num == 1:
                    soft = True
                elif args.num == 1000:
                    soft = False
                else:
                    soft = not soft
                head, relation, tail = classifier.get_embedding(kge_model, fake)
                if args.model == "RotatE":
                    fake_score = classifier.forward(RotatE(head, relation, tail, "single", kge_model))
                elif args.model == "DistMult":
                    fake_score = classifier.forward(head * relation * tail)
                elif args.model == "TransE":
                    fake_score = classifier.forward(head + relation - tail)
                all_weight = classifier.find_topK_triples(kge_model, classifier, train_iterator, clf_iterator,
                                                           gen_iterator, soft=soft, model_name=args.model)
                logging.info("fake percent %f in %d" % (fake_score.sum().item() / all_weight, all_weight))
                logging.info("fake triples in classifier training %d / %d" % (
                    len(set(fake_triples).intersection(set(clf_iterator.dataloader_head.dataset.triples))),
                    len(clf_iterator.dataloader_head.dataset.triples)))

                epoch_reward, epoch_loss, avg_reward = 0, 0, 0
                for epoch in tqdm(range(200)):
                    classifier.train_GAN_step(kge_model, generator, classifier, gen_opt, clf_opt, gen_iterator, epoch_reward, epoch_loss, avg_reward, args, model_name=args.model)

                clf_train_num = 200
                for epoch in range(clf_train_num):
                    log = classifier.train_classifier_step(kge_model, classifier, clf_opt, clf_iterator, args, generator=None, model_name=args.model)
                    if epoch % 100 == 0:
                        logging.info(log)

            if step % 300 == 0 and step > 0 and args.method == "KBGAN":
                avg_reward = epoch_reward / batch_num
                epoch_reward, epoch_loss = 0, 0
                logging.info('Training average reward at step %d: %f' % (step, avg_reward))
                logging.info('Training average loss at step %d: %f' % (step, epoch_loss / batch_num))

            if args.method == "KBGAN":
                epoch_reward, epoch_loss, batch_num = kge_model.train_GAN_step(generator, kge_model, gen_opt, optimizer, train_iterator, epoch_reward, epoch_loss, avg_reward, args)
            else:
                log = kge_model.train_step(kge_model, optimizer, train_iterator, args, generator=generator)

            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()),
                    lr=current_learning_rate
                )
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                if args.method is not None:
                    save_variable_list["confidence"] = train_iterator.dataloader_head.dataset.subsampling_weights
                save_model(kge_model, optimizer, save_variable_list, args, classifier=classifier, generator=generator)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs]) / len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
                log_metrics('Valid', step, metrics)
        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        if args.method is not None:
            save_variable_list["confidence"] = train_iterator.dataloader_head.dataset.subsampling_weights
        save_model(kge_model, optimizer, save_variable_list, args, classifier=classifier, generator=generator)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
        log_metrics('Valid', step, metrics)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
        if args.method is not None:
            classifier.find_topK_triples(kge_model, classifier, train_iterator, clf_iterator,
                                         gen_iterator, soft=True, model_name=args.model)
            # torch.save(train_iterator.dataloader_head.dataset.subsampling_weights,
            #            os.path.join(args.save_path, 'weight'))
            true_triples = set(train_triples) - set(fake_triples)
            scores, label = [], []
            for triple in true_triples:
                if not (triple == (0, 0, 0)):
                    scores.append(train_iterator.dataloader_head.dataset.subsampling_weights[triple].item())
                    label.append(1)
            for triple in fake_triples:
                if not (triple == (0, 0, 0)):
                    scores.append(train_iterator.dataloader_head.dataset.subsampling_weights[triple].item())
                    label.append(0)
        else:
            print("start to use sigmoid to translate distance to probability")
            scores, label = [], []
            true_triples = set(train_triples) - set(fake_triples)
            i = 0
            import sys
            while i < len(train_iterator.dataloader_head.dataset.triples):
                sys.stdout.write("%d in %d\r" % (i, len(train_iterator.dataloader_head.dataset.triples)))
                sys.stdout.flush()
                j = min(i + 1024, len(train_iterator.dataloader_head.dataset.triples))
                sample = torch.LongTensor(train_iterator.dataloader_head.dataset.triples[i: j]).cuda()
                score = kge_model(sample).detach().cpu().view(-1)
                for x, triple in enumerate(train_iterator.dataloader_head.dataset.triples[i: j]):
                    if triple in true_triples:
                        label.append(1)
                        scores.append(torch.sigmoid(score[x]))
                    elif triple in fake_triples:
                        label.append(0)
                        scores.append(torch.sigmoid(score[x]))
                i = j
                del sample
                del score
        scores, label = np.array(scores), np.array(label)
        from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
        p = precision_score(label, scores > 0.5)
        r = recall_score(label, scores > 0.5)
        f1 = f1_score(label, scores > 0.5)
        auc = roc_auc_score(label, scores > 0.5)
        logging.info(f"""
        precision = {p}
        recall = {r}
        f1 score = {f1}
        auc score = {auc}
        """)
        p = precision_score(1 - label, scores < 0.5)
        r = recall_score(1 - label, scores < 0.5)
        f1 = f1_score(1 - label, scores < 0.5)
        auc = roc_auc_score(1 - label, scores < 0.5)
        logging.info(f"""
                precision = {p}
                recall = {r}
                f1 score = {f1}
                auc score = {auc}
                """)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #23
0
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)
    elif args.data_path is None:
        raise ValueError('one of init_checkpoint/data_path must be choosed.')

    if args.do_train and args.save_path is None:
        raise ValueError('Where do you want to save your trained model?')
    
    if args.save_path and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    
    # Write logs to checkpoint and console
    set_logger(args)
    
    with open(os.path.join(args.data_path, 'entities.dict')) as fin:
        entity2id = dict()
        for line in fin:
            eid, entity = line.strip().split('\t')
            entity2id[entity] = int(eid)

    with open(os.path.join(args.data_path, 'relations.dict')) as fin:
        relation2id = dict()
        for line in fin:
            rid, relation = line.strip().split('\t')
            relation2id[relation] = int(rid)
    
    # Read regions for Countries S* datasets
    if args.countries:
        regions = list()
        with open(os.path.join(args.data_path, 'regions.list')) as fin:
            for line in fin:
                region = line.strip()
                regions.append(entity2id[region])
        args.regions = regions

    nentity = len(entity2id)
    nrelation = len(relation2id)
    
    args.nentity = nentity
    args.nrelation = nrelation
    
    logging.info('Model: %s' % args.model)
    logging.info('Data Path: %s' % args.data_path)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)
    
    train_triples = read_triple(os.path.join(args.data_path, 'train.txt'), entity2id, relation2id)
    logging.info('#train: %d' % len(train_triples))
    valid_triples = read_triple(os.path.join(args.data_path, 'valid.txt'), entity2id, relation2id)
    logging.info('#valid: %d' % len(valid_triples))
    test_triples = read_triple(os.path.join(args.data_path, 'test.txt'), entity2id, relation2id)
    logging.info('#test: %d' % len(test_triples))
    
    #All true triples
    all_true_triples = train_triples + valid_triples + test_triples
    
    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        type_dim = args.type_dim,
        gamma=args.gamma,
        gamma_type=args.gamma_type,
        gamma_pair=args.gamma_pair,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding
    )
    
    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' % (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()
    
    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, args.pair_sample_size, 'head-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation, args.negative_sample_size, args.pair_sample_size, 'tail-batch'), 
            batch_size=args.batch_size,
            shuffle=True, 
            num_workers=max(1, args.cpu_num//2),
            collate_fn=TrainDataset.collate_fn
        )
        
        train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)
        
        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, kge_model.parameters()), 
            lr=current_learning_rate
        )
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0
    
    step = init_step
    
    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' % args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('type_dim = %d' % args.type_dim)
    logging.info('gamma_type = %f' % args.gamma_type)
    logging.info('alpha_1 = %f' % args.alpha_1)
    logging.info('gamma_pair = %f' % args.gamma_pair)
    logging.info('alpha_2 = %f' % args.alpha_2)
    logging.info('negative_adversarial_sampling = %s' % str(args.negative_adversarial_sampling))
    logging.info('pair_sample_size = %d' % args.pair_sample_size)
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' % args.adversarial_temperature)
    
    # Set valid dataloader as it would be evaluated during training
    
    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []
        
        #Training Loop
        for step in range(init_step, args.max_steps):
            log = kge_model.train_step(kge_model, optimizer, train_iterator, args)
            
            training_logs.append(log)
            
            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' % (current_learning_rate, step))
                optimizer = torch.optim.Adam(
                    filter(lambda p: p.requires_grad, kge_model.parameters()), 
                    lr=current_learning_rate
                )
                warm_up_steps = warm_up_steps * 3
            
            if step % args.save_checkpoint_steps == 0:
                save_variable_list = {
                    'step': step, 
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps
                }
                save_model(kge_model, optimizer, save_variable_list, args)
                
            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum([log[metric] for log in training_logs])/len(training_logs)
                log_metrics('Training average', step, metrics)
                training_logs = []
                
            if args.do_valid and step % args.valid_steps == 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
                log_metrics('Valid', step, metrics)
        
        save_variable_list = {
            'step': step, 
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)
        
    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples, all_true_triples, args)
        log_metrics('Valid', step, metrics)
    
    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
    
    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        metrics = kge_model.test_step(kge_model, train_triples, all_true_triples, args)
        log_metrics('Test', step, metrics)
Example #24
0
# endregion

# region 配置
device = "cuda"
tensorboard_log_dir = "./result/log/"
checkpoint_path = "./result/fr_en/TransE/checkpoint.tar"
embedding_path = "./result/fr_en/TransE/ATentsembed.txt"

learning_rate = 0.001
# endregion

# region 模型和优化器
model = KGEModel(
    t.train_seeds,
    nentity=entity_count,
    nrelation=attr_count,
    nvalue=value_count,
    hidden_dim=200,
    gamma=24.0,
).to(device)

optim = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
# endregion

# region 可视化
summary_writer = tensorboard.SummaryWriter(log_dir=tensorboard_log_dir)
# endregion

# region 开始训练