コード例 #1
0
ファイル: mrr.py プロジェクト: mana-ysh/gaussian-embedding
def cal_mrr(model, dataset):
    n_sample = len(dataset)
    sum_rr = 0
    for sample in data_iter(dataset, rand_flg=False):
        rank = model.cal_rank(sample[0], sample[1], sample[2])
        sum_rr += float(1/rank)
    return float(sum_rr/n_sample)
コード例 #2
0
ファイル: hits.py プロジェクト: mana-ysh/gaussian-embedding
def multi_cal_hits(model, dataset, nbest):
    n_sample = len(dataset)
    pool = Pool(n_cpu)
    args_list = [[sample[0], sample[1], nbest]
                 for sample in data_iter(dataset, rand_flg=False)]
    res = pool.map(model.most_similar_multi, args_list)
    n_corr = sum(1 for i in range(n_sample) if dataset.samples[i][2] in res[i])
    return float(n_corr / n_sample)
コード例 #3
0
ファイル: hits.py プロジェクト: mana-ysh/gaussian-embedding
def cal_hits(model, dataset, nbest):
    n_sample = len(dataset)
    n_corr = 0
    for sample in data_iter(dataset, rand_flg=False):
        res = model.most_similar(sample[0], sample[1], nbest)
        if sample[2] in res:
            n_corr += 1
    return float(n_corr / n_sample)
コード例 #4
0
    def run(self, train_dat, valid_dat=None, eval_step=1):
        best_model = None
        best_val = -1
        best_epoch = 0
        for epoch in range(self.n_epoch):
            self.logger.info('start {} epoch'.format(epoch + 1))
            sum_loss = 0
            start = time.time()
            for i, pos_sample in enumerate(data_iter(train_dat)):
                neg_samples = [(pos_sample[0], pos_sample[1],
                                self.sampler.sample())
                               for _ in range(self.n_negative)]
                for neg_sample in neg_samples:
                    loss = self.model.updata(pos_sample, neg_sample)
                    sum_loss += loss
                print('processing {} samples in this epoch'.format(i + 1))
            self.logger.info('sum_loss: {}'.format(sum_loss))
            self.logger.info('{} sec/epoch for training'.format(time.time() -
                                                                start))
            model_path = os.path.join(self.log_dir,
                                      'model{}'.format(epoch + 1))
            self.model.save_model(model_path)

            if valid_dat and (epoch + 1) % eval_step == 0:  # evaluation
                val = self.evaluator.run(self.model, valid_dat)
                self.logger.info('validation: {}'.format(val))
                if val > best_val:
                    best_model = copy.deepcopy(self.model)
                    best_val = val
                    best_epoch = epoch + 1

        if valid_dat:
            self.logger.info('best model is {} epoch'.format(best_epoch))
            model_path = os.path.join(self.log_dir, 'bestmodel')
            best_model.save_model(model_path)

        self.logger.info('done all')
コード例 #5
0
def train(args):

    if args.log:
        log_dir = args.log
    else:
        log_dir = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            '{}'.format(datetime.now().strftime('%Y%m%d_%H:%M')))

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # setting for logging
    logger = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    log_path = os.path.join(log_dir, 'log')
    file_handler = logging.FileHandler(log_path)
    fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(fmt)
    logger.addHandler(file_handler)

    logger.info('Arguments...')
    for arg, val in vars(args).items():
        logger.info('{} : {}'.format(arg, val))

    logger.info('Preparing dataset...')
    if not args.entity or not args.relation:
        # make vocab from train set
        logger.info('Making entity/relation vocab from train data...')
        raise NotImplementedError()
    else:
        ent_vocab = Vocab.load(args.entity)
        rel_vocab = Vocab.load(args.relation)

    n_entity, n_relation = len(ent_vocab), len(rel_vocab)
    train_dat = TripletDataset.load(args.train, ent_vocab, rel_vocab)
    logger.info('')
    if args.valid:
        assert args.metric in ['mrr',
                               'hits'], 'Invalid evaluation metric: {}'.format(
                                   args.metric)
        assert args.metric, 'Please indecate evaluation metric for validation'
        if args.metric == 'hits':
            assert args.nbest, 'Please indecate nbest for hits'
        valid_dat = TripletDataset.load(args.valid, ent_vocab, rel_vocab)

    if args.restart:
        logger.info('Restarting training: {}'.format(args.restart))
        model = GaussianBilinearModel.load_model(args.restart)
    else:
        logger.info('Building new model')
        opt = SGD(args.lr, args.gradclip)
        model = GaussianBilinearModel(n_entity, n_relation, args.dim,
                                      args.cmin, args.cmax, opt, args.tri,
                                      args.init_sigma)

    best_model = None
    best_val = -1
    for epoch in range(args.epoch):
        logger.info('start {} epoch'.format(epoch + 1))
        sum_loss = 0
        start = time.time()
        for i, pos_sample in enumerate(data_iter(train_dat)):
            neg_samples = [(pos_sample[0], pos_sample[1],
                            np.random.randint(n_entity))
                           for _ in range(args.num_negative)]
            for neg_sample in neg_samples:
                loss = model.update(pos_sample, neg_sample)
                sum_loss += loss
                # logger.info('loss: {}'.format(loss))
            # logger.info('processing {} samples in this epoch'.format(i+1))
            print('processing {} samples in this epoch'.format(i + 1))
        logger.info('sum loss: {}'.format(sum_loss))
        logger.info('{} sec/epoch for training'.format(time.time() - start))
        model_path = os.path.join(log_dir, 'model{}'.format(epoch + 1))
        model.save_model(model_path)
        if args.valid and (epoch + 1) % args.evalstep == 0:
            val = evaluation(model, valid_dat, args.metric, args.nbest)
            logger.info('{} in validation: {}'.format(args.metric, val))
            if val > best_val:
                best_model = copy.deepcopy(model)
                best_val = val
                best_epoch = epoch + 1

    if args.valid:
        logger.info('best model is {} epoch'.format(best_epoch))
        model_path = os.path.join(log_dir, 'bestmodel')
        best_model.save_model(model_path)

    logger.info('done all')