def cal_mrr(model, dataset): n_sample = len(dataset) sum_rr = 0 for sample in data_iter(dataset, rand_flg=False): rank = model.cal_rank(sample[0], sample[1], sample[2]) sum_rr += float(1/rank) return float(sum_rr/n_sample)
def multi_cal_hits(model, dataset, nbest): n_sample = len(dataset) pool = Pool(n_cpu) args_list = [[sample[0], sample[1], nbest] for sample in data_iter(dataset, rand_flg=False)] res = pool.map(model.most_similar_multi, args_list) n_corr = sum(1 for i in range(n_sample) if dataset.samples[i][2] in res[i]) return float(n_corr / n_sample)
def cal_hits(model, dataset, nbest): n_sample = len(dataset) n_corr = 0 for sample in data_iter(dataset, rand_flg=False): res = model.most_similar(sample[0], sample[1], nbest) if sample[2] in res: n_corr += 1 return float(n_corr / n_sample)
def run(self, train_dat, valid_dat=None, eval_step=1): best_model = None best_val = -1 best_epoch = 0 for epoch in range(self.n_epoch): self.logger.info('start {} epoch'.format(epoch + 1)) sum_loss = 0 start = time.time() for i, pos_sample in enumerate(data_iter(train_dat)): neg_samples = [(pos_sample[0], pos_sample[1], self.sampler.sample()) for _ in range(self.n_negative)] for neg_sample in neg_samples: loss = self.model.updata(pos_sample, neg_sample) sum_loss += loss print('processing {} samples in this epoch'.format(i + 1)) self.logger.info('sum_loss: {}'.format(sum_loss)) self.logger.info('{} sec/epoch for training'.format(time.time() - start)) model_path = os.path.join(self.log_dir, 'model{}'.format(epoch + 1)) self.model.save_model(model_path) if valid_dat and (epoch + 1) % eval_step == 0: # evaluation val = self.evaluator.run(self.model, valid_dat) self.logger.info('validation: {}'.format(val)) if val > best_val: best_model = copy.deepcopy(self.model) best_val = val best_epoch = epoch + 1 if valid_dat: self.logger.info('best model is {} epoch'.format(best_epoch)) model_path = os.path.join(self.log_dir, 'bestmodel') best_model.save_model(model_path) self.logger.info('done all')
def train(args): if args.log: log_dir = args.log else: log_dir = os.path.join( os.path.abspath(os.path.dirname(__file__)), '{}'.format(datetime.now().strftime('%Y%m%d_%H:%M'))) if not os.path.exists(log_dir): os.mkdir(log_dir) # setting for logging logger = logging.getLogger() logging.basicConfig(level=logging.INFO) log_path = os.path.join(log_dir, 'log') file_handler = logging.FileHandler(log_path) fmt = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(fmt) logger.addHandler(file_handler) logger.info('Arguments...') for arg, val in vars(args).items(): logger.info('{} : {}'.format(arg, val)) logger.info('Preparing dataset...') if not args.entity or not args.relation: # make vocab from train set logger.info('Making entity/relation vocab from train data...') raise NotImplementedError() else: ent_vocab = Vocab.load(args.entity) rel_vocab = Vocab.load(args.relation) n_entity, n_relation = len(ent_vocab), len(rel_vocab) train_dat = TripletDataset.load(args.train, ent_vocab, rel_vocab) logger.info('') if args.valid: assert args.metric in ['mrr', 'hits'], 'Invalid evaluation metric: {}'.format( args.metric) assert args.metric, 'Please indecate evaluation metric for validation' if args.metric == 'hits': assert args.nbest, 'Please indecate nbest for hits' valid_dat = TripletDataset.load(args.valid, ent_vocab, rel_vocab) if args.restart: logger.info('Restarting training: {}'.format(args.restart)) model = GaussianBilinearModel.load_model(args.restart) else: logger.info('Building new model') opt = SGD(args.lr, args.gradclip) model = GaussianBilinearModel(n_entity, n_relation, args.dim, args.cmin, args.cmax, opt, args.tri, args.init_sigma) best_model = None best_val = -1 for epoch in range(args.epoch): logger.info('start {} epoch'.format(epoch + 1)) sum_loss = 0 start = time.time() for i, pos_sample in enumerate(data_iter(train_dat)): neg_samples = [(pos_sample[0], pos_sample[1], np.random.randint(n_entity)) for _ in range(args.num_negative)] for neg_sample in neg_samples: loss = model.update(pos_sample, neg_sample) sum_loss += loss # logger.info('loss: {}'.format(loss)) # logger.info('processing {} samples in this epoch'.format(i+1)) print('processing {} samples in this epoch'.format(i + 1)) logger.info('sum loss: {}'.format(sum_loss)) logger.info('{} sec/epoch for training'.format(time.time() - start)) model_path = os.path.join(log_dir, 'model{}'.format(epoch + 1)) model.save_model(model_path) if args.valid and (epoch + 1) % args.evalstep == 0: val = evaluation(model, valid_dat, args.metric, args.nbest) logger.info('{} in validation: {}'.format(args.metric, val)) if val > best_val: best_model = copy.deepcopy(model) best_val = val best_epoch = epoch + 1 if args.valid: logger.info('best model is {} epoch'.format(best_epoch)) model_path = os.path.join(log_dir, 'bestmodel') best_model.save_model(model_path) logger.info('done all')