class Trainer(object): def __init__(self, dataset): """ dataset: object of DataInputer """ self.dataset = dataset self.model = LambdaRank() def __call__(self): self.run() def run(self): """ record is a string line """ val_maps = [] for val_idx in xrange(TRAIN_SET_NUM): # user ith dataset as a validate dataset self.val_idx = val_idx set_indexs = set(range(TRAIN_SET_NUM)) set_indexs.discard(val_idx) self.train(set_indexs) val_res = self.validate() show_status(".. get map: " + str(val_res)) val_maps.append(val_res) map_res = sum(val_maps) / TRAIN_SET_NUM show_status(".. get avage map: " + str(map_res)) self.model.dataspace.tofile(data_path('models', str(map_res))) def train(self, set_indexs): # train using the rest dataset for i in list(set_indexs): show_status(">>>" * 20) show_status(".. training %dth dataset" % i) for dataset in self.dataset.get_dataset(i): dataset_len = len(self.dataset.train_pairs[i]) show_status("dataset len: %d" % dataset_len) for i, (X1, X2) in enumerate(dataset): print "train %dth line" % i show_status(">> training data", i, dataset_len) X1 = np.array([float(i) for i in X1.split()]) X2 = np.array([float(i) for i in X2.split()]) self.model.study_line(X1, X2) def validate(self): """ validate and save best MAP """ def mysort(l1, l2): if l1[1] == l2[1]: return 0 if l1[1] > l2[1]: return -1 return 1 # TODO how to validate? vali_set = self.dataset.trainset[self.val_idx] uid, p_papers, n_papers = vali_set.split(',') uid = int(uid) p_papers = [int(i) for i in p_papers] n_papers = [int(i) for i in n_papers] predicts = [] for p in p_papers + n_papers: p_feature = self.dataset.get_data_line(uid, p) score = self.model.predict(p_feature) predicts.append((p, score)) predicts.sort(mysort) return cal_map(p_papers, predicts)
def __init__(self, dataset): """ dataset: object of DataInputer """ self.dataset = dataset self.model = LambdaRank()