def __init__(self): self.db = DBHelper() #self.w2v = Word2Vec() self.gally = Gallimaufry() self.favors = self.db.get_all_favor_entrys() self.entryids = self.db.get_entryids()
class RecommenderBase(object): ''' 推荐算法基类 ''' def __init__(self): self.db = DBHelper() #self.w2v = Word2Vec() self.gally = Gallimaufry() self.favors = self.db.get_all_favor_entrys() self.entryids = self.db.get_entryids() def recommend(self, uid): ''' 为给定用户推荐词条 ----------------------------- uid: 待推荐的用户ID ----------------------------- return: [entryid,...] ''' raise NotImplementedError() def train(self): ''' 模型训练 ''' raise NotImplementedError() def load_trainset(self): ''' 构造训练集 ---------------------------- return: ([[feature,...],...],[result,...]),result=0表示不接受,result=1表示接受 ''' uids = self.db.get_uids() trainsetsize = len(uids) * len(self.entryids) featuresize = self._get_featuresize() trainset = np.array([np.empty(featuresize, dtype=np.float16) for i in range(trainsetsize)]) results = np.zeros(trainsetsize) index = 0 for uid in uids: print uid for entryid in self.entryids: self._construct_features(uid, entryid, trainset[index], train=True) receive = self._receive_suggestion(uid, entryid, config.trainset_timespan) results[index] = receive index += 1 return trainset, np.array(results) def load_testset(self, uid): ''' 加载给定用户的测试数据 ------------------------------ return: [[feature,...],...], [(entryid,result),...] ''' testsetsize = len(self.entryids) testset = np.array([np.empty(self._get_featuresize(), dtype=np.float32) for i in range(testsetsize)]) results = [] index = 0 for entryid in self.entryids: self._construct_features(uid, entryid, testset[index], train=False) receive = self._receive_suggestion(uid, entryid, config.testset_timespan) results.append((entryid, receive)) index += 1 return testset, results def _receive_suggestion(self, uid, entryid, timespan): ''' 给定用户在某个时间段内是否会接受某个词条 -------------------------------------------- return: 0->不会接受,1->会接受 ''' receive = 0 if uid in self.favors and entryid in self.favors[uid]: times = self.favors[uid][entryid] if times[0] > timespan[1] or times[-1] < timespan[0]: return receive for time in times: if time <= timespan[1] and time >= timespan[0]: receive = 1 break return receive def _construct_features(self, uid, entryid, features, train=False): ''' 利用各种特征生成方法来生成特征向量 ---------------------------------- uid: 用户ID entryid: 词条ID train: 是否是训练样本的特征 features: 保存特征值的ndarray数组 ''' index = 0 #w2vfeatures = self.w2v.extract(uid, entryid, train) gallyfeatures = self.gally.extract(uid, entryid, train) for index in range(len(gallyfeatures)): features[index] = gallyfeatures[index] def _get_featuresize(self): ''' 获取特征空间的维度 ------------------------- return: feature size ''' featuresize = 0 featuresize += self.gally.featuresize() return featuresize