def baselines(self, data, reg_i, reg_u, reg_it, width_mu, width_it): logging.info("Computing mu...") self.mu = WindowedAverage(width_mu) for v1, v2, r, t in data.iter_ratings(): self.mu.add(t, r) self.mu.process() logging.info("Computing item baselines...") self.cbi = [] for ratings in data.r_i: t = 0 n = 0 for user_id, r, timestamp in ratings: t += (r - self.mu.query(timestamp)) n += 1 self.cbi.append(t / float(reg_i + n)) self.cbi = array(self.cbi) logging.info("Computing item baseline functions...") self.cbit = [] for item_id, ratings in enumerate(data.r_i): cb = WindowedAverage(width_it, reg_it) for user_id, r, t in ratings: cb.add(t, r - self.cbi[item_id] - self.mu.query(t)) self.cbit.append(cb.process()) logging.info("Computing user baselines...") self.cbu = [] for user_id, ratings in data.iter_users(): t = 0 n = 0 for item_id, r, timestamp in ratings: t += (r - self.cbi[item_id] - self.cbit[item_id].query(timestamp) - self.mu.query(timestamp)) n += 1 self.cbu.append(t / float(reg_u + n)) self.cbu = array(self.cbu) self.bi = array(self.cbi) self.bu = array(self.cbu)
class CFModel(CFModelBase): def cbui(self, user_id, item_id, t): return self.mu.query(t) + self.cbu[user_id] + self.cbi[item_id] + self.cbit[item_id].query(t) def bui(self, user_id, item_id, t): return self.mu.query(t) + self.bu[user_id] + self.bi[item_id] + self.cbit[item_id].query(t) def load(self, filename): with open(filename) as file_read: self.mu, self.bu, self.bi, \ self.cbu, self.cbi, self.cbit, self.q, self.x, \ self.alpha, self.f = cPickle.loads(file_read.read()) return self def save(self, filename): with open(filename, "w") as file_write: file_write.write(cPickle.dumps((self.mu, self.bu, self.bi, self.cbu, self.cbi, self.cbit, self.q, self.x, self.alpha, self.f), protocol=-1)) def baselines(self, data, reg_i, reg_u, reg_it, width_mu, width_it): logging.info("Computing mu...") self.mu = WindowedAverage(width_mu) for v1, v2, r, t in data.iter_ratings(): self.mu.add(t, r) self.mu.process() logging.info("Computing item baselines...") self.cbi = [] for ratings in data.r_i: t = 0 n = 0 for user_id, r, timestamp in ratings: t += (r - self.mu.query(timestamp)) n += 1 self.cbi.append(t / float(reg_i + n)) self.cbi = array(self.cbi) logging.info("Computing item baseline functions...") self.cbit = [] for item_id, ratings in enumerate(data.r_i): cb = WindowedAverage(width_it, reg_it) for user_id, r, t in ratings: cb.add(t, r - self.cbi[item_id] - self.mu.query(t)) self.cbit.append(cb.process()) logging.info("Computing user baselines...") self.cbu = [] for user_id, ratings in data.iter_users(): t = 0 n = 0 for item_id, r, timestamp in ratings: t += (r - self.cbi[item_id] - self.cbit[item_id].query(timestamp) - self.mu.query(timestamp)) n += 1 self.cbu.append(t / float(reg_u + n)) self.cbu = array(self.cbu) self.bi = array(self.cbi) self.bu = array(self.cbu) def train(self, data, reg, reg_i, reg_u, reg_it, width_mu, width_it, min_iter, max_iter, step_size): self.baselines(data, reg_i, reg_u, reg_it, width_mu, width_it) logging.info("Performing optimization...") if self.x is None: self.x = [array([(random.random()-0.5)/100000.0 \ for i in range(self.f)]) for j in range(data.m)] if self.q is None: self.q = [array([(random.random()-0.5)/100000.0 \ for i in range(self.f)]) for j in range(data.m)] last_tot = float('inf') for iter in range(max_iter): tot = 0 rmse = 0 n = 0 for user_id in range(data.n): cw = len(data.r_u[user_id])**(-self.alpha) p = cw * sum((r - self.cbui(user_id, item_id, t))*self.x[item_id] \ for item_id, r, t in data.r_u[user_id]) s = 0 for item_id, r, t in data.r_u[user_id]: rp = self.bui(user_id, item_id, t) + dot(self.q[item_id], p) e = r - rp tot += e**2 + reg*(self.bu[user_id]**2 + self.bi[item_id]**2 + \ dot(self.q[item_id], self.q[item_id])) s += e*self.q[item_id] self.q[item_id] += step_size * (e*p - reg*self.q[item_id]) self.bu[user_id] += step_size * (e - reg*self.bu[user_id]) self.bi[item_id] += step_size * (e - reg*self.bi[item_id]) rmse += e**2 n += 1 for item_id, r, t in data.r_u[user_id]: tot += reg * len(data.r_u[user_id]) * \ dot(self.x[item_id], self.x[item_id]) self.x[item_id] += step_size * (cw * \ (r - self.cbui(user_id, item_id, t))*s - reg*self.x[item_id]) logging.info("%s: %s, %s", iter, (rmse / n) ** 0.5, tot) if iter >= min_iter and tot > last_tot: logging.info("Stopping early") return last_tot = tot