Ejemplo n.º 1
0
def test_classifier(model, filename=None, itemkey="track", selector="SELECT * FROM train"):
    conn = sqlite3.connect("db.sqlite")
    conn.row_factory = dict_factory
    cur = conn.cursor()
    s = 0
    c = 0
    t_p = 0
    for i in range(0,10):
        svd = SVD()
        if filename is not None:
            svd.load_model(filename)
        l = list(cur.execute(selector))
        random.shuffle(l)
        count = len(l)
        svd.set_data([(x["rating"],x["track"],x["user"]) for x in l[0:int(count*0.7)]])
        K = 1000
        svd.compute(k=K, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

        pairs = []
        for idx,item in enumerate(l[int(count*0.7):]): 
            user = item["user"]
            track = item[itemkey]
            pairs.append((predict_item(svd, track,user), item["rating"]))
        t_p += len(pairs)
        s += RMSE(pairs).compute()
        c += 1.0
        print "iteration"
    print s/c, t_p
Ejemplo n.º 2
0
def getSimilarityMatrix(svd_model_file):
	""" Returns similarity matrix from svd_model_file
	"""
	#Import SVD from file
	svd=SVD()
	svd.load_model(svd_model_file)

	return svd.get_matrix_similarity()
Ejemplo n.º 3
0
def test_load_model():
    svd2 = SVD()
    svd2.load_model(os.path.join(MOVIELENS_DATA_PATH, 'SVD_matrix'))
    recs_svd = svd.recommend(USERID1, NUM_SIMILARS, is_row=False)
    recs_svd2 = svd2.recommend(USERID1, NUM_SIMILARS, is_row=False)
    assert_equal(recs_svd, recs_svd2)
Ejemplo n.º 4
0
def test_load_model():
    svd2 = SVD()
    svd2.load_model(os.path.join(MOVIELENS_DATA_PATH, 'SVD_matrix'))
    recs_svd = svd.recommend(USERID1, NUM_SIMILARS, is_row=False)
    recs_svd2 = svd2.recommend(USERID1, NUM_SIMILARS, is_row=False)
    assert_equal(recs_svd, recs_svd2)
Ejemplo n.º 5
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None
Ejemplo n.º 6
0
def load_svd(filename):
    svd = SVD() 
    svd.load_model(filename)
    return svd
Ejemplo n.º 7
0
Archivo: day_07.py Proyecto: lmlzk/ML
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()

    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None

    def train(self, train):
        """
        训练数据
        :param train: 训练集
        :return:
        """
        if not self.load_model:
            # svd去获取训练数据集
            self.svd.set_data(train)
            # 注意传入的文件名字,不是带后缀名
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=filename[:-4])
        return None

    def recommend_to_user(self, userid):
        """
        推荐结果
        :param usrid: 用于ID
        :return: None
        """

        recommend_list = self.svd.recommend(userid, is_row=False)

        # 打印电影的名称,和预测的评分

        # 构建电影名字的列表
        movies_list = []

        for line in open("./data/ml-1m/movies.dat", "r"):
            movies_list.append(' '.join(line.split("::")[1:2]))

        # 依次取出推荐ID
        for itemid, rating in recommend_list:

            print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating)

        return None

    def rs_predict(self, userid, itemid):
        """
        得出评分
        :param userid: 用户ID
        :param itemid: 物品ID
        :return: 评分
        """
        score = self.svd.predict(itemid, userid)

        return score

    def evaluation(self, test):
        """
        均方误差评估模型
        :param test: 测试数据
        :return: None
        """
        if not self.load_model:
            # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)>
            for rating, itemid, userid in test.get():
                try:
                    # rating真是值
                    score = self.rs_predict(userid, itemid)

                    # 添加所有的测试数据
                    self.rmse.add(rating, score)
                except KeyError:
                    continue

            error = self.rmse.compute()

            print "均方误差为:%s" % error

        return None