Beispiel #1
0
def main():
    topics = int(sys.argv[1])
    total_iters = int(sys.argv[2])
    burn_in = float(sys.argv[3])
    thinning = int(sys.argv[4])

    ratings, _ = get_split_review_mats()
    bpf = BayesianPoissonFactorization(0.3, 0.3, 1.0, 0.3, 0.3, 1.0, topics, ratings)
    bpf.sample(total_iters, burn_in, thinning)
def main():
    topics = int(sys.argv[1])
    total_iters = int(sys.argv[2])
    burn_in = float(sys.argv[3])
    thinning = int(sys.argv[4])

    ratings, _ = get_split_review_mats()
    bpf = BayesianPoissonFactorization(0.3, 0.3, 1.0, 0.3, 0.3, 1.0, topics,
                                       ratings)
    bpf.sample(total_iters, burn_in, thinning)
Beispiel #3
0
    def __init__(self, numTopics, alpha, beta, gamma):
        # Setup logger
        self.log = logging.getLogger("Gibbs")
        self.log.setLevel(logging.DEBUG)
        formatter = logging.Formatter("%(asctime)s %(message)s",
                                      datefmt="%m/%d/%Y %I:%M:%S %p")
        fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log",
                                                       when="D",
                                                       interval=1,
                                                       backupCount=10)
        ch = logging.StreamHandler()
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        self.log.addHandler(fh)
        self.log.addHandler(ch)

        self.numTopics = numTopics
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma

        self.info = getMeta()

        self.user_movies, _ = get_split_review_mats()
        user_indices, movie_indices = self.user_movies.nonzero()
        self.user_movie_indices = zip(user_indices, movie_indices)

        self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int)
        self.CountRUT = np.zeros((6, self.info["users"], numTopics),
                                 dtype=np.int)  # ratings 1-5 and 0
        self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int)
        self.topic_assignments = np.zeros(
            (self.info["users"], self.info["movies"]), dtype=np.int)

        # Normalization factors
        self.CountT = np.zeros(numTopics, dtype=np.int)
        self.CountU = np.zeros(self.info["users"], dtype=np.int)
        self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int)

        for userid, movieid in self.user_movie_indices:
            topic = randint(0, numTopics - 1)
            self.CountMT[movieid, topic] += 1
            rating = self.user_movies[userid, movieid]
            self.CountRUT[rating, userid, topic] += 1
            self.CountUT[userid, topic] += 1
            self.topic_assignments[userid, movieid] = topic

            self.CountT[topic] += 1
            self.CountU[userid] += 1
            self.CountRU[rating, userid] += 1
Beispiel #4
0
    def __init__(self, numTopics, alpha, beta, gamma):
        # Setup logger
        self.log = logging.getLogger("Gibbs")
        self.log.setLevel(logging.DEBUG)
        formatter = logging.Formatter("%(asctime)s %(message)s",
                                      datefmt="%m/%d/%Y %I:%M:%S %p")
        fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log",
                                                       when="D",
                                                       interval=1,
                                                       backupCount=10)
        ch = logging.StreamHandler()
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)
        self.log.addHandler(fh)
        self.log.addHandler(ch)

        self.numTopics = numTopics
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma

        self.info = getMeta()

        self.user_movies, _ = get_split_review_mats()
        user_indices, movie_indices = self.user_movies.nonzero()
        self.user_movie_indices = zip(user_indices, movie_indices)

        self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int)
        self.CountRUT = np.zeros((6, self.info["users"], numTopics), dtype=np.int)  # ratings 1-5 and 0
        self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int)
        self.topic_assignments = np.zeros((self.info["users"], self.info["movies"]), dtype=np.int)

        # Normalization factors
        self.CountT = np.zeros(numTopics, dtype=np.int)
        self.CountU = np.zeros(self.info["users"], dtype=np.int)
        self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int)

        for userid, movieid in self.user_movie_indices:
            topic = randint(0, numTopics - 1)
            self.CountMT[movieid, topic] += 1
            rating = self.user_movies[userid, movieid]
            self.CountRUT[rating, userid, topic] += 1
            self.CountUT[userid, topic] += 1
            self.topic_assignments[userid, movieid] = topic

            self.CountT[topic] += 1
            self.CountU[userid] += 1
            self.CountRU[rating, userid] += 1
Beispiel #5
0
def test_iid_users():
    train, test = get_split_review_mats()
    avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float)

    rmse = 0.0
    count = 0
    for user, movie in izip(*test.nonzero()):
        true_rating = test[user, movie]
        predicted = avg_ratings[movie]

        if np.isnan(predicted):
            # The movie wasn't rated by any users in the training data set
            continue

        rmse += (predicted - true_rating) ** 2
        count += 1

    return math.sqrt(rmse / count)
Beispiel #6
0
def test_iid_users():
    train, test = get_split_review_mats()
    avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float)

    rmse = 0.0
    count = 0
    for user, movie in izip(*test.nonzero()):
        true_rating = test[user, movie]
        predicted = avg_ratings[movie]

        if np.isnan(predicted):
            # The movie wasn't rated by any users in the training data set
            continue

        rmse += (predicted - true_rating)**2
        count += 1

    return math.sqrt(rmse / count)
Beispiel #7
0
def top_recommendations_iid():
    info = getMeta()
    train, reviews = get_split_review_mats()

    avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float)
    top_movies = sorted(((movie, rating) for movie, rating in enumerate(avg_ratings)),
                        key=lambda x: x[1])
    top_movies = set(movie for movie, rating in top_movies[-1000:])

    precision = 0.0
    num_users = 0
    for user in xrange(info["users"]):
        movies = reviews[user, :].nonzero()[0]
        user_precision = 0.0
        for movie in movies:
            if movie in top_movies:
                user_precision += 1
        if len(movies) > 0:
            num_users += 1
            precision += (user_precision / len(movies))
        return precision / num_users
Beispiel #8
0
def top_recommendations_iid():
    info = getMeta()
    train, reviews = get_split_review_mats()

    avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float)
    top_movies = sorted(
        ((movie, rating) for movie, rating in enumerate(avg_ratings)),
        key=lambda x: x[1])
    top_movies = set(movie for movie, rating in top_movies[-1000:])

    precision = 0.0
    num_users = 0
    for user in xrange(info["users"]):
        movies = reviews[user, :].nonzero()[0]
        user_precision = 0.0
        for movie in movies:
            if movie in top_movies:
                user_precision += 1
        if len(movies) > 0:
            num_users += 1
            precision += (user_precision / len(movies))
        return precision / num_users