Beispiel #1
0
def main(ratings_components=300, features_components=300, print_scores=False):
    np.random.seed(42)
    tf.set_random_seed(1984)
    data_path = '../data/goodbooks-10k/'
    book_features = get_book_features(get_book_dataframe(data_path))
    reduced_item_features, _, _ = reduce_matrix(book_features, n_components=features_components)

    goodreads_path = '../data/goodbooks-10k/ratings.csv'
    amazon_path = '../data/amazon/ratings_amazon.csv'
    spr = get_ratings(goodreads_path, amazon_path, min_amazon_items=6)

    n_folds = 5
    scores = np.zeros((n_folds, 2))
    kf = ColumnwiseKFold(n_folds, random_seed=30)
    for i, (X, (user_indices, item_indices)) in enumerate(kf.split(spr)):
        _, _, rating_VT = reduce_matrix(X, n_components=ratings_components)
        reduced_item_ratings = rating_VT.T
        items = get_reduced_joint(reduced_item_ratings, reduced_item_features)
        tf.reset_default_graph()
        encoder = BookEncoder(user_input_dim=10000, book_input_dim=items.shape[1], user_hidden=150, book_hidden=150)
        with tf.Session() as sess:
            encoder.initialize(sess)
            encoder.train(sess, X, items)
            scores[i, :] = encoder.test(sess, spr, X, items, user_indices, item_indices)
            if print_scores:
                print_evaluation(scores[i, 0], scores[i, 1])

    scores = np.mean(scores, axis=0)
    if print_scores:
        print('{0:d}-Fold Scores:'.format(n_folds))
        print_evaluation(scores[0], scores[1])

    return scores
Beispiel #2
0
def main(ratings_components=100, features_components=100, print_scores=False):
    #data_path = '../data/goodbooks-10k/'
    data_path = '../../goodbooks-10k/'
    book_features = get_book_features(get_book_dataframe(data_path))
    reduced_item_features, _, _ = reduce_matrix(
        book_features, n_components=features_components)

    goodreads_path = data_path + 'ratings.csv'
    amazon_path = data_path + 'ratings_amazon.csv'
    spr = get_ratings(goodreads_path, amazon_path, min_amazon_items=6)

    n_folds = 5
    scores = np.zeros((n_folds, 2))
    kf = ColumnwiseKFold(n_folds, random_seed=30)
    for i, (X, (user_incides, item_indices)) in enumerate(kf.split(spr)):
        _, _, rating_VT = reduce_matrix(X, n_components=ratings_components)
        reduced_item_ratings = rating_VT.T
        items = get_reduced_joint(reduced_item_ratings, reduced_item_features)
        sim = (cosine_similarity(items) + 1) / 2
        scores[i, :] = evaluate(spr, X, sim, user_incides, item_indices)
        if print_scores:
            print_evaluation(scores[i, 0], scores[i, 1])

    scores = np.mean(scores, axis=0)
    if print_scores:
        print('{0:d}-Fold Scores:')
        print_evaluation(scores[0], scores[1])

    return scores
Beispiel #3
0
def get_user_vector(user_input):
    try:
        q = np.load('../.tmp/user_'+user_input+'.npy')
        print('found user_vector...')
        return q
    except:
        # Set this to where you save and load all data
        data_path = '../../goodbooks-10k/'

        # Get dataframe from books
        books = get_book_dataframe(data_path)

        mapper = get_mapper(data_path + 'books.csv')

        # make an array for myself
        q = np.zeros((10000), dtype = np.int)

        # username = secret.USERNAME
        api_key = secret.API_KEY

        if not user_input.isdigit():
            user_id = get_id_from_username(user_input, api_key)
        else:
            user_id = user_input
        
        if user_id is None:
            return None

        page = 1
        while True:
            response = requests.get('https://www.goodreads.com/review/list/?v=2&id='+user_id+'&shelf=read&format=xml&key='+api_key+'&per_page=200&page=' + str(page))
            tree = ElementTree.fromstring(response.content)
            reviews = tree.find('reviews')
            for review in reviews:
                goodreads_book_id = str(review.find('book').find('id').text)
                if goodreads_book_id in mapper:
                    book_id = int(mapper[goodreads_book_id])
                    rating = int(review.find('rating').text)
                    q[book_id-1] = float(rating)
            page += 1
            
            print(len(reviews))
            if len(reviews) < 1:
                break

        for i in range(len(q)):
            if q[i] != 0:
                title = books.iloc[i]['title']
                print("%s --> %s" % (q[i], title))
        
        # Turn 1-5 rating scale into negative - positive scale
        ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
        for i in range(len(q)):
            q[i] = ratings_mapper[q[i]]

        print('saving user_vector...')
        np.save('../.tmp/user_'+user_input, q)
        
        return q
Beispiel #4
0
def main():
    """ Sample program to verify the code.

    This method will load in the book features, do some preprocessing,
    and use SVD to reduce it to 100 dimensions. It will then output
    the top 10 singular values.
    """
    # Set this to where you save and load all data
    # data_path = '../data/goodbooks-10k/'
    data_path = '../../goodbooks-10k/'
    df = get_book_dataframe(data_path)
    fv = get_book_features(df)
    U, S, VT = reduce_matrix(fv, 100, random_state = 42)
    print(S[:10])
Beispiel #5
0
def main(): 
    """ 
    Sample program to verify the code.
    This method will load and join ratings
    """
    
    # Set this to where you save and load all data
    # data_path = '../data/goodbooks-10k/'
    data_path = '../../goodbooks-10k/'
    goodreads_path = data_path + 'ratings.csv'
    amazon_path = data_path + 'ratings_amazon.csv'
    ratings = get_ratings(goodreads_path, amazon_path)

    book_features = get_book_features(get_book_dataframe(data_path))

    joint = get_joint(ratings.T, book_features, 30, 30)
    print(joint.shape)