Ejemplo n.º 1
0
def predict(user_info, content_array, num_partitions=30):
    """
    Creates a user preference profile by determining the rating of a particular vector item
    For example if we are looking at movies and a user highly rates sci-fi movies over drama, then the sci-fi row will be a higher number than the drama row
    Then this user preference vector is used to determine a prediction rating for each product

    There needs to be some cleaning still to renormalize the final answer so that predictions go from 0-1
    Something similar to this is shown in sum_components

    Args:
        user_info: user rdd of ratings (or interactions) which should be in the format of (user, item, rating)
        content_array: content feature array of the items which should be in the format of (item [content_feature vector])

    Returns:
        predictions_norm: an rdd which is in the format of (user, item, predicted_rating) normalized to be between 0 and the max prediction
    """

    user_keys = user_info.map(lambda (user, page, value): (page, (user, value)))
    user_prefs = content_array.join(user_keys).groupBy(lambda (page, ((array), (user, rating))): user)\
        .map(lambda(user, array): (user, rechelp.sum_components(array)))

    #ensure that there are no user_preference vectors or content vectors with a zero array - this causes the predictions to be nan
    user_prefs = user_prefs.filter(lambda (u_id,user_vect ): all(v == 0 for v in list(user_vect))==False)
    content_array = content_array.filter(lambda (c_id, cont_vect ): all(v == 0 for v in list(cont_vect))==False)

    max_rating = user_info.map(lambda (user, item, rating): rating).max()
    min_rating = user_info.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    diff_ratings = float(max_rating - min_rating)

    predictions = user_prefs.cartesian(content_array).map(lambda ((user_id, user_vect), (page_id, item_vector)):\
            (user_id, page_id, np.dot(user_vect, item_vector)/(norm(item_vector)*norm(user_vect)))).coalesce(num_partitions)

    max_pred = predictions.map(lambda (user,item, pred):pred).max()
    min_pred = predictions.map(lambda (user,item, pred):pred).min()

    diff_pred = float(max_pred - min_pred)

    norm_predictions = predictions.map(lambda (user,item, pred):(user, item, \
                    (pred-min_pred)*float(diff_ratings/diff_pred)+min_rating))

    return norm_predictions
Ejemplo n.º 2
0
def predict(user_info, content_array, num_partitions=30):
    """
    Creates a user preference profile by determining the rating of a particular vector item
    For example if we are looking at movies and a user highly rates sci-fi movies over drama, then the sci-fi row will be a higher number than the drama row
    Then this user preference vector is used to determine a prediction rating for each product

    There needs to be some cleaning still to renormalize the final answer so that predictions go from 0-1
    Something similar to this is shown in sum_components

    Args:
        user_info: user rdd of ratings (or interactions) which should be in the format of (user, item, rating)
        content_array: content feature array of the items which should be in the format of (item [content_feature vector])

    Returns:
        predictions_norm: an rdd which is in the format of (user, item, predicted_rating) normalized to be between 0 and the max prediction
    """

    user_keys = user_info.map(lambda (user, page, value): (page, (user, value)))
    user_prefs = content_array.join(user_keys).groupBy(lambda (page, ((array), (user, rating))): user)\
        .map(lambda(user, array): (user, rechelp.sum_components(array)))

    #ensure that there are no user_preference vectors or content vectors with a zero array - this causes the predictions to be nan
    user_prefs = user_prefs.filter(lambda (u_id,user_vect ): sum(list(user_vect))>0.0)
    content_array = content_array.filter(lambda (c_id, cont_vect ): sum(list(cont_vect))>0.0)

    max_rating = user_info.map(lambda (user, item, rating): rating).max()
    min_rating = user_info.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    diff_ratings = float(max_rating - min_rating)

    predictions = user_prefs.cartesian(content_array).map(lambda ((user_id, user_vect), (page_id, item_vector)):\
            (user_id, page_id, np.dot(user_vect, item_vector)/(norm(item_vector)*norm(user_vect)))).coalesce(num_partitions)

    max_pred = predictions.map(lambda (user,item, pred):pred).max()
    min_pred = predictions.map(lambda (user,item, pred):pred).min()

    diff_pred = float(max_pred - min_pred)

    norm_predictions = predictions.map(lambda (user,item, pred):(user, item, \
                    (pred-min_pred)*float(diff_ratings/diff_pred)+min_rating))

    return norm_predictions
Ejemplo n.º 3
0
def predict(user_info, content_array, num_predictions, k=10):
    """Predict ratings for items using a k-means clustering content based
    algorithm designed to increase the diversity of recommended items.

    User profiles are generated by weighting the item vectors by the user's
    rating of the item and summing them.

    The clustering is performed on the item vectors. Items are then drawn from
    these clusters in proportion to the clusters prevalence in the dataset.

    Args:
        user_info (rdd): in the format of (user, item, rating)
        content_array (rdd): content feature array of the items which should be in
            the format of (item, [content_feature vector])
        num_predictions (int): Number of predictions to return

    Returns:
        rdd: in the format of (user, item, predicted_rating)
    """
    # Extract the vectors from the content array
    vectors = content_array.values()
    cluster_model = KMeans.train(vectors, k)
    clustered_content = content_array\
        .map(lambda (item, vector): (cluster_model.predict(vector), (item, vector)))

    cluster_centers = cluster_model.centers

    # Calculate the percent of recommendations to make from each cluster
    counts = clustered_content.countByKey()
    fractions = {}
    total = sum([v for k,v in counts.iteritems()])
    for k, v in counts.iteritems():
        fractions[k] = round(float(v) / total, 2)

    # Make the user profiles
    user_keys = user_info.map(lambda (user, item, rating): (item, (user, rating)))
    user_prefs = content_array\
        .join(user_keys).\
        groupBy(lambda (item, ((item_vector), (user, rating))): user)\
        .map(lambda(user, array): (user, rechelp.sum_components(array)))

    # Make predictions
    max_rating = user_info.map(lambda (user, item, rating): rating).max()
    min_rating = user_info.map(lambda (user, item, rating): rating).min()
    content_and_profiles = clustered_content.cartesian(user_prefs)
    predictions_with_clusters = content_and_profiles\
        .map(
            lambda (
                (cluster, (item, item_vector)),
                (user, user_vector)
            ): (
                user,
                cluster,
                item,
                round(rechelp.dot_product_predict_ratings(user_vector, item_vector, minimum=min_rating, maximum=max_rating), 3)
            )
        )

    clustered_predictions = predictions_with_clusters\
        .groupBy(lambda (user, cluster, item, rating): (user, cluster))\
        .flatMap(lambda row: rechelp.sort_and_cut_by_cluster(row, num_predictions, fractions))\
        .map(lambda (user, rating, item): (user, item, rating))

    return clustered_predictions
Ejemplo n.º 4
0
def predict(user_info,
            content_array,
            num_predictions,
            k=10,
            num_partitions=20):
    """Predict ratings for items using a k-means clustering content based
    algorithm designed to increase the diversity of recommended items.

    User profiles are generated by weighting the item vectors by the user's
    rating of the item and summing them.

    The clustering is performed on the item vectors. Items are then drawn from
    these clusters in proportion to the clusters prevalence in the dataset.

    Args:
        user_info (rdd): in the format of (user, item, rating)
        content_array (rdd): content feature array of the items which should be in
            the format of (item, [content_feature vector])
        num_predictions (int): Number of predictions to return

    Returns:
        rdd: in the format of (user, item, predicted_rating)
    """
    # Extract the vectors from the content array
    vectors = content_array.values()
    cluster_model = KMeans.train(vectors, k)
    clustered_content = content_array\
        .map(lambda (item, vector): (cluster_model.predict(vector), (item, vector)))

    cluster_centers = cluster_model.centers

    # Calculate the percent of recommendations to make from each cluster
    counts = clustered_content.countByKey()
    fractions = {}
    total = sum([v for k, v in counts.iteritems()])
    for k, v in counts.iteritems():
        fractions[k] = round(float(v) / total, 2)

    # Make the user profiles
    user_keys = user_info.map(lambda (user, item, rating): (item,
                                                            (user, rating)))
    user_prefs = content_array\
        .join(user_keys).\
        groupBy(lambda (item, ((item_vector), (user, rating))): user)\
        .map(lambda(user, array): (user, rechelp.sum_components(array)))

    #ensure that there are no user_preference vectors or content vectors with a zero array - this causes the predictions to be nan
    user_prefs = user_prefs.filter(lambda (u_id, user_vect): all(
        v == 0 for v in list(user_vect)) == False)
    clustered_content = clustered_content.filter(lambda (cluster, (
        item, item_vector)): all(v == 0 for v in list(item_vector)) == False)

    # Make predictions
    max_rating = user_info.map(lambda (user, item, rating): rating).max()
    min_rating = user_info.map(lambda (user, item, rating): rating).min()
    diff_ratings = max_rating - min_rating
    content_and_profiles = clustered_content.cartesian(user_prefs).coalesce(
        num_partitions)
    predictions_with_clusters = content_and_profiles\
        .map(
            lambda (
                (cluster, (item, item_vector)),
                (user, user_vector)
            ): (
                user,
                cluster,
                item,
                round(np.dot(user_vector, item_vector)/(norm(item_vector)*norm(user_vector)), 3)
            )
        )

    clustered_predictions = predictions_with_clusters\
        .groupBy(lambda (user, cluster, item, rating): (user, cluster))\
        .flatMap(lambda row: rechelp.sort_and_cut_by_cluster(row, num_predictions, fractions))\
        .map(lambda (user, rating, item): (user, item, rating))

    max_pred = clustered_predictions.map(lambda (user, item, pred): pred).max()
    min_pred = clustered_predictions.map(lambda (user, item, pred): pred).min()

    diff_pred = float(max_pred - min_pred)

    norm_predictions = clustered_predictions.map(lambda (user,item, pred):(user, item, \
                    (pred-min_pred)*float(diff_ratings/diff_pred)+min_rating))

    return norm_predictions
Ejemplo n.º 5
0
def predict(user_info, content_array, num_predictions, k=10, num_partitions=20):
    """Predict ratings for items using a k-means clustering content based
    algorithm designed to increase the diversity of recommended items.

    User profiles are generated by weighting the item vectors by the user's
    rating of the item and summing them.

    The clustering is performed on the item vectors. Items are then drawn from
    these clusters in proportion to the clusters prevalence in the dataset.

    Args:
        user_info (rdd): in the format of (user, item, rating)
        content_array (rdd): content feature array of the items which should be in
            the format of (item, [content_feature vector])
        num_predictions (int): Number of predictions to return

    Returns:
        rdd: in the format of (user, item, predicted_rating)
    """
    # Extract the vectors from the content array
    vectors = content_array.values()
    cluster_model = KMeans.train(vectors, k)
    clustered_content = content_array\
        .map(lambda (item, vector): (cluster_model.predict(vector), (item, vector)))

    cluster_centers = cluster_model.centers

    # Calculate the percent of recommendations to make from each cluster
    counts = clustered_content.countByKey()
    fractions = {}
    total = sum([v for k,v in counts.iteritems()])
    for k, v in counts.iteritems():
        fractions[k] = round(float(v) / total, 2)

    # Make the user profiles
    user_keys = user_info.map(lambda (user, item, rating): (item, (user, rating)))
    user_prefs = content_array\
        .join(user_keys).\
        groupBy(lambda (item, ((item_vector), (user, rating))): user)\
        .map(lambda(user, array): (user, rechelp.sum_components(array)))

    #ensure that there are no user_preference vectors or content vectors with a zero array - this causes the predictions to be nan
    user_prefs = user_prefs.filter(lambda (u_id,user_vect ): all(v == 0 for v in list(user_vect))==False)
    clustered_content = clustered_content.filter(lambda (cluster, (item, item_vector)): all(v == 0 for v in list(item_vector))==False)

    # Make predictions
    max_rating = user_info.map(lambda (user, item, rating): rating).max()
    min_rating = user_info.map(lambda (user, item, rating): rating).min()
    diff_ratings = max_rating - min_rating
    content_and_profiles = clustered_content.cartesian(user_prefs).coalesce(num_partitions)
    predictions_with_clusters = content_and_profiles\
        .map(
            lambda (
                (cluster, (item, item_vector)),
                (user, user_vector)
            ): (
                user,
                cluster,
                item,
                round(np.dot(user_vector, item_vector)/(norm(item_vector)*norm(user_vector)), 3)
            )
        )

    clustered_predictions = predictions_with_clusters\
        .groupBy(lambda (user, cluster, item, rating): (user, cluster))\
        .flatMap(lambda row: rechelp.sort_and_cut_by_cluster(row, num_predictions, fractions))\
        .map(lambda (user, rating, item): (user, item, rating))

    max_pred = clustered_predictions.map(lambda (user,item, pred):pred).max()
    min_pred = clustered_predictions.map(lambda (user,item, pred):pred).min()

    diff_pred = float(max_pred - min_pred)

    norm_predictions = clustered_predictions.map(lambda (user,item, pred):(user, item, \
                    (pred-min_pred)*float(diff_ratings/diff_pred)+min_rating))

    return norm_predictions