Esempio n. 1
0
def main(train_file, user_item_side_information_file, hierarchy_file,
         test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_item_side_information_file)
    hierarchy = json.loads(open(hierarchy_file).read())

    lda = LDAHierarquical(B, hierarchy, topics=15)

    #####REMOVE_IT
    def important_topics(x, topics):
        if not x:
            return x
        transf = [(i, j) for i, j in enumerate(x)]
        transf = sorted(transf, cmp=lambda x, y: cmp(x[1], y[1]))
        return [i[0] for i in transf[:topics]]

    topics = 3

    coincidencias = []
    for user in range(1, 101):
        # Topicos do usuario 10
        user_topics = important_topics(lda.model['users'][user], topics)

        # Topicos das cidades de teste do usuario 10
        T = tsv_to_matrix(test_file)
        cities = T[user].nonzero()[0]

        cities_topics = [
            important_topics(lda.model['cities'].get(city, []), topics)
            for city in cities
        ]

        total = 0
        topics_compared = 0
        coinc = 0
        for city_topic in cities_topics:
            if city_topic:
                coinc += len(set(user_topics) & set(city_topic))
                topics_compared += len(user_topics)
                total += 1
            else:
                pass

        if total:
            perc = (coinc / float(topics_compared))
        else:
            perc = -1

        coincidencias.append([coinc, topics_compared, perc])

    aa = open('/tmp/coincidencias.json', 'w')
    aa.write(json.dumps(coincidencias))
    aa.close()
    #####

    W = slim_train(A)

    recommendations = slim_lda_recommender(A, W, lda)

    compute_precision(recommendations, test_file)
Esempio n. 2
0
def main(train_file, user_item_side_information_file, hierarchy_file, test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_item_side_information_file)
    hierarchy = json.loads(open(hierarchy_file).read())

    lda = LDAHierarquical(B, hierarchy, topics=15)

    #####REMOVE_IT
    def important_topics(x, topics):
        if not x:
            return x
        transf = [ (i, j) for i, j in enumerate(x) ]
        transf = sorted(transf, cmp=lambda x, y: cmp(x[1], y[1]))
        return [ i[0] for i in transf[:topics] ]

    topics = 3

    coincidencias = []
    for user in range(1, 101):
        # Topicos do usuario 10
        user_topics = important_topics(lda.model['users'][user], topics)

        # Topicos das cidades de teste do usuario 10
        T = tsv_to_matrix(test_file)
        cities = T[user].nonzero()[0]

        cities_topics = [ important_topics(lda.model['cities'].get(city, []), topics) for city in cities ]


        total = 0
        topics_compared = 0
        coinc = 0
        for city_topic in cities_topics:
            if city_topic:
                coinc += len(set(user_topics) & set(city_topic))
                topics_compared += len(user_topics)
                total += 1
            else:
                pass

        if total:
            perc = (coinc/float(topics_compared))
        else:
            perc = -1

        coincidencias.append([coinc, topics_compared, perc])

    aa = open('/tmp/coincidencias.json', 'w')
    aa.write(json.dumps(coincidencias))
    aa.close()
    #####

    W = slim_train(A)

    recommendations = slim_lda_recommender(A, W, lda)

    compute_precision(recommendations, test_file)
Esempio n. 3
0
def main(train_file, part_file, test_file):

    AG = tsv_to_matrix(train_file, 942, 1682)
    AP = tsv_to_matrix(part_file, 942, 1682)

    W1 = slim_train(AG)
    W2 = slim_train(AP)
    for i in range(0, 11):
        W = (i / 10) * W1 + (1 - i / 10) * W2
        print(i / 10)
        recommendations = slim_recommender(AP, W)
        compute_precision(recommendations, test_file)
Esempio n. 4
0
def main(train_file, part_file):

    AG = tsv_to_matrix(train_file, 942, 1682)
    AP = tsv_to_matrix(part_file, 942, 1682)

    W1 = slim_train(AG)
    W2 = slim_train(AP)

    W = 0 * W1 + 1 * W2

    recommendations = slim_recommender(AP, W)

    compute_precision(recommendations, part_file)
Esempio n. 5
0
def main(train_file, user_sideinformation_file, test_file, normalize):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file)

    if normalize:
        B = normalize_values(B)

    A, B = make_compatible(A, B)

    W = sslim_train(A, B)

    recommendations = slim_recommender(A, W)

    return compute_precision(recommendations, test_file)
Esempio n. 6
0
def main(train_file, user_item_side_information_file, hierarchy_file,
         test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_item_side_information_file)
    hierarchy = json.loads(open(hierarchy_file).read())

    W = slim_train(A)

    ## LDA
    #lda = LDAHierarquical(B, hierarchy, topics=20)
    #recommendations_lda = slim_lda_recommender(A, W, lda)
    #compute_precision(recommendations_lda, test_file)
    ###

    recommendations_slim = slim_recommender(A, W)
    ## HSLIM
    from hslim import handle_user_bias, hierarchy_factory, normalize_wline, generate_subitem_hierarchy
    hierarchy = hierarchy_factory('data/hierarchy.json')
    K = slim_train(handle_user_bias(B))
    Wline = generate_subitem_hierarchy(K, W, hierarchy)
    WlineNorm = normalize_wline(Wline)
    recommendations_other = slim_recommender(A, WlineNorm)
    ###

    kendall_tau_values = []
    differences_values = []

    for u in recommendations_slim.iterkeys():
        ranking_slim = recommendations_slim[u][:RANKING_UNTIL]
        ranking_other = recommendations_other[u][:RANKING_UNTIL]

        kendall_tau_values.append(kendalltau(ranking_slim, ranking_other))
        differences_values.append(RANKING_UNTIL -
                                  len(set(ranking_slim) & set(ranking_other)))

    # Differences
    plt.hist(differences_values)
    plt.xlabel('Size of difference')
    plt.ylabel('Amount of rankings')
    plt.title('Differences (novelty) between rankings')

    # Ranking comparison
    show_matplot_fig()
    plt.figure()
    plt.hist([i[0] for i in kendall_tau_values])
    plt.xlabel('KendallTau Distance SLIM/SLIM LDA')
    plt.ylabel('Number of occurrences')
    plt.title('Comparison between rankings')
    show_matplot_fig()
Esempio n. 7
0
def main(train_file, user_sideinformation_file, test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file)
    """
    from util import mm2csr
    mm2csr(A, '/tmp/train.mat')
    mm2csr(useritem_featureitem, '/tmp/train_feature.mat')
    C = tsv_to_matrix(test_file)
    mm2csr(C, '/tmp/test.mat')
    """

    W = sslim_train(A, B)

    recommendations = slim_recommender(A, W)

    compute_precision(recommendations, test_file)
Esempio n. 8
0
def main(train_file, user_item_side_information_file, hierarchy_file, test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_item_side_information_file)
    hierarchy = json.loads(open(hierarchy_file).read())

    W = slim_train(A)

    ## LDA
    #lda = LDAHierarquical(B, hierarchy, topics=20)
    #recommendations_lda = slim_lda_recommender(A, W, lda)
    #compute_precision(recommendations_lda, test_file)
    ###

    recommendations_slim = slim_recommender(A, W)
    ## HSLIM
    from hslim import handle_user_bias, hierarchy_factory, normalize_wline, generate_subitem_hierarchy
    hierarchy = hierarchy_factory('data/hierarchy.json')
    K = slim_train(handle_user_bias(B))
    Wline = generate_subitem_hierarchy(K, W, hierarchy)
    WlineNorm = normalize_wline(Wline)
    recommendations_other = slim_recommender(A, WlineNorm)
    ###

    kendall_tau_values = []
    differences_values = []

    for u in recommendations_slim.iterkeys():
        ranking_slim = recommendations_slim[u][:RANKING_UNTIL]
        ranking_other = recommendations_other[u][:RANKING_UNTIL]

        kendall_tau_values.append(kendalltau(ranking_slim, ranking_other))
        differences_values.append(RANKING_UNTIL-len(set(ranking_slim) & set(ranking_other)))

    # Differences
    plt.hist(differences_values)
    plt.xlabel('Size of difference')
    plt.ylabel('Amount of rankings')
    plt.title('Differences (novelty) between rankings')

    # Ranking comparison
    show_matplot_fig()
    plt.figure()
    plt.hist([ i[0] for i in kendall_tau_values ])
    plt.xlabel('KendallTau Distance SLIM/SLIM LDA')
    plt.ylabel('Number of occurrences')
    plt.title('Comparison between rankings')
    show_matplot_fig()
Esempio n. 9
0
def main(train_file, test_file):
    A = tsv_to_matrix(train_file)

    W = slim_train(A)

    recommendations = slim_recommender(A, W)

    compute_precision_as_an_oracle(recommendations, test_file)
Esempio n. 10
0
def main(train_file, user_sideinformation_file, test_file, normalize):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file)

    if normalize:
        B = normalize_values(B)

    A, B = make_compatible(A, B)

    W = sslim_train(A, B)

    save_matrix(W, 'sslim_oracle_wmatrix.tsv')
    recommendations = slim_recommender(A, W)

    precisions = compute_precision_as_an_oracle(recommendations, test_file)

    return precisions
Esempio n. 11
0
def main(train_file, user_sideinformation_file, test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file)

    """
    from util import mm2csr
    mm2csr(A, '/tmp/train.mat')
    mm2csr(useritem_featureitem, '/tmp/train_feature.mat')
    C = tsv_to_matrix(test_file)
    mm2csr(C, '/tmp/test.mat')
    """

    W = sslim_train(A, B)

    recommendations = slim_recommender(A, W)

    compute_precision(recommendations, test_file)
Esempio n. 12
0
def main(train_file, test_file):
    A = tsv_to_matrix(train_file)

    W = slim_train(A)

    recommendations = slim_recommender(A, W)

    return compute_precision(recommendations, test_file)
Esempio n. 13
0
def main(train_file, user_sideinformation_file, test_file, normalize):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file)

    if normalize:
        B = normalize_values(B)

    A, B = make_compatible(A, B)

    W = sslim_train(A, B)

    save_matrix(W, 'sslim_oracle_wmatrix.tsv')
    recommendations = slim_recommender(A, W)

    precisions = compute_precision_as_an_oracle(recommendations, test_file)

    return precisions
Esempio n. 14
0
def main(train_file):
    data = tsv_to_matrix(train_file, 942, 1682)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
    print (data.shape)
    k = 4
    # k-means picking the first k points as centroids

    centroids = kmeans.cluster_centers
    labels = kmeans.labels_
    print(centroids)
Esempio n. 15
0
def main(train_file):
    data = tsv_to_matrix(train_file, 942, 1682).toarray()

    k = 8
    #shape[0] is 4, shape[1] is 1682
    dim = data.shape[1]

    #    print("data shape0:" + str(data[:k].shape[0]))
    #    print("data shape1:" + str(data[:k].shape[1]))

    iteration = 10

    for i in range(0, iteration):
        #call cuckoo search
        if i == 0:
            centroids = cuckoo_search(k, dim, data[:k])
        else:
            centroids = cuckoo_search(k, dim, new_centroids)

        # k-means picking the first k points as centroids
        #centroids = data[:k]

        print("shape[0]: " + str(centroids.shape[0]))
        print("shape[1]: " + str(centroids.shape[1]))

        clusters, labels, new_centroids = kmeans(k, centroids, data, "first")

        if i == iteration - 1:
            train0 = np.array(clusters[0])
            train1 = np.array(clusters[1])
            train2 = np.array(clusters[2])
            train3 = np.array(clusters[3])
            train4 = np.array(clusters[4])
            train5 = np.array(clusters[5])
            train6 = np.array(clusters[6])
            train7 = np.array(clusters[7])

            file = open('./data/k8/k8_label.txt', 'w')
            for i in range(len(labels)):
                file.write("%s\n" % (str(labels[i])))
            file.close()
            print(len(train0))
            print(len(train1))
            print(len(train2))
            print(len(train3))
            print(len(train4))
            print(len(train5))
            print(len(train6))
            print(len(train7))
Esempio n. 16
0
def main(train_file, user_sideinformation_file, hierarchy_file, test_file):
    A = tsv_to_matrix(train_file)
    B = tsv_to_matrix(user_sideinformation_file, A.shape[0], A.shape[1])
    hierarchy = hierarchy_factory(hierarchy_file)

    # Learning using SLIM
    # We handle user bias only in B because in B we have explicit evaluations
    K = slim_train(handle_user_bias(B))
    W = slim_train(A)

    Wline = generate_subitem_hierarchy(K, W, hierarchy)
    WlineNorm = normalize_wline(Wline)

    #recommendations = slim_recommender(A, W + 0.2 * WlineNorm)
    import pdb;pdb.set_trace()
    recommendations = slim_recommender(A, WlineNorm)

    # See if the predictor is just of not
    #user_cities = np.array([ map(hierarchy, B[i].nonzero()[0].tolist()) for i in range(B.shape[0]) ])
    #G = tsv_to_matrix(test_file)
    #print 'TEM QUE DAR VAZIO: ', set(G[1].nonzero()[0]) & set(user_cities[1])
    ### ---- FIM REMOVAME

    compute_precision(recommendations, test_file)
Esempio n. 17
0
def main(train_file):
    data = tsv_to_matrix(train_file, 942, 1682).toarray()

    k = 4
    # k-means picking the first k points as centroids

    centroids = data[:k]
    clusters, labels = kmeans(k, centroids, data, "first")
    train0 = np.array(clusters[0])
    train1 = np.array(clusters[1])
    train2 = np.array(clusters[2])
    train3 = np.array(clusters[3])
    file = open('label.txt', 'w')
    for i in range(len(labels)):
    	file.write("%s\n" %(str(labels[i])))
    file.close()
    print(train0.shape)
Esempio n. 18
0
read "SLIM: Sparse LInear Methods for Top-N Recommender Systems".
"""
from sklearn.linear_model import SGDRegressor
from util import tsv_to_matrix, generate_slices
from util.metrics import compute_precision
from util.recommender import slim_recommender
import numpy as np
import multiprocessing
import ctypes
from util import parse_args
import simplejson as json

args = parse_args()

# Loading matrices
A = tsv_to_matrix(args.train)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1] ** 2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])


# because in SLIM each column is independent we can use make this work in
# parallel
def work(params, W=shared_array):
    from_j = params[0]
    to_j = params[1]
    M = params[2]
    model = params[3]
    counter = 0
Esempio n. 19
0
from util import (tsv_to_matrix, generate_slices,
                  make_compatible, normalize_values, save_matrix)
from util.metrics import compute_precision
import multiprocessing
import ctypes
from scipy.sparse import vstack
import datetime
from util import parse_args
import simplejson as json

print '>>> Start: %s' % datetime.datetime.now()

args = parse_args(side_information=True, beta=True)

# Loading matrices
A = tsv_to_matrix(args.train)
B = tsv_to_matrix(args.side_information)

if args.normalize:
    B = normalize_values(B)

A, B = make_compatible(A, B)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1] ** 2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])


# We create a work function to fit each one of the columns of our W matrix,
# because in SLIM each column is independent we can use make this work in
Esempio n. 20
0
def main(train_file, part_file, test_file):

    AG = tsv_to_matrix(train_file, 942, 1682)
    AP = tsv_to_matrix(part_file, 942, 1682)

    W1 = slim_train(AG)
    W2 = slim_train(AP)
    # total_precision = []
    k = 2
    matrix_5 = np.zeros((21, k))
    matrix_10 = np.zeros((21, k))
    matrix_15 = np.zeros((21, k))
    matrix_20 = np.zeros((21, k))

    for i in range(0, 105, 5):
        gu = i / 100
        W = gu * W1 + (1 - gu) * W2
        print("gu: " + str(gu))
        recommendations = slim_recommender(AP, W)
        top5, top10, top15, top20 = compute_precision(recommendations,
                                                      test_file)
        for j in range(2):
            matrix_5[int(i / 5)][j] = top5[j]
            matrix_10[int(i / 5)][j] = top10[j]
            matrix_15[int(i / 5)][j] = top15[j]
            matrix_20[int(i / 5)][j] = top20[j]

    hr_values = []
    hr_values1 = []
    index1, value1 = max(enumerate(matrix_5[:, 0]), key=operator.itemgetter(1))
    index2, value2 = max(enumerate(matrix_10[:, 0]),
                         key=operator.itemgetter(1))
    index3, value3 = max(enumerate(matrix_15[:, 0]),
                         key=operator.itemgetter(1))
    index4, value4 = max(enumerate(matrix_20[:, 0]),
                         key=operator.itemgetter(1))
    hr_values.append(index1 * 0.05)
    hr_values.append(value1)
    hr_values.append(index2 * 0.05)
    hr_values.append(value2)
    hr_values.append(index3 * 0.05)
    hr_values.append(value3)
    hr_values.append(index4 * 0.05)
    hr_values.append(value4)
    hr_values1.append(matrix_5[20][0])
    hr_values1.append(matrix_10[20][0])
    hr_values1.append(matrix_15[20][0])
    hr_values1.append(matrix_20[20][0])

    arhr_values = []
    arhr_values1 = []
    index1, value1 = max(enumerate(matrix_5[:, 1]), key=operator.itemgetter(1))
    index2, value2 = max(enumerate(matrix_10[:, 1]),
                         key=operator.itemgetter(1))
    index3, value3 = max(enumerate(matrix_15[:, 1]),
                         key=operator.itemgetter(1))
    index4, value4 = max(enumerate(matrix_20[:, 1]),
                         key=operator.itemgetter(1))

    arhr_values.append(index1 * 0.05)
    arhr_values.append(value1)
    arhr_values.append(index2 * 0.05)
    arhr_values.append(value2)
    arhr_values.append(index3 * 0.05)
    arhr_values.append(value3)
    arhr_values.append(index4 * 0.05)
    arhr_values.append(value4)

    arhr_values1.append(matrix_5[20][1])
    arhr_values1.append(matrix_10[20][1])
    arhr_values1.append(matrix_15[20][1])
    arhr_values1.append(matrix_20[20][1])

    print('k8 top5: %s' % (matrix_5))
    print('k8 top10: %s' % (matrix_10))
    print('k8 top15: %s' % (matrix_15))
    print('k8 top20: %s' % (matrix_20))

    print('Max HR: %s' % (hr_values))
    print('HR at gu = 1: %s' % (hr_values1))
    print('Max ARHR: %s' % (arhr_values))
    print('ARHR at gu = 1: %s' % (arhr_values1))
Esempio n. 21
0
read "SLIM: Sparse LInear Methods for Top-N Recommender Systems".
"""
from sklearn.linear_model import SGDRegressor
from util import tsv_to_matrix, generate_slices
from util.metrics import compute_precision
from util.recommender import slim_recommender
import numpy as np
import multiprocessing
import ctypes
from util import parse_args
import simplejson as json

args = parse_args()

# Loading matrices
A = tsv_to_matrix(args.train)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])


# because in SLIM each column is independent we can use make this work in
# parallel
def work(params, W=shared_array):
    from_j = params[0]
    to_j = params[1]
    M = params[2]
    model = params[3]
    counter = 0
Esempio n. 22
0
read "Sparse Linear Methods with Side Information for Top-N Recommendations"
"""
from sklearn.linear_model import SGDRegressor
import numpy as np
from recommender import slim_recommender
from util import tsv_to_matrix, split_train_test, generate_slices
from metrics import compute_precision
import multiprocessing
import ctypes
import sys
from scipy.sparse import vstack

train_file, user_sideinformation_file, test_file = sys.argv[1:]

# Loading matrices
A = tsv_to_matrix(train_file)
B = tsv_to_matrix(user_sideinformation_file)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])


# We create a work function to fit each one of the columns of our W matrix,
# because in SLIM each column is independent we can use make this work in
# parallel
def work(params, W=shared_array):
    from_j = params[0]
    to_j = params[1]
    M = params[2]
Esempio n. 23
0
SLIM Parallel implementation. To understand deeply how it works we encourage you to
read "SLIM: Sparse LInear Methods for Top-N Recommender Systems".
"""
from sklearn.linear_model import SGDRegressor
from util import tsv_to_matrix, generate_slices
from metrics import compute_precision
from recommender import slim_recommender
import numpy as np
import multiprocessing
import ctypes
import sys

train_file, test_file = sys.argv[1:]

# Loading matrices
A = tsv_to_matrix(train_file)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])

# because in SLIM each column is independent we can use make this work in
# parallel
def work(params, W=shared_array):
    from_j = params[0]
    to_j = params[1]
    M = params[2]
    model = params[3]
    counter = 0
Esempio n. 24
0
read "Sparse Linear Methods with Side Information for Top-N Recommendations"
"""
from sklearn.linear_model import SGDRegressor
import numpy as np
from recommender import slim_recommender
from util import tsv_to_matrix, split_train_test, generate_slices
from metrics import compute_precision
import multiprocessing
import ctypes
import sys
from scipy.sparse import vstack

train_file, user_sideinformation_file, test_file = sys.argv[1:]

# Loading matrices
A = tsv_to_matrix(train_file)
B = tsv_to_matrix(user_sideinformation_file)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])

# We create a work function to fit each one of the columns of our W matrix,
# because in SLIM each column is independent we can use make this work in
# parallel
def work(params, W=shared_array):
    from_j = params[0]
    to_j = params[1]
    M = params[2]
    model = params[3]
Esempio n. 25
0
from util import (tsv_to_matrix, generate_slices, make_compatible,
                  normalize_values, save_matrix)
from util.metrics import compute_precision
import multiprocessing
import ctypes
from scipy.sparse import vstack
import datetime
from util import parse_args
import simplejson as json

print '>>> Start: %s' % datetime.datetime.now()

args = parse_args(side_information=True, beta=True)

# Loading matrices
A = tsv_to_matrix(args.train)
B = tsv_to_matrix(args.side_information)

if args.normalize:
    B = normalize_values(B)

A, B = make_compatible(A, B)

# Loading shared array to be used in results
shared_array_base = multiprocessing.Array(ctypes.c_double, A.shape[1]**2)
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape(A.shape[1], A.shape[1])


# We create a work function to fit each one of the columns of our W matrix,
# because in SLIM each column is independent we can use make this work in