コード例 #1
0
def main():
    f = open("Python/user_rated_movies.tsv", "r")
    user_ratings = []
    for line in f:
        inline = line.split('\t')
        rating = inline[2]
        mytuple = inline[0], inline[1], float(rating[:-1]), None
        user_ratings.append(mytuple)
    f.close()

    # data = Dataset.load_builtin(name=u'ml-1m')
    reader = Reader(line_format='user item rating', sep='\t')
    datain = pd.read_csv("ratings.tsv", sep="\t")
    data = Dataset.load_from_df(datain, reader=reader)
    for i in user_ratings:
        data.raw_ratings.append(i)

    movies = pd.read_csv("movies.tsv", sep="\t", header=None, low_memory=False)

    algo = NMF(n_factors=4, n_epochs=100, random_state=1)
    trainSet = data.build_full_trainset()
    algo.fit(trainSet)

    predictions = []
    #have i[0] and i[1] be the current user and movie id
    for index, row in movies.iterrows():
        pred = algo.predict(user_ratings[0][0], row[1], r_ui=4)
        predictions.append(pred)

    sortpred = sorted(predictions, key=lambda pred: pred[3])
    sortpred = sortpred[-10:]

    for i in sortpred:
        print(i[1])
コード例 #2
0
ファイル: blag.py プロジェクト: sivaramakrishnansr/BLAG
    def run_process(self, all_ips_data, ip_16_data, misclassifications, queue):
        historical_item = generate_prefix_data(all_ips_data, ip_16_data,
                                               self.reference_end_time,
                                               self.half_life_duration)
        matrix = []
        if len(historical_item) == 0:
            return
        if len(historical_item) < 5:
            for ip, bl_name_data in historical_item.items():
                queue.put(ip + ",0")
            return

        matrix_string = "userId,itemId,rating\n"
        all_blacklists = set()
        ip_order = set()
        for ip, bl_name_data in historical_item.items():
            ip_order.add(ip)
            for bl_name, score in bl_name_data.items():
                matrix_string = matrix_string + ip + "," + bl_name + "," + str(
                    score) + "\n"
                all_blacklists.add(bl_name)
        for ip in misclassifications:
            if ip in ip_order:
                matrix_string = matrix_string + ip + "," + "misclassifications,10" + "\n"

        matrix_string = StringIO(matrix_string)
        ratings = pd.read_csv(matrix_string)

        ratings_dict = {
            'itemID': list(ratings.itemId),
            'userID': list(ratings.userId),
            'rating': list(ratings.rating)
        }

        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 10.0))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
        epochs = 100
        broken_flag = False
        while True:
            algo = NMF(n_epochs=epochs, n_factors=self.n_factors)
            try:
                res = model_selection.cross_validate(algo,
                                                     data,
                                                     measures=['RMSE'])
            except:
                broken_flag = True
                break
            mean_rmse = sum(res["test_rmse"]) / len(res["test_rmse"])
            if mean_rmse <= 1:
                break
            epochs = epochs + 100
            if epochs >= self.epochs:
                break
        for ip in ip_order:
            prediction = algo.predict(ip, "misclassifications").est
            queue.put(ip + "," + str(round(prediction, 2)))
        return
コード例 #3
0
def nmf(train, test, ids, Xtest, Xids):
    """
    Non Negative Matrix Factorization
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('NMF')
    algo = NMF(n_factors=20,
               n_epochs=50,
               random_state=15,
               reg_pu=0.5,
               reg_qi=0.05)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
コード例 #4
0
ファイル: predit_service.py プロジェクト: LyHoaNam/chocola
 def nmf(self, namefile, uid, iid, rati, value_uid, value_iid):
     test_data = pd.read_csv('./container/' + namefile)
     dt = pd.DataFrame(test_data)
     # Retrieve the trainset.
     reader = Reader(rating_scale=(0, 100))
     data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
     trainset = data.build_full_trainset()
     algo = NMF()
     algo.fit(trainset)
     pred = algo.predict(int(value_uid),
                         int(value_iid),
                         r_ui=1,
                         verbose=True)
     #var_rmse = accuracy.rmse(pred)
     #return result to json
     jsondata = {}
     jsondata = {}
     jsondata["uid"] = pred.uid
     jsondata["idd"] = pred.iid
     jsondata["rati"] = round(pred.est, 2)
     return jsondata
コード例 #5
0
ファイル: predit_service.py プロジェクト: LyHoaNam/chocola
    def nmf_from_to(self, namefile, uid, iid, rati, from_uid, to_uid, from_iid,
                    to_iid):
        test_data = pd.read_csv('./container/' + namefile)
        dt = pd.DataFrame(test_data)
        # Retrieve the trainset.
        reader = Reader(rating_scale=(0, 100))
        data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
        trainset = data.build_full_trainset()
        algo = NMF()
        algo.fit(trainset)

        arr = []
        for value_uid in range(from_uid, to_uid):
            for value_iid in range(from_iid, to_iid):
                pred = algo.predict(value_uid, value_iid, r_ui=1, verbose=True)
                tempdata = []
                tempdata.append(pred.uid)
                tempdata.append(pred.iid)
                tempdata.append(round(pred.est, 2))
                arr.append(tempdata)
        #return result to json
        return arr
Pred_Test_SVD = []
Pred_Test_NMF = []
Pred_Test_SL1 = []
Pred_Test_KNN = []
Pred_Test_BSL = []

start = time.time()
for line in data_test:
    Pred_Test_KNN.append(
        alg_KNN.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_SVD.append(
        alg_SVD.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_NMF.append(
        alg_NMF.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_SL1.append(
        alg_SL1.predict(str(line[1]), str(line[0]), clip=False).est)

    Pred_Test_BSL.append(
        alg_BSL.predict(str(line[1]), str(line[0]), clip=False).est)

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

X_Test = np.matrix([
    Pred_Test_SVD, Pred_Test_NMF, Pred_Test_SL1, Pred_Test_KNN, Pred_Test_BSL
])
import zipfile

#Unzip the file
"""file = zipfile.ZipFile('/home/shanmukha/AnacondaProjects/Spyder_projects/Recommendation_trail/ml-100k.zip','r')
file.extractall()
file.close()
"""
#Read dataset
reader = Reader(line_format='user item rating timestamp', sep='\t')
dataset = Dataset.load_from_file(file_path='./ml-100k/u.data', reader=reader)

#Split dataset
dataset.split(n_folds=5)

#Using SVD,NMF
algo1 = SVD()
algo2 = NMF()
#evaluate(algo,dataset,measures=['RMSE','MAE'])

#Training entire dataset
train_data = dataset.build_full_trainset()
algo1.fit(train_data)
algo2.fit(train_data)

#predicting
user = str(196)
item = str(302)
actual_rating = 4
print(algo1.predict(user, item, actual_rating))
print(algo2.predict(user, item, actual_rating))
コード例 #8
0
ファイル: NMF.py プロジェクト: gusfelhberg/DataMining
    del dfTest['date']
    del dfTest['test_id']

    # Set the rating scale and create the data for Surprise to use
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        dfRatings[['user_id', 'business_id', 'rating']], reader)

    # Cross validation for tuning
    # Split in 5 folds
    data.split(5)

    # This part is to use all the data to train and get the output
    train_set = data.build_full_trainset()

    # Use NMF with surprise
    algo = NMF()
    algo.train(train_set)

    f = open('PMFOutput.csv', 'w')
    f.write("test_id,rating\n")
    for i in range(len(dfTest)):
        prediction = algo.predict(dfTest.at[i, 'user_id'],
                                  dfTest.at[i, 'business_id'],
                                  r_ui=4,
                                  verbose=True)
        predRating = prediction.est
        f.write(str(i) + "," + str(predRating) + '\n')

    f.close()
コード例 #9
0
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
trainset = data.build_full_trainset()
svd.fit(trainset)
ratings[ratings['userId'] == 1]
svd.predict(13, 238)
m_cols = ['id', 'Title', 'release_date', 'video_release_date', 'imdb_url']
moviesdb = pd.read_csv('./ml-100k/u.item',
                       sep='|',
                       names=m_cols,
                       usecols=range(5),
                       encoding='latin-1')

nmf = NMF()
cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
nmf.fit(trainset)
nmf.predict(13, 238)

knnb = KNNBasic()
cross_validate(knnb, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
knnb.fit(trainset)
knnb.predict(13, 238)

knnm = KNNWithMeans()
cross_validate(knnm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
knnm.fit(trainset)
knnm.predict(13, 238)

user = 13

user_rating_svd = pd.DataFrame()
for i in range(0, moviesdb.shape[0]):
コード例 #10
0
# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.15)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

algoritmo = NMF(n_epochs=5)

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algoritmo.test(testset)

# Avalia RMSE
print("Avaliação RMSE: ")
accuracy.rmse(test_pred, verbose=True)

# Avalia MAE
print("Avaliação MAE: ")
accuracy.mae(test_pred, verbose=True)
コード例 #11
0
ファイル: surprise_NMF.py プロジェクト: AChelikani/CS156b
trainset = data.build_full_trainset()
del data

print(time.asctime(), 'training set built, now training')
# algo = SlopeOne()
#
#
# MODEL DEFINITION
algo = NMF(verbose=True, biased=True, n_factors=fac, n_epochs=ep)
#
#
#
algo.fit(trainset)

print(time.asctime(), 'training complete, now loading prediction data')
to_predict = pd.read_csv(file_path_test, delimiter=' ', header=None)
to_predict = to_predict.values.T[0:2].T
predicted = np.zeros(len(to_predict))

print(time.asctime(), 'prediction data loaded, now predicting')
for i in range(len(predicted)):
    user = to_predict[i][0]
    item = to_predict[i][1]
    predicted[i] = algo.predict(uid=user, iid=item, verbose=0).est
    if (i % 500000 == 0):
        print(i, 'of', len(predicted), 'predicted')

print(time.asctime(), 'now saving predictions')
np.savetxt('../custom_data/' + title + '.dta', predicted, fmt='%.3f')

print(time.asctime(), 'done')
コード例 #12
0
# -*- coding:utf-8 -*-
__author__ = 'neuclil'
import surprise
from surprise import NMF, evaluate
from surprise import Dataset, Reader
from model.convertor import Convertor
import os

if __name__ == '__main__':
    convetor = Convertor()

    file_path = os.path.expanduser('../data/popular_music_suprise_format.txt')
    reader = Reader(line_format='user item rating timestamp', sep=',')
    music_data = Dataset.load_from_file(file_path, reader=reader)

    algo = NMF()
    trainset = music_data.build_full_trainset()
    algo.train(trainset)

    user_inner_id = 4
    user_rating = trainset.ur[user_inner_id]
    items = map(lambda x: x[0], user_rating)
    for song in items:
        print(
            algo.predict(algo.trainset.to_raw_uid(user_inner_id),
                         algo.trainset.to_raw_iid(song),
                         r_ui=1),
            convetor.get_song_name_by_iid(algo.trainset.to_raw_iid(song)))
    surprise.dump.dump('./nmf.model', algo=algo)
from surprise import Reader, Dataset
from surprise import NMF, evaluate

# creating the format for the dataset when given the user, item, rating and timestamp
data_reader = Reader(line_format="user item rating timestamp", sep="\t")

# store the data in the specific format created above
# u. data is the data we want
data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader)

# will be splitting the data into 5 folds for cross validation
data.split(n_folds=5)

# for this project I will be using the NMF algorithm
algorithm = NMF()
evaluate(algorithm, data, measures=["RMSE", "MAE"])

# train the whole data set now
training_set = data.build_full_trainset()
algorithm.train(training_set)

# set the specific user and movie I want to predict
user_id = str(200)
item_id = str(222)
actual_rating = 5

# see how it works!
print(algorithm.predict(user_id, item_id, actual_rating))
コード例 #14
0
predict = clf.predict(X_test)
print(
    f"SVM Accuracy Score: {metrics.accuracy_score(Y_test,predict)*100:0.4f}%")

#NMF
data = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1)
data2 = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)
NMFModel = NMF()
NMFModel.fit(data.build_full_trainset())

predicted = []
for indx, row in data2.iterrows():
    pred = NMFModel.predict(uid=row['reviewerID'], iid=row['asin']).est
    predicted.append(pred)

true = df['overall'].tolist()
acc = 0
for i in range(len(true)):
    if int(round(predicted[i])) == true[i]:
        acc += 1

avg = acc / len(true)

print(f"NMF Accuracy: {avg*100:0.4f}%")

#K-nearest Neighbor
KNModel = KNeighborsClassifier(n_neighbors=3)
KNModel.fit(X_train, Y_train)
コード例 #15
0
# %% Look at the prior on the train data
file_path = "Data/data_train_preprocessed.csv"
data_train = utils.load_data_desired(file_path)

# %% Labels for training
Pred_NotCliped_label = []
Pred_Cliped_label = []
Real_label = []

Clip = False

for line in data_train:
    Real_label.append(line[2])
    Pred_NotCliped_label.append(
        alg.predict(str(line[1]), str(line[0]), clip=False).est)
    Pred_Cliped_label.append(
        alg.predict(str(line[1]), str(line[0]), clip=True).est)

Pred_NotCliped_label = np.array(Pred_NotCliped_label)
Pred_Cliped_label = np.array(Pred_Cliped_label)
Real_label = np.array(Real_label)

# %% Visualization
plt.figure()
plt.hist(Pred_NotCliped_label)
plt.grid()
plt.title('Histogram of Predicted Labels')
plt.xlabel('Label')

plt.figure()
コード例 #16
0
alg.fit(data_train.build_full_trainset())

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

# %% Loading Test Data
file_path = "Data/sample_submission.csv"
data_test = utils.load_data_desired(file_path)

# %% Prediction
Predict_Test = []

for line in data_test:
    Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est)


# %% Save Prediction
file = open("Details.txt", "w")

file.write("+ Best Score: \n \n")
file.write(str(Train_CV.best_score) + "\n \n")
file.write("************************************************************ \n")
file.write("+ Best Param: \n \n")
file.write(str(Train_CV.best_params) + "\n \n")
file.write("************************************************************ \n")
file.write("+ CV Summary: \n \n")
file.write(str(Train_CV.cv_results) + "\n \n")
file.write("************************************************************ \n")
コード例 #17
0
    for hist in testset:
        if hist[0] in user_visiting_hist:
            user_visiting_hist[hist[0]].append(hist[1])
        else:
            user_visiting_hist[hist[0]] = [hist[1]]

    algo.fit(trainset)

    # Making recommendation for each user and all venues in test data set
    precision = 0.0
    recall = 0.0
    k = 20
    for user in user_list:
        est_item_rating = {}
        for item in item_list:
            est_item_rating[item] = algo.predict(user, item, clip=False).est
            # print(algo.predict(user, item, clip=False).est)
        sorted_items_dict = OrderedDict(sorted(est_item_rating.items()))
        sorted_items = list(sorted_items_dict.keys())
        count = 0
        for i in sorted_items[:k]:
            if i in user_visiting_hist[user]:
                count += 1
        precision += count / float(k)
        recall += count / float(len(user_visiting_hist[user]))
    print('precision: ', precision / len(user_list), ' recall: ',
          recall / len(user_list))
    overall_precision += precision / len(user_list)
    overall_recall += recall / len(user_list)
print('overall_precision: ', overall_precision / 5, ' overall_recall: ',
      overall_recall / 5)
コード例 #18
0
def compute_recommendations(user_id, prediction_table, numeric_prediction_table):


    algo = 'NMF'

    algorithm = NMF()



    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview



    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(sessionmaker(bind=engine,
                                      autocommit = False,
                                      autoflush = False))



    #reading in the database


    df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine)
    df_ratings=df_ratings[['user_id','item_id','rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()


    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id','item_id','rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)




    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()


#     algorithm = eval(algo + "()")# set the algorithm...............................................


    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con = engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])


    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index)


    predictions['prediction'] = predicted_ratings


    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)


    cols =['pred_1', 'pred_2','pred_3','pred_4',
                                   'pred_5','pred_6','pred_7','pred_8',
                                  'pred_9','pred_10']




    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id



    df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4',
                                       'pred_5','pred_6','pred_7','pred_8',
                                      'pred_9','pred_10']]

    df_pred['id'] = df_pred['id'].astype(int)



    df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True)


    df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    predcols =['num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols




    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)








    df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append'
    session.commit()
コード例 #19
0
               n_epochs=params['n_epochs'],
               lr_all=params['lr_all'],
               reg_all=params['reg_all'])
algo_SVD.train(data_full)

#%%

datamat_filled_SVD = datamat_missing.copy().astype(np.float)
datamat_filled_NMF = datamat_missing.copy().astype(np.float)
for i in range(0, datamat_full.shape[0]):  # movie
    for j in range(0, datamat_full.shape[1]):  # user

        val = algo_SVD.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est
        datamat_filled_SVD[i, j] = val

        val = algo_NMF.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est
        datamat_filled_NMF[i, j] = val

#%% compute correlations between real and recovered ratings
corvals_SVD = np.zeros(datamat_full.shape[1])
corvals_NMF = np.zeros(datamat_full.shape[1])
corvals_SVD_fancy = np.zeros(datamat_full.shape[1])
corvals_NNM_fancy = np.zeros(datamat_full.shape[1])
corvals_SOFT_fancy = np.zeros(datamat_full.shape[1])
for j in range(0, datamat_full.shape[1]):  # user
    corvals_SVD[j] = np.corrcoef(datamat_full[:, j],
                                 datamat_filled_SVD[:, j])[0, 1]
    corvals_SVD_fancy[j] = np.corrcoef(datamat_full[:, j],
                                       datamat_filled_SVD_fancy[:, j])[0, 1]
    corvals_NMF[j] = np.corrcoef(datamat_full[:, j],
                                 datamat_filled_NMF[:, j])[0, 1]