min_songs_hour = 10 output_listeningHistoryFolder = "../data/process/listeningHistory-hours/" def doExperiment(lisHistFile, songsetFile): df_lisHist = pd.read_csv(lisHistFile,delimiter=";", encoding="UTF-8") #"ISO-8859-1") print(df_lisHist.head(1)) sys.exit() contents_lh = glob(f"{listeningHistoryFolder}*.csv") contents_lh.sort() for listeningHistoryFile in contents_lh: #df = pd.read_csv(listeningHistoryFile,delimiter=";", encoding="UTF-8") #print(df.head()) df = mylib.loadData(listeningHistoryFile, min_msPlayed, delimiter="\t") df.sort_values(by=["date","time","msPlayed"], inplace=True) # Aggiungo una colonna con l'orario giusto #df["MY_PLAYED_AT"] = [pd.Timestamp('2017-01-01T12') for i in range(0,df.shape[0])] df["played_at"] = [" " for i in range(0,df.shape[0])] df["datetime"] = [" " for i in range(0,df.shape[0])] for index, row in df.iterrows(): # vado allo scadere del minuto e sottraggo i secondi ##df.at[index, "MY_PLAYED_AT"] = row["endTime"] + pd.Timedelta(seconds=(59-row["msPlayed"]//1000)) #df.at[index, "MY_PLAYED_AT"] = row["endTime"] + pd.Timedelta(seconds=59) - pd.Timedelta(milliseconds=row["msPlayed"]) played_at_timestamp = row["endTime"] + pd.Timedelta(seconds=59) - pd.Timedelta(milliseconds=row["msPlayed"]) # Problema: non stampa secondi e millisecondi se sono tutti 0 #timeiso = played_at_timestamp.isoformat() #df.at[index, "played_at"] = timeiso[:-3] + "Z"
# In[] load 1M data import mylib [data_] = mylib.loadData('../1Mtrain') train_data = data_[:900000] valid_data = data_[900000:] # In[] calculate with a lamda from collections import defaultdict from math import exp import numpy def getRui(data): Rui = {}; for d in data: Rui[(d['reviewerID'], d['itemID'])] = d['rating'] return Rui def getIu_Ui(data): Iu, Ui = {},{} for d in data: if d['reviewerID'] not in Iu: Iu[d['reviewerID']]= [] if d['itemID'] not in Ui: Ui[d['itemID']] = [] Iu[d['reviewerID']].append(d['itemID']) Ui[d['itemID']].append(d['reviewerID']) return [Iu, Ui] Rui = getRui(train_data) [Iu, Ui] = getIu_Ui(train_data)
def generateSongset(csv_file, output_folder, cluster_method="KM", heuristic_method="LINEAR", min_songs_hour=10, min_ms_played=10000, max_clusters=10, num_tracks=100): print(csv_file) if ".csv" not in csv_file and ".tsv" not in csv_file: print("ERROR: only tsv and csv input files are allowed. Skip file.") cluster_method = cluster_method.upper() heuristic_method = heuristic_method.upper() df = mylib.loadData(csv_file, min_ms_played, delimiter="\t") # 3.1 ntna_ntka = computeNTNA_NTKA(df) for time_hour in range(0, 24): if time_hour < 21: continue # 3.2 - FILTERING df_h = songs_byHour(df, time_hour) # Remove duplicate songs df_h.drop_duplicates(subset="TrackID", keep="first", inplace=True) # controllo su numero di canzoni nella fascia oraria if df_h.shape[0] < min_songs_hour: print(f"* hour {time_hour}: skip ({df_h.shape[0]} songs).") continue print(f"* hour {time_hour}: {df_h.shape[0]} songs.") #3.3 - CLUSTERING df_h_feat = df_h[[ "Acousticness", "Danceability", "Energy", "Instrumentalness", "Key", "Liveness", "Loudeness", "Mode", "Speechiness", "Tempo", "Time_signature", "Valence" ]] if cluster_method == "KM": best_clustering = best_k_means( df_h_feat, max_clusters, "exclude_K_less_4_songs") #"exclude_cluster_less_4_songs") #print(best_clustering) elif cluster_method == "FBF": print( f"ERROR in generateSongset(): {cluster_method} not yet implemented. Exit." ) sys.exit() else: print( f"ERROR in generateSongset(): cluster method {cluster_method} not defined. Exit." ) sys.exit() kLength = best_clustering["best-length"] numReqs_perPoint = int(num_tracks / (4 * kLength)) + 1 feature_names = [ "Acousticness", "Danceability", "Energy", "Speechiness", "Instrumentalness", "Liveness", "Valence", "Loudeness", "Tempo", "Time_signature", "Key", "Mode" ] ######################## ### LINEAR HEURISTIC ### ######################## if heuristic_method == "LINEAR": linear_kMeans = linearHeuristic(df_h, best_clustering) ########################### ### RECOMMENDER SPOTIFY ### ########################### results = list() kIndex = 0 for df_group in linear_kMeans: # FIRST SONG print( f"\t{time_hour}) CLUSTER KM #{kIndex}/{kLength} - SONG #0/4" ) firstPoint = df_group.iloc[0] trackId = firstPoint["TrackID"] # get features centroidFeatures_list = firstPoint["kCentroid"] features = dict() for index in range(0, len(centroidFeatures_list)): features[ feature_names[index]] = centroidFeatures_list[index] tracks = recommenderGetSongs(trackId, features, numReqs_perPoint, results, retryLimit=2, sleepTime=recommender_sleepTime) results.extend(tracks) #print(len(results)) #res = recommenderGetSongs("7CDaY0pk8qGFoahgxVVbaX", numReqs_perPoint, list(), retryLimit=2, sleepTime=recommender_sleepTime) # OTHER THREE SONGS for i in range(1, 4): print( f"\t{time_hour}) CLUSTER KM LINEAR #{kIndex}/{kLength} - SONG #{i}/4" ) point = df_group.iloc[i] trackId = point["TrackID"] tracks = recommenderGetSongs( trackId, point, numReqs_perPoint, results, retryLimit=2, sleepTime=recommender_sleepTime) results.extend(tracks) #print(len(results)) kIndex += 1 ######################## ### SPHERE HEURISTIC ### ######################## elif heuristic_method == "SPHERE": sphere_kMeans = sphereHeuristic(df_h, best_clustering) ########################### ### RECOMMENDER SPOTIFY ### ########################### results = list() for item in sphere_kMeans: print( f"\t{time_hour}) {kLength} CLUSTER KM SPHERE - SONG #{item['index']}/{len(sphere_kMeans)}" ) #{"index": minDistSongIndex, "randomPoint": currRandomPoint, "minDistSong": minDistSong, "minDist": minDist} randomPoint = item["randomPoint"] features = dict() for index in range(0, len(randomPoint)): features[feature_names[index]] = randomPoint[index] #for index in range(0,len(centroidFeatures_list)): # features[feature_names[index]] = centroidFeatures_list[index] clusterMinDistSong = item["minDistSong"] trackId = clusterMinDistSong["TrackID"] tracks = recommenderGetSongs(trackId, features, numReqs_perPoint, results, retryLimit=2, sleepTime=recommender_sleepTime) results.extend(tracks) else: print( f"ERROR in generateSongset(): heuristic {cluster_method} not defined. Exit." ) sys.exit() #print(results) output_file_start = ntpath.basename(csv_file).replace(".csv", "").replace( ".tsv", "") saveSongset(results, output_folder, output_file_start, time_hour, kLength, cluster_method, heuristic_method)
pairs_Rating.append(UIO) rating_test_predict = [] for pr in pairs_Rating: if pr[0] in dirty_u.keys(): rating_test_predict.append(dirty_u[pr[0]]) elif pr[1] in dirty_i.keys(): rating_test_predict.append(dirty_i[pr[1]]) else: rating_test_predict.append(alpha+beta_u[pr[0]]+beta_i[pr[1]]) rating_test_result = [[pr[0]+'-'+pr[1], str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)] saveCSV('rating_test_result.csv',rating_test_result) # In[]================================================================================ # In[] main [data_] = mylib.loadData('./assignment1/data/1Mtrain') train_data = data_[:900000] valid_data = data_[900000:] del data_ dirty_limits = [10,11,12,13,14,15,16,17,18,19,20] sds = [0.3,0.5,0.7,0.9,1.0,2.0] dirty_bound = [[2.6,4.4],[2.8,4.2],[3,4],[3.5,3.5]] MSEs = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))] thetas = [[[0 for dbi in range(len(dirty_bound))] for si in range(len(sds))] for dli in range(len(dirty_limits))] for dli in range(len(dirty_limits)): dl = dirty_limits[dli] for si in range(len(sds)): s = sds[si] for dbi in range(len(dirty_bound)): db = dirty_bound[dbi]
""" Created on Thu Nov 12 22:56:19 2015 @author: ssc317 """ import mylib import csv filename = '1M_train_rating' pairs_Rating = [] f = open('./pairs_Rating.txt') for line in f: if line.startswith('userID'): pass elif line.startswith('U'): UIO = line.split('-') pairs_Rating.append(UIO) [alpha, beta_u, beta_i] = mylib.loadData(filename) rating_test_predict = [alpha+beta_u[pr[0]]+beta_i[pr[1]] for pr in pairs_Rating] rating_test_result = [[pr[0]+'-'+pr[1].rstrip(), str(ptr)] for pr,ptr in zip(pairs_Rating,rating_test_predict)] def saveCSV(filename, data): f = open(filename, 'wb') writer = csv.writer(f) writer.writerow(['userID-itemID', 'prediction']) for d in data: writer.writerow(d) f.close() saveCSV('rating_test_result.csv',rating_test_result)