def _loadRatings(self, sc, data_file): data = sc.textFile(data_file) data_dict, data_triplet = format_triplets(data) data_triplet = by_max_count(data_triplet) num_ratings = data_triplet.count() num_users = data_triplet.map(lambda r: r[0]).distinct().count() num_songs = data_triplet.map(lambda r: r[1]).distinct().count() print(100 * '//') print("Got {} ratings, with {} distinct songs and {} distinct users".format(num_ratings, num_users, num_songs)) print(100 * '//') train_ratings = data_triplet.map(lambda l: Rating(l[0], l[1], l[2])) return train_ratings
def get_ratings(sc, data_file): data = sc.textFile(data_file) # # Normalize start # # print('Training normalization started') data_dict, data_triplet = format_triplets(data) data_triplet = by_max_count(data_triplet) print(' Training normalization ended') # # Normalize end # # num_ratings = data_triplet.count() num_users = data_triplet.map(lambda r: r[0]).distinct().count() num_songs = data_triplet.map(lambda r: r[1]).distinct().count() print(100 * '//') print("Got {} ratings, with {} distinct songs and {} distinct users".format(num_ratings, num_users, num_songs)) print(100 * '//') train_ratings = data_triplet.map(lambda l: Rating(l[0], l[1], l[2])) return train_ratings
def get_ratings(sc, data_file): data = sc.textFile(data_file) # # Normalize start # # print('Training normalization started') data_dict, data_triplet = format_triplets(data) data_triplet = by_max_count(data_triplet) print(' Training normalization ended') # # Normalize end # # num_ratings = data_triplet.count() num_users = data_triplet.map(lambda r: r[0]).distinct().count() num_songs = data_triplet.map(lambda r: r[1]).distinct().count() print(100 * '//') print( "Got {} ratings, with {} distinct songs and {} distinct users".format( num_ratings, num_users, num_songs)) print(100 * '//') train_ratings = data_triplet.map(lambda l: Rating(l[0], l[1], l[2])) return train_ratings