def main(): movie_data = get_movie_data() print movie_data.first() num_movies = movie_data.count() print "Movies: %d" % num_movies movie_fields = movie_data.map(lambda lines: lines.split("|")) years = movie_fields.map(lambda fields: fields[2]).map( lambda x: convert_year(x)) # we filter out any 'bad' data points here years_filtered = years.filter(lambda x: x != 1900) # plot the movie ages histogram movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue() values = movie_ages.values() bins = movie_ages.keys() plt.hist(values, bins=bins, color='lightblue', normed=True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16, 10) plt.show()
def main(): movie_data = get_movie_data() print(movie_data.first()) num_movies = movie_data.count() print("Movies: %d" % num_movies) #movie_years = movie_data.select("year") #from pyspark.sql.functions import udf #from pyspark.sql import SparkSession spark.udf.register("convert_year", convert_year) # Bug in pyspark 2.0.0 reverting to RDD # https://issues.apache.org/jira/browse/SPARK-17538 #movie_data.createTempView("movie_data") #movie_years = spark.sql("select convertYear(date) as year from movie_data") #print(movie_years.first) movie_fields = movie_data.map(lambda lines: lines.split("|")) print(len(movie_fields.first())) years = movie_fields.map(lambda fields: fields[2]).map( lambda x: convert_year(x)) # # we filter out any 'bad' data points here years_filtered = years.filter(lambda x: x != 1900) years_filtered = years_filtered.sortBy(lambda years: -years) # plot the movie ages histogram movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue() # movie_ages = years_filtered.map(lambda yr: yr).countByValue() values = movie_ages.values() bins = list(movie_ages.keys()) print("") print(bins) plt.hist(values, bins=bins, color='lightblue', density=True) plt.xticks(fontsize='12') # fig, ax = matplotlib.pyplot.subplots() # fig.set_size_inches(16, 10) # for tick in ax.xaxis.get_major_ticks(): # tick.label.set_fontsize(8) # # specify integer or one of preset strings, e.g. # #tick.label.set_fontsize('x-small') # tick.label.set_rotation('vertical') plt.show()
def main(): rating_data_raw = get_rating_data() print(rating_data_raw.first()) num_ratings = rating_data_raw.count() print("Ratings: %d" % num_ratings) num_movies = get_movie_data().count() num_users = get_user_data().count() rating_data = rating_data_raw.map(lambda line: line.split("\t")) ratings = rating_data.map(lambda fields: int(fields[2])) max_rating = ratings.reduce(lambda x, y: max(x, y)) min_rating = ratings.reduce(lambda x, y: min(x, y)) mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings) median_rating = np.median(ratings.collect()) ratings_per_user = num_ratings / num_users ratings_per_movie = num_ratings / num_movies print("Min rating: %d" % min_rating) print("Max rating: %d" % max_rating) print("Average rating: %2.2f" % mean_rating) print("Median rating: %d" % median_rating) print("Average # of ratings per user: %2.2f" % ratings_per_user) print("Average # of ratings per movie: %2.2f" % ratings_per_movie)