class WarezbbDataAnalyser(): """This class is used to analyse the process warezbb data""" def __init__(self): self.filename = "data/processedWarezbbMovieFile.csv" self.my_analyser = Analyser(self.filename) self.my_printer = MyPrinter() self.years = ['2015', '2014', '2013', '2012', '2011', '2010', '2009'] def get_col(self, col_name): """returns a col for a given col name""" return self.my_analyser.get_col(col_name) def get_post_dates(self): """get all the posts dates""" my_dict = self.my_analyser.count_field('post_date', return_as_dict=True) new_dict = {} for date in my_dict: new_date = date[:16] if new_date in new_dict: new_dict[new_date] += my_dict[date] else: new_dict[new_date] = my_dict[date] return new_dict def get_posts_dates_by_year(self): """"gets all the posts sorted by year""" return self.my_analyser.count_field('year', return_as_dict=True) def get_author_counts(self): """get all the author counts""" return self.my_analyser.count_field('author', return_as_dict=True) def get_author_total_views(self): """Get all the views an author has ever had""" author_views = self.my_analyser.count_two_fields('author', 'views') author_total_views = {} for author in author_views: my_sum = 0 for view in author_views[author]: my_sum += int(view.replace(',', '')) author_total_views[author] = my_sum return author_total_views def caculate_author_view_averages(self): """Caculates the average amount of views each author has""" author_count = self.get_author_counts() author_views = self.get_author_total_views() author_average_views = {} for author in author_views: try: author_average_views[ author] = author_views[author] / author_count[author] except: author_average_views[author] = 0 return author_average_views def author_rank_by_no_of_views(self): """caculates the rank of the authors rank 1 = author with most threads rank n = author with least threads""" count_dict = self.get_author_total_views() sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True) my_array = [] for author in sorted_dict: my_array.append([author[1], author[0]]) return my_array def author_rank_by_average_views(self): """caculates the rank of the authors rank 1 = author with most threads rank n = author with least threads""" count_dict = self.caculate_author_view_averages() sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True) my_array = [] for author in sorted_dict: my_array.append([author[1], author[0]]) return my_array def caculate_author_rank(self): """ caculates author rank and information about the author """ total_views = self.get_author_total_views() total_replies = self.get_author_total_replies() total_threads = self.get_author_counts() average_views = self.caculate_author_replies_averages() average_replies = self.caculate_author_view_averages() qualities = self.get_author_qualities_count() unique = self.get_movies_for_author() #rank 1 = author with most threads # quality_dict = {"cam":0,"vhs":0,"dvd":0,"web":0,"hd":0,"not given":0} sorted_dict = sorted(total_threads.items(), key=operator.itemgetter(1), reverse=True) my_array = [] for x in range(0, len(sorted_dict)): # [author, rank, total_threads, total views # total_replies, average_replies, average_views ] author = sorted_dict[x][0] if author == "][,o0k~": print "f**k this n***a" else: my_array.append([ sorted_dict[x][0], x, sorted_dict[x][1], total_views[author], total_replies[author], average_views[author], average_replies[author], qualities[author]["hd"], qualities[author]["web"], qualities[author]["dvd"], qualities[author]["cam"], qualities[author]["vhs"], qualities[author]["n/a"], unique[author] ]) return my_array def caculate_author_rank_by_no_of_replies(self): """caculates the rank of the authors rank 1 = author with most threads rank n = author with least threads""" count_dict = self.get_author_total_replies() sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True) myArray = [] for author in sorted_dict: myArray.append([author[1], author[0]]) return myArray def caculate_author_rank_by_no_of_threads(self): """caculates the rank of the authors rank 1 = author with most threads rank n = author with least threads""" count_dict = self.get_author_counts() sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True) myArray = [] for author in sorted_dict: myArray.append([author[1], author[0]]) return myArray def print_table(self): """Year, # of authors, # of posts, # of movies, # of verions""" # self.get_author_posts_in_year myDict = {} for year in self.years: myDict[year] = [year] author_posts = self.get_author_posts_in_year(year) myDict[year].append(len(author_posts)) sum = 0 for author in author_posts: sum += author_posts[author] myDict[year].append(sum) movies = self.get_total_movies_in_year(year) myDict[year].append(len(movies)) movie_quality = self.get_movies_with_detected_quality_in_year(year) for movie in movie_quality: if len(movie_quality[movie]) > 1: movie_quality[movie] = set(movie_quality[movie].split(",")) quality_list = [] for quality in movie_quality[movie]: t = self.my_analyser.get_quality_type(quality) if t not in quality_list: quality_list.append(t) movie_quality[movie] = len(quality_list) sum = 0 for movie in movie_quality: if type(movie_quality[movie]) is int: sum += movie_quality[movie] else: sum += 1 myDict[year].append(sum) self.my_analyser.make_table(myDict, "warezbbTable.csv") def get_movies_with_detected_quality_in_year(self, year): movie_quality = self.my_analyser.count_two_fields_matching_third_field( 'movie title', 'detected_quality', 'year', year) my_dict = {} for movie in movie_quality: my_dict[movie] = movie_quality[movie][0] for i in range(1, len(movie_quality[movie])): my_dict[movie] = my_dict[movie] + "," + movie_quality[movie][i] return my_dict def get_author_total_replies(self): """get all the replies an author has ever had""" author_replies = self.my_analyser.count_two_fields('author', 'replies') author_total_replies = {} for author in author_replies: my_sum = 0 for view in author_replies[author]: my_sum += int(view.replace(',', '')) author_total_replies[author] = my_sum return author_total_replies def caculate_author_replies_averages(self): """Caculates the average amount of replies each author has""" author_count = self.get_author_counts() author_replies = self.get_author_total_replies() author_average_replies = {} for author in author_replies: try: author_average_replies[ author] = author_replies[author] / author_count[author] except: author_average_replies[author] = 0 pass return author_average_replies def get_author_qualities_count(self): quality_dict = { "cam": 0, "vhs": 0, "dvd": 0, "web": 0, "hd": 0, "n/a": 0 } myArray = self.my_analyser.count_two_fields('author', 'detected_quality', split_by_comma=False) authorDict = {} for author in myArray: array_of_qua = myArray[author] for qua in array_of_qua: qualities = qua.split(",") for q in qualities: q_dict = {} if author in authorDict: q_dict = authorDict[author] else: q_dict = { "cam": 0, "vhs": 0, "dvd": 0, "web": 0, "hd": 0, "n/a": 0 } t = self.my_analyser.get_quality_type(q) q_dict[t] += 1 authorDict[author] = q_dict return authorDict def get_author_posts_in_year(self, year): author_count = self.my_analyser.count_two_fields_matching_value( 'author', 'year', year) return author_count def get_author_total_replies_in_year(self, year): author_replies = self.my_analyser.count_two_fields_matching_third_field( 'author', 'replies', 'year', year) author_total_replies = {} for author in author_replies: my_sum = 0 for view in author_replies[author]: my_sum += int(view.replace(',', '')) author_total_replies[author] = my_sum return author_total_replies def get_qualities(self): """ gets the number of qualties in warezbb """ qualities = self.get_col("detected_quality") formated_list = [] for quality in qualities: for q in quality.split(","): formated_list.append(q) quality_dict = {} for quality in formated_list: quality_type = self.my_analyser.get_quality_type(quality) if quality_type in quality_dict: quality_dict[quality_type] += 1 else: quality_dict[quality_type] = 1 return quality_dict def caculate_movies_by_quality(self): """ gets movies with qualities information """ movie_quality = self.my_analyser.count_two_fields( "movie title", "detected_quality") hd_movie = {} cam_movie = {} vhs_movie = {} web_movie = {} dvd_movie = {} for movie in movie_quality: for quality in movie_quality[movie]: for qua in quality.split(","): t = self.my_analyser.get_quality_type(qua) if t == "hd": if movie in hd_movie: hd_movie[movie] += 1 else: hd_movie[movie] = 1 if t == "cam": if movie in cam_movie: cam_movie[movie] += 1 else: cam_movie[movie] = 1 if t == "vhs": if movie in vhs_movie: vhs_movie[movie] += 1 else: vhs_movie[movie] = 1 if t == "web": if movie in web_movie: web_movie[movie] += 1 else: web_movie[movie] = 1 if t == "dvd": if movie in dvd_movie: dvd_movie[movie] += 1 else: dvd_movie[movie] = 1 self.my_printer.print_dict_to_csv(hd_movie, "hd_movies.csv") self.my_printer.print_dict_to_csv(cam_movie, "cam_movies.csv") self.my_printer.print_dict_to_csv(vhs_movie, "vhs_movies.csv") self.my_printer.print_dict_to_csv(web_movie, "web_movies.csv") self.my_printer.print_dict_to_csv(dvd_movie, "dvd_movies.csv") def get_movies_for_author(self): """ get top five authors """ authors = self.get_author_counts() cols = self.my_analyser.count_two_fields('author', 'movie title', split_by_comma=False) new_cols = {} for author in authors: col = cols[author] movie_dict = Counter(col) new_cols[author] = len(set(movie_dict)) return new_cols def print_qualities(self): self.my_printer.print_dict_to_csv(self.get_qualities(), 'total_quality_types.csv') def print_total_post_dates(self): self.my_printer.print_dict_to_csv(self.get_post_dates(), 'alltime_posts.csv') def print_total_posts_by_year(self): self.my_printer.print_dict_to_csv(self.get_posts_dates_by_year(), 'alltime_posts_by_year.csv') def print_total_author_count(self): self.my_printer.print_dict_to_csv(self.get_author_counts(), 'alltime_author_count.csv') def print_author_view_averages(self): self.my_printer.print_dict_to_csv(self.caculate_author_view_averages(), 'alltime_author_view_averages.csv') def print_logged_author_rank_by_no_of_views(self): ranks = self.author_rank_by_no_of_views() self.my_printer.print_logged_array_to_csv( ranks, "loggedauthorRankbyviews.csv") def print_logged_author_rank_by_no_of_replies(self): ranks = self.caculate_author_rank_by_no_of_replies() self.my_printer.print_logged_array_to_csv( ranks, "loggedauthorRankbyreplies.csv") def print_logged_author_rank_by_no_of_threads(self): ranks = self.caculate_author_rank_by_no_of_threads() self.my_printer.print_logged_array_to_csv( ranks, "loggedauthorRankbythreads.csv") def print_author_rank_by_no_of_views(self): ranks = self.author_rank_by_no_of_views() self.my_printer.print_array_to_csv(ranks, "authorRankbyviews.csv") def print_author_rank_by_no_of_average_views(self): ranks = self.author_rank_by_average_views() self.my_printer.print_array_to_csv(ranks, "authorRankbyviews.csv") def print_author_rank_by_no_of_replies(self): ranks = self.caculate_author_rank_by_no_of_replies() self.my_printer.print_array_to_csv(ranks, "authorRankbyreplies.csv") def print_author_rank_by_no_of_threads(self): ranks = self.caculate_author_rank_by_no_of_threads() self.my_printer.print_array_to_csv(ranks, "authorRankbythreads.csv") def print_author_total_views(self): self.my_printer.print_dict_to_csv(self.get_author_total_views(), 'alltime_author_views_total.csv') def print_author_total_replies(self): self.my_printer.print_dict_to_csv(self.get_author_total_replies(), 'alltime_author_replies_total.csv') def print_author_replies_averages(self): self.my_printer.print_dict_to_csv( self.caculate_author_replies_averages(), "alltime_author_replies_averages.csv") def print_author_posts_all_years(self): for year in self.years: self.print_author_posts_in_year(year) def print_author_replies_all_years(self): for year in self.years: self.print_author_total_replies_in_year(year) def print_author_posts_in_year(self, year): self.my_printer.print_dict_to_csv(self.get_author_posts_in_year(year), year + '_author_count.csv') def print_author_total_replies_in_year(self, year): self.my_printer.print_dict_to_csv( self.get_author_total_replies_in_year(year), year + "_author_replies_total.csv") def print_author_rank(self): header = [ 'author', 'rank', 'total_threads', 'total_views', 'total_replies', 'average_replies', 'average_views', "hd", "web", "dvd", "cam", "vhs", "not given", "unique content" ] myArray = self.caculate_author_rank() self.my_printer.print_array_to_csv_with_header(myArray, header, "author_rank.csv")
class KickassDataAnalyser(): """ This script is used to analyse the processed kickass metadata Results of the queries in this class should be printed out to a .csv in the same directory """ def __init__(self, filename): self.filename = filename self.kickass_analyser = Analyser(self.filename) self.printer = MyPrinter() def get_number_of_authors(self, newfilename): """Gets number of authors that posted for every day/year (depending on post_date format).""" date_dictionary = self.kickass_analyser.count_two_fields( 'post_date', 'author') for date in date_dictionary: date_dictionary[date] = len(set(date_dictionary[date])) self.printer.print_dict_to_csv(date_dictionary, newfilename) def get_number_of_posts(self, newfilename): """Gets number of posts made for every day/year (depending on post_date format).""" date_dictionary = self.kickass_analyser.count_field( 'post_date', return_as_dict=True) self.printer.print_dict_to_csv(date_dictionary, newfilename) def get_number_of_downloads(self, newfilename): """Gets the number of donwloads made for every day/year (depending on post_date format).""" downloads_dictionary = self.kickass_analyser.count_two_fields( 'post_date', 'downloads') for post_date in downloads_dictionary: sum = 0 for item in downloads_dictionary[post_date]: sum += int(item) downloads_dictionary[post_date] = sum self.printer.print_dict_to_csv(downloads_dictionary, newfilename) def get_author_download_averages(self, newfilename): """Get average number of downloads per author.""" author_count = self.kickass_analyser.count_field('author', return_as_dict=True) author_downloads = self.kickass_analyser.count_two_fields( 'author', 'downloads') author_average_downloads = {} for author in author_downloads: sum = 0 for item in author_downloads[author]: sum += int(item) author_downloads[author] = sum for author in author_downloads: if author in author_count: author_average_downloads[ author] = author_downloads[author] / author_count[author] self.printer.print_dict_to_csv(author_average_downloads, newfilename) def get_year_download_averages(self, newfilename): """Get the average download numbers per year.""" year_count = self.kickass_analyser.count_field('post_date', return_as_dict=True) year_downloads = self.kickass_analyser.count_two_fields( 'post_date', 'downloads') for year in year_downloads: sum = 0 for item in year_downloads[year]: sum += int(item) year_downloads[year] = sum year_average_downloads = {} for year in year_downloads: year_average_downloads[ year] = year_downloads[year] / year_count[year] self.printer.print_dict_to_csv(year_average_downloads, newfilename) def total_number_of_downloads_per_author(self, newfilename): """Get the total number of downloads per author.""" author_view_dictionary = self.kickass_analyser.count_two_fields( 'author', 'downloads') for author in author_view_dictionary: sum = 0 for item in author_view_dictionary[author]: sum += int(item) author_view_dictionary[author] = sum self.printer.print_dict_to_csv(author_view_dictionary, newfilename) return author_view_dictionary def total_number_of_posts_per_author(self, newfilename): """Get the total number of posts per author.""" author_post_dictionary = self.kickass_analyser.count_field( 'author', return_as_dict=True) self.printer.print_dict_to_csv(author_post_dictionary, newfilename) def get_reputation_per_author(self, newfilename): """Get the reputation value per author.""" author_reputation_dictionary = self.kickass_analyser.count_field_unique( 'author', 'author_reputation') for reputation in author_reputation_dictionary: if author_reputation_dictionary[reputation] == "N/A": author_reputation_dictionary[reputation] = "0" self.printer.print_dict_to_csv(author_reputation_dictionary, newfilename) def count_unique_movies(self, newfilename): """Get the number of unique movies per day/year (depending on post_date format).""" year_movie_dictionary = self.kickass_analyser.count_two_fields( 'post_date', 'title') movie_count_per_year_dictionary = {} for year in year_movie_dictionary: movie_count_per_year_dictionary[year] = len( set(year_movie_dictionary[year])) self.printer.print_dict_to_csv(movie_count_per_year_dictionary, newfilename) def count_unique_content(self, newfilename): """Get the number of unique versions of movies per day/year (depending on post_date format).""" year_movie_dictionary = self.kickass_analyser.count_two_fields( 'post_date', 'title') for year in year_movie_dictionary: movie_quality_dictionary = self.kickass_analyser.count_two_fields_matching_third_field( 'title', 'detected_quality', 'post_date', year) for movie in movie_quality_dictionary: movie_quality_dictionary[movie] = set( movie_quality_dictionary[movie]) year_movie_dictionary[year] = movie_quality_dictionary content_count_per_year_dictionary = {} for year in year_movie_dictionary: number_of_unique_content = 0 movie_quality_dict = year_movie_dictionary[year] for movie in movie_quality_dict: quality_set = set() for quality in movie_quality_dict[movie]: quality_set.add( self.kickass_analyser.get_quality_type(quality)) number_of_unique_content = number_of_unique_content + len( quality_set) content_count_per_year_dictionary[year] = number_of_unique_content self.printer.print_dict_to_csv(content_count_per_year_dictionary, newfilename) def count_qualities(self, newfilename): """Count the number of quality types that each movie is uploaded in.""" movie_quality = self.kickass_analyser.count_two_fields( "title", "detected_quality") hd_movie = {} cam_movie = {} vhs_movie = {} web_movie = {} dvd_movie = {} for movie in movie_quality: for quality in movie_quality[movie]: for qua in quality.split(","): t = self.kickass_analyser.get_quality_type( qua ) # Assign the quality into one of HD,DVD,Web,VHS,Cam if t == "hd": if movie in hd_movie: hd_movie[movie] += 1 else: hd_movie[movie] = 1 if t == "cam": if movie in cam_movie: cam_movie[movie] += 1 else: cam_movie[movie] = 1 if t == "vhs": if movie in vhs_movie: vhs_movie[movie] += 1 else: vhs_movie[movie] = 1 if t == "web": if movie in web_movie: web_movie[movie] += 1 else: web_movie[movie] = 1 if t == "dvd": if movie in dvd_movie: dvd_movie[movie] += 1 else: dvd_movie[movie] = 1 # A separate csv file is created to show how many times each movie is uploaded for a particular quality self.printer.print_dict_to_csv(hd_movie, "hd_movies.csv") self.printer.print_dict_to_csv(cam_movie, "cam_movies.csv") self.printer.print_dict_to_csv(vhs_movie, "vhs_movies.csv") self.printer.print_dict_to_csv(web_movie, "web_movies.csv") self.printer.print_dict_to_csv(dvd_movie, "dvd_movies.csv")