def plot_exponent(series, ax, num_of_bins, color, label): x_series, p_series = getDistribution(series, True) bx_series, bp_series, bp_series_err = getBinnedDistribution( x_series, p_series, num_of_bins) ax.plot(x_series, p_series, color + 'o', alpha=0.1, markersize=7) ax.errorbar((bx_series[1:] + bx_series[:-1]) / 2, bp_series, yerr=bp_series_err, color=color, fmt='o-', alpha=0.9, capsize=3, elinewidth=1, linewidth=3, label=label)
def plot_measure(average_ratings_year, title, num_of_bins, ax, color, label, music=False): x_average_ratings_year, y_average_ratings_year, yerr_average_ratings_year = get_yearly_avg_data( average_ratings_year) bx_average_ratings_year, bp_average_ratings_year, bperr_average_ratings_year = getBinnedDistribution( x_average_ratings_year, y_average_ratings_year, num_of_bins) ax.set_title(title, fontsize=20) if music: ax.errorbar(x_average_ratings_year, y_average_ratings_year, yerr=yerr_average_ratings_year, fmt=color + '-', alpha=0.5, capsize=3, elinewidth=1, linewidth=2) ax.errorbar( (bx_average_ratings_year[1:] + bx_average_ratings_year[:-1]) / 2, bp_average_ratings_year, yerr=bperr_average_ratings_year, fmt=color + 'o-', alpha=0.6, capsize=3, elinewidth=1, linewidth=3, label=label) else: ax.errorbar( (bx_average_ratings_year[1:] + bx_average_ratings_year[:-1]) / 2, bp_average_ratings_year, yerr=bperr_average_ratings_year, fmt=color + 'o-', alpha=0.6, capsize=3, elinewidth=1, linewidth=3, label=label) ax.set_xlim([1880, 2020])
def plot_ccdf(file_avg_all, num_of_bins, ax, color, label, Nmin, title, marker): x_Nstar_avg_all, p_Nstar_avg_all, len_career, r_square, numInd = parse_N_star_N_data( file_avg_all, Nmin) if 'orig' in label: ax.set_title(str(numInd) + ' ' + title, fontsize=19) bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution( np.asarray(x_Nstar_avg_all), np.asarray(p_Nstar_avg_all), num_of_bins) #ax.plot(x_Nstar_avg_all, p_Nstar_avg_all, color = color, marker = 'o', linewidth = 0, markersize = 5, alpha= 0.5, label = label + ', ' + str(len_career) + ' $R^2=$' + str(round(r_square, 4)),) ax.errorbar((bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2, bp_average_ratings, yerr=bperr_average_ratings, fmt=color + '-', linewidth=1, markersize=9, label=label + ' $R^2 = $' + str(round(r_square, 5)), marker=marker, alpha=0.7) return r_square
def get_r_model_curves(data_file, max_data_file, ax, label, num_of_bins, title, xlabel, ylabel, log=False): ax.set_title(title, fontsize=19) ax.set_xlabel(xlabel, fontsize=17) ax.set_ylabel(ylabel, fontsize=17) data = [float(line.strip()) for line in open(data_file)] (data_max, career_len) = zip(*[[float(num) for num in line.strip().split('\t')] for line in open(max_data_file) if 'nan' not in line]) ax.plot(career_len, data_max, marker='o', color='lightgrey', alpha=0.15, linewidth=0) career_max = [] career_max_dict = {} for i in range(100): data_new = data[:] random.shuffle(data_new) for leng in career_len: #career_max.append(max( data_new[0:int(leng)])) if leng not in career_max_dict: career_max_dict[leng] = [max(data_new[0:int(leng)])] else: career_max_dict[leng].append(max(data_new[0:int(leng)])) del data_new[0:int(leng)] sorted_len = sorted(list(set(career_len))) career_max = [] for s in sorted_len: career_max.append(np.mean(career_max_dict[s])) print len(sorted_len), len(career_max) if not log: xb_data, pb_data, pberr_data = getBinnedDistribution( np.asarray(career_len), np.asarray(data_max), num_of_bins) xb_gen, pb_gen, pberr_gen = getBinnedDistribution( np.asarray(sorted_len), np.asarray(career_max), num_of_bins) ax.errorbar((xb_data[1:] + xb_data[:-1]) / 2, pb_data, yerr=pberr_data, fmt='o-', color='grey', label='data', alpha=0.9) ax.errorbar((xb_gen[1:] + xb_gen[:-1]) / 2, pb_gen, yerr=pberr_gen, fmt='-', color='r', label='R-model', alpha=0.9) else: ax.set_xscale('log') ax.set_yscale('log') xb_data, pb_data, pberr_data = getLogBinnedDistribution( np.asarray(career_len), np.asarray(data_max), num_of_bins) xb_gen, pb_gen, pberr_gen = getLogBinnedDistribution( np.asarray(sorted_len), np.asarray(career_max), num_of_bins) ax.errorbar(xb_data, pb_data, yerr=pberr_data, fmt='o-', color='grey', label='data', alpha=0.9) ax.errorbar(xb_gen, pb_gen, yerr=pberr_gen, fmt='-', color='r', label='R-model', alpha=0.9)
def get_imapct_distr(): ''' ---------------------------------------------- ''' ''' MOVIE YO ''' professions = [('director', 'k'), ('producer', 'b'), ('writer', 'r'), ('composer', 'g'), ('art-director', 'y')] num_of_bins = 20 title_font = 25 seaborn.set_style('white') f, ax = plt.subplots(2, 3, figsize=(25, 15)) st = f.suptitle("IMDb normalized impact distributions", fontsize=title_font) FOLDER = 'ProcessedDataNormalized' for (label, color) in professions: num_car = str( int( round( len( os.listdir('Data/Film/film-' + label + '-simple-careers')) / 1000.0))) + 'k' file_avg = FOLDER + '/1_impact_distributions/film_average_ratings_dist_' + label + '.dat' file_cnt = FOLDER + '/1_impact_distributions/film_rating_counts_dist_' + label + '.dat' file_mets = FOLDER + '/1_impact_distributions/film_metascores_dist_' + label + '.dat' file_crit = FOLDER + '/1_impact_distributions/film_critic_review_dist_' + label + '.dat' file_user = FOLDER + '/1_impact_distributions/film_user_review_dist_' + label + '.dat' average_ratings = np.asarray( [round(float(line.strip()), 2) for line in open(file_avg)]) rating_counts = [ round(float(line.strip()), 2) for line in open(file_cnt) ] metascores = [ round(float(line.strip()), 1) for line in open(file_mets) ] critic_review = [ round(float(line.strip()), 2) for line in open(file_crit) ] user_review = [ round(float(line.strip()), 2) for line in open(file_user) ] # plot avg ratings x_average_ratings, p_average_ratings = getDistribution( average_ratings, True) bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution( x_average_ratings, p_average_ratings, num_of_bins) ax[0, 0].set_title('IMDb - average rating', fontsize=20) ax[0, 0].plot(x_average_ratings, p_average_ratings, color, marker='o', alpha=0.1, linewidth=0, label=label + ', ' + str(num_car)) ax[0, 0].errorbar( (bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2, bp_average_ratings, yerr=bperr_average_ratings, fmt=color + '-', linewidth=2) # plot rating counts x_rating_counts, p_rating_counts = getDistribution(rating_counts, True) bx_rating_counts, bp_rating_counts, bperr_rating_counts = getBinnedDistribution( x_rating_counts, p_rating_counts, num_of_bins) ax[0, 1].set_title('IMDb - rating count', fontsize=20) ax[0, 1].set_xscale('log') ax[0, 1].set_yscale('log') ax[0, 1].plot(x_rating_counts, p_rating_counts, color + 'o', alpha=0.8, label=label) # + ', ' + str(num_wr)) #ax[0,1].errorbar((bx_rating_counts[1:] + bx_rating_counts[:-1])/2, bp_rating_counts, yerr=bperr_rating_counts, fmt='b-', linewidth = 2) # plot metascores x_metascores, p_metascores = getDistribution(metascores, True) bx_metascores, bp_metascores, bperr_metascores = getBinnedDistribution( x_metascores, p_metascores, num_of_bins) ax[0, 2].set_title('IMDb - metascores', fontsize=20) ax[0, 2].plot(x_metascores, p_metascores, color + 'o', alpha=0.2, label=label) # + ', ' + str(len(metascores))) ax[0, 2].errorbar((bx_metascores[1:] + bx_metascores[:-1]) / 2, bp_metascores, yerr=bperr_metascores, fmt=color + '-', linewidth=2) # plot critic review count x_critic_review, p_critic_review = getDistribution(critic_review, True) ax[1, 0].set_title('IMDb - critic_review', fontsize=20) ax[1, 0].set_xscale('log') ax[1, 0].set_yscale('log') ax[1, 0].plot(x_critic_review, p_critic_review, color + 'o', alpha=0.8, label=label) #+ ', ' + str(len(critic_review))) # plot user review count x_user_review, p_user_review = getDistribution(user_review, True) ax[1, 1].set_title('IMDb - user_review', fontsize=20) ax[1, 1].set_xscale('log') ax[1, 1].set_yscale('log') ax[1, 1].plot(x_user_review, p_user_review, color + 'o', alpha=0.8, label=label) # + ', ' + str(len(user_review))) ''' ---------------------------------------------- ''' ''' MOVIE YO ''' genres = [('electro', 'k'), ('pop', 'b')] for (genre, color) in genres: num_mus = str( int( round( len( os.listdir('Data/Music/music-' + genre + '-simple-careers')) / 1000.0))) + 'k' file_music = FOLDER + '/1_impact_distributions/music_rating_counts_dist_' + genre + '.dat' average_ratings = np.asarray( [round(float(line.strip())) for line in open(file_music)]) x_rating_counts, p_rating_counts = getDistribution( average_ratings, True) print len(average_ratings) ax[1, 2].set_title('Music - playcount', fontsize=20) ax[1, 2].set_xscale('log') ax[1, 2].set_yscale('log') ax[1, 2].plot(x_rating_counts, p_rating_counts, color + 'o', alpha=0.2, label=genre + ', ' + num_mus) align_plot(ax) plt.savefig('impact_distributions_normalized.png') plt.close()
def get_impact_correlations(): num_of_bins = 12 title_font = 25 seaborn.set_style('white') professions = [('director', 'royalblue'), ('producer', 'b'), ('writer', 'r'), ('composer', 'g'), ('art-director', 'y')] for mode in ['', 'Normalized']: f, ax = plt.subplots(2, 3, figsize=(25, 15)) st = f.suptitle("IMDb impact correlations - " + mode, fontsize=title_font) for (label, color) in professions[0:1]: impacts = zip(*[[ float(aaa) if 'tt' not in aaa else aaa for aaa in line.strip().split('\t') ] for line in open('ProcessedData' + mode + '/7_multiple_impacts/film_multiple_impacts_' + label + '.dat')]) Alpha = 0.05 ax[0, 0].set_ylabel('avg rating', fontsize=20) ax[0, 0].set_xlabel('rating cnt', fontsize=20) ax[0, 0].set_xscale('log') avg, cnt = get_rid_of_zeros(impacts[2], impacts[1]) ax[0, 0].plot(impacts[2], impacts[1], 'o', color=color, alpha=Alpha, label=label) xb_avg, pb_avg, pberr_avg = getLogBinnedDistribution( np.asarray(avg), np.asarray(cnt), num_of_bins) ax[0, 0].errorbar(xb_avg, pb_avg, yerr=pberr_avg, fmt='^-', color='r') x_cnt_meta, p_cnt_meta, perr_cnt_meta = getBinnedDistribution( np.asarray(impacts[3]), np.asarray(impacts[1]), num_of_bins) meta, avg = get_rid_of_zeros(impacts[3], impacts[1]) ax[0, 1].set_xlabel('metascore', fontsize=20) ax[0, 1].plot(meta, avg, 'o', color=color, alpha=Alpha, label=label) ax[0, 1].errorbar( (x_cnt_meta[1:] + x_cnt_meta[:-1]) / 2, p_cnt_meta, yerr=perr_cnt_meta, fmt='^-', color='r', label='$corr=$' + str( round( stats.pearsonr((x_cnt_meta[1:] + x_cnt_meta[:-1]) / 2, p_cnt_meta)[0], 4))) #, alpha = Alpha, label = label) ax[0, 2].set_ylabel('#critic review', fontsize=20) ax[0, 2].set_xlabel('#user review', fontsize=20) ax[0, 2].set_xscale('log') ax[0, 2].set_yscale('log') crit, user = get_rid_of_zeros(impacts[4], impacts[5]) xb_crit, pb_crit, pberr_crit = getLogBinnedDistribution( np.asarray(crit), np.asarray(user), num_of_bins) ax[0, 2].plot(crit, user, 'o', color=color, alpha=Alpha, label=label) ax[0, 2].errorbar(xb_crit, pb_crit, yerr=pberr_crit, fmt='^-', color='r') #ax[1,0].xaxis.get_major_formatter().set_powerlimits((0, 1)) ax[1, 0].set_xscale('log') cnttt, metatt = get_rid_of_zeros(impacts[2], impacts[3]) x_cnt_metat, p_cnt_metat, perr_cnt_metat = getLogBinnedDistribution( np.asarray(cnttt), np.asarray(metatt), num_of_bins) ax[1, 0].set_xlabel('rating cnt', fontsize=20) ax[1, 0].set_ylabel('metascore', fontsize=20) ax[1, 0].plot(cnttt, metatt, 'o', color=color, alpha=Alpha, label=label) ax[1, 0].errorbar(x_cnt_metat, p_cnt_metat, yerr=perr_cnt_metat, fmt='^-', color='r') #, alpha = Alpha, label = label) ax[1, 1].set_xlabel('rating cnt', fontsize=20) ax[1, 1].set_ylabel('#critic review', fontsize=20) cnt, crit = get_rid_of_zeros(impacts[2], impacts[4]) xb_cnt_crit, pb_cnt_crit, pberr_cnt_crit = getLogBinnedDistribution( np.asarray(cnt), np.asarray(crit), num_of_bins) ax[1, 1].loglog(impacts[2], impacts[4], 'o', color=color, alpha=Alpha, label=label) ax[1, 1].errorbar(xb_cnt_crit, pb_cnt_crit, yerr=pberr_cnt_crit, fmt='^-', color='r') ax[1, 2].set_xlabel('rating cnt', fontsize=20) ax[1, 2].set_ylabel('#user review', fontsize=20) cnt, user = get_rid_of_zeros(impacts[2], impacts[5]) xb_cnt_crit, pb_cnt_user, pberr_cnt_user = getLogBinnedDistribution( np.asarray(cnt), np.asarray(user), num_of_bins) ax[1, 2].loglog(impacts[2], impacts[5], 'o', color=color, alpha=Alpha, label=label) ax[1, 2].errorbar(xb_cnt_crit, pb_cnt_user, yerr=pberr_cnt_user, fmt='^-', color='r') align_plot(ax) plt.savefig('correlations_' + mode + '.png') plt.close()
def get_length_plots(): title_font = 25 num_of_bins = 20 seaborn.set_style('white') f, ax = plt.subplots(2, 3, figsize=(22, 15)) #st = f.suptitle("IMDb Inflation of impact measures", fontsize=title_font) file_avg_year = 'ProcessedData/3_inflation_curves/film_yearly_average_ratings_dist_director.dat' average_ratings_year = np.asarray( [line.strip() for line in open(file_avg_year)]) x_average_ratings_year, y_average_ratings_year = get_num_per_year( average_ratings_year) #ax[0,0].plot(x_average_ratings_year, y_average_ratings_year, 'ko', label = 'movies', alpha = 0.8) #ax[0,0].set_title('#movies', fontsize = title_font) ax[0, 0].set_yscale('log') ax[0, 0].plot(x_average_ratings_year, y_average_ratings_year, 'ko', label='movies', alpha=0.6) ax[0, 0].set_title('#movies', fontsize=title_font) file_avg_year_electro = 'ProcessedData/3_inflation_curves/music_yearly_rating_counts_dist_electro.dat' file_avg_year_pop = 'ProcessedData/3_inflation_curves/music_yearly_rating_counts_dist_pop.dat' average_ratings_year_electro = np.asarray( [line.strip() for line in open(file_avg_year_electro)]) average_ratings_year_pop = np.asarray( [line.strip() for line in open(file_avg_year_pop)]) x_average_ratings_year_electro, y_average_ratings_year_electro = get_num_per_year( average_ratings_year_electro) x_average_ratings_year_pop, y_average_ratings_year_pop = get_num_per_year( average_ratings_year_pop) xb_average_ratings_year_electro, pb_average_ratings_year_electro, pberr_average_ratings_year_electro = getBinnedDistribution( x_average_ratings_year_electro, y_average_ratings_year_electro, num_of_bins) ax[0, 1].set_yscale('log') ax[0, 1].plot(x_average_ratings_year_electro, y_average_ratings_year_electro, 'ko', label='electro', alpha=0.6) ax[0, 1].plot(x_average_ratings_year_pop, y_average_ratings_year_pop, 'bo', label='pop', alpha=0.6) ax[0, 1].set_title('#tracks', fontsize=title_font) ax[0, 2].set_title('#books', fontsize=title_font) align_plot(ax) plt.savefig('num_of_products_length_data.png')