def fitSkewedNormal(rand, ax, label, alpha_hist=0.4, color_line='r'): ax.set_title(label, fontsize=18) param = stats.skewnorm.fit(rand) x_rand, p_rand = getDistribution(rand) pdf_fitted = stats.skewnorm.pdf(x_rand, param[0], loc=param[1], scale=param[2]) mean = stats.skewnorm.mean(param[0], loc=param[1], scale=param[2]) maxx = x_rand[pdf_fitted.tolist().index(max(pdf_fitted))] counts, bins, bars = ax.hist(rand, normed=True, bins=np.linspace(min(x_rand), max(x_rand), 25), alpha=alpha_hist) sk_results_norm = stats.kstest( np.asarray(pdf_fitted), lambda x: stats.skewnorm.cdf( x_rand, param[0], loc=param[1], scale=param[ 2])) # stats.ks_2samp(np.cumsum(p_rand), np.cumsu ax.plot(x_rand, pdf_fitted, '-', color=color_line, linewidth=3, label='$\\mu$=' + str(round(mean, 2)) + ', $\\mu^{*}$=' + str(maxx) + '\n$D$=' + str(round(sk_results_norm[0], 2)) + ', $p$=' + str(round(sk_results_norm[1], 2))) return mean, sk_results_norm[0], sk_results_norm[1], param[0], param[ 1], param[2]
def fitPowerLaw(rand, ax, label): ax.set_title(label, fontsize=18) # histogram print 'Fitting lognormal...' x_rand, p_rand = getDistribution(rand) counts, bins, bars = ax.hist( rand, normed=True, bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), 15), log=True, alpha=0.0) #, histtype='step', linewidth = 0) ax.plot((bins[1:] + bins[:-1]) / 2, counts, 's-', color='royalblue', alpha=0.5, markersize=12, linewidth=2) # get the lognormal param = stats.lognorm.fit(rand) pdf_fitted = stats.lognorm.pdf(x_rand, param[0], loc=param[1], scale=param[2]) # mu = np.log(param[2]) sigma = param[0] sk_results_norm = stats.kstest( np.asarray(pdf_fitted), lambda x: stats.lognorm.cdf( x_rand, param[0], loc=param[1], scale=param[ 2])) # stats.ks_2samp(np.cumsum(p_rand), np.cumsu ax.plot(x_rand, pdf_fitted, 'k-', linewidth=4, label='$\\mu$=' + str(round(mu, 2)) + ' $\\sigma$=' + str(round(sigma, 2)) + ', $D$=' + str(round(sk_results_norm[0], 2))) # fit and plot the powerlaw results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS') alpha = results.power_law.alpha xmin = results.power_law.xmin D = results.power_law.KS() results.power_law.plot_pdf(color='r', ax=ax, linestyle='-', linewidth=4, label='$\\alpha$= ' + str(round(alpha, 2)) + ', $x_{min}$=' + str(round(xmin, 2)) + '\n$D$=' + str(round(D, 2))) ax.set_ylim([min(counts), 1.1]) ax.set_xlim([min(x_rand), max(bins)]) return alpha, xmin, D
def plot_exponent(series, ax, num_of_bins, color, label): x_series, p_series = getDistribution(series, True) bx_series, bp_series, bp_series_err = getBinnedDistribution( x_series, p_series, num_of_bins) ax.plot(x_series, p_series, color + 'o', alpha=0.1, markersize=7) ax.errorbar((bx_series[1:] + bx_series[:-1]) / 2, bp_series, yerr=bp_series_err, color=color, fmt='o-', alpha=0.9, capsize=3, elinewidth=1, linewidth=3, label=label)
def fitPowerLaw(rand, ax, label): ax.set_title(label, fontsize=18) # get the scatterplot x_rand, p_rand = getDistribution(rand) # fit and plot the lognormal print 'Fitting lognormal...' counts, bins, bars = ax.hist( rand, normed=True, bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), 15), log=True, alpha=0.0) #, histtype='step', linewidth = 0) ax.plot((bins[1:] + bins[:-1]) / 2, counts, 's-', color='royalblue', alpha=0.5, markersize=12, linewidth=2) # fit and plot the powerlaw results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS') alpha = results.power_law.alpha xmin = results.power_law.xmin D = results.power_law.KS() results.power_law.plot_pdf(marker='o', color='r', ax=ax, linestyle='-', linewidth=3, label='$\\alpha$= ' + str(round(alpha, 2)) + ', $x_{min}$=' + str(round(xmin, 2)) + '\n$D$=' + str(round(D, 2))) ax.set_ylim([min(counts), 1.1]) ax.set_xlim([min(x_rand), max(bins)]) return alpha, xmin, D
def fitSkewedNormal(filename, ax, label, alpha_hist=0.2, color_line='r'): rand = np.asarray([float(line.strip()) for line in open(filename)]) print 'Fitting normal...' param = stats.skewnorm.fit(rand) x_rand, p_rand = getDistribution(rand) pdf_fitted = stats.skewnorm.pdf(x_rand, param[0], loc=param[1], scale=param[2]) mean = stats.skewnorm.mean(param[0], loc=param[1], scale=param[2]) maxx = str(x_rand[pdf_fitted.tolist().index(max(pdf_fitted))]) counts, bins, bars = ax.hist(rand, bins=np.linspace(min(x_rand), max(x_rand), 25), normed=True, alpha=alpha_hist) sk_results = stats.kstest( np.asarray(pdf_fitted), lambda x: stats.skewnorm.cdf( x_rand, param[0], loc=param[1], scale=param[2])) ax.plot(x_rand, pdf_fitted, '-', color=color_line, linewidth=3, label='$\\mu$=' + str(round(mean, 2)) + ', $\\mu^{*}$=' + maxx + '\n$D$=' + str(round(sk_results[0], 2)) + ', $p$=' + str(round(sk_results[1], 2))) ax.set_title(label, fontsize=18) ax.set_yticks(np.linspace(0, max(counts), 5)) ax.set_yticklabels([ str(int(100 * y)) + '%' for y in np.linspace(0, 1.05 * max(counts) / (sum(counts)), 5) ])
def fitAndStatsTransformedNormal(filename, ax, label, outfolder, name, statfile, filterparam, alpha_hist=0.2, color_line='r'): if 'log_Q' in name: rand = np.asarray([ float(line.strip().split('\t')[1]) for line in open(filename) if len(line.strip().split('\t')) > 1 ]) else: rand = np.asarray([float(line.strip()) for line in open(filename)]) mmin = min(rand) rand = [math.log(rr - mmin + 1) for rr in rand] print 'Fitting normal...' param = stats.norm.fit(rand) print param x_rand, p_rand = getDistribution(rand) pdf_fitted = stats.norm.pdf(x_rand, loc=param[0], scale=param[1]) mean, variance = stats.norm.stats(loc=param[0], scale=param[1], moments='mv') maxx = x_rand[pdf_fitted.tolist().index(max(pdf_fitted))] counts, bins, bars = ax.hist(rand, bins=np.linspace(min(x_rand), max(x_rand), 25), normed=True, alpha=alpha_hist) D = stats.kstest( np.asarray(pdf_fitted), lambda x: stats.norm.cdf(x_rand, loc=param[0], scale=param[1]))[0] counts, bins, bars = ax.hist(rand, bins=np.linspace(min(x_rand), max(x_rand), 25), normed=True, alpha=alpha_hist) ax.plot(x_rand, pdf_fitted, '-', color=color_line, linewidth=3, label='D = ' + str(D) + '\nvarQ = ' + str(variance)) ax.set_title('Normal fit') legend = ax.legend(loc='left', shadow=True, fontsize=20) print 'NORM', mean, variance extra = '' if 'release' in filename: extra = '_release-max' write_row( outfolder + '/' + label + '_' + name + '_tnorm_original_fit' + extra + '.dat', [ str(x_rand[i]) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand)) ]) write_row( outfolder + '/' + label + '_' + name + '_tnorm_mean_centered_fit' + extra + '.dat', [ str(x_rand[i] - mean) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand)) ]) write_row( outfolder + '/' + label + '_' + name + '_tnorm_peak_centered_fit' + extra + '.dat', [ str(x_rand[i] - maxx) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand)) ]) write_row( outfolder + '/' + label + '_' + name + '_tnorm_mean_centered_fit' + extra + 'sample' + '.dat', [ str(x_rand[i] - mean) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand))[0::filterparam] ]) write_row( outfolder + '/' + label + '_' + name + '_tnorm_peak_centered_fit' + extra + 'sample' + '.dat', [ str(x_rand[i] - maxx) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand))[0::filterparam] ]) write_row( outfolder + '/' + label + '_' + name + '_tnorm_original_hist_' + extra + '.dat', rand) fout = open(statfile, 'a') fout.write(label + '\t' + name + '\t' + str(D) + '\t' + str(mean) + '\t' + str(variance) + '\n') fout.close()
def fitLognormal(filename, ax, label='', out_folder='', name='', cutoff=-sys.maxint, writeout=True, noise=False, norm='no'): rand = [] if 'log_p' in name: rand = np.asarray([ math.exp(float(line.strip())) + noise for line in open(filename) if float(line.strip()) > cutoff ]) elif 'log_Q' in name: rand = np.asarray([ math.exp(float(line.strip().split('\t')[1])) for line in open(filename) if len(line.strip().split('\t')) > 1 and float(line.strip().split('\t')[1]) > cutoff ]) elif noise: rand = np.asarray([ float(line.strip()) + random.random() for line in open(filename) if float(line.strip()) > cutoff ]) else: rand = np.asarray([ float(line.strip()) + random.random() for line in open(filename) if float(line.strip()) > cutoff ]) x_rand, p_rand = getDistribution(rand) # histogram nbins = 20 counts, bins, bars = ax.hist(rand, normed=True, bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), nbins), log=True, alpha=0.0, cumulative=1) ax.plot((bins[1:] + bins[:-1]) / 2, counts, 's-', color='royalblue', alpha=0.0, markersize=0, linewidth=5) bins = (bins[1:] + bins[:-1]) / 2 # fit and plot the powerlaw print 'Fit and plot the lognormal...' + label p0 = stats.lognorm._fitstart(rand) p1 = stats.lognorm.fit(rand, p0[0], loc=p0[1], scale=p0[2]) param = stats.lognorm.fit(rand, p1[0], loc=p1[1], scale=p1[2]) ppdf_fitted = stats.lognorm.cdf(x_rand, param[0], loc=param[1], scale=param[2]) mu = np.log(param[2]) sigma = param[0] ax.set_xlabel(label, fontsize=20) ax.set_ylabel('CDF of ' + label, fontsize=20) cdf_fitted = stats.lognorm.cdf(x_rand, param[0], loc=param[1], scale=param[2]) sk_results_norm = stats.ks_2samp( cdf_fitted, np.cumsum(p_rand)) # stats.ks_2samp(np.cumsum(p_rand), np.cumsu print label, '\t', norm, '\t', sk_results_norm[0] extra = '' if 'release' in filename: extra = '_release-max' if writeout: if not os.path.exists(out_folder): os.makedirs(out_folder) write_row( out_folder + '/' + label + '_' + name + '_lognormal_pdf_' + norm + extra + '.dat', [ str(x_rand[i]) + '\t' + str(ppdf_fitted[i]) for i in range(len(x_rand)) ]) write_row( out_folder + '/' + label + '_' + name + '_lognormal_hist_pdf_' + str(nbins) + '_' + norm + extra + '.dat', [str(bins[i]) + '\t' + str(counts[i]) for i in range(len(counts))]) return
def fitPowerLaw(filename, ax, label='', out_folder='', name='', cutoff=-sys.maxint, writeout=True, noise=False, distancefile=''): rand = [] if 'log_p' in name: rand = np.asarray([ math.exp(float(line.strip())) + noise for line in open(filename) if float(line.strip()) > cutoff ]) elif 'log_Q' in name: rand = np.asarray([ math.exp(float(line.strip().split('\t')[1])) for line in open(filename) if len(line.strip().split('\t')) > 1 and float(line.strip().split('\t')[1]) > cutoff ]) elif noise: rand = np.asarray([ float(line.strip()) + random.random() for line in open(filename) if float(line.strip()) > cutoff ]) else: rand = np.asarray([ float(line.strip()) + random.random() for line in open(filename) if float(line.strip()) > cutoff ]) x_rand, p_rand = getDistribution(rand) ax.set_title(label, fontsize=18) # histogram counts, bins, bars = ax.hist(rand, normed=True, bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), 1000), log=True, alpha=0.0, cumulative=1) ax.plot((bins[1:] + bins[:-1]) / 2, counts, 's-', color='royalblue', alpha=0.7, markersize=0, linewidth=5) bins = (bins[1:] + bins[:-1]) / 2 ax.set_ylim([min(counts), 1.05 * max(counts)]) ax.set_xlim([min(x_rand), max(bins)]) # fit and plot the powerlaw print 'Fit and plot the powerlaw...' results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS') alpha = results.power_law.alpha D = results.power_law.KS() parassms = results.power_law.plot_cdf(color='r', ax=ax, linestyle='-', linewidth=3, label='$\\alpha$= ' + str(round(alpha, 2)) + ', $D$=' + str(round(D, 2))) # fit and plot the powerlaw print 'Fit and plot the lognormal...' + label p0 = stats.lognorm._fitstart(rand) p1 = stats.lognorm.fit(rand, p0[0], loc=p0[1], scale=p0[2]) param = stats.lognorm.fit(rand, p1[0], loc=p1[1], scale=p1[2]) pdf_fitted = stats.lognorm.cdf(x_rand, param[0], loc=param[1], scale=param[2]) ppdf_fitted = stats.lognorm.pdf(x_rand, param[0], loc=param[1], scale=param[2]) mu = np.log(param[2]) sigma = param[0] sk_results_norm = stats.ks_2samp( pdf_fitted, np.cumsum(p_rand)) # stats.ks_2samp(np.cumsum(p_rand), np.cumsu ax.plot(x_rand, pdf_fitted, 'k-', linewidth=4, label='Lognormal fit, $\\mu$=' + str(round(mu, 2)) + '\n$\\sigma$=' + str(round(sigma, 2)) + ', $D$=' + str(round(sk_results_norm[0], 2))) print sk_results_norm, D ax.set_xlabel(label, fontsize=20) ax.set_ylabel('CDF of ' + label, fontsize=20) extra = '' if 'release' in filename: extra = '_release-max' if writeout: if not os.path.exists(out_folder): os.makedirs(out_folder) xfit = parassms.lines[1].get_xdata() yfit = parassms.lines[1].get_ydata() write_row( out_folder + '/' + label + '_' + name + '_powerlaw_hist_' + extra + '.dat', [str(bins[i]) + '\t' + str(counts[i]) for i in range(len(counts))]) write_row( out_folder + '/' + label + '_' + name + '_powerlaw_fit_' + extra + '.dat', [str(xfit[i]) + '\t' + str(yfit[i]) for i in range(len(xfit))]) write_row( out_folder + '/' + label + '_' + name + '_lognormal_' + extra + '.dat', [ str(x_rand[i]) + '\t' + str(pdf_fitted[i]) for i in range(len(x_rand)) ]) write_row( out_folder + '/' + label + '_' + name + '_lognormal_pdf_' + extra + '.dat', [ str(x_rand[i]) + '\t' + str(ppdf_fitted[i]) for i in range(len(x_rand)) ]) f_Ddata = open(distancefile, 'a') f_Ddata.write(label + '\t' + str(D) + '\t' + str(sk_results_norm[0]) + '\n') f_Ddata.close() return sk_results_norm[0], D
def get_imapct_distr(): ''' ---------------------------------------------- ''' ''' MOVIE YO ''' professions = [('director', 'k'), ('producer', 'b'), ('writer', 'r'), ('composer', 'g'), ('art-director', 'y')] num_of_bins = 20 title_font = 25 seaborn.set_style('white') f, ax = plt.subplots(2, 3, figsize=(25, 15)) st = f.suptitle("IMDb normalized impact distributions", fontsize=title_font) FOLDER = 'ProcessedDataNormalized' for (label, color) in professions: num_car = str( int( round( len( os.listdir('Data/Film/film-' + label + '-simple-careers')) / 1000.0))) + 'k' file_avg = FOLDER + '/1_impact_distributions/film_average_ratings_dist_' + label + '.dat' file_cnt = FOLDER + '/1_impact_distributions/film_rating_counts_dist_' + label + '.dat' file_mets = FOLDER + '/1_impact_distributions/film_metascores_dist_' + label + '.dat' file_crit = FOLDER + '/1_impact_distributions/film_critic_review_dist_' + label + '.dat' file_user = FOLDER + '/1_impact_distributions/film_user_review_dist_' + label + '.dat' average_ratings = np.asarray( [round(float(line.strip()), 2) for line in open(file_avg)]) rating_counts = [ round(float(line.strip()), 2) for line in open(file_cnt) ] metascores = [ round(float(line.strip()), 1) for line in open(file_mets) ] critic_review = [ round(float(line.strip()), 2) for line in open(file_crit) ] user_review = [ round(float(line.strip()), 2) for line in open(file_user) ] # plot avg ratings x_average_ratings, p_average_ratings = getDistribution( average_ratings, True) bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution( x_average_ratings, p_average_ratings, num_of_bins) ax[0, 0].set_title('IMDb - average rating', fontsize=20) ax[0, 0].plot(x_average_ratings, p_average_ratings, color, marker='o', alpha=0.1, linewidth=0, label=label + ', ' + str(num_car)) ax[0, 0].errorbar( (bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2, bp_average_ratings, yerr=bperr_average_ratings, fmt=color + '-', linewidth=2) # plot rating counts x_rating_counts, p_rating_counts = getDistribution(rating_counts, True) bx_rating_counts, bp_rating_counts, bperr_rating_counts = getBinnedDistribution( x_rating_counts, p_rating_counts, num_of_bins) ax[0, 1].set_title('IMDb - rating count', fontsize=20) ax[0, 1].set_xscale('log') ax[0, 1].set_yscale('log') ax[0, 1].plot(x_rating_counts, p_rating_counts, color + 'o', alpha=0.8, label=label) # + ', ' + str(num_wr)) #ax[0,1].errorbar((bx_rating_counts[1:] + bx_rating_counts[:-1])/2, bp_rating_counts, yerr=bperr_rating_counts, fmt='b-', linewidth = 2) # plot metascores x_metascores, p_metascores = getDistribution(metascores, True) bx_metascores, bp_metascores, bperr_metascores = getBinnedDistribution( x_metascores, p_metascores, num_of_bins) ax[0, 2].set_title('IMDb - metascores', fontsize=20) ax[0, 2].plot(x_metascores, p_metascores, color + 'o', alpha=0.2, label=label) # + ', ' + str(len(metascores))) ax[0, 2].errorbar((bx_metascores[1:] + bx_metascores[:-1]) / 2, bp_metascores, yerr=bperr_metascores, fmt=color + '-', linewidth=2) # plot critic review count x_critic_review, p_critic_review = getDistribution(critic_review, True) ax[1, 0].set_title('IMDb - critic_review', fontsize=20) ax[1, 0].set_xscale('log') ax[1, 0].set_yscale('log') ax[1, 0].plot(x_critic_review, p_critic_review, color + 'o', alpha=0.8, label=label) #+ ', ' + str(len(critic_review))) # plot user review count x_user_review, p_user_review = getDistribution(user_review, True) ax[1, 1].set_title('IMDb - user_review', fontsize=20) ax[1, 1].set_xscale('log') ax[1, 1].set_yscale('log') ax[1, 1].plot(x_user_review, p_user_review, color + 'o', alpha=0.8, label=label) # + ', ' + str(len(user_review))) ''' ---------------------------------------------- ''' ''' MOVIE YO ''' genres = [('electro', 'k'), ('pop', 'b')] for (genre, color) in genres: num_mus = str( int( round( len( os.listdir('Data/Music/music-' + genre + '-simple-careers')) / 1000.0))) + 'k' file_music = FOLDER + '/1_impact_distributions/music_rating_counts_dist_' + genre + '.dat' average_ratings = np.asarray( [round(float(line.strip())) for line in open(file_music)]) x_rating_counts, p_rating_counts = getDistribution( average_ratings, True) print len(average_ratings) ax[1, 2].set_title('Music - playcount', fontsize=20) ax[1, 2].set_xscale('log') ax[1, 2].set_yscale('log') ax[1, 2].plot(x_rating_counts, p_rating_counts, color + 'o', alpha=0.2, label=genre + ', ' + num_mus) align_plot(ax) plt.savefig('impact_distributions_normalized.png') plt.close()
def get_career_length(): title_font = 25 num_of_bins = 20 seaborn.set_style('white') f, ax = plt.subplots(1, 2, figsize=(22, 9)) st = f.suptitle("Career length distributions", fontsize=title_font) professions = [ ('director', 'k'), ('producer', 'b'), ('writer', 'r'), ('composer', 'g'), ('art-director', 'y'), ] for (label, color) in professions[0:1]: career_length = [ float(line.strip()) for line in open('ProcessedData/5_career_length/film_career_length_' + label + '.dat') ] xcareer_length, pcareer_length = getDistribution(career_length) #ax[0].plot(xcareer_length, pcareer_length, color, marker = 'o', alpha = 0.3, linewidth = 0, label = label+ ', ') fitPowerLaw([c for c in career_length if c > 10], ax[0], label) #results = powerlaw.Fit(career_length, xmin = min(xcareer_length), fit_method = 'KS') # alpha = results.power_law.alpha #D = results.power_law.KS() #results.plot_pdf(color=color, ax = ax[0], linestyle='-', linewidth=0, marker = 'o', alpha = 0.5) # results.power_law.plot_pdf(color=color, ax = ax[0], linestyle='-', linewidth=3, alpha = 0.9, label ='$\\alpha$= ' + str(round(alpha,2)) + ', $D$='+str(round(D, 2) )) ax[0].set_title('Length of director careers', fontsize=18) ax[1].set_title('Length of DJ careers', fontsize=18) professions = [('pop', 'k'), ('electro', 'b')] for (label, color) in professions[1:2]: career_length = [ float(line.strip()) for line in open('ProcessedData/5_career_length/music_career_length_' + label + '.dat') ] xcareer_length, pcareer_length = getDistribution(career_length) fitPowerLaw([c for c in career_length if c > 10], ax[1], label) ''' results = powerlaw.Fit(career_length, xmin = min(xcareer_length), fit_method = 'KS') alpha = results.power_law.alpha D = results.power_law.KS() results.plot_pdf(color=color, ax = ax[1], linestyle = '-', linewidth = 0, marker = 'o', alpha = 0.5) results.power_law.plot_pdf(color=color, ax = ax[1], linestyle='-', linewidth=3, alpha = 0.9, label ='$\\alpha$= ' + str(round(alpha,2)) + ', $D$='+str(round(D, 2) )) ''' align_plot1D(ax) plt.savefig('career_length.png') plt.close()