def plot_exponent(series, ax, num_of_bins, color, label):

    x_series, p_series = getDistribution(series, True)
    bx_series, bp_series, bp_series_err = getBinnedDistribution(
        x_series, p_series, num_of_bins)
    ax.plot(x_series, p_series, color + 'o', alpha=0.1, markersize=7)
    ax.errorbar((bx_series[1:] + bx_series[:-1]) / 2,
                bp_series,
                yerr=bp_series_err,
                color=color,
                fmt='o-',
                alpha=0.9,
                capsize=3,
                elinewidth=1,
                linewidth=3,
                label=label)
def plot_measure(average_ratings_year,
                 title,
                 num_of_bins,
                 ax,
                 color,
                 label,
                 music=False):

    x_average_ratings_year, y_average_ratings_year, yerr_average_ratings_year = get_yearly_avg_data(
        average_ratings_year)
    bx_average_ratings_year, bp_average_ratings_year, bperr_average_ratings_year = getBinnedDistribution(
        x_average_ratings_year, y_average_ratings_year, num_of_bins)

    ax.set_title(title, fontsize=20)

    if music:
        ax.errorbar(x_average_ratings_year,
                    y_average_ratings_year,
                    yerr=yerr_average_ratings_year,
                    fmt=color + '-',
                    alpha=0.5,
                    capsize=3,
                    elinewidth=1,
                    linewidth=2)
        ax.errorbar(
            (bx_average_ratings_year[1:] + bx_average_ratings_year[:-1]) / 2,
            bp_average_ratings_year,
            yerr=bperr_average_ratings_year,
            fmt=color + 'o-',
            alpha=0.6,
            capsize=3,
            elinewidth=1,
            linewidth=3,
            label=label)
    else:
        ax.errorbar(
            (bx_average_ratings_year[1:] + bx_average_ratings_year[:-1]) / 2,
            bp_average_ratings_year,
            yerr=bperr_average_ratings_year,
            fmt=color + 'o-',
            alpha=0.6,
            capsize=3,
            elinewidth=1,
            linewidth=3,
            label=label)

    ax.set_xlim([1880, 2020])
def plot_ccdf(file_avg_all, num_of_bins, ax, color, label, Nmin, title,
              marker):

    x_Nstar_avg_all, p_Nstar_avg_all, len_career, r_square, numInd = parse_N_star_N_data(
        file_avg_all, Nmin)

    if 'orig' in label:
        ax.set_title(str(numInd) + ' ' + title, fontsize=19)

    bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution(
        np.asarray(x_Nstar_avg_all), np.asarray(p_Nstar_avg_all), num_of_bins)

    #ax.plot(x_Nstar_avg_all, p_Nstar_avg_all, color = color,  marker = 'o', linewidth = 0, markersize = 5, alpha= 0.5, label = label + ', ' + str(len_career) + ' $R^2=$' + str(round(r_square, 4)),)
    ax.errorbar((bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2,
                bp_average_ratings,
                yerr=bperr_average_ratings,
                fmt=color + '-',
                linewidth=1,
                markersize=9,
                label=label + ' $R^2 = $' + str(round(r_square, 5)),
                marker=marker,
                alpha=0.7)

    return r_square
def get_r_model_curves(data_file,
                       max_data_file,
                       ax,
                       label,
                       num_of_bins,
                       title,
                       xlabel,
                       ylabel,
                       log=False):

    ax.set_title(title, fontsize=19)
    ax.set_xlabel(xlabel, fontsize=17)
    ax.set_ylabel(ylabel, fontsize=17)

    data = [float(line.strip()) for line in open(data_file)]
    (data_max,
     career_len) = zip(*[[float(num) for num in line.strip().split('\t')]
                         for line in open(max_data_file) if 'nan' not in line])
    ax.plot(career_len,
            data_max,
            marker='o',
            color='lightgrey',
            alpha=0.15,
            linewidth=0)

    career_max = []
    career_max_dict = {}

    for i in range(100):

        data_new = data[:]
        random.shuffle(data_new)

        for leng in career_len:
            #career_max.append(max( data_new[0:int(leng)]))

            if leng not in career_max_dict:
                career_max_dict[leng] = [max(data_new[0:int(leng)])]
            else:
                career_max_dict[leng].append(max(data_new[0:int(leng)]))

            del data_new[0:int(leng)]

    sorted_len = sorted(list(set(career_len)))
    career_max = []
    for s in sorted_len:
        career_max.append(np.mean(career_max_dict[s]))

    print len(sorted_len), len(career_max)

    if not log:
        xb_data, pb_data, pberr_data = getBinnedDistribution(
            np.asarray(career_len), np.asarray(data_max), num_of_bins)
        xb_gen, pb_gen, pberr_gen = getBinnedDistribution(
            np.asarray(sorted_len), np.asarray(career_max), num_of_bins)
        ax.errorbar((xb_data[1:] + xb_data[:-1]) / 2,
                    pb_data,
                    yerr=pberr_data,
                    fmt='o-',
                    color='grey',
                    label='data',
                    alpha=0.9)
        ax.errorbar((xb_gen[1:] + xb_gen[:-1]) / 2,
                    pb_gen,
                    yerr=pberr_gen,
                    fmt='-',
                    color='r',
                    label='R-model',
                    alpha=0.9)
    else:
        ax.set_xscale('log')
        ax.set_yscale('log')
        xb_data, pb_data, pberr_data = getLogBinnedDistribution(
            np.asarray(career_len), np.asarray(data_max), num_of_bins)
        xb_gen, pb_gen, pberr_gen = getLogBinnedDistribution(
            np.asarray(sorted_len), np.asarray(career_max), num_of_bins)
        ax.errorbar(xb_data,
                    pb_data,
                    yerr=pberr_data,
                    fmt='o-',
                    color='grey',
                    label='data',
                    alpha=0.9)
        ax.errorbar(xb_gen,
                    pb_gen,
                    yerr=pberr_gen,
                    fmt='-',
                    color='r',
                    label='R-model',
                    alpha=0.9)
def get_imapct_distr():
    ''' ---------------------------------------------- '''
    '''      MOVIE YO                                  '''

    professions = [('director', 'k'), ('producer', 'b'), ('writer', 'r'),
                   ('composer', 'g'), ('art-director', 'y')]

    num_of_bins = 20
    title_font = 25
    seaborn.set_style('white')
    f, ax = plt.subplots(2, 3, figsize=(25, 15))
    st = f.suptitle("IMDb normalized impact distributions",
                    fontsize=title_font)

    FOLDER = 'ProcessedDataNormalized'

    for (label, color) in professions:

        num_car = str(
            int(
                round(
                    len(
                        os.listdir('Data/Film/film-' + label +
                                   '-simple-careers')) / 1000.0))) + 'k'

        file_avg = FOLDER + '/1_impact_distributions/film_average_ratings_dist_' + label + '.dat'
        file_cnt = FOLDER + '/1_impact_distributions/film_rating_counts_dist_' + label + '.dat'
        file_mets = FOLDER + '/1_impact_distributions/film_metascores_dist_' + label + '.dat'
        file_crit = FOLDER + '/1_impact_distributions/film_critic_review_dist_' + label + '.dat'
        file_user = FOLDER + '/1_impact_distributions/film_user_review_dist_' + label + '.dat'

        average_ratings = np.asarray(
            [round(float(line.strip()), 2) for line in open(file_avg)])
        rating_counts = [
            round(float(line.strip()), 2) for line in open(file_cnt)
        ]
        metascores = [
            round(float(line.strip()), 1) for line in open(file_mets)
        ]
        critic_review = [
            round(float(line.strip()), 2) for line in open(file_crit)
        ]
        user_review = [
            round(float(line.strip()), 2) for line in open(file_user)
        ]

        # plot avg ratings
        x_average_ratings, p_average_ratings = getDistribution(
            average_ratings, True)
        bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution(
            x_average_ratings, p_average_ratings, num_of_bins)

        ax[0, 0].set_title('IMDb - average rating', fontsize=20)
        ax[0, 0].plot(x_average_ratings,
                      p_average_ratings,
                      color,
                      marker='o',
                      alpha=0.1,
                      linewidth=0,
                      label=label + ', ' + str(num_car))
        ax[0, 0].errorbar(
            (bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2,
            bp_average_ratings,
            yerr=bperr_average_ratings,
            fmt=color + '-',
            linewidth=2)

        # plot rating counts
        x_rating_counts, p_rating_counts = getDistribution(rating_counts, True)
        bx_rating_counts, bp_rating_counts, bperr_rating_counts = getBinnedDistribution(
            x_rating_counts, p_rating_counts, num_of_bins)

        ax[0, 1].set_title('IMDb - rating count', fontsize=20)
        ax[0, 1].set_xscale('log')
        ax[0, 1].set_yscale('log')
        ax[0, 1].plot(x_rating_counts,
                      p_rating_counts,
                      color + 'o',
                      alpha=0.8,
                      label=label)  # + ', ' + str(num_wr))
        #ax[0,1].errorbar((bx_rating_counts[1:] + bx_rating_counts[:-1])/2, bp_rating_counts, yerr=bperr_rating_counts, fmt='b-', linewidth = 2)

        # plot metascores
        x_metascores, p_metascores = getDistribution(metascores, True)
        bx_metascores, bp_metascores, bperr_metascores = getBinnedDistribution(
            x_metascores, p_metascores, num_of_bins)
        ax[0, 2].set_title('IMDb - metascores', fontsize=20)
        ax[0, 2].plot(x_metascores,
                      p_metascores,
                      color + 'o',
                      alpha=0.2,
                      label=label)  # + ', ' + str(len(metascores)))
        ax[0, 2].errorbar((bx_metascores[1:] + bx_metascores[:-1]) / 2,
                          bp_metascores,
                          yerr=bperr_metascores,
                          fmt=color + '-',
                          linewidth=2)

        # plot critic review count
        x_critic_review, p_critic_review = getDistribution(critic_review, True)
        ax[1, 0].set_title('IMDb - critic_review', fontsize=20)
        ax[1, 0].set_xscale('log')
        ax[1, 0].set_yscale('log')
        ax[1, 0].plot(x_critic_review,
                      p_critic_review,
                      color + 'o',
                      alpha=0.8,
                      label=label)  #+ ', ' + str(len(critic_review)))

        # plot user review count
        x_user_review, p_user_review = getDistribution(user_review, True)
        ax[1, 1].set_title('IMDb - user_review', fontsize=20)
        ax[1, 1].set_xscale('log')
        ax[1, 1].set_yscale('log')
        ax[1, 1].plot(x_user_review,
                      p_user_review,
                      color + 'o',
                      alpha=0.8,
                      label=label)  # + ', ' + str(len(user_review)))
    ''' ---------------------------------------------- '''
    '''      MOVIE YO                                  '''

    genres = [('electro', 'k'), ('pop', 'b')]

    for (genre, color) in genres:

        num_mus = str(
            int(
                round(
                    len(
                        os.listdir('Data/Music/music-' + genre +
                                   '-simple-careers')) / 1000.0))) + 'k'

        file_music = FOLDER + '/1_impact_distributions/music_rating_counts_dist_' + genre + '.dat'
        average_ratings = np.asarray(
            [round(float(line.strip())) for line in open(file_music)])
        x_rating_counts, p_rating_counts = getDistribution(
            average_ratings, True)

        print len(average_ratings)

        ax[1, 2].set_title('Music - playcount', fontsize=20)
        ax[1, 2].set_xscale('log')
        ax[1, 2].set_yscale('log')
        ax[1, 2].plot(x_rating_counts,
                      p_rating_counts,
                      color + 'o',
                      alpha=0.2,
                      label=genre + ', ' + num_mus)

    align_plot(ax)
    plt.savefig('impact_distributions_normalized.png')
    plt.close()
def get_impact_correlations():

    num_of_bins = 12
    title_font = 25
    seaborn.set_style('white')

    professions = [('director', 'royalblue'), ('producer', 'b'),
                   ('writer', 'r'), ('composer', 'g'), ('art-director', 'y')]

    for mode in ['', 'Normalized']:

        f, ax = plt.subplots(2, 3, figsize=(25, 15))
        st = f.suptitle("IMDb impact correlations - " + mode,
                        fontsize=title_font)

        for (label, color) in professions[0:1]:

            impacts = zip(*[[
                float(aaa) if 'tt' not in aaa else aaa
                for aaa in line.strip().split('\t')
            ] for line in open('ProcessedData' + mode +
                               '/7_multiple_impacts/film_multiple_impacts_' +
                               label + '.dat')])

            Alpha = 0.05

            ax[0, 0].set_ylabel('avg rating', fontsize=20)
            ax[0, 0].set_xlabel('rating cnt', fontsize=20)
            ax[0, 0].set_xscale('log')
            avg, cnt = get_rid_of_zeros(impacts[2], impacts[1])
            ax[0, 0].plot(impacts[2],
                          impacts[1],
                          'o',
                          color=color,
                          alpha=Alpha,
                          label=label)
            xb_avg, pb_avg, pberr_avg = getLogBinnedDistribution(
                np.asarray(avg), np.asarray(cnt), num_of_bins)
            ax[0, 0].errorbar(xb_avg,
                              pb_avg,
                              yerr=pberr_avg,
                              fmt='^-',
                              color='r')

            x_cnt_meta, p_cnt_meta, perr_cnt_meta = getBinnedDistribution(
                np.asarray(impacts[3]), np.asarray(impacts[1]), num_of_bins)
            meta, avg = get_rid_of_zeros(impacts[3], impacts[1])
            ax[0, 1].set_xlabel('metascore', fontsize=20)
            ax[0, 1].plot(meta,
                          avg,
                          'o',
                          color=color,
                          alpha=Alpha,
                          label=label)
            ax[0, 1].errorbar(
                (x_cnt_meta[1:] + x_cnt_meta[:-1]) / 2,
                p_cnt_meta,
                yerr=perr_cnt_meta,
                fmt='^-',
                color='r',
                label='$corr=$' + str(
                    round(
                        stats.pearsonr((x_cnt_meta[1:] + x_cnt_meta[:-1]) / 2,
                                       p_cnt_meta)[0],
                        4)))  #, alpha = Alpha, label = label)

            ax[0, 2].set_ylabel('#critic review', fontsize=20)
            ax[0, 2].set_xlabel('#user review', fontsize=20)
            ax[0, 2].set_xscale('log')
            ax[0, 2].set_yscale('log')
            crit, user = get_rid_of_zeros(impacts[4], impacts[5])
            xb_crit, pb_crit, pberr_crit = getLogBinnedDistribution(
                np.asarray(crit), np.asarray(user), num_of_bins)
            ax[0, 2].plot(crit,
                          user,
                          'o',
                          color=color,
                          alpha=Alpha,
                          label=label)
            ax[0, 2].errorbar(xb_crit,
                              pb_crit,
                              yerr=pberr_crit,
                              fmt='^-',
                              color='r')

            #ax[1,0].xaxis.get_major_formatter().set_powerlimits((0, 1))
            ax[1, 0].set_xscale('log')
            cnttt, metatt = get_rid_of_zeros(impacts[2], impacts[3])
            x_cnt_metat, p_cnt_metat, perr_cnt_metat = getLogBinnedDistribution(
                np.asarray(cnttt), np.asarray(metatt), num_of_bins)
            ax[1, 0].set_xlabel('rating cnt', fontsize=20)
            ax[1, 0].set_ylabel('metascore', fontsize=20)
            ax[1, 0].plot(cnttt,
                          metatt,
                          'o',
                          color=color,
                          alpha=Alpha,
                          label=label)
            ax[1, 0].errorbar(x_cnt_metat,
                              p_cnt_metat,
                              yerr=perr_cnt_metat,
                              fmt='^-',
                              color='r')  #, alpha = Alpha, label = label)

            ax[1, 1].set_xlabel('rating cnt', fontsize=20)
            ax[1, 1].set_ylabel('#critic review', fontsize=20)
            cnt, crit = get_rid_of_zeros(impacts[2], impacts[4])
            xb_cnt_crit, pb_cnt_crit, pberr_cnt_crit = getLogBinnedDistribution(
                np.asarray(cnt), np.asarray(crit), num_of_bins)
            ax[1, 1].loglog(impacts[2],
                            impacts[4],
                            'o',
                            color=color,
                            alpha=Alpha,
                            label=label)
            ax[1, 1].errorbar(xb_cnt_crit,
                              pb_cnt_crit,
                              yerr=pberr_cnt_crit,
                              fmt='^-',
                              color='r')

            ax[1, 2].set_xlabel('rating cnt', fontsize=20)
            ax[1, 2].set_ylabel('#user review', fontsize=20)
            cnt, user = get_rid_of_zeros(impacts[2], impacts[5])
            xb_cnt_crit, pb_cnt_user, pberr_cnt_user = getLogBinnedDistribution(
                np.asarray(cnt), np.asarray(user), num_of_bins)
            ax[1, 2].loglog(impacts[2],
                            impacts[5],
                            'o',
                            color=color,
                            alpha=Alpha,
                            label=label)
            ax[1, 2].errorbar(xb_cnt_crit,
                              pb_cnt_user,
                              yerr=pberr_cnt_user,
                              fmt='^-',
                              color='r')

        align_plot(ax)
        plt.savefig('correlations_' + mode + '.png')
        plt.close()
def get_length_plots():

    title_font = 25
    num_of_bins = 20
    seaborn.set_style('white')

    f, ax = plt.subplots(2, 3, figsize=(22, 15))
    #st = f.suptitle("IMDb Inflation of impact measures", fontsize=title_font)
    file_avg_year = 'ProcessedData/3_inflation_curves/film_yearly_average_ratings_dist_director.dat'
    average_ratings_year = np.asarray(
        [line.strip() for line in open(file_avg_year)])
    x_average_ratings_year, y_average_ratings_year = get_num_per_year(
        average_ratings_year)
    #ax[0,0].plot(x_average_ratings_year, y_average_ratings_year, 'ko', label = 'movies', alpha  = 0.8)
    #ax[0,0].set_title('#movies', fontsize = title_font)

    ax[0, 0].set_yscale('log')
    ax[0, 0].plot(x_average_ratings_year,
                  y_average_ratings_year,
                  'ko',
                  label='movies',
                  alpha=0.6)
    ax[0, 0].set_title('#movies', fontsize=title_font)

    file_avg_year_electro = 'ProcessedData/3_inflation_curves/music_yearly_rating_counts_dist_electro.dat'
    file_avg_year_pop = 'ProcessedData/3_inflation_curves/music_yearly_rating_counts_dist_pop.dat'

    average_ratings_year_electro = np.asarray(
        [line.strip() for line in open(file_avg_year_electro)])
    average_ratings_year_pop = np.asarray(
        [line.strip() for line in open(file_avg_year_pop)])

    x_average_ratings_year_electro, y_average_ratings_year_electro = get_num_per_year(
        average_ratings_year_electro)
    x_average_ratings_year_pop, y_average_ratings_year_pop = get_num_per_year(
        average_ratings_year_pop)

    xb_average_ratings_year_electro, pb_average_ratings_year_electro, pberr_average_ratings_year_electro = getBinnedDistribution(
        x_average_ratings_year_electro, y_average_ratings_year_electro,
        num_of_bins)

    ax[0, 1].set_yscale('log')
    ax[0, 1].plot(x_average_ratings_year_electro,
                  y_average_ratings_year_electro,
                  'ko',
                  label='electro',
                  alpha=0.6)
    ax[0, 1].plot(x_average_ratings_year_pop,
                  y_average_ratings_year_pop,
                  'bo',
                  label='pop',
                  alpha=0.6)
    ax[0, 1].set_title('#tracks', fontsize=title_font)

    ax[0, 2].set_title('#books', fontsize=title_font)

    align_plot(ax)
    plt.savefig('num_of_products_length_data.png')