Ejemplo n.º 1
0
def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain):
    # ---------------------- Prepare Data Frame ----------------------- #
    df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume'])
    df_domain['Date'] = dates

    x_lbl = ['Observed Volume' for i in xrange(len(x))]
    xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))]
    xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))]
    col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl)

    df_plot = pd.concat( (df_domain, col3), axis=1)
    df_plot.columns = ['Date', 'Volume', 'Data']
    
    
    # ---------------------- Plot Decomposition ----------------------- #
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
        ggplot.geom_line(color='blue', size=2) + \
        ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
        ggplot.xlab("Week (Marked on Mondays)") + \
        ggplot.ylab("Message Vol") + \
        ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \
        ggplot.facet_grid('Data', scales='free_y') + \
        ggplot.theme_seaborn()

    return p
Ejemplo n.º 2
0
    def plot(self, what='cumulative_payouts', include_ci=True):
        import ggplot as gg #This is hacky ... need to DRY out the imports

        if what == 'cumulative_payouts':
            plt = self._plot_cumulative_payouts(include_ci=include_ci)
        elif what == 'avg_accuracy':
            plt = self._plot_avg_accuracy(include_ci=include_ci)
        elif what == 'all':
            summary = self.summary()
            p1 = self._plot_cumulative_payouts(include_ci=include_ci, summary=summary)
            p2 = self._plot_avg_accuracy(include_ci=include_ci, summary=summary)
            d1 = p1.data
            d2 = p2.data
            d1['Outcome'] = d1['AverageCumulativePayout']
            d2['Outcome'] = d2['AverageAccuracy']
            d1['Plot'] = 'Cumulative Payouts'
            d2['Plot'] = 'Average Accuracy'
            df = d1.append(d2, ignore_index=True)

            if include_ci:
                plt = gg.ggplot(gg.aes(x='Round', y='Outcome', ymin='ymin', ymax='ymax'), data=df) + \
                    gg.geom_area(alpha=0.5)
            else:
                plt = gg.ggplot(gg.aes(x='Round', y='Outcome'), data=df)

            plt += gg.facet_grid('Plot', scales='free')
        else:
            raise ValueError('%s is not a valid option' % what)

        return plt + gg.geom_line()
Ejemplo n.º 3
0
def graph1(score_data):
    """ Average score as time goes on;
        Creates and returns graph 1, a line graph. """

    date_column = score_data[0][find_time_stamp(score_data)]

    data = DataFrame(score_data[1:], columns=score_data[0])

    # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical
    # questions so we know what to graph
    num_questions = data.select_dtypes(include=['int64']).columns.values

    # Melt data so that each question is in a seperate row
    new_data = pd.melt(data,
                       id_vars=date_column,
                       value_vars=num_questions,
                       var_name="Question",
                       value_name="Score")

    # Convert date string into an actual date type
    new_data[date_column] = pd.to_datetime(new_data[date_column],
                                           format="%m/%d/%Y")

    # Group all rows with same date and question, and then take the average.
    new_data = new_data.groupby([date_column, 'Question']).mean().reset_index()
    new_data['All'] = "Indiviual Questions"

    new_data2 = new_data.groupby(date_column).mean().reset_index()
    new_data2['Question'] = "All Questions"
    new_data2['All'] = "Average of All Questions"

    new_data = pd.concat([new_data, new_data2])

    new_data[date_column] = new_data[date_column].astype('int64')

    # Create time graph with seperate lines for each question
    ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\
        ggplot.geom_point() +\
        ggplot.geom_line() +\
        ggplot.facet_grid("All") +\
        ggplot.scale_x_continuous(labels=[""], breaks=0) +\
        ggplot.labs(x="Time", y="Average Question Score") +\
        ggplot.ggtitle("Question Scores Over Time")
    return ret
Ejemplo n.º 4
0
 def _make_grid(grid):
     columns = ceil(grid.n_rows / len(grid.plots()))
     return grid.plot[0] + facet_grid(grid.n_rows, columns, scales="fixed")
Ejemplo n.º 5
0
def plot_weather_data(turnstile_weather):

    '''
    You are passed in a dataframe called turnstile_weather.
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.
    You should feel free to implement something that we discussed in class
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/\n
    turnstile_data_master_with_weather.csv

    To see all the columns and data points included in the turnstile_weather
    dataframe.

    However, due to the limitation of our Amazon EC2 server, we are giving
    you about 1/3 of the actual data in the turnstile_weather dataframe
    '''


    df = turnstile_weather.copy()
        
    # we will remove national holidays from the data. May 30 is Memorial Day,
    # the only national holiday in our data set. Normally this would be done
    # by passing in the data more elegantly, but since this is a bit more
    # constrained, we will simply hard code it into the function.
    national_holidays = ['2011-05-30']
    for holiday in national_holidays:
        df = df[df.DATEn != holiday]

    # add a column to represent the ISO day of the week for each data point.
    df[u'weekday'] = df[u'DATEn'].apply(\
            lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isoweekday())

    ##now introduce a multiplier variable so that the ENTRIESn_hourly
    ##values can be modified when we have multiple data days. For example
    ##if we have 2 fridays with rain the multiplier is 1/2 so that summing
    ##the modified values will give us the average number of riders
    ##entering the subways system on a rainy friday.

    for day in df.weekday.unique():
        for rain_status in df.rain.unique():

            # number of unique dates with the same weekday and rain status
            u = df[(df.weekday == day) & (df.rain == rain_status)].\
                DATEn.nunique()

            if u != 0:
                multiplier = float(1.0 / u)
            else:
                multiplier = 0

            daily_sum = \
                df[(df.weekday == day) & (df.rain == rain_status)].sum()

            entries_sum = daily_sum.ENTRIESn_hourly

            multiplier_index_list = \
                df[(df.weekday == day) & (df.rain == rain_status)].index

            df.loc[multiplier_index_list, u'ENTRIESn_hourly'] = \
                multiplier * entries_sum

    ##now we have a dataframe wich is ready to be utilized for making our
    ##plot using the data contained within.

    p = ggplot.ggplot(ggplot.aes(x = u'factor(weekday)', \
                                 weight = u'ENTRIESn_hourly', \
                                 fill = u'weekday'),\
                      data = df) +\
        ggplot.geom_bar() +\
        ggplot.facet_grid(x = u'rain', y = u'weekday') +\
        ggplot.ggtitle('Average Ridership on Sunny & Rainy ISO Weekdays')
    print p
    return p
Ejemplo n.º 6
0
import ggplot as gg
import ultrasignup as us
import numpy as np

d = us.event_results(299)

p1 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("50K Finishing Times for All Years")

p2 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("11M Finishing Times for All Years")
Ejemplo n.º 7
0
        },
        index=range(t * len(count_tops),
                    t * len(count_tops) + len(count_tops)))
    probs_list.append(probs_t)
    # Calculate KL divergences
    kl_mle_list.append(stats.entropy(true_bins_t, mle_probs_vals))
    kl_nn_list.append(stats.entropy(true_bins_t, nn_probs_t))

probs = pd.concat(probs_list)

# In[44]:

probs_tail = probs[probs.Tenor > 360]

gg.ggplot(probs_tail, gg.aes(x='Count Top', weight='Probs True')
          ) + gg.facet_grid('Tenor') + gg.geom_bar() + gg.geom_step(
              gg.aes(y='Probs MLE', color='red')) + gg.geom_step(
                  gg.aes(y='Probs NN', color='blue')) + gg.scale_x_continuous(
                      limits=(0, len(count_tops)))

# In[57]:

# KL divergences

kl_df = pd.DataFrame({
    'Tenor': range(0, t_end + 1),
    'KL MLE': kl_mle_list,
    'KL NN': kl_nn_list
})

print kl_df.head()
                	           'Pi':windowPi(sorted(list(set(vcfdf['window']))))})


# Now try and plot graph
	p_MaxMinor = gg.ggplot(gg.aes('window', 'MaxMinor'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Minor Variant Frequency (%)") +gg.ggtitle(vcfoutput + "\n Valid Minor Variant Sites :" + str(len(minorvar))) 


# Plot Nucleotide Diversity (Pi) along genome 
	p_pi =gg.ggplot(gg.aes('window', 'Pi'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Mean nucleotide diversity (" + u"\u03c0" +")") +gg.scale_y_continuous(expand=(0,0),limits=(0, windowed_df['Pi'].max(axis=0)+0.001)) +gg.ggtitle(vcfoutput + "\n Genome-wide Mean Nucleotide Diversity (" +u"\u03c0"+ ") :" +str(round(gw_Pi,6))) 

#p_pi

# Facetted plot (still not sorted y axes labels yet)
	windowed_df_melt = pd.melt(windowed_df, id_vars=['window'])
	p_combi = gg.ggplot(gg.aes('window', 'value',colour='variable'),data=windowed_df_melt)
	p_combi = p_combi + gg.geom_point(colour='variable') + gg.facet_grid('variable',scales='free_y')+gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")")

# Print graphs to .png
	p_combi.save(vcfinput + ".MinorVar_combo.png")
	p_MaxMinor.save(vcfinput + ".MinorVar.png")
	p_pi.save(vcfinput + ".Pi-diversity.png")



# Print full dataframe and minor vars only to separate tab delimited files
vcfdf.to_csv(vcfinput + ".analysed.tsv",sep='\t', index=False)
minorvar.to_csv(vcfinput + ".minorvars.tsv",sep='\t', index=False)



Ejemplo n.º 9
0
 def density(self, inp1, inp2, inp3):
     return gg.ggplot(self.data, gg.aes(x=inp1, color=inp2, fill=inp2)) +\
          gg.geom_density(alpha=0.5, size=5) +\
          gg.facet_grid(inp3) +\
          gg.ggtitle('Density of Fare by Sex and Survival Status') +\
          gg.ylab('Survival Status')
Ejemplo n.º 10
0
 def test_ndim_1_facet_grid_col(self):
     p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid(None, 'clarity')
     nrow, ncol = p.facets.nrow, p.facets.ncol
     self.assertEqual(nrow, 1)
     self.assertEqual(ncol, 8)
Ejemplo n.º 11
0
 def test_ndim_2facet_grid_reverse(self):
     p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid('clarity', 'cut')
     nrow, ncol = p.facets.nrow, p.facets.ncol
     self.assertEqual(nrow, 8)
     self.assertEqual(ncol, 5)
Ejemplo n.º 12
0
def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather.
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.
    You should feel free to implement something that we discussed in class
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/\n
    turnstile_data_master_with_weather.csv

    To see all the columns and data points included in the turnstile_weather
    dataframe.

    However, due to the limitation of our Amazon EC2 server, we are giving
    you about 1/3 of the actual data in the turnstile_weather dataframe
    '''

    df = turnstile_weather.copy()

    # we will remove national holidays from the data. May 30 is Memorial Day,
    # the only national holiday in our data set. Normally this would be done
    # by passing in the data more elegantly, but since this is a bit more
    # constrained, we will simply hard code it into the function.
    national_holidays = ['2011-05-30']
    for holiday in national_holidays:
        df = df[df.DATEn != holiday]

    # add a column to represent the ISO day of the week for each data point.
    df[u'weekday'] = df[u'DATEn'].apply(\
            lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isoweekday())

    ##now introduce a multiplier variable so that the ENTRIESn_hourly
    ##values can be modified when we have multiple data days. For example
    ##if we have 2 fridays with rain the multiplier is 1/2 so that summing
    ##the modified values will give us the average number of riders
    ##entering the subways system on a rainy friday.

    for day in df.weekday.unique():
        for rain_status in df.rain.unique():

            # number of unique dates with the same weekday and rain status
            u = df[(df.weekday == day) & (df.rain == rain_status)].\
                DATEn.nunique()

            if u != 0:
                multiplier = float(1.0 / u)
            else:
                multiplier = 0

            daily_sum = \
                df[(df.weekday == day) & (df.rain == rain_status)].sum()

            entries_sum = daily_sum.ENTRIESn_hourly

            multiplier_index_list = \
                df[(df.weekday == day) & (df.rain == rain_status)].index

            df.loc[multiplier_index_list, u'ENTRIESn_hourly'] = \
                multiplier * entries_sum

    ##now we have a dataframe wich is ready to be utilized for making our
    ##plot using the data contained within.

    p = ggplot.ggplot(ggplot.aes(x = u'factor(weekday)', \
                                 weight = u'ENTRIESn_hourly', \
                                 fill = u'weekday'),\
                      data = df) +\
        ggplot.geom_bar() +\
        ggplot.facet_grid(x = u'rain', y = u'weekday') +\
        ggplot.ggtitle('Average Ridership on Sunny & Rainy ISO Weekdays')
    print p
    return p
Ejemplo n.º 13
0
 def test_ndim_1_facet_grid_col(self):
     p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid(
         None, 'clarity')
     nrow, ncol = p.facets.nrow, p.facets.ncol
     self.assertEqual(nrow, 1)
     self.assertEqual(ncol, 8)
Ejemplo n.º 14
0
 def test_ndim_2facet_grid_reverse(self):
     p = gg.ggplot(gg.aes(x='price'), gg.diamonds) + gg.facet_grid(
         'clarity', 'cut')
     nrow, ncol = p.facets.nrow, p.facets.ncol
     self.assertEqual(nrow, 8)
     self.assertEqual(ncol, 5)