def main(file_path):
    # Validate raw data path
    if not os.path.exists(file_path):
        LOG_ERROR('Could not find file: {}'.format(file_path))
        return

    # Validate raw data file type
    if not file_path.endswith('.pkl'):
        LOG_ERROR('File path must be a pickle file')
        return

    with open(file_path, 'rb') as f:
        LOG_INFO('Parsing pickle file: {}'.format(file_path))
        conversation = pickle.load(f)

        LOG_INFO('Found conversation: {}'.format(conversation['conversation_name']))

        df = pd.DataFrame(conversation['messages'])
        df.columns = ['Timestamp', 'Type', 'Participant']
        # df['Datetime'] = pd.to_datetime(df['Timestamp'])
        df['Datetime'] = df['Timestamp'].apply(lambda x:
                datetime.datetime.fromtimestamp(float(x)).toordinal())

        histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \
                        + ggplot.geom_histogram(alpha=0.6, binwidth=2) \
                        + ggplot.scale_x_date(labels='%b %Y') \
                        + ggplot.ggtitle(conversation['conversation_name']) \
                        + ggplot.ylab('Number of messages') \
                        + ggplot.xlab('Date')

        print(histogram)
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None):
    '''
    Show on screen a line plot. Can save to a .pdf file too if specified.
    
    X,y - 
    '''
    df = pandas.DataFrame()
    
    if (title!=None):
        img_title = title.replace(" ","").replace(".","-") + ".pdf"
    
    df['X'] = X 
    for i in range(y.shape[1]):
        df[str(i)] = y.iloc[:,i].values
    
    if colors is None:
        colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys())

    df = df.iloc[0:df.shape[0]-1, :]    
    p = ggplot(df, aes(x='X'))
    
    for i in range(y.shape[1]):
         if colors not in X.columns.values:
            p = p + geom_line(aes(y=str(i),color = colors[i]))
         else:
            p = p + geom_point(aes(y=str(i),color = colors))
    
    p = p + xlab(labelx) + ylab(labely) + ggtitle(title)
    
    if(save):
        p.save(img_title)
    else:   
        return p
def plot_update_frequency(result):    
    import pandas as pd
    import numpy
    
    #turns query results into timeseries of chnages
    d = []
    v = []
    for res in result:
        d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime())
        v.append(res['count'])       
        
    ts = pd.DataFrame(v, index = d, columns = ['changes'])
    ts = ts.resample('W', how='sum')
    ts.index.names = ['date']

    import ggplot
    #plots timeseries of changes       
    p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\
            ggplot.geom_point(color = 'blue') +\
            ggplot.xlab('Period') +\
            ggplot.ylab('Changes') +\
            ggplot.geom_smooth() +\
            ggplot.ylim(low = 0) +\
            ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"),  labels = ggplot.date_format('%Y-%m')) +\
            ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week')
    return p
Exemple #4
0
    def density_plot(by='dpsi_zscore', categorical=True):

        if categorical:
            data_dict = {
                'muts increasing AAA':
                np.array([x[by] for x in variants['increase']]),
                'muts decreasing AAA':
                np.array([x[by] for x in variants['decrease']]),
                'muts not changing AAA length':
                np.array([x[by] for x in variants['constant']])
            }
        else:
            data_dict = OrderedDict(
                (change,
                 np.array(
                     [x[by] for x in variants['all']
                      if x['change'] == change])) for change in aaa_changes if
                len([x[by]
                     for x in variants['all'] if x['change'] == change]) > 1)

        plot = (
            ggplot(aes(x='value', colour='variable', fill='variable'),
                   data=prepare_data_frame(data_dict)) +
            ggtitle('Impact of variants affecting poly AAA sequences on %s' %
                    by) + xlab(by) + ylab('Kernel density estimate') +
            geom_density(alpha=0.6))

        return plot
Exemple #5
0
def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain):
    # ---------------------- Prepare Data Frame ----------------------- #
    df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume'])
    df_domain['Date'] = dates

    x_lbl = ['Observed Volume' for i in xrange(len(x))]
    xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))]
    xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))]
    col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl)

    df_plot = pd.concat( (df_domain, col3), axis=1)
    df_plot.columns = ['Date', 'Volume', 'Data']
    
    
    # ---------------------- Plot Decomposition ----------------------- #
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
        ggplot.geom_line(color='blue', size=2) + \
        ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
        ggplot.xlab("Week (Marked on Mondays)") + \
        ggplot.ylab("Message Vol") + \
        ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \
        ggplot.facet_grid('Data', scales='free_y') + \
        ggplot.theme_seaborn()

    return p
Exemple #6
0
def render(data, bin_width, plot_density=False):
    if plot_density:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \
               + ggplot.geom_density() \
               + ggplot.scale_x_date(labels='%b %Y') \
               + ggplot.ggtitle('Conversation Densities') \
               + ggplot.ylab('Density') \
               + ggplot.xlab('Date')
    else:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \
               + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \
               + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \
               + ggplot.ggtitle('Message Breakdown') \
               + ggplot.ylab('Number of Messages') \
               + ggplot.xlab('Date')

    print(plot)
 def plot_roc(self, experiment_type, to_plot):
     # turn this to string for categorical colour scheme
     to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]]
     p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR")
     gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p)
     return
	def plotAverageLatency(self):
		averages = [d.averageLatency() for d in self.data]
		dat = { "device" : range(1, len(averages) + 1), "average" : averages }
		dataframe = pandas.DataFrame(dat)
		chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \
				+ ggplot.labs(title="Average Latency Per Device") + \
				ggplot.ylab("Average Latency (ms)") + \
				ggplot.xlab("Device Number")  + \
				ggplot.geom_bar(stat="identity")
		chart.show()
Exemple #9
0
 def plotAverageLatency(self):
     averages = [d.averageLatency() for d in self.data]
     dat = {"device": range(1, len(averages) + 1), "average": averages}
     dataframe = pandas.DataFrame(dat)
     chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \
       + ggplot.labs(title="Average Latency Per Device") + \
       ggplot.ylab("Average Latency (ms)") + \
       ggplot.xlab("Device Number")  + \
       ggplot.geom_bar(stat="identity")
     chart.show()
def plot_weather_data(df):
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly'))
	plot += gp.geom_line()
	plot += gp.ggtitle('Subway Ridership by Day')
	plot += gp.xlab('Date')
	plot += gp.ylab('Exits')
	return plot
def lineplot(hr_year_csv):
    df = pandas.read_csv(hr_year_csv)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR"))
        + gp.geom_point(color="red")
        + gp.geom_line(color="red")
        + gp.ggtitle("Homeruns by Year")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
def lineplot_compare(filename):
    df = pd.read_csv(filename)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle("Homeruns by Year by Team")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Exemple #13
0
def plot(mydata, opts):
    # number of mutants killed by exactly 0 tests
    nd = sum(mydata[mydata.ntests == 0].exactly)
    d = sum(mydata[mydata.ntests != 0].exactly)
    total = nd + d
    print("Not detected = ", nd, "/", total)
    title = opts['title'] + (' ND=%d/%d (Mu: %3.1f%%)' %
                             (nd, total, (1 - nd / total) * 100.0))
    p = gg.ggplot(gg.aes(x=opts['x'], y=opts['y']), data=mydata) + gg.geom_point() +\
            gg.xlab(opts['x']) + gg.ylab(opts['y']) + gg.ggtitle(title)  #+ \
    #   gg.xlim(0,lim)

    p.save(opts['file'])
def lineplot_compare(filename):  # Cleaner version with string vars
    df = pd.read_csv(filename)
    p_title = "Homeruns by Year by Team"
    p_xlab = "Homeruns"
    p_ylab = "Year"
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle(p_title)
        + gp.xlab(p_xlab)
        + gp.ylab(p_ylab)
    )
    return gg
    def plot_distance_VS_sp_per_bin_gt150(self):
        fname = "distance_VS_sp_per_bin_gt150"

        sp_gt150mask = self.d.len_contigs_per_bin["SNP-pairs"] > 150

        p = self._plot_scat_w_line(gp.ggplot(gp.aes(x='distance_bin',
                                              y='SNP-pairs'),
                                       data=self.d.len_contigs_per_bin[sp_gt150mask])
                             ) + gp.ylab('SNP-pairs per bin')

        # print p

        self.save_figs(base_dir=self.base_dir,
                       fname=fname,
                       save_types=self.formats,
                       is_ggplot=p
                       )
Exemple #16
0
def googletrend_command(delta_t, threshold=0.0, inverse=False):
    """the command to run google trend algorithm.

	:param delta_t:   the upper bound for original delta_t parameter
    :param threshold: upper bound for the threshold of differentiating two classes
    :param inverse:   whether to inverse the classifier
	"""
    ## handle filepath and title based on parameter inverse
    filename = "googletrend"
    titlename = "ROC of google trend classifier"
    if inverse:
        filename += "_inverse"
        titlename += " (inverse version)"
    filepath = "./plots/%s.jpg" % filename
    ## generate data first
    data = googletrend.preprocess()
    ## store classifier evaluation metrics into dict
    output = {}
    output['tpr'] = []
    output['fpr'] = []
    output['plot'] = []
    for thre in np.arange(0, threshold + 0.1, 0.1):
        print "==> threshold: %f, inverse: %s" % (thre, inverse)
        for i in xrange(1, int(delta_t)):
            googletrend.algorithm(data, i, thre, inverse)
            tp_rate, fp_rate = googletrend.evaluate(data)
            # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate)
            output['tpr'].append(tp_rate)
            output['fpr'].append(fp_rate)
            output['plot'].append('thre_' + str(thre))
    ## plot ROC graph
    ## add a y=x baseline for comparison
    output['tpr'].extend([0.0, 1.0])
    output['fpr'].extend([0.0, 1.0])
    output['plot'].extend(['baseline', 'baseline'])
    df = pd.DataFrame(output)
    graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \
      gg.theme_seaborn() + \
      gg.ggtitle(titlename) + \
         gg.xlab("FPR") + \
         gg.ylab("TPR") + \
         gg.xlim(0.0, 1.0) + \
         gg.ylim(0.0, 1.0) + \
      gg.geom_point() + \
      gg.geom_line()
    gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
Exemple #17
0
def data_output(data, chart_title):
		print "Good News! You're data has been returned. I'm happy to show it to you."
		print "Just tell me how you want it - Table or Line Graph?"

		data_output = raw_input("Choose table or line > ")

		if data_output[0].lower() == "t":
			print "Ok, here's your data."
			print data
		elif data_output[0] == "l" or data_output[0].lower() =="g":
			import ggplot as gg 

			plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \
    			gg.geom_point(color='black') + \
    			gg.geom_line(color='green') + \
    			gg.ggtitle(chart_title) + \
    			gg.xlab("Month, Year") + \
    			gg.ylab("Value") 
    			gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B"))

			print (plot + gg.theme_xkcd())
			
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year,
    # which contains two columns -- yearID, and HR.
    #
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv

    # your code here

    df = pd.read_csv('hr_year.csv')
    gg = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year, 
    # which contains two columns -- yearID, and HR.  
    # 
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv
    
    # your code here

    df = pd.read_csv('hr_year.csv')
    gg  = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
                           m=1,
                           alpha=alpha))

    #save results to results folder, with plot and printing to screen.
    metadata = datetime.datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode)
    f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode='w')
    pkl.dump(obj=results, file=f)

    logtimes = [math.log(r.avg_time, 2) for r in results]
    distances = [r.avg_distance for r in results]
    methods = [r.method[0:3] for r in results]
    alpha = [r.alpha for r in results]
    m = [r.m for r in results]
    results_df = pd.DataFrame(
        data={
            "logtimes": logtimes,
            "distances": distances,
            "methods": methods,
            "m": m,
            "alpha": alpha
        })
    print results_df
    p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes",
                                                         y = "distances",
                                                         label = "methods")) + \
        gg.geom_text() + \
        gg.ggtitle("LSH and KD trees: tradeoffs") + \
        gg.xlab("Log2 average query time  ") + gg.ylab("Average L2 distance from query point)")
    gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot=p)
ax.set_xlim(0, 23)

turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]]
turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining")
turnstile_rain.groupby("rain2").describe()

turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]]
turnstile_rain["ENTRIESn_hourly_log10"] = np.log10(turnstile_rain["ENTRIESn_hourly"] + 1)
turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining")
set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors
plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \
       gg.geom_density() + \
       gg.facet_wrap("rain2", scales="fixed") + \
       gg.scale_colour_manual(values=set1) + \
       gg.xlab("log10(entries per hour)") + \
       gg.ylab("Number of turnstiles") + \
       gg.ggtitle("Entries per hour whilst raining and not raining")
plot

np.random.seed(42)
data = pd.Series(np.random.normal(loc=180, scale=40, size=600))
data.hist()

p = turnstile_weather["ENTRIESn_hourly"].hist()
pylab.suptitle("Entries per hour across all stations")
pylab.xlabel("Entries per hour")
pylab.ylabel("Number of occurrences")

turnstile_weather["grp"]=turnstile_weather["rain"]+turnstile_weather["fog"]
plot=ggplot(aes(y='ENTRIESn_hourly',x='Hour'), data=turnstile_weather)+geom_histogram()+xlab("Hour")+ylab("ENTRIESn_hourly")+ggtitle("T")
print plot
#total-based
dftmp = df[['n_sub']+brks[:5]].melt(id_vars=['n_sub'],value_vars=brks[:5], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Total-Expected Total)/Expected Total']*dftmp['n_sub'].size
df_stacked = dftmp
#enhancement-based
dftmp = df[['n_sub']+brks[5:10]].melt(id_vars=['n_sub'],value_vars=brks[5:10], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Enhanc-Expected Enhanc)/Expected Enhanc']*dftmp['n_sub'].size
df_stacked = df_stacked.append(dftmp)
#enhancements + full sample background
dftmp = df[['n_sub']+brks[10:]].melt(id_vars=['n_sub'],value_vars=brks[10:], var_name = 'stat',value_name = 'value')
dftmp['method']=['(Enhanc+Expected Backgr-Expected Total)/Expected Total']*dftmp['n_sub'].size
df_stacked = df_stacked.append(dftmp)
df_stacked['percentile']=['{0}th%'.format(a[1:3]) for a in df_stacked['stat']]
#plots
#compare all 3
plt1 = gg.ggplot(df_stacked, gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.facet_wrap('method')+gg.ggtitle('Bias comparison {0}'.format(title))
plt1.save(filename = r'..\charts\drivebias_laqn_{0}.png'.format(species), width=None, height=None, dpi=300)

#plot total alone for presenation
plt2 = gg.ggplot(df_stacked[df_stacked['method']=='(Total-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title))
t = gg.theme_bw()
t._rcParams['font.size']=16
plt2 = plt2+t
plt2.save(filename = r'..\charts\drivebias_laqn_{0}_total.png'.format(species), width=None, height=None, dpi=300)

#plot enhancement alone for presenation
plt3 = gg.ggplot(df_stacked[df_stacked['method']=='(Enhanc+Expected Backgr-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title))
t = gg.theme_bw()
t._rcParams['font.size']=16
plt3 = plt3+t
plt3.save(filename = r'..\charts\drivebias_laqn_{0}_enhanc.png'.format(species), width=None, height=None, dpi=300)
Exemple #23
0
 def test_ylab(self):
     p = gg.ggplot(gg.aes(x='mpg'),
                   gg.mtcars) + gg.geom_histogram() + gg.ylab("TEST")
     self.assertEqual(p.ylab, "TEST")
Exemple #24
0
def quarterly_queries(keywords,
                      category,
                      cookies,
                      session,
                      domain,
                      throttle,
                      filing_date,
                      ggplot,
                      month_offset=[-12, 12],
                      trends_url=DEFAULT_TRENDS_URL):
    """Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data.

		month_offset: [no. month back, no. months forward] to query
	Returns daily data over the period.
	"""

    aw_range = arrow.Arrow.range
    begin_period = aget(filing_date).replace(months=month_offset[0])
    ended_period = aget(filing_date).replace(months=month_offset[1])

    # Set up date ranges to iterate queries across
    start_range = aw_range('month', YYYY_MM(begin_period),
                           YYYY_MM(ended_period))
    ended_range = aw_range('month',
                           YYYY_MM(begin_period).replace(months=3),
                           YYYY_MM(ended_period).replace(months=3))

    start_range = [r.datetime for r in start_range][::3]
    ended_range = [r.datetime for r in ended_range][::3]

    # Fix last date if incomplete quarter (offset -1 week from today)
    last_week = arrow.utcnow().replace(weeks=-1).datetime
    start_range = [d for d in start_range if d < last_week]
    ended_range = [d for d in ended_range if d < last_week]
    if len(ended_range) < len(start_range):
        ended_range += [last_week]

    # Iterate attention queries through each quarter
    all_data = []
    missing_queries = []  # use this to scale IoT later.
    for start, end in zip(start_range, ended_range):
        if start > last_week:
            break

        print("Querying period: {s} ~ {e}".format(s=start.date(),
                                                  e=end.date()))
        throttle_rate(throttle)

        response_args = {
            'url': trends_url.format(domain=domain),
            'params': _query_parameters(start, end, keywords, category),
            'cookies': cookies,
            'session': session
        }

        query_data = _check_data(
            keywords, _process_response(_get_response(**response_args)))

        # from IPython import embed; embed()
        if query_data[1] == '':
            query_data = [[date, '0']
                          for date in arrow.Arrow.range('day', start, end)]
            missing_queries.append('missing')
        if all(int(vals) == 0 for date, vals in query_data):
            query_data = [[date, '0']
                          for date in arrow.Arrow.range('day', start, end)]
            missing_queries.append('missing')
        elif len(query_data[0][0]) > 10:
            missing_queries.append('weekly')
        else:
            missing_queries.append('daily')

        try:
            if not aligned_weekly(query_data, all_data):
                ## Workaround: shift filing date
                q1 = weekly_date(all_data[-1][-1][0])
                q2 = weekly_date(query_data[0][0])

                if q1 < q2:
                    start = arrow.get(start).replace(months=-1)
                    response_args['params'] = _query_parameters(
                        start, end, keywords, category)
                    ## Do a new 4month query, overlap/replace previous month.
                    query_data = _check_data(
                        keywords,
                        _process_response(_get_response(**response_args)))
                    if all_data[:-1] != []:
                        q2 = weekly_date(query_data[0][0], 'start')
                        all_data[-1] = [
                            d for d in all_data[-1] if q2 > weekly_date(d[0])
                        ]

                elif q1 >= q2:
                    # if q1 > 1st date in query_data, remove the first few entries
                    query_data = [
                        d for d in query_data if q1 < weekly_date(d[0])
                    ]

        except IndexError:
            pass
        except:
            from IPython import embed
            embed()

        finally:
            all_data.append(query_data)

    # Get overall long-term trend data across entire queried period
    s = begin_period.replace(weeks=-2).datetime
    e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime
    e2 = arrow.utcnow().replace(weeks=-1).datetime
    e = min(e1, e2)
    print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(),
                                                               e=e.date()))

    response_args = {
        'url': trends_url.format(domain=domain),
        'params': _query_parameters(s, e, keywords, category),
        'cookies': cookies,
        'session': session
    }

    query_data = _check_data(keywords,
                             _process_response(_get_response(**response_args)))

    if query_data[1] == '':
        adj_all_data = [[
            str(date.date()), int(zero)
        ] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data, []))))]

    elif len(query_data) > 1:
        # compute changes in IoI (interest over time) per quarter
        # and merged quarters together after interpolating data
        # with daily data.
        # We cannot mix quarters as Google normalizes each query
        all_ioi_delta = []
        qdat_interp = []
        for quarter_data in all_data:
            if quarter_data != []:
                quarter_data = [x for x in quarter_data if x[1] != '']
                all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data))))

                if ggplot:
                    qdat_interp += interpolate_ioi(*zip(*quarter_data))[1]
                    # for plotting only

        qdate = [date for date, delta_ioi in all_ioi_delta]
        delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta]

        try:
            ydate = [
                date[-10:] if len(date) > 10 else date
                for date, ioi in query_data
            ]
            yIoI = [float(ioi) for date, ioi in query_data]
        except:
            from IPython import embed
            embed()
            yIoI = [float(ioi) for date, ioi in query_data[:-1]]
        ydate, yIoI = interpolate_ioi(ydate, yIoI)

        # match quarterly and yearly dates and get correct delta IoI
        # common_date = [x for x in ydate+qdate if x in ydate and x in qdate]
        common_date = sorted(set(ydate) & set(qdate))

        delta_ioi = [
            delta_ioi for date, delta_ioi in zip(qdate, delta_ioi)
            if date in common_date
        ]
        y_ioi = [y for x, y in zip(ydate, yIoI) if x in common_date]

        # calculate daily %change in IoI and adjust weekly values
        adj_IoI = [ioi * mult for ioi, mult in zip(y_ioi, delta_ioi)]

        adj_all_data = [[str(date.date()), round(ioi, 2)]
                        for date, ioi in zip(common_date, adj_IoI)]
    else:
        adj_all_data = [[
            str(date.date()), int(zero)
        ] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data, []))))]

    # from IPython import embed; embed()
    heading = ["Date", keywords[0].title]
    querycounts = list(zip((d.date() for d in start_range), missing_queries))
    keywords[0].querycounts = querycounts

    if not ggplot:
        return [heading] + adj_all_data

    ## GGplot Only
    else:
        # GGPLOT MERGED GTRENDS PLOTS:
        import pandas as pd
        from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes
        try:
            ydat = pd.DataFrame(list(zip(common_date, y_ioi)),
                                columns=["Date", 'Weekly series'])
            mdat = pd.DataFrame(list(zip(common_date, adj_IoI)),
                                columns=['Date', 'Merged series'])
            qdat = pd.DataFrame(list(zip(common_date, qdat_interp)),
                                columns=['Date', 'Daily series'])
            ddat = ydat.merge(mdat, on='Date').merge(qdat, on='Date')
            ddat['Date'] = list(map(pd.to_datetime, ddat['Date']))

            ydat['Date'] = list(map(pd.to_datetime, ydat['Date']))
            mdat['Date'] = list(map(pd.to_datetime, mdat['Date']))
            qdat['Date'] = list(map(pd.to_datetime, qdat['Date']))
        except UnboundLocalError as e:
            raise (UnboundLocalError("No Interest-over-time to plot"))

        # meltkeys = ['Date','Weekly series','Merged series','Daily series']
        # melt = pd.melt(ddat[meltkeys], id_vars='Date')

        colors = [
            '#77bde0',  # blue
            '#b47bc6',  # purple
            '#d55f5f'  # red
        ]

        entity_type = keywords[0].desc

        g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \
         geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \
         geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \
         geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \
         ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \
         ylab("Interest Over Time") + xlab("Date")

        # from IPython import embed; embed()

        print(g)
        # ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5)
        return [heading] + adj_all_data
Exemple #25
0
def quarterly_queries(keywords, category, cookies, session, domain, throttle, filing_date, ggplot, month_offset=[-12, 12], trends_url=DEFAULT_TRENDS_URL):
	"""Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data.

		month_offset: [no. month back, no. months forward] to query
	Returns daily data over the period.
	"""

	aw_range = arrow.Arrow.range
	begin_period = aget(filing_date).replace(months=month_offset[0])
	ended_period = aget(filing_date).replace(months=month_offset[1])

	# Set up date ranges to iterate queries across
	start_range = aw_range('month', YYYY_MM(begin_period),
									YYYY_MM(ended_period))
	ended_range = aw_range('month', YYYY_MM(begin_period).replace(months=3),
									YYYY_MM(ended_period).replace(months=3))

	start_range = [r.datetime for r in start_range][::3]
	ended_range = [r.datetime for r in ended_range][::3]

	# Fix last date if incomplete quarter (offset -1 week from today)
	last_week = arrow.utcnow().replace(weeks=-1).datetime
	start_range = [d for d in start_range if d < last_week]
	ended_range = [d for d in ended_range if d < last_week]
	if len(ended_range) < len(start_range):
		ended_range += [last_week]

	# Iterate attention queries through each quarter
	all_data = []
	missing_queries = []    # use this to scale IoT later.
	for start, end in zip(start_range, ended_range):
		if start > last_week:
			break

		print("Querying period: {s} ~ {e}".format(s=start.date(),
												  e=end.date()))
		throttle_rate(throttle)

		response_args = {'url': trends_url.format(domain=domain),
						'params': _query_parameters(start, end, keywords, category),
						'cookies': cookies,
						'session': session}

		query_data = _check_data(keywords,
						_process_response(
							_get_response(**response_args)))

		if all(int(vals)==0 for date,vals in query_data):
			query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)]
			missing_queries.append('missing')
		elif len(query_data[0][0]) > 10:
			missing_queries.append('weekly')
		else:
			missing_queries.append('daily')

		try:
			if not aligned_weekly(query_data, all_data):
				## Workaround: shift filing date
				q1 = weekly_date(all_data[-1][-1][0])
				q2 = weekly_date(query_data[0][0])

				if q1 < q2:
					start = arrow.get(start).replace(months=-1)
					response_args['params'] = _query_parameters(start, end, keywords, category)
					## Do a new 4month query, overlap/replace previous month.
					query_data = _check_data(keywords,
									_process_response(
										_get_response(**response_args)))
					if all_data[:-1] != []:
						q2 = weekly_date(query_data[0][0], 'start')
						all_data[-1] = [d for d in all_data[-1] if q2 > weekly_date(d[0])]

				elif q1 >= q2:
					# if q1 > 1st date in query_data, remove the first few entries
					query_data = [d for d in query_data if q1 < weekly_date(d[0])]

		except IndexError:
			pass
		except:
			from IPython import embed; embed()

		finally:
			all_data.append(query_data)



	# Get overall long-term trend data across entire queried period
	s = begin_period.replace(weeks=-2).datetime
	e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime
	e2 = arrow.utcnow().replace(weeks=-1).datetime
	e = min(e1,e2)
	print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(), e=e.date()))

	response_args = {
		'url': trends_url.format(domain=domain),
		'params': _query_parameters(s, e, keywords, category),
		'cookies': cookies,
		'session': session
		}

	query_data = _check_data(keywords,
					_process_response(
						_get_response(**response_args)))



	if len(query_data) > 1:
		# compute changes in IoI (interest over time) per quarter
		# and merged quarters together after interpolating data
		# with daily data.
		# We cannot mix quarters as Google normalizes each query
		all_ioi_delta = []
		qdat_interp = []
		for quarter_data in all_data:
			if quarter_data != []:
				quarter_data = [x for x in quarter_data if x[1] != '']
				all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data))))

				if ggplot:
					qdat_interp += interpolate_ioi(*zip(*quarter_data))[1]
					# for plotting only

		qdate = [date for date, delta_ioi in all_ioi_delta]
		delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta]
		ydate = [date[-10:] if len(date) > 10 else date for date, ioi in query_data]
		try:
			yIoI  = [float(ioi) for date, ioi in query_data]
		except:
			# from IPython import embed; embed()
			yIoI = [float(ioi) for date, ioi in query_data[:-1]]
		ydate, yIoI = interpolate_ioi(ydate, yIoI)

		# match quarterly and yearly dates and get correct delta IoI
		# common_date = [x for x in ydate+qdate if x in ydate and x in qdate]
		common_date = sorted(set(ydate) & set(qdate))

		delta_ioi = [delta_ioi for date,delta_ioi in zip(qdate, delta_ioi)
					if date in common_date]
		y_ioi = [y for x,y in zip(ydate, yIoI) if x in common_date]

		# calculate daily %change in IoI and adjust weekly values
		adj_IoI = [ioi*mult for ioi,mult in zip(y_ioi, delta_ioi)]

		adj_all_data = [[str(date.date()), round(ioi, 2)] for date,ioi in zip(common_date, adj_IoI)]
	else:
		adj_all_data = [[str(date.date()), int(zero)] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data,[]))))]

	# from IPython import embed; embed()
	heading = ["Date", keywords[0].title]
	querycounts = list(zip((d.date() for d in start_range), missing_queries))
	keywords[0].querycounts = querycounts

	if not ggplot:
		return [heading] + adj_all_data

	## GGplot Only
	else:
		# GGPLOT MERGED GTRENDS PLOTS:
		import pandas as pd
		from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes
		try:
			ydat = pd.DataFrame(list(zip(common_date, y_ioi)), columns=["Date", 'Weekly series'])
			mdat = pd.DataFrame(list(zip(common_date, adj_IoI)), columns=['Date', 'Merged series'])
			qdat = pd.DataFrame(list(zip(common_date, qdat_interp)), columns=['Date', 'Daily series'])
			ddat = ydat.merge(mdat, on='Date').merge(qdat,on='Date')
			ddat['Date'] = list(map(pd.to_datetime, ddat['Date']))

			ydat['Date'] = list(map(pd.to_datetime, ydat['Date']))
			mdat['Date'] = list(map(pd.to_datetime, mdat['Date']))
			qdat['Date'] = list(map(pd.to_datetime, qdat['Date']))
		except UnboundLocalError as e:
			raise(UnboundLocalError("No Interest-over-time to plot"))

		# meltkeys = ['Date','Weekly series','Merged series','Daily series']
		# melt = pd.melt(ddat[meltkeys], id_vars='Date')

		colors = [
				'#77bde0', # blue
				'#b47bc6',   # purple
				'#d55f5f'    # red
				]

		entity_type = keywords[0].desc

		g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \
			geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \
			geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \
			geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \
			ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \
			ylab("Interest Over Time") + xlab("Date")

		# from IPython import embed; embed()

		print(g)
		# ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5)
		return [heading] + adj_all_data
Exemple #26
0
import pandas as pd
import numpy as np
# from source import view_and_print_output
import ggplot as gg


df = pd.DataFrame()
for num_layers, num_nodes in [(2, 50), (2, 100), (2, 150), (2, 200), (4, 50), (4, 100), (4, 150), (4, 200)]:
    file_coarse = '../../data/coarse_lambda_dropout_' + str(num_layers) + '_' + str(num_nodes) + '.txt'
    newdata = pd.read_csv(file_coarse)
    newdata = newdata.sort_values(by='validation error', ascending=True)
    newdata['lambda'] = np.log10(newdata['lambda'])
    newdata['index'] = (np.arange(len(newdata), dtype='float')/len(newdata))**3
    newdata['config'] = str(num_layers * 100 + num_nodes) +  ' ' +  str(num_layers) + ' ' + str(num_nodes)
    df = df.append(newdata)
print(df.sort_values(by='validation error', ascending=False).head(20))
p = gg.ggplot(gg.aes(x='lambda', y='dropout prob', color='index'), data=df) + \
        gg.geom_point() + \
        gg.xlab('lambda') + \
        gg.ylab('dropout prob') + \
        gg.scale_x_continuous(limits=(-5, 2)) + \
        gg.facet_wrap('config')
print(p)

# Conclusion: ignore dropout
Exemple #27
0
 def test_ylab(self):
     p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ylab("TEST")
     self.assertEqual(p.ylab, "TEST")
def plot_weather_data(df):  # older version
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	
	p_title = 'Subway Ridership by Hour vs Raining'
	p_xlab = 'Hour of the Day'
	p_ylab = 'Subway Entries'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab)
	return plot
def visualize_clusters(X, var, color = 'cluster'):
    '''
    Prints with ggplot a visualization of the different clusters.
    '''
    aux = pandas.DataFrame()
    
    aux['fecha'] = X.index
    aux.index = X.index
    
    aux[var] = X[var]
    aux['Cluster'] = X[color]
    
    return ggplot(aes(x='fecha', y=var, color='Cluster'), aux) + geom_point() + xlab(var) + ylab("Valor") + ggtitle("Clustering de la variable \"" + var + "\"") +  theme(axis_text_x  = element_text(color=[0,0,0,0]))
def visualize_segmentation(X, var):
    '''
    Prints with ggplot a visualization of the different segments.
    '''
    aux = pandas.DataFrame(index = X.index)
    
    aux['fecha'] = X.index.values
    aux[var] = X[var]
    aux['Segmento'] = X['segmento'].astype(str)
    
    return ggplot(aes(x="fecha", y=var, color="Segmento"), aux) + geom_point() + xlab("Fecha") + ylab(var) + ggtitle("Segmentacion de la variable \"" + var + "\"") +  theme(axis_text_x  = element_text(color=[0,0,0,0]))
Exemple #31
0
def plot_vol(dates, x, cp, my_domain):
    # -------------------- Prepare for Plotting -------------------------- #
    # Prepare DataFrame objects for graphing
    #Add a column for the label to show in the legend in the graph
    #Need to reshape it, from (124,) to (124,1) for exmple, so that it
    #will concatenate. This gives a df with [date, vol_data, 'Volume']
    v = ['Volume' for i in xrange(x.shape[0])]
    #df_domain = np.concatenate((x, v), axis=1)
    ndf_vol = np.transpose(np.array([dates, x, v]))
    df_vol = pd.DataFrame(ndf_vol, columns=['Date', 'Volume', 'Data'])

    #Create pre-allocated lists for plotting means and cp
    xmin_list = [0 for i in xrange(len(cp))]  #hold lft pt of vol_mean
    xmax_list = [0 for i in xrange(len(cp))]  #hold rt pt of vol_mean
    yint_list = [0 for i in xrange(len(cp))]  #holds vol_means
    cp_date_list = [0 for i in xrange(len(cp))]  #holds date for cp
    cp_value_list = [0 for i in xrange(len(cp))]  #holds cp value

    ref_idx = 0  #used to keep track of vol_means
    #collect list data for plotting
    for i in xrange(len(cp)):
        cp_idx = cp[i][0] - 1  #-1 b/c 1-indexed (includes cp itself)
        xmin_list[i] = dates[ref_idx].toordinal()  #convert to match ggplot
        xmax_list[i] = dates[cp_idx].toordinal()  #convert to match ggplot
        yint_list[i] = cp[i][2]  #use value from_mean for vol_mean
        cp_date_list[i] = dates[cp_idx]  #date of cp
        #cp_value_list[i] = x[cp_idx] #value of cp
        cp_value_list[i] = cp[i][2]
        ref_idx = cp_idx + 1  #+1 b/c moving to next point

    #Reform lists into a data frame and attach to df_domains. The first two
    #lists can be created together since they are both numeric, but if I try
    #to create all three together all types will be downgraded to strings.
    #np.concatenate avoids this conversion. The transpose is needed to take
    #an item from each to form a single row.
    cp_lbl = ['Change Point' for i in xrange(len(yint_list))]

    #Need to create a dummy entry to put 'Volume Mean' into legend
    cp_date_list.append(dates[0])
    yint_list.append(x[0])
    cp_lbl.append('Volume Mean')
    ndf_cp = np.transpose(np.array([cp_date_list, yint_list, cp_lbl]))
    yint_list.pop(-1)
    cp_date_list.pop(-1)
    df_cp = pd.DataFrame(ndf_cp, columns=['Date', 'Volume', 'Data'])

    df_plot = pd.concat((df_vol, df_cp), axis=0)

    #Need to create a dummy entry to put 'Volume Mean' into legend
    #dummy = np.array([dates[0], x[0], 'Volume Mean']).reshape(1,-1)
    #df_cp = np.concatenate( (df_cp, dummy), axis=0) #add to bottom df_cp
    #df_domain = np.concatenate( (df_domain, df_cp), axis=0 ) #add df_domains

    #convert final array into a pd.DataFrame for printing and plotting
    #df_domain = pd.DataFrame(df_domain, columns=['Date','Volume','Data'])
    #df_domain.to_html(open('out.html','w'))
    #os.system('sudo cp out.html /usr/local/www/analytics/rwing')

    margin = 0.10 * (np.max(x) - np.min(x))
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
            ggplot.geom_line(color='blue',size=2) + \
            ggplot.geom_point(x=xmax_list, y=cp_value_list, color='black', \
                        shape='D', size=50) + \
            ggplot.geom_hline(xmin=xmin_list, \
                        xmax=xmax_list, \
                        yintercept=yint_list, color="red", size=3) + \
            ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
            ggplot.scale_colour_manual(values = ["black", "blue", "red"]) + \
            ggplot.scale_y_continuous(labels='comma') + \
            ggplot.ylim(low=np.min(x)-margin/4.0, high=np.max(x)+margin) + \
            ggplot.xlab("Week (Marked on Mondays)") + \
            ggplot.ylab("Message Vol") + \
            ggplot.ggtitle("%s\nMessage Volume by Week" % my_domain) + \
            ggplot.theme_seaborn()

    return p
pd.value_counts(vehicles.trany)
vehicles["trany2"] = vehicles.trany.str[0]
pd.value_counts(vehicles.trany2)

#%% step 1 ~ 4 on Page 202
from ggplot import ggplot, aes, geom_point, xlab, ylab, ggtitle

grouped = vehicles.groupby("year")
averaged = grouped['comb08', 'highway08', 'city08'].agg([np.mean])
averaged.columns = ['comb08_mean', 'highway08_mean', 'city08_mean']
averaged['year'] = averaged.index

print(ggplot(averaged, aes('year', 'comb08_mean')) +
      geom_point(color='steelblue') +
      xlab('Year') +
      ylab('Average MPG') +
      ggtitle('All cars'))

#%% step 5
criteria1 = vehicles.fuelType1.isin(['Regular Gasoline', 'Prenium Gasoline', 'Midgrade Gasoline'])
criteria2 = vehicles.fuelType2.isnull()
criteria3 = vehicles.atvType != 'Hybrid'
vehicles_non_hybrid = vehicles[criteria1 & criteria2 & criteria3]
len(vehicles_non_hybrid)

#%% step 6
grouped = vehicles_non_hybrid.groupby(['year'])
averaged = grouped['comb08'].agg([np.mean])
print(averaged)

#%% step 7 ~ 9
    #Testing
    results = []
    for m in mvals:
        results.append(test_approx_nn(method = "hashing", traindata=docdata, testdata = testdata, m=m, alpha=1))
    for alpha in avals:
        results.append(test_approx_nn(method = "kdtree" , traindata=docdata, testdata = testdata, m=1, alpha=alpha))

    #save results to results folder, with plot and printing to screen.
    metadata = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode)
    f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode = 'w')
    pkl.dump(obj=results, file=f)

    logtimes =  [math.log(r.avg_time, 2)     for r in results]
    distances = [r.avg_distance for r in results]
    methods =   [r.method[0:3]  for r in results]
    alpha =     [r.alpha  for r in results]
    m =         [r.m  for r in results]
    results_df = pd.DataFrame(data = {"logtimes" : logtimes,
                                      "distances" : distances,
                                      "methods" : methods,
                                      "m":m,
                                      "alpha": alpha})
    print results_df
    p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes",
                                                         y = "distances",
                                                         label = "methods")) + \
        gg.geom_text() + \
        gg.ggtitle("LSH and KD trees: tradeoffs") + \
        gg.xlab("Log2 average query time  ") + gg.ylab("Average L2 distance from query point)")
    gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot = p)
Exemple #34
0
import ggplot as gg
import ultrasignup as us
import numpy as np

d = us.event_results(299)

p1 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("50K Finishing Times for All Years")

p2 = gg.ggplot(
    gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \
  gg.facet_grid(x='gender') + \
  gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \
  gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \
  gg.ggtitle("11M Finishing Times for All Years")
    )
    group by pod_id_location
""")
qry_job = bqclient.query(qry_str, location='EU', job_config=job_config)
#save result as dataframe
df = qry_job.to_dataframe()
df_long = df.melt(id_vars=['pod_str', 'pod_idx'],
                  value_vars=['p05', 'p25', 'med', 'p75', 'p95'],
                  var_name='yparam',
                  value_name='value')
#plots
#plt1 = gg.ggplot(df, gg.aes(x='date_UTC',y='no2_ppb'))+gg.geom_line()+gg.xlab('Time')+gg.ylab('NO2 (ppb)')+gg.theme_bw()+gg.facet_wrap('pod_id_location',scales='free_y')
#plt1.save(filename = r'.\charts\ulezpodts.png', width=None, height=None, dpi=200)
plt2 = gg.ggplot(df_long, gg.aes(
    x='pod_str', y='value', color='yparam')) + gg.geom_point() + gg.xlab(
        'pod') + gg.ylab('NO2 (as % of median)') + gg.theme_bw() + gg.theme(
            figure_size=(12, 6)) + gg.scale_x_discrete()
plt2.save(filename=r'.\charts\ulezpodvar.png', width=10, height=6, dpi=200)

#repeat for mobile data using segid instead of podid where N = 10 and N = 40
#repeat for stationary data at mobile times
qry_str = ("""
    with cte0 as (
    --all data, ULEZ pods with 6000 hrs
    select date_UTC, a.pod_id_location, no2_ppb
    from AQMesh.NO2_scaled_hightimeres_ppb_20180901_20190630 a
    join AQMesh.NO2_site_metadata_v2_1_20180901_20190630 b
    on a.pod_id_location=b.pod_id_location
    where ULEZ = true and no2_ppb <> -999
    and a.pod_id_location in 
      --limit to pods with at least 6000 hours
Exemple #36
0
cov = [(0.2**2, -0.064 / 2), (-0.064 / 2, 0.2**2)]
data = pd.DataFrame()

data['x'] = array(sorted(append([-3, 0, 3], linspace(-10, 10, 997))))  # for c)

# Generate analytical data via uncertainties
a_analytical = correlated_values((1, 1), cov)
data['Analytical Nom'] = unp.nominal_values(f(data['x'], a_analytical))
data['Analytical Std'] = unp.std_devs(f(data['x'], a_analytical))

# Monte Carlo
samples = 10000
a_mc = random.multivariate_normal((1, 1), cov, samples)
# a plot visualizing the 2d normal distribution
plot = gg.qplot(a_mc[:, 0], a_mc[:, 1]) + gg.xlab('a0') + gg.ylab('a1')
plot.save("fig/4b-a.pdf")


def std_dev_mc(x_array):
    return_value = []
    for x in x_array:
        values = [f(x, a) for a in a_mc]
        nominal = mean(values)
        std_d = std(values, ddof=1)
        return_value.append(ufloat(nominal, std_d))
    return return_value


noms_with_stds_mc = smd.parallel_slice(
    std_dev_mc, data['x'])  # std_dev_mc(list(data['x']))
#coding=utf-8
#!/usr/bin/python


### 资料来源:http://nbviewer.ipython.org/gist/wrobstory/1eb8cb704a52d18b9ee8/Up%20and%20Down%20PyData%202014.ipynb

# 导入文件模块
import ggplot as gg
from ggplot import ggplot
import numpy as np
import pandas as pd

df = pd.read_csv('/Users/zhangbo/github/pydatasv2014/USGS_WindTurbine_201307_cleaned.csv')
min_heights = df[df['Rotor Diameter'] > 10]

(ggplot(gg.aes(x='Turbine MW', y='Rotor Swept Area'), data=min_heights[:500])
    + gg.geom_point(color='#75b5aa', size=75)
    + gg.ggtitle("Rotor Swept Area vs. Power")
    + gg.xlab("Power (MW)")
    + gg.ylab("Rotor Swept Area (m^2)"))
def firms_dynamics_plot(decision):
    data = pd.read_csv(os.path.join(
        parameters.OUTPUT_PATH,
        "temp_general_firms_pop_%s_decision_%s_time_%s.txt" %
        (parameters.pop_redutor, decision, parameters.final_Time)),
                       sep=",",
                       header=None,
                       decimal=",").astype(float)
    # renaming the collunms names
    data.columns = [
        'time', 'total_firms', 'average_output', 'average_age', 'average_size',
        'new_firms', 'exit_firms', 'max_size', 'total_effort', 'average_effort'
    ]

    #logical test to control the process of burn the initial
    if parameters.time_to_cut_plots > 0:
        data = data.loc[(
            data['time']).astype(int) >= parameters.time_to_cut_plots, :]

    # variable to add in the plot title
    title_pop_val = float(parameters.pop_redutor) * 100

    # create a list of a years to plot
    list_of_years_division = list(
        range(int(data['time'].min()), int(data['time'].max()),
              12)) + [data['time'].max() + 1]
    list_of_years = [int(i / 12) for i in list_of_years_division]

    # graph paramter variables
    dpi_var_plot = 700
    width_var_plot = 15
    height_var_plot = 10

    ###############################################################################################################
    # plotting AGENTS UTILITY
    # Total firms
    plot_data = gg.ggplot(data, gg.aes('time', 'total_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years) +\
                gg.ggtitle('Total firms') + gg.xlab('Years') + gg.ylab('Total of Firms')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_total_firms_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_total_firms_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_total_firms_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # Average of output
    plot_data = gg.ggplot(data, gg.aes('time', 'average_output')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\
                +gg.ggtitle('Average of output') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_average_output_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_average_output_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_average_output_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # Average of age
    plot_data = gg.ggplot(data, gg.aes('time', 'average_age')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\
                +gg.ggtitle('Average of age of firms') + gg.xlab('Years') + gg.ylab('Age of Firms')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_average_age_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_average_age_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_average_age_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # Average of size
    plot_data = gg.ggplot(data, gg.aes('time', 'average_size')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\
                +gg.ggtitle('Average of size of firms') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_average_size_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_average_size_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_average_size_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # number of new firms
    plot_data = gg.ggplot(data, gg.aes('time', 'new_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\
                +gg.ggtitle('Number of new firms') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_number_of_new_firms_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_number_of_new_firms_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_number_of_new_firms_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # Number of firms out
    plot_data = gg.ggplot(data, gg.aes('time', 'exit_firms')) + gg.geom_line() + gg.scale_y_continuous(breaks=11) + \
                gg.scale_x_discrete(breaks=list_of_years_division, labels=list_of_years)\
                +gg.ggtitle('Number of firms out') + gg.xlab('Years') + gg.ylab('Units')+ gg.theme_bw()

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_general_number_of_firms_out_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(parameters.OUTPUT_PATH,
                         ('temp_general_number_of_firms_out_%s_%s_%s.png' %
                          (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    gg.ggsave(plot_data,
              os.path.join(parameters.OUTPUT_PATH,
                           ('temp_general_number_of_firms_out_%s_%s_%s.png' %
                            (decision, title_pop_val, parameters.final_Time))),
              width=width_var_plot,
              height=height_var_plot,
              units="in")

    # Average and max size of firms
    dat_merged = pd.concat([
        data.iloc[:, data.columns == 'average_effort'],
        data.iloc[:, data.columns == 'total_effort']
    ],
                           axis=1)

    plot_data = dat_merged.plot(
        title='Average and maximum effort of employees')
    plot_data.set_xlabel('Years')
    plot_data.set_ylabel('Values units of effort')
    plot_data.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plot_data.set_xticks(list_of_years_division)
    plot_data.set_xticklabels(list_of_years)
    plot_data.set_axis_bgcolor('w')
    fig = plot_data.get_figure()
    fig.set_size_inches(width_var_plot, height_var_plot)

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    fig.savefig(os.path.join(
        parameters.OUTPUT_PATH,
        ('temp_average_and_maximum_effort_of_firms_out_%s_%s_%s.png' %
         (decision, title_pop_val, parameters.final_Time))),
                dpi=dpi_var_plot)

    dat_merged = pd.concat([
        data.iloc[:, data.columns == 'average_size'],
        data.iloc[:, data.columns == 'max_size']
    ],
                           axis=1)

    plot_data = dat_merged.plot(title='Average and maximum size firms')
    plot_data.set_xlabel('Years')
    plot_data.set_ylabel('Number of employees')
    plot_data.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plot_data.set_xticks(list_of_years_division)
    plot_data.set_xticklabels(list_of_years)
    plot_data.set_axis_bgcolor('w')
    fig = plot_data.get_figure()
    fig.set_size_inches(width_var_plot, height_var_plot)

    # logical test for presence of plot, if is TRUE is deleted before save the new one
    if os.path.isfile(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time)))) is True:
        os.remove(
            os.path.join(
                parameters.OUTPUT_PATH,
                ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' %
                 (decision, title_pop_val, parameters.final_Time))))
    # saving the plot
    fig.savefig(os.path.join(
        parameters.OUTPUT_PATH,
        ('temp_average_size_and_maximum_of_firms_out_%s_%s_%s.png' %
         (decision, title_pop_val, parameters.final_Time))),
                dpi=dpi_var_plot)
Exemple #39
0
kk = count_vect.fit_transform(subjects_train)

analyze = count_vect.build_analyzer()

subjects_words_count = subjects_train.apply(lambda x: len(analyze(x)))

print(subjects_words_count.describe())

#%%
import ggplot as gg

df = pd.DataFrame(subjects_words_count, columns = ["count"])

hist =  gg.ggplot(df, gg.aes(x = "count"))
hist += gg.xlab("# of words") +\
        gg.ylab("Frequency") +\
        gg.ggtitle("Frequency of words")

hist += gg.geom_vline(x = df.mean(), color="red")
hist += gg.geom_vline(x = df.median(), color="blue")
hist += gg.geom_density(color="green")
hist += gg.geom_histogram(binwidth=1, color="grey")

hist

#%%

# 1st attemtp to classify subjects per tag

X_raw_train = subjects_train
X_raw_test = subjects_test
Exemple #40
0
 def density(self, inp1, inp2, inp3):
     return gg.ggplot(self.data, gg.aes(x=inp1, color=inp2, fill=inp2)) +\
          gg.geom_density(alpha=0.5, size=5) +\
          gg.facet_grid(inp3) +\
          gg.ggtitle('Density of Fare by Sex and Survival Status') +\
          gg.ylab('Survival Status')
Exemple #41
0
# -*- coding: utf-8 -*-
from ggplot import ggplot, aes, geom_point, geom_line, ggtitle, xlab, ylab

data = []
xvar = 'X'
yvar = 'Y'

print ggplot(
    data,
    aes(x='yearID', y='HR')) + \
      geom_point(color='red') + \
      geom_line(color='red') + \
      ggtitle('Number of HR by year') + \
      xlab('Year') + \
      ylab('Number of HR')
Exemple #42
0
turnstile_rain.groupby("rain2").describe()

turnstile_rain = turnstile_weather[[
    "rain", "ENTRIESn_hourly", "EXITSn_hourly"
]]
turnstile_rain["ENTRIESn_hourly_log10"] = np.log10(
    turnstile_rain["ENTRIESn_hourly"] + 1)
turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining",
                                   "not raining")
set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors
plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \
       gg.geom_density() + \
       gg.facet_wrap("rain2", scales="fixed") + \
       gg.scale_colour_manual(values=set1) + \
       gg.xlab("log10(entries per hour)") + \
       gg.ylab("Number of turnstiles") + \
       gg.ggtitle("Entries per hour whilst raining and not raining")
plot

np.random.seed(42)
data = pd.Series(np.random.normal(loc=180, scale=40, size=600))
data.hist()

p = turnstile_weather["ENTRIESn_hourly"].hist()
pylab.suptitle("Entries per hour across all stations")
pylab.xlabel("Entries per hour")
pylab.ylabel("Number of occurrences")

turnstile_weather["grp"] = turnstile_weather["rain"] + turnstile_weather["fog"]
plot = ggplot(aes(y='ENTRIESn_hourly', x='Hour'),
              data=turnstile_weather) + geom_histogram() + xlab("Hour") + ylab(
    return sc.parallelize(
        [Pham2004Row(k, float(fK))
         for (k, fK) in rowContents]).toDF()


# In[42]:

pham2004DF = makePham2004DF(ks, fKs)
pham2004DF.show()


# Nubraižome $f(K)$ pagal $K$ grafiką.

# In[43]:

gg.ggplot(gg.aes(x="K", y="fK"), data=pham2004DF.toPandas()) + gg.geom_line() + gg.ylab("f(K)")


# Matome, kad „alkūnė“ yra kai $K = 3$, tačiau iš lentelės matome, kad $f(K)$ skirtumas tarp $K = 3$ ir $K = 6$ taip pat žymus. Taip galima matyti ir iš grafiku su $K >= 3$ reikšmėmis:

# In[44]:

pham2004DF.where(pham2004DF["K"] > 2).show()


# In[45]:

gg.ggplot(gg.aes(x="K", y="fK"), data=pham2004DF.where(pham2004DF["K"] > 2).toPandas()) + gg.geom_line() + gg.ylab("f(K)")


# Parinkime daugiau $K$ reikšmių itervale $[3; 9]$.
Exemple #44
0
import sys
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

species = 'no2'
df = pd.read_csv(r'.\charts\background_data_melted.csv',
                 index_col='idx',
                 dtype={
                     'timestamp': 'str',
                     'vidperiod': 'str',
                     'type': 'str',
                     'param': 'str',
                     'value': 'float64'
                 })
print(df[:10])
df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S")
#plots
plt1 = gg.ggplot(df, gg.aes(
    x='timestamp', y='value', color='type')) + gg.geom_line() + gg.xlab(
        'Time') + gg.ylab('Concentration') + gg.theme_bw() + gg.ylim(
            0, 100) + gg.facet_wrap('vidperiod', scales='free') + gg.ggtitle(
                'Regional background comparison {0}'.format(species))
#+gg.theme(axis_text_x=gg.element_text(angle=20))
plt1.save(filename=r'.\charts\background_{0}_ggtest_{1}.png'.format(
    species,
    dt.datetime.today().strftime('%Y%b%d')),
          width=None,
          height=None,
          dpi=300)
        if (reward == 1):
            wins_for_player_1[i] += 1.0
        elif (reward == 0.5):
            draw_for_players[i] += 1.0

    print(i, wins_for_player_1[i], draw_for_players[i])
    data.append({
        'Type': 0,
        'Wins': wins_for_player_1[i],
        'Training': training_steps * (i - 1)
    })
    data.append({
        'Type': 1,
        'Wins': draw_for_players[i],
        'Training': training_steps * (i - 1)
    })
    learnitMC(training_steps, epsilon, alpha, n)
#   learnit(training_steps, epsilon, alpha) # the original learning code.

# Pandas gives you the power of R
learningdf = pd.DataFrame(data)
# I use ggplot when I generate figures in R and would like to use it with Python, HOWEVER:
# latest Pandas causes problems for ggplot so I needed these two patches:
# https://stackoverflow.com/questions/50591982/importerror-cannot-import-name-timestamp/52378663
# https://github.com/yhat/ggpy/issues/612
p = gg.ggplot(gg.aes(x='Training', y='Wins', group='Type'), data=learningdf)+ gg.xlab('Learning games') + \
    gg.ylab('Wins for player 1') + gg.ggtitle("n="+str(n)) + gg.geom_point() + gg.stat_smooth(method='loess')
p.make()
filename = "experiment_" + str(n) + ".pdf"
p.save(filename)
Exemple #46
0
        site_list)]  #filter to only ULEZ sites if applicable
    df_along = df_a.melt(id_vars=['site_str', 'n_passes'],
                         value_vars=['p05', 'p25', 'p50', 'p75', 'p95'],
                         var_name='yparam',
                         value_name='value')
    print(c['name'])
    #print(df_a)

    #plots
    #split percentiles into different charts, all sites
    #plt1 = gg.ggplot(df_along, gg.aes(x='n_passes',y='value',color='site_str'))+gg.geom_point()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.xlim(0,100)+gg.facet_wrap('yparam',scales='free_y')
    #plt1.save(filename = r'..\charts\bias_{0}.png'.format(c['name']), width=None, height=None, dpi=200)
    #n_segments
    plt2 = gg.ggplot(
        df_a, gg.aes(x='n_passes', y='n_segments', color='site_str')
    ) + gg.geom_line() + gg.xlab('n, number drive periods') + gg.ylab(
        'Sample size (number of drive patterns)') + gg.theme_bw() + gg.xlim(
            0, 35) + gg.ylim(0, 2000)
    plt2.save(filename=r'..\charts\n_segments_{0}_{1}.png'.format(
        c['name'], dtstamp),
              width=None,
              height=None,
              dpi=200)
    #combine percentiles, split sites
    plt3 = gg.ggplot(
        df_along, gg.aes(x='n_passes', y='value', color='yparam')
    ) + gg.geom_line() + gg.xlab('n, number of drive periods') + gg.ylab(
        'Sample error (%)') + gg.theme_bw() + gg.xlim(0, 35) + gg.ylim(
            -100, 100) + gg.geom_hline(
                y=25, linetype="dashed", color="gray") + gg.geom_hline(
                    y=-25, linetype="dashed", color="gray") + gg.geom_vline(
                        x=[10, 15], linetype="dashed",