Exemple #1
0
def t_sne_visualize(latent_vectors, labels, epoch):
    print(latent_vectors.shape)
    X_sample = latent_vectors.data.numpy() / 255
    feat_cols = ['pixel' + str(i) for i in range(X_sample.shape[1])]
    nsne = 1000
    df = pd.DataFrame(X_sample, columns=feat_cols)
    df['label'] = labels
    df['label'] = df['label'].apply(lambda i: str(i))
    rndperm = np.concatenate(
        (list(range(df.shape[0],
                    df.shape[0])), np.random.permutation(df.shape[0])))
    tsne = TSNE(n_components=2, verbose=1, perplexity=30)
    print('INITIALIZED')
    tsne_results = tsne.fit_transform(df.loc[rndperm[:nsne], feat_cols].values)
    print('AFTER FITTING')
    df_tsne = df.loc[rndperm[:nsne], :].copy()
    df_tsne['x-tsne'] = tsne_results[:, 0]
    df_tsne['y-tsne'] = tsne_results[:, 1]

    chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \
            + geom_point(size=70, alpha =0.7) \
            + ggtitle("tSNE dimensions colored by digit")
    chart.save(
        str(args.dataset) + "tsne-vae/2d-vec-miss" + str(args.remove_label) +
        "/tsne" + str(epoch) + ".png")

    return
Exemple #2
0
def graph3(score_data):
    """ Box plot for scores;
        Creates and returns graph 3, a box plot. """

    date_column = score_data[0][find_time_stamp(score_data)]
    data = DataFrame(score_data[1:], columns=score_data[0])

    # Get all columns that are numerical questions
    num_questions = data.select_dtypes(include=['int64']).columns.values

    # Melt data so that each question is in a seperate row
    new_data = pd.melt(data,
                       id_vars=[date_column, "Name"],
                       value_vars=num_questions,
                       var_name="Question",
                       value_name="Score")

    # Get rid of unecessary column
    new_data = new_data.drop('Name', axis=1)

    # Convert date string into an actual date type
    new_data[date_column] = pd.to_datetime(new_data[date_column],
                                           format="%m/%d/%Y")

    # Create box plot graph
    box_plot = ggplot.ggplot(ggplot.aes(x=date_column, y='Score'), new_data) +\
        ggplot.geom_boxplot() +\
        ggplot.ggtitle("Distribution of Question Scores over Time")
    return box_plot
def plot_update_frequency(result):    
    import pandas as pd
    import numpy
    
    #turns query results into timeseries of chnages
    d = []
    v = []
    for res in result:
        d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime())
        v.append(res['count'])       
        
    ts = pd.DataFrame(v, index = d, columns = ['changes'])
    ts = ts.resample('W', how='sum')
    ts.index.names = ['date']

    import ggplot
    #plots timeseries of changes       
    p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\
            ggplot.geom_point(color = 'blue') +\
            ggplot.xlab('Period') +\
            ggplot.ylab('Changes') +\
            ggplot.geom_smooth() +\
            ggplot.ylim(low = 0) +\
            ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"),  labels = ggplot.date_format('%Y-%m')) +\
            ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week')
    return p
Exemple #4
0
def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain):
    # ---------------------- Prepare Data Frame ----------------------- #
    df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume'])
    df_domain['Date'] = dates

    x_lbl = ['Observed Volume' for i in xrange(len(x))]
    xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))]
    xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))]
    col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl)

    df_plot = pd.concat( (df_domain, col3), axis=1)
    df_plot.columns = ['Date', 'Volume', 'Data']
    
    
    # ---------------------- Plot Decomposition ----------------------- #
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
        ggplot.geom_line(color='blue', size=2) + \
        ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
        ggplot.xlab("Week (Marked on Mondays)") + \
        ggplot.ylab("Message Vol") + \
        ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \
        ggplot.facet_grid('Data', scales='free_y') + \
        ggplot.theme_seaborn()

    return p
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None):
    '''
    Show on screen a line plot. Can save to a .pdf file too if specified.
    
    X,y - 
    '''
    df = pandas.DataFrame()
    
    if (title!=None):
        img_title = title.replace(" ","").replace(".","-") + ".pdf"
    
    df['X'] = X 
    for i in range(y.shape[1]):
        df[str(i)] = y.iloc[:,i].values
    
    if colors is None:
        colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys())

    df = df.iloc[0:df.shape[0]-1, :]    
    p = ggplot(df, aes(x='X'))
    
    for i in range(y.shape[1]):
         if colors not in X.columns.values:
            p = p + geom_line(aes(y=str(i),color = colors[i]))
         else:
            p = p + geom_point(aes(y=str(i),color = colors))
    
    p = p + xlab(labelx) + ylab(labely) + ggtitle(title)
    
    if(save):
        p.save(img_title)
    else:   
        return p
Exemple #6
0
    def density_plot(by='dpsi_zscore', categorical=True):

        if categorical:
            data_dict = {
                'muts increasing AAA':
                np.array([x[by] for x in variants['increase']]),
                'muts decreasing AAA':
                np.array([x[by] for x in variants['decrease']]),
                'muts not changing AAA length':
                np.array([x[by] for x in variants['constant']])
            }
        else:
            data_dict = OrderedDict(
                (change,
                 np.array(
                     [x[by] for x in variants['all']
                      if x['change'] == change])) for change in aaa_changes if
                len([x[by]
                     for x in variants['all'] if x['change'] == change]) > 1)

        plot = (
            ggplot(aes(x='value', colour='variable', fill='variable'),
                   data=prepare_data_frame(data_dict)) +
            ggtitle('Impact of variants affecting poly AAA sequences on %s' %
                    by) + xlab(by) + ylab('Kernel density estimate') +
            geom_density(alpha=0.6))

        return plot
Exemple #7
0
def t_sne_visualize(generated,n_sne,epoch):
    transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    #
    # mnist_ = datasets.MNIST('data/mnist', train=True, download=True, transform=transform)
    # X=mnist_.data.numpy()/255
    # y=mnist_.targets.numpy()
    # X=np.reshape(np.ravel(X), (X.shape[0], 28*28))
    n_label=7
    X_sample=generated.data.numpy()/255
    y_sample=list(range(n_label))*n_label
    X_sample=np.reshape(np.ravel(X_sample), (X_sample.shape[0], 28*28*3))

    feat_cols = [ 'pixel'+str(i) for i in range(X_sample.shape[1]) ]
    df = pd.DataFrame(X_sample,columns=feat_cols)
    df['label'] = y_sample
    df['label'] = df['label'].apply(lambda i: str(i))
    n_sne=49
    rndperm = np.concatenate((list(range(df.shape[0],df.shape[0])),np.random.permutation(df.shape[0])))
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    print('INITIALIZED')
    tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne],feat_cols].values)
    print('AFTER FITTING')
    df_tsne = df.loc[rndperm[:n_sne],:].copy()
    df_tsne['x-tsne'] = tsne_results[:,0]
    df_tsne['y-tsne'] = tsne_results[:,1]

    chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \
            + geom_point(size=70, alpha =0.7) \
            + ggtitle("tSNE dimensions colored by digit")
    chart.save("tsne"+str(epoch)+".png")

    return
Exemple #8
0
def extra(dataframe):
    mpl.rcParams["figure.figsize"] = "18, 4"
    plot = ggplot.ggplot(
        dataframe, ggplot.aes(x='Time', y='Speed')
    ) + ggplot.geom_path(color='lightblue', size=5) + ggplot.ggtitle(
        'Ports & Speeds') + ggplot.scale_y_reverse() + ggplot.theme_xkcd()
    plot.show()
def main(file_path):
    # Validate raw data path
    if not os.path.exists(file_path):
        LOG_ERROR('Could not find file: {}'.format(file_path))
        return

    # Validate raw data file type
    if not file_path.endswith('.pkl'):
        LOG_ERROR('File path must be a pickle file')
        return

    with open(file_path, 'rb') as f:
        LOG_INFO('Parsing pickle file: {}'.format(file_path))
        conversation = pickle.load(f)

        LOG_INFO('Found conversation: {}'.format(conversation['conversation_name']))

        df = pd.DataFrame(conversation['messages'])
        df.columns = ['Timestamp', 'Type', 'Participant']
        # df['Datetime'] = pd.to_datetime(df['Timestamp'])
        df['Datetime'] = df['Timestamp'].apply(lambda x:
                datetime.datetime.fromtimestamp(float(x)).toordinal())

        histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \
                        + ggplot.geom_histogram(alpha=0.6, binwidth=2) \
                        + ggplot.scale_x_date(labels='%b %Y') \
                        + ggplot.ggtitle(conversation['conversation_name']) \
                        + ggplot.ylab('Number of messages') \
                        + ggplot.xlab('Date')

        print(histogram)
 def plot_roc(self, experiment_type, to_plot):
     # turn this to string for categorical colour scheme
     to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]]
     p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \
         gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR")
     gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p)
     return
Exemple #11
0
def render(data, bin_width, plot_density=False):
    if plot_density:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \
               + ggplot.geom_density() \
               + ggplot.scale_x_date(labels='%b %Y') \
               + ggplot.ggtitle('Conversation Densities') \
               + ggplot.ylab('Density') \
               + ggplot.xlab('Date')
    else:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \
               + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \
               + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \
               + ggplot.ggtitle('Message Breakdown') \
               + ggplot.ylab('Number of Messages') \
               + ggplot.xlab('Date')

    print(plot)
def plot_cost_history(alpha, cost_history):

   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\
          gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
def plot_bin_dists(df, bin_def="distance_bin <= 500"):
    plt.rcParams['figure.figsize'] = np.array([16, 12]) * 0.65

    p = gp.ggplot(gp.aes(x='R2'), data=df.query(bin_def))
    p = p + gp.geom_histogram(
        fill='coral') + gp.facet_wrap("distance_bin") + gp.theme_seaborn(
            context='talk') + gp.ggtitle(bin_def)

    return p
Exemple #14
0
    def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None,
                  groups=None, legend=True):
        palette = self.__default_options__.get('palette', None) if palette is None else palette

        return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \
               geom_histogram(alpha=0.6, breaks=bins, position="fill") + \
               self._palette(palette) + \
               ggtitle(title) + \
               scale_y_continuous(name="Count (%s)" % values)
def plot_weather_data(df):  # older version
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	
	p_title = 'Subway Ridership by Hour vs Raining'
	p_xlab = 'Hour of the Day'
	p_ylab = 'Subway Entries'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab)
	return plot
def plot_weather_data(df):
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby('DATEn', as_index=False).sum()
	grouped.index.name = 'DATEn'
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly'))
	plot += gp.geom_line()
	plot += gp.ggtitle('Subway Ridership by Day')
	plot += gp.xlab('Date')
	plot += gp.ylab('Exits')
	return plot
def lineplot_compare(filename):
    df = pd.read_csv(filename)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle("Homeruns by Year by Team")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Exemple #18
0
 def plot(self):
     prob231g_plot_df = self.data.copy()
     for k in range(self.num_clusters):
         n = prob231g_plot_df.shape[0]
         prob231g_plot_df.loc[n] = self.cluster_centers[k]
     prob231g_plot_df["class_label"] = [label for label in self.class_label] + \
                                       self.num_clusters * ["center"]
     p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
         gg.geom_point() + gg.ggtitle("EM cluster assignments")
     print p
     return
def lineplot(hr_year_csv):
    df = pandas.read_csv(hr_year_csv)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR"))
        + gp.geom_point(color="red")
        + gp.geom_line(color="red")
        + gp.ggtitle("Homeruns by Year")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Exemple #20
0
 def show_price(self):
     price_table = pd.DataFrame(
         {
             'time_step': range(len(self.price)),
             'price': self.price
         },
         columns=['time_step', 'price'])
     p = gp.ggplot(gp.aes(x='time_step', y='price'), data=price_table) + \
                 gp.geom_line() + \
                 gp.xlim(0, len(self.price)) + \
                 gp.ggtitle('Price trend')
     print(p)
def prob231cd_recover(initialization):
    filename = "results/prob231cd" + initialization
    tuple_in = pkl.load(open(filename + ".pkl", "rb"))
    prob231c_plot_df = tuple_in[0]
    kmcalls = tuple_in[1]
    num_trials = tuple_in[2]
    p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \
        gg.geom_point() + gg.ggtitle(initialization + " initialization")
    gg.ggsave(filename + ".png", plot = p)
    obj = [kmcalls[i].obj for i in range(num_trials)]
    obj_stats = {"mean":np.mean(obj), "sd":np.std(obj), "min":np.min(obj)}
    return obj_stats
 def show_asset(self):
     asset_table = pd.DataFrame(
         {
             'time_step': range(len(self.asset_history)),
             'asset': self.asset_history
         },
         columns=['time_step', 'asset'])
     p = gp.ggplot(gp.aes(x='time_step', y='asset'), data = asset_table) + \
                gp.geom_line() + \
                gp.xlim(0, len(self.asset_history)) + \
                gp.ggtitle('Asset trend')
     print(p)
def scatter_vis(costs, tss, path, f):
    plt.figure()
    p = ggplot(costs,
       aes(x="$N$",
           y="cost")) +\
    geom_point() +\
    geom_hline(y=costs.cost.mean(), color="grey") +\
    geom_hline(y=costs.cost.max(), color="red") +\
    geom_hline(y=costs.cost.min(), color="green") +\
    ggtitle(f.__name__)

    p.save(path+scatter_vis.__name__+".pdf")
def prob231cd_recover(initialization):
    filename = "results/prob231cd" + initialization
    tuple_in = pkl.load(open(filename + ".pkl", "rb"))
    prob231c_plot_df = tuple_in[0]
    kmcalls = tuple_in[1]
    num_trials = tuple_in[2]
    p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \
        gg.geom_point() + gg.ggtitle(initialization + " initialization")
    gg.ggsave(filename + ".png", plot=p)
    obj = [kmcalls[i].obj for i in range(num_trials)]
    obj_stats = {"mean": np.mean(obj), "sd": np.std(obj), "min": np.min(obj)}
    return obj_stats
Exemple #25
0
    def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None,
                yaxis_label=None):
        color = self.__default_options__.get('palette', None) if color is None else color
        width = self.__default_options__.get('width', None) if width is None else width

        gg = ggplot(dataframe, aes(x, y)) + geom_point(color=color, alpha=0.6) + ggtitle(title)
        if xaxis_label:
            gg += scale_x_continuous(name=xaxis_label)
        if yaxis_label:
            gg += scale_y_continuous(name=xaxis_label)

        return gg
Exemple #26
0
def plot_age_speed(df):
    num_rows = df.shape[0]
    title = 'age v speed'

    print ggplot(df, aes(s.AGE_COL_NAME, s.SPEED_COL_NAME)) + \
            ggtitle(_make_title(title, num_rows))+ \
            geom_point(colour='steelblue') + \
            scale_x_continuous(
                    # breaks=[10,20,30],
                    # labels=["horrible", "ok", "awesome"]
                    )

    return df 
def prob231b(initialization = "regular"):
    cluster_counts = [2,3,5,10,15,20]
    kmcalls = [0 for i in cluster_counts]
    for i, num_clusters in enumerate(cluster_counts):
        kmcalls[i] = KmeansCall(features_only, num_clusters, initialization)
        kmcalls[i].run_kmeans(verbose = False)

        df_to_plot = kmcalls[i].data.copy()
        df_to_plot["class_label"] = [label for label in kmcalls[i].class_label]
        p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters))
        metadata = "k=" + str(num_clusters) + "_" + datestring
        gg.ggsave(filename = "results/" + metadata +".png", plot = p)
Exemple #28
0
 def plot(self):
     dat = []
     for traj in self.trajs:
         rec = traj.to_np_array()
         rec_len = rec.shape[0]
         label = [traj.name] * rec_len
         lb_array = np.array(label)
         lb_array = np.expand_dims(lb_array, 1)
         dat.append(np.concatenate([rec, lb_array], axis=1))
     df_data = np.concatenate(dat, axis=0)
     df = pd.DataFrame(data=df_data, columns=['ep', 'value', 'type'])
     p = gp.ggplot(gp.aes(x='ep', y='value', color='type'), data=df) + \
         gp.geom_line() + gp.ggtitle(self.title)
Exemple #29
0
def plot(mydata, opts):
    # number of mutants killed by exactly 0 tests
    nd = sum(mydata[mydata.ntests == 0].exactly)
    d = sum(mydata[mydata.ntests != 0].exactly)
    total = nd + d
    print("Not detected = ", nd, "/", total)
    title = opts['title'] + (' ND=%d/%d (Mu: %3.1f%%)' %
                             (nd, total, (1 - nd / total) * 100.0))
    p = gg.ggplot(gg.aes(x=opts['x'], y=opts['y']), data=mydata) + gg.geom_point() +\
            gg.xlab(opts['x']) + gg.ylab(opts['y']) + gg.ggtitle(title)  #+ \
    #   gg.xlim(0,lim)

    p.save(opts['file'])
def models_llhd(pm_llhd):
    """
    Tracking the total likelihood of READS in a model(cluster).
    :param pm_llhd: (np.array) matrix stores read likelihood in every model/cluster.
    :param type (

    x axis: iteration time
    y axis: sum likelihood log value
    """
    p = gp.ggplot(gp.aes(x="iteration num", y="log value"), data=pm_llhd)\
        +gp.geom_point(color="blue")\
        +gp.ggtitle(u"model likelihood")
    print(p)
def prob231b(initialization="regular"):
    cluster_counts = [2, 3, 5, 10, 15, 20]
    kmcalls = [0 for i in cluster_counts]
    for i, num_clusters in enumerate(cluster_counts):
        kmcalls[i] = KmeansCall(features_only, num_clusters, initialization)
        kmcalls[i].run_kmeans(verbose=False)

        df_to_plot = kmcalls[i].data.copy()
        df_to_plot["class_label"] = [label for label in kmcalls[i].class_label]
        p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters))
        metadata = "k=" + str(num_clusters) + "_" + datestring
        gg.ggsave(filename="results/" + metadata + ".png", plot=p)
Exemple #32
0
    def plot_outcomes(self, chart_title=None, use_ggplot=False):
        """ Plot the outcomes of patients observed.

        :param chart_title: optional chart title. Default is fairly verbose
        :type chart_title: str
        :param use_ggplot: True to use ggplot, else matplotlib
        :type use_ggplot: bool
        :return: a plot of patient outcomes

        """

        if not chart_title:
            chart_title="Each point represents a patient\nA circle indicates no toxicity, a cross toxicity"
            chart_title = chart_title + "\n"

        if use_ggplot:
            if self.size() > 0:
                from ggplot import (ggplot, ggtitle, geom_text, aes, ylim)
                import numpy as np
                import pandas as pd
                patient_number = range(1, self.size()+1)
                symbol = np.where(self.toxicities(), 'X', 'O')
                data = pd.DataFrame({'Patient number': patient_number,
                                     'Dose level': self.doses(),
                                     'DLT': self.toxicities(),
                                     'Symbol': symbol})

                p = ggplot(data, aes(x='Patient number', y='Dose level', label='Symbol')) \
                    + ggtitle(chart_title) + geom_text(aes(size=20, vjust=-0.07)) + ylim(1, 5)
                return p
        else:
            if self.size() > 0:
                import matplotlib.pyplot as plt
                import numpy as np
                patient_number = np.arange(1, self.size()+1)
                doses_given = np.array(self.doses())
                tox_loc = np.array(self.toxicities()).astype('bool')
                if sum(tox_loc):
                    plt.scatter(patient_number[tox_loc], doses_given[tox_loc], marker='x', s=300,
                                facecolors='none', edgecolors='k')
                if sum(~tox_loc):
                    plt.scatter(patient_number[~tox_loc], doses_given[~tox_loc], marker='o', s=300,
                                facecolors='none', edgecolors='k')

                plt.title(chart_title)
                plt.ylabel('Dose level')
                plt.xlabel('Patient number')
                plt.yticks(self.dose_levels())
                p = plt.gcf()
                phi = (np.sqrt(5)+1)/2.
                p.set_size_inches(12, 12/phi)
Exemple #33
0
def plot_distance_trip_time(df):
    num_rows = df.shape[0]
    title = 'trip duration v distance travelled'

    print ggplot(df, aes(s.TRIP_DURATION_COL, s.DISTANCE_TRAVELED_COL_NAME)) + \
            ggtitle(_make_title(title, num_rows))+ \
            stat_smooth(colour="red") + \
            geom_point(colour='steelblue') + \
            scale_x_continuous(
                    # breaks=[10,20,30], 
                    #labels=["horrible", "ok", "awesome"]
                    )

    return df 
def lineplot_compare(filename):  # Cleaner version with string vars
    df = pd.read_csv(filename)
    p_title = "Homeruns by Year by Team"
    p_xlab = "Homeruns"
    p_ylab = "Year"
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle(p_title)
        + gp.xlab(p_xlab)
        + gp.ylab(p_ylab)
    )
    return gg
def plot_cost_history(alpha, cost_history):
    """This function is for viewing the plot of your cost history.
    You can run it by uncommenting this

        plot_cost_history(alpha, cost_history)

    call in predictions.

    If you want to run this locally, you should print the return value
    from this function.
    """
    cost_df = pandas.DataFrame({
        'Cost_History': cost_history,
        'Iteration': range(len(cost_history))
    })
    return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
           geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha)
Exemple #36
0
def googletrend_command(delta_t, threshold=0.0, inverse=False):
    """the command to run google trend algorithm.

	:param delta_t:   the upper bound for original delta_t parameter
    :param threshold: upper bound for the threshold of differentiating two classes
    :param inverse:   whether to inverse the classifier
	"""
    ## handle filepath and title based on parameter inverse
    filename = "googletrend"
    titlename = "ROC of google trend classifier"
    if inverse:
        filename += "_inverse"
        titlename += " (inverse version)"
    filepath = "./plots/%s.jpg" % filename
    ## generate data first
    data = googletrend.preprocess()
    ## store classifier evaluation metrics into dict
    output = {}
    output['tpr'] = []
    output['fpr'] = []
    output['plot'] = []
    for thre in np.arange(0, threshold + 0.1, 0.1):
        print "==> threshold: %f, inverse: %s" % (thre, inverse)
        for i in xrange(1, int(delta_t)):
            googletrend.algorithm(data, i, thre, inverse)
            tp_rate, fp_rate = googletrend.evaluate(data)
            # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate)
            output['tpr'].append(tp_rate)
            output['fpr'].append(fp_rate)
            output['plot'].append('thre_' + str(thre))
    ## plot ROC graph
    ## add a y=x baseline for comparison
    output['tpr'].extend([0.0, 1.0])
    output['fpr'].extend([0.0, 1.0])
    output['plot'].extend(['baseline', 'baseline'])
    df = pd.DataFrame(output)
    graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \
      gg.theme_seaborn() + \
      gg.ggtitle(titlename) + \
         gg.xlab("FPR") + \
         gg.ylab("TPR") + \
         gg.xlim(0.0, 1.0) + \
         gg.ylim(0.0, 1.0) + \
      gg.geom_point() + \
      gg.geom_line()
    gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
Exemple #37
0
    def histogram(self,
                  dataframe,
                  bins=100,
                  width=None,
                  height=None,
                  palette=None,
                  title='Histogram',
                  values=None,
                  groups=None,
                  legend=True):
        palette = self.__default_options__.get(
            'palette', None) if palette is None else palette

        return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \
               geom_histogram(alpha=0.6, breaks=bins, position="fill") + \
               self._palette(palette) + \
               ggtitle(title) + \
               scale_y_continuous(name="Count (%s)" % values)
Exemple #38
0
def graph1(score_data):
    """ Average score as time goes on;
        Creates and returns graph 1, a line graph. """

    date_column = score_data[0][find_time_stamp(score_data)]

    data = DataFrame(score_data[1:], columns=score_data[0])

    # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical
    # questions so we know what to graph
    num_questions = data.select_dtypes(include=['int64']).columns.values

    # Melt data so that each question is in a seperate row
    new_data = pd.melt(data,
                       id_vars=date_column,
                       value_vars=num_questions,
                       var_name="Question",
                       value_name="Score")

    # Convert date string into an actual date type
    new_data[date_column] = pd.to_datetime(new_data[date_column],
                                           format="%m/%d/%Y")

    # Group all rows with same date and question, and then take the average.
    new_data = new_data.groupby([date_column, 'Question']).mean().reset_index()
    new_data['All'] = "Indiviual Questions"

    new_data2 = new_data.groupby(date_column).mean().reset_index()
    new_data2['Question'] = "All Questions"
    new_data2['All'] = "Average of All Questions"

    new_data = pd.concat([new_data, new_data2])

    new_data[date_column] = new_data[date_column].astype('int64')

    # Create time graph with seperate lines for each question
    ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\
        ggplot.geom_point() +\
        ggplot.geom_line() +\
        ggplot.facet_grid("All") +\
        ggplot.scale_x_continuous(labels=[""], breaks=0) +\
        ggplot.labs(x="Time", y="Average Question Score") +\
        ggplot.ggtitle("Question Scores Over Time")
    return ret
def prob231g():
    filename = "results/prob231g"

    num_clusters_231g = 3
    emcall = EMCall(features_only, labels_only, num_clusters_231g)
    emcall.run_em()

    plt.plot(emcall.log_likelihood_record)
    plt.title("Likelihood over EM iterations")
    plt.savefig(filename + "_loglike.png")

    prob231g_plot_df = emcall.data.copy()
    prob231g_plot_df["class_label"] = [label for label in emcall.class_label]
    p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("EM cluster assignments")
    gg.ggsave(filename + "_clusters.png", plot=p)

    pkl.dump(obj=emcall, file=open(filename + "_a.pkl", "wb"))
    print("Done with 231g.")
    return
def prob231g():
    filename = "results/prob231g"

    num_clusters_231g = 3
    emcall = EMCall(features_only, labels_only, num_clusters_231g)
    emcall.run_em()

    plt.plot(emcall.log_likelihood_record)
    plt.title("Likelihood over EM iterations")
    plt.savefig(filename + "_loglike.png")

    prob231g_plot_df = emcall.data.copy()
    prob231g_plot_df["class_label"] = [label for label in emcall.class_label]
    p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("EM cluster assignments")
    gg.ggsave(filename + "_clusters.png", plot = p)

    pkl.dump(obj = emcall, file = open(filename + "_a.pkl", "wb"))
    print("Done with 231g.")
    return
Exemple #41
0
def plotAlignmentStat(input, output):
    """plot Alignment summary using ggplot"""
    df = pd.read_csv(input, thousands=",")
    # replace % with '' and convert the type to float
    #df.replace('%', '', regex=True)
    print df.dtypes
    # convert to numeric
    #df1=df.apply(pd.to_numeric, args=('coerce',))
    # Get certain rows
    print df
    df = df.iloc[[2, 4, 5], ]
    #df = df.ix[['Uniquely mapped reads %', 'Number of reads mapped to multiple loci %', 'Reads unmapped: too short %']]
    dfm = pd.melt(df,
                  id_vars=['category'],
                  var_name='sampleName',
                  value_name='Value')

    print dfm
    #from ggplot import *
    #import pandas as pd
    #df = pd.DataFrame({"x":[1,2,3,4], "y":[1,3,4,2]})
    #ggplot(aes(x="x", weight="y"), df) + geom_bar()
    #ggplot(diamonds, aes(x='price', fill='cut')) + geom_histogram() +  theme_bw() + scale_color_brewer(type='qual')

    from ggplot import ggplot, geom_bar, aes, theme_bw, ggtitle, coord_flip, geom_histogram  #,scale_y_continuous,coord_flip
    p = ggplot(dfm, aes(x='sampleName', weight='Value',
                        fill='category')) + geom_bar() + theme_bw() + ggtitle(
                            "Alignment Summary stats") + coord_flip(
                            )  # + scale_y_continuous(labels='comma

    #p = ggplot(dfm, aes(x='sampleName', weight='Value', fill='category')) + geom_bar(position = "stack", stat='identity') + theme_bw() + ggtitle("Alignment Summary stats")  + coord_flip()# + scale_y_continuous(labels='comma') + coord_flip()
    #p = ggplot(df, aes(x = "category", y = "value", fill = "variable")) + \
    #geom_bar(stat="bar", labels=df["category"].tolist()) + \
    #theme(axis_text_x = element_text(angle=90))
    dirname, filename = os.path.split(output)
    print dirname
    print filename
    p.save(output)
    #ggsave(plot=p, filename=filename, path=dirname)
    return
Exemple #42
0
def graph2(score_data):
    """ Average scores for each question on most recent date;
        Creates and returns graph 2, a bar graph. """

    date_column = score_data[0][find_time_stamp(score_data)]

    columns_data = score_data[0]
    for i in range(0, len(columns_data)):
        columns_data[i] = columns_data[i].split('.')[0]

    data = DataFrame(score_data[1:], columns=columns_data)

    # Get all columns that are numerical questions so we know what to graph
    num_questions = data.select_dtypes(include=['int64']).columns.values

    # Melt data so that each question is in a seperate row
    new_data = pd.melt(data,
                       id_vars=date_column,
                       value_vars=num_questions,
                       var_name="Question",
                       value_name="Score")

    # Convert date string into actual data type
    new_data[date_column] = pd.to_datetime(new_data[date_column],
                                           format="%m/%d/%Y")

    # Latest Dates
    recent_date = new_data[date_column].max()

    # Removing all dates that are recent
    new_data = new_data[new_data.Timestamp == recent_date]

    # Group all rows with question, and then take the average.
    new_data = new_data.groupby(['Question']).mean().reset_index()

    # Create bar graph with data from past week
    ret = ggplot.ggplot(ggplot.aes(x="Question", weight="Score"), new_data) +\
        ggplot.geom_bar() +\
        ggplot.ggtitle("Most Recent Average Scores")
    return ret
Exemple #43
0
def data_output(data, chart_title):
		print "Good News! You're data has been returned. I'm happy to show it to you."
		print "Just tell me how you want it - Table or Line Graph?"

		data_output = raw_input("Choose table or line > ")

		if data_output[0].lower() == "t":
			print "Ok, here's your data."
			print data
		elif data_output[0] == "l" or data_output[0].lower() =="g":
			import ggplot as gg 

			plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \
    			gg.geom_point(color='black') + \
    			gg.geom_line(color='green') + \
    			gg.ggtitle(chart_title) + \
    			gg.xlab("Month, Year") + \
    			gg.ylab("Value") 
    			gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B"))

			print (plot + gg.theme_xkcd())
			
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year,
    # which contains two columns -- yearID, and HR.
    #
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv

    # your code here

    df = pd.read_csv('hr_year.csv')
    gg = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
Exemple #45
0
    def plot(self, title='The amazing peixinho\'s plots'):

        import ggplot
        import sklearn.manifold
        import pandas

        reduc = self._dataset.get('projection', None)
        if reduc is None or reduc.shape[1] >= 3:
            tsne = sklearn.manifold.TSNE()
            reduc = tsne.fit_transform(self._dataset['feats'])
            self._dataset['projection'] = reduc

        df = pandas.DataFrame({
            'X': reduc[:, 0].ravel(),
            'Y': reduc[:, 1].ravel(),
            'truelabel': self._dataset['truelabel'].ravel()
        })

        df['truelabel'] = df['truelabel'].astype(object)

        return ggplot.ggplot(
            ggplot.aes(x='X', y='Y', color='truelabel'),
            data=df) + ggplot.geom_point() + ggplot.ggtitle(title)
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year, 
    # which contains two columns -- yearID, and HR.  
    # 
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv
    
    # your code here

    df = pd.read_csv('hr_year.csv')
    gg  = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
Exemple #47
0
    def scatter(self,
                dataframe,
                x=None,
                y=None,
                width=None,
                height=None,
                color=None,
                title='Scatter',
                xaxis_label=None,
                yaxis_label=None,
                label=None):
        color = self.__default_options__.get('palette',
                                             None) if color is None else color
        width = self.__default_options__.get('width',
                                             None) if width is None else width

        gg = ggplot(dataframe, aes(x, y)) + geom_point(
            color=color, alpha=0.6) + ggtitle(title)
        if xaxis_label:
            gg += scale_x_continuous(name=xaxis_label)
        if yaxis_label:
            gg += scale_y_continuous(name=xaxis_label)

        return gg
Exemple #48
0
# -*- coding:utf-8 -*-
# 准备数据
import ggplot as gp # 不太喜欢import *
import pandas as pd
meat = gp.meat


p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.ggtitle(u'散点图')
print (p)
p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_line(color='blue')+gp.ggtitle(u'折线图')
print (p)
p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.geom_line(color='blue')+gp.ggtitle(u'散点图+折线图')
print (p)

# 将想要表达的变量组成一列
meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date')
# meat_lng包含了date,value(变量的值组成的列),variable(变量的名称组成的列)
p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+\
    gp.geom_point()+gp.geom_line()
print (p)




meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date')
p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+gp.geom_point()+gp.facet_wrap('variable')
print (p)

p = gp.ggplot(gp.aes(x='beef'),data=meat)+gp.geom_histogram()
print (p)
# _Python_ `ggplot` paketas naudoja _Python_ `pandas` paketo duomenų rinkinius `pandas.DataFrame`. Iškvietus _Apache Spark_ `pyspark.sql.DataFrame` metodą `toPandas` gauname `pandas.DataFrame` objektą kurį naudojame sklaidos diagramos braižymui.

# In[22]:

# grafikų braižymui Jupyter Notebook ląstelėse
get_ipython().magic('matplotlib inline')
import ggplot as gg


# In[23]:

(
    gg.ggplot(gg.aes(x="x", y="y", color="prediction"), data=ca1WithPredictionDF.toPandas()) + 
    gg.geom_point() + 
    gg.ggtitle("K = 2")
)


# Matome, kad $K = 2$ tikrai nėra optimalus klasterių kiekis. $\mathbb{R}^2$ atveju galime nubraižyti sklaidos diagramą, tačiau ką daryti $\mathbb{R}^p$, kai $p > 2$ atveju? 
# Vienas iš būdų yra apmokyti keletą modelių su skirtingomis $K$ reikšmėmis ir juos palyginti pagal iš anksto apsibrėžtų charakteristikų reikšmes.

# Apmokome keletą modelių su skirtingomis $K$ reikšmėmis ir apskaičiuojame jų $S_K$ reikšmes.

# In[24]:

kValues = [2, 3, 4, 5, 6, 7]
models = [KMeans.train(featuresRDD, k=k) for k in kValues]
WSSSEs = [m.computeCost(featuresRDD) for m in models]

rowsWSSSSE = list(zip(kValues, map(float, WSSSEs)))
Exemple #50
0
def main():
    global args, ruleset
    # Arguments Parser
    argparser, subparser = parser_setup()
    register_rules(subparser)
    args = argparser.parse_args()
    rulemod = sys.modules["rpgdice.rulesets.%s" % args.ruleset]
    rulemod.prepare(args, srand)

    if args.debug:
        print "DEBUG: args", args
        print

    results = list()
    pool = multiprocessing.Pool()
    try:
        for result in pool.map(rulemod.simulate_rolls, rulemod.variables):
            results.extend(result)
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        sys.exit(130)
    if args.debug:
        print "DEBUG: results:"
        pprint(results)
        print

    conf = dict()
    conf = {"vlab": "Variables", "xlab": "Outcome", "ylab": "Probability %"}
    for item in conf:
        try:
            conf[item] = getattr(rulemod, item)
        except:
            pass

    columns = ("Graph", conf["vlab"], conf["xlab"], "Count", conf["ylab"])
    data = pandas.DataFrame.from_records(results, columns=columns)

    # Create and save graphs
    for gkey in rulemod.graphs:
        # Graph Defaults
        graph_conf = conf.copy()
        graph_conf["file_prefix"] = "%s%02d" % (args.ruleset, gkey)
        graph_conf["file_suffix"] = str()
        # colors
        colors_lower = ["#ff0000", "#cc0000", "#993300", "#666600"]
        colors_upper = ["#006666", "#003399", "#0000cc", "#0000ff"]
        colors_mid = ["#000000"]
        color_count = len(rulemod.variables) - 1
        if color_count % 2 == 0:
            lower_slice = (color_count / 2) * -1
            upper_slice = color_count / 2
        else:
            lower_slice = ((color_count - 1) / 2) * -1
            upper_slice = (color_count + 1) / 2
        graph_conf["color_list"] = colors_lower[lower_slice:] + colors_mid + colors_upper[0:upper_slice]

        # graph_conf from graph
        graph_items = (
            "color_list",
            "file_prefix",
            "file_suffix",
            "graph_type",
            "limits",
            "x_breaks",
            "x_labels",
            "title",
            "vlab",
            "xlab",
            "ylab",
        )
        for item in graph_items:
            try:
                graph_conf[item] = rulemod.graphs[gkey][item]
            except:
                try:
                    graph_conf[item] = getattr(rulemod, item)
                except:
                    if item not in graph_conf:
                        graph_conf[item] = None
        if args.debug:
            print "DEBUG: graph_conf:"
            pprint(graph_conf)
            print

        # plot_data
        plot_data = data.copy()
        plot_data = plot_data[plot_data["Graph"] == gkey]
        plot_data.rename(
            columns={
                conf["vlab"]: graph_conf["vlab"],
                conf["xlab"]: graph_conf["xlab"],
                conf["ylab"]: graph_conf["ylab"],
            },
            inplace=True,
        )
        plot_data.index = range(1, len(plot_data) + 1)
        if args.debug:
            print "DEBUG: plot_data:"
            pprint(plot_data)
            print

        # Create plot
        if args.graph:
            plot = (
                ggplot.ggplot(
                    ggplot.aes(x=graph_conf["xlab"], y=graph_conf["ylab"], color=graph_conf["vlab"]), data=plot_data
                )
                + ggplot.ggtitle(graph_conf["title"])
                + ggplot.theme_gray()
                + ggplot.scale_colour_manual(values=graph_conf["color_list"])
            )
            plot.rcParams["font.family"] = "monospace"
            if graph_conf["x_breaks"] and graph_conf["x_labels"]:
                plot += ggplot.scale_x_discrete(breaks=graph_conf["x_breaks"], labels=graph_conf["x_labels"])
            if graph_conf["limits"]:
                plot += ggplot.ylim(graph_conf["limits"][0], graph_conf["limits"][1])
            if graph_conf["graph_type"] == "bars":
                plot += ggplot.geom_line(size=20)
                text_data = plot_data[plot_data["Count"] > 0]
                text_data.index = range(0, len(text_data))
                outcomes = dict(text_data[graph_conf["xlab"]])
                percents = dict(text_data[graph_conf["ylab"]])
                for k in outcomes:
                    percent = "%4.1f%%" % percents[k]
                    x = outcomes[k]
                    y = percents[k] + 4
                    color = graph_conf["color_list"][k]
                    plot += ggplot.geom_text(label=[percent], x=[x, x + 1], y=[y, y - 1], color=color)
            else:
                plot += ggplot.geom_line()
                plot += ggplot.geom_point(alpha=0.3, size=50)
            if hasattr(rulemod, "update_plot"):
                plot = rulemod.update_plot(gkey, graph_conf, plot, plot_data)
            if args.dumpsave:
                filename = "/dev/null"
            else:
                filename = "%s%s.png" % (graph_conf["file_prefix"], graph_conf["file_suffix"])
            ggplot.ggsave(filename, plot, format="png", dpi=300)

    return 0
Exemple #51
0
    def plot_toxicity_probabilities(self, chart_title=None, use_ggplot=False):
        """ Plot prior and posterior dose-toxicity curves.

        :param chart_title: optional chart title. Default is fairly verbose
        :type chart_title: str
        :param use_ggplot: True to use ggplot, else matplotlib
        :type use_ggplot: bool
        :return: plot of toxicity curves

        """

        if not chart_title:
            chart_title = "Prior (dashed) and posterior (solid) dose-toxicity curves"
            chart_title = chart_title + "\n"

        if use_ggplot:
            from ggplot import (ggplot, ggtitle, geom_line, geom_hline, aes, ylim)
            import numpy as np
            import pandas as pd
            data = pd.DataFrame({'Dose level': self.dose_levels(),
                                 'Prior': self.prior,
                                 'Posterior': self.prob_tox(),
                                 #                      'Lower': crm.get_tox_prob_quantile(0.05),
                                 #                      'Upper': crm.get_tox_prob_quantile(0.95)
                                 })
            var_name = 'Type'
            value_name = 'Probability of toxicity'
            melted_data = pd.melt(data, id_vars='Dose level', var_name=var_name, value_name=value_name)
            # melted_data['LineType'] =  np.where(melted_data.Type=='Posterior', '--', np.where(melted_data.Type=='Prior', '-', '..'))
            # melted_data['LineType'] =  np.where(melted_data.Type=='Posterior', '--', np.where(melted_data.Type=='Prior', '-', '..'))
            # melted_data['Col'] =  np.where(melted_data.Type=='Posterior', 'green', np.where(melted_data.Type=='Prior', 'blue', 'yellow'))
            # np.where(melted_data.Type=='Posterior', '--', '-')

            p = ggplot(melted_data, aes(x='Dose level', y=value_name, linetype=var_name)) + geom_line() \
                + ggtitle(chart_title) + ylim(0, 1) + geom_hline(yintercept=self.target, color='black')
            # Can add confidence intervals once I work out linetype=??? in ggplot

            return p
        else:
            import matplotlib.pyplot as plt
            import numpy as np
            dl = self.dose_levels()
            prior_tox = self.prior
            post_tox = self.prob_tox()
            post_tox_lower = self.get_tox_prob_quantile(0.05)
            post_tox_upper = self.get_tox_prob_quantile(0.95)
            plt.plot(dl, prior_tox, '--', c='black')
            plt.plot(dl, post_tox, '-', c='black')
            plt.plot(dl, post_tox_lower, '-.', c='black')
            plt.plot(dl, post_tox_upper, '-.', c='black')
            plt.scatter(dl, prior_tox, marker='x', s=300, facecolors='none', edgecolors='k')
            plt.scatter(dl, post_tox, marker='o', s=300, facecolors='none', edgecolors='k')
            plt.axhline(self.target)
            plt.ylim(0, 1)
            plt.xlim(np.min(dl), np.max(dl))
            plt.xticks(dl)
            plt.ylabel('Probability of toxicity')
            plt.xlabel('Dose level')
            plt.title(chart_title)

            p = plt.gcf()
            phi = (np.sqrt(5) + 1) / 2.
            p.set_size_inches(12, 12 / phi)
#coding=utf-8
#!/usr/bin/python


### 资料来源:http://nbviewer.ipython.org/gist/wrobstory/1eb8cb704a52d18b9ee8/Up%20and%20Down%20PyData%202014.ipynb

# 导入文件模块
import ggplot as gg
from ggplot import ggplot
import numpy as np
import pandas as pd

df = pd.read_csv('/Users/zhangbo/github/pydatasv2014/USGS_WindTurbine_201307_cleaned.csv')
min_heights = df[df['Rotor Diameter'] > 10]

(ggplot(gg.aes(x='Turbine MW', y='Rotor Swept Area'), data=min_heights[:500])
    + gg.geom_point(color='#75b5aa', size=75)
    + gg.ggtitle("Rotor Swept Area vs. Power")
    + gg.xlab("Power (MW)")
    + gg.ylab("Rotor Swept Area (m^2)"))
turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]]
turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining")
turnstile_rain.groupby("rain2").describe()

turnstile_rain = turnstile_weather[["rain", "ENTRIESn_hourly", "EXITSn_hourly"]]
turnstile_rain["ENTRIESn_hourly_log10"] = np.log10(turnstile_rain["ENTRIESn_hourly"] + 1)
turnstile_rain["rain2"] = np.where(turnstile_rain["rain"] == 1, "raining", "not raining")
set1 = brewer2mpl.get_map('Set1', 'qualitative', 3).mpl_colors
plot = gg.ggplot(turnstile_rain, gg.aes(x="ENTRIESn_hourly_log10", color="rain2")) + \
       gg.geom_density() + \
       gg.facet_wrap("rain2", scales="fixed") + \
       gg.scale_colour_manual(values=set1) + \
       gg.xlab("log10(entries per hour)") + \
       gg.ylab("Number of turnstiles") + \
       gg.ggtitle("Entries per hour whilst raining and not raining")
plot

np.random.seed(42)
data = pd.Series(np.random.normal(loc=180, scale=40, size=600))
data.hist()

p = turnstile_weather["ENTRIESn_hourly"].hist()
pylab.suptitle("Entries per hour across all stations")
pylab.xlabel("Entries per hour")
pylab.ylabel("Number of occurrences")

turnstile_weather["grp"]=turnstile_weather["rain"]+turnstile_weather["fog"]
plot=ggplot(aes(y='ENTRIESn_hourly',x='Hour'), data=turnstile_weather)+geom_histogram()+xlab("Hour")+ylab("ENTRIESn_hourly")+ggtitle("T")
print plot
Exemple #54
0
def quarterly_queries(keywords, category, cookies, session, domain, throttle, filing_date, ggplot, month_offset=[-12, 12], trends_url=DEFAULT_TRENDS_URL):
	"""Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data.

		month_offset: [no. month back, no. months forward] to query
	Returns daily data over the period.
	"""

	aw_range = arrow.Arrow.range
	begin_period = aget(filing_date).replace(months=month_offset[0])
	ended_period = aget(filing_date).replace(months=month_offset[1])

	# Set up date ranges to iterate queries across
	start_range = aw_range('month', YYYY_MM(begin_period),
									YYYY_MM(ended_period))
	ended_range = aw_range('month', YYYY_MM(begin_period).replace(months=3),
									YYYY_MM(ended_period).replace(months=3))

	start_range = [r.datetime for r in start_range][::3]
	ended_range = [r.datetime for r in ended_range][::3]

	# Fix last date if incomplete quarter (offset -1 week from today)
	last_week = arrow.utcnow().replace(weeks=-1).datetime
	start_range = [d for d in start_range if d < last_week]
	ended_range = [d for d in ended_range if d < last_week]
	if len(ended_range) < len(start_range):
		ended_range += [last_week]

	# Iterate attention queries through each quarter
	all_data = []
	missing_queries = []    # use this to scale IoT later.
	for start, end in zip(start_range, ended_range):
		if start > last_week:
			break

		print("Querying period: {s} ~ {e}".format(s=start.date(),
												  e=end.date()))
		throttle_rate(throttle)

		response_args = {'url': trends_url.format(domain=domain),
						'params': _query_parameters(start, end, keywords, category),
						'cookies': cookies,
						'session': session}

		query_data = _check_data(keywords,
						_process_response(
							_get_response(**response_args)))

		if all(int(vals)==0 for date,vals in query_data):
			query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)]
			missing_queries.append('missing')
		elif len(query_data[0][0]) > 10:
			missing_queries.append('weekly')
		else:
			missing_queries.append('daily')

		try:
			if not aligned_weekly(query_data, all_data):
				## Workaround: shift filing date
				q1 = weekly_date(all_data[-1][-1][0])
				q2 = weekly_date(query_data[0][0])

				if q1 < q2:
					start = arrow.get(start).replace(months=-1)
					response_args['params'] = _query_parameters(start, end, keywords, category)
					## Do a new 4month query, overlap/replace previous month.
					query_data = _check_data(keywords,
									_process_response(
										_get_response(**response_args)))
					if all_data[:-1] != []:
						q2 = weekly_date(query_data[0][0], 'start')
						all_data[-1] = [d for d in all_data[-1] if q2 > weekly_date(d[0])]

				elif q1 >= q2:
					# if q1 > 1st date in query_data, remove the first few entries
					query_data = [d for d in query_data if q1 < weekly_date(d[0])]

		except IndexError:
			pass
		except:
			from IPython import embed; embed()

		finally:
			all_data.append(query_data)



	# Get overall long-term trend data across entire queried period
	s = begin_period.replace(weeks=-2).datetime
	e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime
	e2 = arrow.utcnow().replace(weeks=-1).datetime
	e = min(e1,e2)
	print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(), e=e.date()))

	response_args = {
		'url': trends_url.format(domain=domain),
		'params': _query_parameters(s, e, keywords, category),
		'cookies': cookies,
		'session': session
		}

	query_data = _check_data(keywords,
					_process_response(
						_get_response(**response_args)))



	if len(query_data) > 1:
		# compute changes in IoI (interest over time) per quarter
		# and merged quarters together after interpolating data
		# with daily data.
		# We cannot mix quarters as Google normalizes each query
		all_ioi_delta = []
		qdat_interp = []
		for quarter_data in all_data:
			if quarter_data != []:
				quarter_data = [x for x in quarter_data if x[1] != '']
				all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data))))

				if ggplot:
					qdat_interp += interpolate_ioi(*zip(*quarter_data))[1]
					# for plotting only

		qdate = [date for date, delta_ioi in all_ioi_delta]
		delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta]
		ydate = [date[-10:] if len(date) > 10 else date for date, ioi in query_data]
		try:
			yIoI  = [float(ioi) for date, ioi in query_data]
		except:
			# from IPython import embed; embed()
			yIoI = [float(ioi) for date, ioi in query_data[:-1]]
		ydate, yIoI = interpolate_ioi(ydate, yIoI)

		# match quarterly and yearly dates and get correct delta IoI
		# common_date = [x for x in ydate+qdate if x in ydate and x in qdate]
		common_date = sorted(set(ydate) & set(qdate))

		delta_ioi = [delta_ioi for date,delta_ioi in zip(qdate, delta_ioi)
					if date in common_date]
		y_ioi = [y for x,y in zip(ydate, yIoI) if x in common_date]

		# calculate daily %change in IoI and adjust weekly values
		adj_IoI = [ioi*mult for ioi,mult in zip(y_ioi, delta_ioi)]

		adj_all_data = [[str(date.date()), round(ioi, 2)] for date,ioi in zip(common_date, adj_IoI)]
	else:
		adj_all_data = [[str(date.date()), int(zero)] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data,[]))))]

	# from IPython import embed; embed()
	heading = ["Date", keywords[0].title]
	querycounts = list(zip((d.date() for d in start_range), missing_queries))
	keywords[0].querycounts = querycounts

	if not ggplot:
		return [heading] + adj_all_data

	## GGplot Only
	else:
		# GGPLOT MERGED GTRENDS PLOTS:
		import pandas as pd
		from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes
		try:
			ydat = pd.DataFrame(list(zip(common_date, y_ioi)), columns=["Date", 'Weekly series'])
			mdat = pd.DataFrame(list(zip(common_date, adj_IoI)), columns=['Date', 'Merged series'])
			qdat = pd.DataFrame(list(zip(common_date, qdat_interp)), columns=['Date', 'Daily series'])
			ddat = ydat.merge(mdat, on='Date').merge(qdat,on='Date')
			ddat['Date'] = list(map(pd.to_datetime, ddat['Date']))

			ydat['Date'] = list(map(pd.to_datetime, ydat['Date']))
			mdat['Date'] = list(map(pd.to_datetime, mdat['Date']))
			qdat['Date'] = list(map(pd.to_datetime, qdat['Date']))
		except UnboundLocalError as e:
			raise(UnboundLocalError("No Interest-over-time to plot"))

		# meltkeys = ['Date','Weekly series','Merged series','Daily series']
		# melt = pd.melt(ddat[meltkeys], id_vars='Date')

		colors = [
				'#77bde0', # blue
				'#b47bc6',   # purple
				'#d55f5f'    # red
				]

		entity_type = keywords[0].desc

		g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \
			geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \
			geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \
			geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \
			ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \
			ylab("Interest Over Time") + xlab("Date")

		# from IPython import embed; embed()

		print(g)
		# ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5)
		return [heading] + adj_all_data
Exemple #55
0
 def test_ggtitle(self):
     p = gg.ggplot(gg.aes(x='mpg'), gg.mtcars) + gg.geom_histogram() + gg.ggtitle("TEST")
     self.assertEqual(p.title, "TEST")
    #Testing
    results = []
    for m in mvals:
        results.append(test_approx_nn(method = "hashing", traindata=docdata, testdata = testdata, m=m, alpha=1))
    for alpha in avals:
        results.append(test_approx_nn(method = "kdtree" , traindata=docdata, testdata = testdata, m=1, alpha=alpha))

    #save results to results folder, with plot and printing to screen.
    metadata = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode)
    f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode = 'w')
    pkl.dump(obj=results, file=f)

    logtimes =  [math.log(r.avg_time, 2)     for r in results]
    distances = [r.avg_distance for r in results]
    methods =   [r.method[0:3]  for r in results]
    alpha =     [r.alpha  for r in results]
    m =         [r.m  for r in results]
    results_df = pd.DataFrame(data = {"logtimes" : logtimes,
                                      "distances" : distances,
                                      "methods" : methods,
                                      "m":m,
                                      "alpha": alpha})
    print results_df
    p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes",
                                                         y = "distances",
                                                         label = "methods")) + \
        gg.geom_text() + \
        gg.ggtitle("LSH and KD trees: tradeoffs") + \
        gg.xlab("Log2 average query time  ") + gg.ylab("Average L2 distance from query point)")
    gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot = p)
Exemple #57
0
def plot_weather_data(turnstile_weather):

    '''
    You are passed in a dataframe called turnstile_weather.
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.
    You should feel free to implement something that we discussed in class
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/\n
    turnstile_data_master_with_weather.csv

    To see all the columns and data points included in the turnstile_weather
    dataframe.

    However, due to the limitation of our Amazon EC2 server, we are giving
    you about 1/3 of the actual data in the turnstile_weather dataframe
    '''


    df = turnstile_weather.copy()
        
    # we will remove national holidays from the data. May 30 is Memorial Day,
    # the only national holiday in our data set. Normally this would be done
    # by passing in the data more elegantly, but since this is a bit more
    # constrained, we will simply hard code it into the function.
    national_holidays = ['2011-05-30']
    for holiday in national_holidays:
        df = df[df.DATEn != holiday]

    # add a column to represent the ISO day of the week for each data point.
    df[u'weekday'] = df[u'DATEn'].apply(\
            lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isoweekday())

    ##now introduce a multiplier variable so that the ENTRIESn_hourly
    ##values can be modified when we have multiple data days. For example
    ##if we have 2 fridays with rain the multiplier is 1/2 so that summing
    ##the modified values will give us the average number of riders
    ##entering the subways system on a rainy friday.

    for day in df.weekday.unique():
        for rain_status in df.rain.unique():

            # number of unique dates with the same weekday and rain status
            u = df[(df.weekday == day) & (df.rain == rain_status)].\
                DATEn.nunique()

            if u != 0:
                multiplier = float(1.0 / u)
            else:
                multiplier = 0

            daily_sum = \
                df[(df.weekday == day) & (df.rain == rain_status)].sum()

            entries_sum = daily_sum.ENTRIESn_hourly

            multiplier_index_list = \
                df[(df.weekday == day) & (df.rain == rain_status)].index

            df.loc[multiplier_index_list, u'ENTRIESn_hourly'] = \
                multiplier * entries_sum

    ##now we have a dataframe wich is ready to be utilized for making our
    ##plot using the data contained within.

    p = ggplot.ggplot(ggplot.aes(x = u'factor(weekday)', \
                                 weight = u'ENTRIESn_hourly', \
                                 fill = u'weekday'),\
                      data = df) +\
        ggplot.geom_bar() +\
        ggplot.facet_grid(x = u'rain', y = u'weekday') +\
        ggplot.ggtitle('Average Ridership on Sunny & Rainy ISO Weekdays')
    print p
    return p
Exemple #58
0
    plot_layer_params_weights(net, layer, tile_tiles)
for layer, params in fc_layers:
    # Very slow, very big, very hardly insightful...
    plot_layer_params_weights(net, layer, lambda x: x.transpose(1,0))

# Plot layer biases
plot_gg(gg_layer(
    gg.ggplot(gg.aes(x='layer', y='bias'),
        pd.DataFrame([
            {'bias': bias, 'layer': layer}
            for layer, (weights, biases) in net.params.items()
            for bias in biases.data
        ])
    ),
    gg.geom_violin(),
    gg.ggtitle('layer params biases'),
))

#
# Image to classify
#

# Pick image
#img_path  = '%(caffe_root)s/examples/images/cat.jpg' % locals()
#img_path  = 'data/img/cat-pizza.jpg'
#img_path = 'data/MLSP 2013/mlsp_contest_dataset/supplemental_data/spectrograms/PC1_20090705_070000_0040.bmp'
img_path = 'data/MLSP 2013/mlsp_contest_dataset/supplemental_data/spectrograms/PC1_20100705_050001_0040.bmp'
#img_path = ... # TODO

# Load image
img_desc                    = 'img-%s' % os.path.basename(os.path.splitext(img_path)[0])
Exemple #59
0
    def _post_density_plot(self, func=None, x_name='', plot_title='', include_doses=None, boot_samps=1000):

        from ggplot import aes, ggplot, geom_density, ggtitle
        import pandas as pd

        if include_doses is None:
            include_doses = range(1, self.num_doses + 1)

        def my_func(x, samp):
            tox_probs = _pi_T(x, mu=samp[:, 0], beta=samp[:, 1])
            eff_probs = _pi_E(x, mu=samp[:, 2], beta1=samp[:, 3], beta2=samp[:, 4])
            u = self.metric(eff_probs, tox_probs)
            return u
        if func is None:
            func = my_func

        x_boot = []
        dose_indices = []
        samp = self.pds._samp
        p = self.pds._probs
        p /= p.sum()
        for i, x in enumerate(self.scaled_doses()):
            dose_index = i+1
            if dose_index in include_doses:
                x = func(x, samp)
                x_boot.extend(np.random.choice(x, size=boot_samps, replace=True, p=p))
                dose_indices.extend(np.repeat(dose_index, boot_samps))
        df = pd.DataFrame({x_name: x_boot, 'Dose': dose_indices})
        return ggplot(aes(x=x_name, fill='Dose'), data=df) + geom_density(alpha=0.6) + ggtitle(plot_title)