Esempio n. 1
0
def displacement_plot(centered, limits=None, style=None):
    u"""Draws nice displacement plots using ggplot2.

    params:
        centered (pd.DataFrame): needs cX, cY, Object, Frame columns, probably
            produced by calling center() above
        limits (real): Sets the limits of the scales to a square window showing
            ±limits on each axis.
        style (Iterable): Collection of strings. Recognized values are 'theme-bw'
            (which uses theme_bw instead of theme_seaborn) and 'no-terminal-dot'
            (which does not label the end of tracks which terminate early).

    Returns:
        g (gg.ggplot): Plot object
    """
    style = {} if style is None else style
    centered['Object'] = centered['Object'].map(str)
    centered = centered.sort(['Frame', 'Object'])
    g = (gg.ggplot(centered, gg.aes(x='cX', y='cY', color='Object')) +
         gg.geom_path(size=0.3))
    g += gg.theme_bw()  # if 'theme-bw' in style else gg.theme_seaborn()
    if limits:
        g = g + gg.ylim(-limits, limits) + gg.xlim(-limits, limits)
    if 'no-terminal-dot' not in style:
        max_frame = centered['Frame'].max()
        endframe = centered.groupby('Object')['Frame'].max()
        endframe = endframe[endframe != max_frame].reset_index()
        endframe = endframe.merge(centered, on=['Object', 'Frame'])
        # we should check if endframe is empty before adding it:
        # https://github.com/yhat/ggplot/issues/425
        if not endframe.empty:
            g += gg.geom_point(data=endframe, color='black', size=1)
    return g
Esempio n. 2
0
def scatter(x, y, filename=""):
    df = pd.DataFrame({'x': pd.Series(x), 'y': pd.Series(y)})
    p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point()
    if filename == "":
        print p
    else:
        gg.ggsave(filename="graphs/scatter/" + filename + ".png", plot=p)
Esempio n. 3
0
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None):
    '''
    Show on screen a line plot. Can save to a .pdf file too if specified.
    
    X,y - 
    '''
    df = pandas.DataFrame()
    
    if (title!=None):
        img_title = title.replace(" ","").replace(".","-") + ".pdf"
    
    df['X'] = X 
    for i in range(y.shape[1]):
        df[str(i)] = y.iloc[:,i].values
    
    if colors is None:
        colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys())

    df = df.iloc[0:df.shape[0]-1, :]    
    p = ggplot(df, aes(x='X'))
    
    for i in range(y.shape[1]):
         if colors not in X.columns.values:
            p = p + geom_line(aes(y=str(i),color = colors[i]))
         else:
            p = p + geom_point(aes(y=str(i),color = colors))
    
    p = p + xlab(labelx) + ylab(labely) + ggtitle(title)
    
    if(save):
        p.save(img_title)
    else:   
        return p
Esempio n. 4
0
def t_sne_visualize(latent_vectors, labels, epoch):
    print(latent_vectors.shape)
    X_sample = latent_vectors.data.numpy() / 255
    feat_cols = ['pixel' + str(i) for i in range(X_sample.shape[1])]
    nsne = 1000
    df = pd.DataFrame(X_sample, columns=feat_cols)
    df['label'] = labels
    df['label'] = df['label'].apply(lambda i: str(i))
    rndperm = np.concatenate(
        (list(range(df.shape[0],
                    df.shape[0])), np.random.permutation(df.shape[0])))
    tsne = TSNE(n_components=2, verbose=1, perplexity=30)
    print('INITIALIZED')
    tsne_results = tsne.fit_transform(df.loc[rndperm[:nsne], feat_cols].values)
    print('AFTER FITTING')
    df_tsne = df.loc[rndperm[:nsne], :].copy()
    df_tsne['x-tsne'] = tsne_results[:, 0]
    df_tsne['y-tsne'] = tsne_results[:, 1]

    chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \
            + geom_point(size=70, alpha =0.7) \
            + ggtitle("tSNE dimensions colored by digit")
    chart.save(
        str(args.dataset) + "tsne-vae/2d-vec-miss" + str(args.remove_label) +
        "/tsne" + str(epoch) + ".png")

    return
def plot_update_frequency(result):    
    import pandas as pd
    import numpy
    
    #turns query results into timeseries of chnages
    d = []
    v = []
    for res in result:
        d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime())
        v.append(res['count'])       
        
    ts = pd.DataFrame(v, index = d, columns = ['changes'])
    ts = ts.resample('W', how='sum')
    ts.index.names = ['date']

    import ggplot
    #plots timeseries of changes       
    p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\
            ggplot.geom_point(color = 'blue') +\
            ggplot.xlab('Period') +\
            ggplot.ylab('Changes') +\
            ggplot.geom_smooth() +\
            ggplot.ylim(low = 0) +\
            ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"),  labels = ggplot.date_format('%Y-%m')) +\
            ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week')
    return p
Esempio n. 6
0
def t_sne_visualize(generated,n_sne,epoch):
    transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    #
    # mnist_ = datasets.MNIST('data/mnist', train=True, download=True, transform=transform)
    # X=mnist_.data.numpy()/255
    # y=mnist_.targets.numpy()
    # X=np.reshape(np.ravel(X), (X.shape[0], 28*28))
    n_label=7
    X_sample=generated.data.numpy()/255
    y_sample=list(range(n_label))*n_label
    X_sample=np.reshape(np.ravel(X_sample), (X_sample.shape[0], 28*28*3))

    feat_cols = [ 'pixel'+str(i) for i in range(X_sample.shape[1]) ]
    df = pd.DataFrame(X_sample,columns=feat_cols)
    df['label'] = y_sample
    df['label'] = df['label'].apply(lambda i: str(i))
    n_sne=49
    rndperm = np.concatenate((list(range(df.shape[0],df.shape[0])),np.random.permutation(df.shape[0])))
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    print('INITIALIZED')
    tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne],feat_cols].values)
    print('AFTER FITTING')
    df_tsne = df.loc[rndperm[:n_sne],:].copy()
    df_tsne['x-tsne'] = tsne_results[:,0]
    df_tsne['y-tsne'] = tsne_results[:,1]

    chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \
            + geom_point(size=70, alpha =0.7) \
            + ggtitle("tSNE dimensions colored by digit")
    chart.save("tsne"+str(epoch)+".png")

    return
Esempio n. 7
0
def scatter(x, y, filename=""):
    df = pd.DataFrame({ 'x': pd.Series(x), 'y': pd.Series(y) })
    p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point()
    if filename == "":
        print p
    else:
        gg.ggsave(filename="graphs/scatter/"+filename+".png", plot=p)
Esempio n. 8
0
def plotSetOfArrays(arrays, names, fileName):
    IDS = np.linspace(0, 1, arrays[0].shape[0])
    A = IDS.reshape(arrays[0].shape[0], 1)
    for i in range(0, len(arrays)):
        A = np.concatenate((A, arrays[i]), axis=1)
    Data = pd.DataFrame(A, columns=['noise'] + names)
    Melted = pd.melt(Data, id_vars=['noise'])

    pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'),
                       data=Melted) + ggplot.geom_line() + ggplot.geom_point()
    ggplot.ggsave(pv, './IMG/' + fileName)

    output_file("iou_scores.html", title="correlation.py example")

    figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave")
    hold()
    line(IDS, arrays[0][:, 0], color='#A6CEE3', legend=names[0])
    line(IDS, arrays[1][:, 0], color='#1F78B4', legend=names[1])
    line(IDS, arrays[2][:, 0], color='#B2DF8A', legend=names[2])
    line(IDS, arrays[3][:, 0], color='#33A02C', legend=names[3])
    line(IDS, arrays[4][:, 0], color='#fb9a99', legend=names[4])

    curplot().title = "Minimum IOU"
    grid().grid_line_alpha = 0.3
    show()
def plot_cost_history(alpha, cost_history):

   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\
          gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
Esempio n. 10
0
def signature_data_plot(sd):
    import ggplot as gg

    aes = gg.aes(x='set_exp', y='not_exp', color='pearson_r')
    return gg.ggplot(aes, data=sd) \
        + gg.geom_point(size=15) \
        + gg.scale_color_gradient(low='yellow', high='red') \
        + gg.scale_x_log() + gg.scale_x_continuous(limits=(0.5, 10000)) \
        + gg.scale_y_log() + gg.scale_y_continuous(limits=(0.05, 10000))
Esempio n. 11
0
def signature_data_plot(sd):
    import ggplot as gg

    aes = gg.aes(x='set_exp', y='not_exp', color='pearson_r')
    return gg.ggplot(aes, data=sd) \
        + gg.geom_point(size=15) \
        + gg.scale_color_gradient(low='yellow', high='red') \
        + gg.scale_x_log() + gg.scale_x_continuous(limits=(0.5, 10000)) \
        + gg.scale_y_log() + gg.scale_y_continuous(limits=(0.05, 10000))
Esempio n. 12
0
def lineplot(hr_year_csv):
    df = pandas.read_csv(hr_year_csv)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR"))
        + gp.geom_point(color="red")
        + gp.geom_line(color="red")
        + gp.ggtitle("Homeruns by Year")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Esempio n. 13
0
 def plot(self):
     prob231g_plot_df = self.data.copy()
     for k in range(self.num_clusters):
         n = prob231g_plot_df.shape[0]
         prob231g_plot_df.loc[n] = self.cluster_centers[k]
     prob231g_plot_df["class_label"] = [label for label in self.class_label] + \
                                       self.num_clusters * ["center"]
     p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
         gg.geom_point() + gg.ggtitle("EM cluster assignments")
     print p
     return
Esempio n. 14
0
def lineplot_compare(filename):
    df = pd.read_csv(filename)
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle("Homeruns by Year by Team")
        + gp.xlab("Homeruns")
        + gp.ylab("Year")
    )
    return gg
Esempio n. 15
0
def visualize_segmentation(X, var):
    '''
    Prints with ggplot a visualization of the different segments.
    '''
    aux = pandas.DataFrame(index = X.index)
    
    aux['fecha'] = X.index.values
    aux[var] = X[var]
    aux['Segmento'] = X['segmento'].astype(str)
    
    return ggplot(aes(x="fecha", y=var, color="Segmento"), aux) + geom_point() + xlab("Fecha") + ylab(var) + ggtitle("Segmentacion de la variable \"" + var + "\"") +  theme(axis_text_x  = element_text(color=[0,0,0,0]))
def plot_weather_data(df):
	df.DATEn = pd.to_datetime(df.DATEn)
	grouped = df.groupby(['DATEn','rain'], as_index=False).sum()
	grouped.index.name = ['DATEn','rain']
	plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly', color='rain'))
	plot += gp.geom_line()
	plot += gp.geom_point()
	plot += gp.ggtitle('Subway Ridership by Day')
	plot += gp.xlab('Date')
	plot += gp.ylab('Exits')
	return plot
def scatter_vis(costs, tss, path, f):
    plt.figure()
    p = ggplot(costs,
       aes(x="$N$",
           y="cost")) +\
    geom_point() +\
    geom_hline(y=costs.cost.mean(), color="grey") +\
    geom_hline(y=costs.cost.max(), color="red") +\
    geom_hline(y=costs.cost.min(), color="green") +\
    ggtitle(f.__name__)

    p.save(path+scatter_vis.__name__+".pdf")
Esempio n. 18
0
    def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None,
                yaxis_label=None):
        color = self.__default_options__.get('palette', None) if color is None else color
        width = self.__default_options__.get('width', None) if width is None else width

        gg = ggplot(dataframe, aes(x, y)) + geom_point(color=color, alpha=0.6) + ggtitle(title)
        if xaxis_label:
            gg += scale_x_continuous(name=xaxis_label)
        if yaxis_label:
            gg += scale_y_continuous(name=xaxis_label)

        return gg
Esempio n. 19
0
def prob231cd_recover(initialization):
    filename = "results/prob231cd" + initialization
    tuple_in = pkl.load(open(filename + ".pkl", "rb"))
    prob231c_plot_df = tuple_in[0]
    kmcalls = tuple_in[1]
    num_trials = tuple_in[2]
    p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \
        gg.geom_point() + gg.ggtitle(initialization + " initialization")
    gg.ggsave(filename + ".png", plot=p)
    obj = [kmcalls[i].obj for i in range(num_trials)]
    obj_stats = {"mean": np.mean(obj), "sd": np.std(obj), "min": np.min(obj)}
    return obj_stats
Esempio n. 20
0
def plot_timeline(scenes):
    # Plot character vs scene timelime
    # NB: due to limitations in Python ggplot we need to plot with scene on y-axis
    # in order to label x-ticks by character.
    # scale_x_continuous and scale_y_continuous behave slightly differently.

    print (gg.ggplot(gg.aes(y='scene', x='character_code'), data=scenes) +
            gg.geom_point() + gg.labs(x='Character', y='Scene') +
           gg.scale_x_continuous(
               labels=scenes['character'].cat.categories.values.tolist(),
           breaks=range(len(scenes['character'].cat.categories))) +
           gg.theme(axis_text_x=gg.element_text(angle=30, hjust=1, size=10)))
Esempio n. 21
0
def prob231cd_recover(initialization):
    filename = "results/prob231cd" + initialization
    tuple_in = pkl.load(open(filename + ".pkl", "rb"))
    prob231c_plot_df = tuple_in[0]
    kmcalls = tuple_in[1]
    num_trials = tuple_in[2]
    p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \
        gg.geom_point() + gg.ggtitle(initialization + " initialization")
    gg.ggsave(filename + ".png", plot = p)
    obj = [kmcalls[i].obj for i in range(num_trials)]
    obj_stats = {"mean":np.mean(obj), "sd":np.std(obj), "min":np.min(obj)}
    return obj_stats
Esempio n. 22
0
def visualize_clusters(X, var, color = 'cluster'):
    '''
    Prints with ggplot a visualization of the different clusters.
    '''
    aux = pandas.DataFrame()
    
    aux['fecha'] = X.index
    aux.index = X.index
    
    aux[var] = X[var]
    aux['Cluster'] = X[color]
    
    return ggplot(aes(x='fecha', y=var, color='Cluster'), aux) + geom_point() + xlab(var) + ylab("Valor") + ggtitle("Clustering de la variable \"" + var + "\"") +  theme(axis_text_x  = element_text(color=[0,0,0,0]))
Esempio n. 23
0
def plot(mydata, opts):
    # number of mutants killed by exactly 0 tests
    nd = sum(mydata[mydata.ntests == 0].exactly)
    d = sum(mydata[mydata.ntests != 0].exactly)
    total = nd + d
    print("Not detected = ", nd, "/", total)
    title = opts['title'] + (' ND=%d/%d (Mu: %3.1f%%)' %
                             (nd, total, (1 - nd / total) * 100.0))
    p = gg.ggplot(gg.aes(x=opts['x'], y=opts['y']), data=mydata) + gg.geom_point() +\
            gg.xlab(opts['x']) + gg.ylab(opts['y']) + gg.ggtitle(title)  #+ \
    #   gg.xlim(0,lim)

    p.save(opts['file'])
Esempio n. 24
0
def prob231b(initialization = "regular"):
    cluster_counts = [2,3,5,10,15,20]
    kmcalls = [0 for i in cluster_counts]
    for i, num_clusters in enumerate(cluster_counts):
        kmcalls[i] = KmeansCall(features_only, num_clusters, initialization)
        kmcalls[i].run_kmeans(verbose = False)

        df_to_plot = kmcalls[i].data.copy()
        df_to_plot["class_label"] = [label for label in kmcalls[i].class_label]
        p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters))
        metadata = "k=" + str(num_clusters) + "_" + datestring
        gg.ggsave(filename = "results/" + metadata +".png", plot = p)
Esempio n. 25
0
def plot_age_speed(df):
    num_rows = df.shape[0]
    title = 'age v speed'

    print ggplot(df, aes(s.AGE_COL_NAME, s.SPEED_COL_NAME)) + \
            ggtitle(_make_title(title, num_rows))+ \
            geom_point(colour='steelblue') + \
            scale_x_continuous(
                    # breaks=[10,20,30],
                    # labels=["horrible", "ok", "awesome"]
                    )

    return df 
Esempio n. 26
0
def prob231b(initialization="regular"):
    cluster_counts = [2, 3, 5, 10, 15, 20]
    kmcalls = [0 for i in cluster_counts]
    for i, num_clusters in enumerate(cluster_counts):
        kmcalls[i] = KmeansCall(features_only, num_clusters, initialization)
        kmcalls[i].run_kmeans(verbose=False)

        df_to_plot = kmcalls[i].data.copy()
        df_to_plot["class_label"] = [label for label in kmcalls[i].class_label]
        p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters))
        metadata = "k=" + str(num_clusters) + "_" + datestring
        gg.ggsave(filename="results/" + metadata + ".png", plot=p)
Esempio n. 27
0
def models_llhd(pm_llhd):
    """
    Tracking the total likelihood of READS in a model(cluster).
    :param pm_llhd: (np.array) matrix stores read likelihood in every model/cluster.
    :param type (

    x axis: iteration time
    y axis: sum likelihood log value
    """
    p = gp.ggplot(gp.aes(x="iteration num", y="log value"), data=pm_llhd)\
        +gp.geom_point(color="blue")\
        +gp.ggtitle(u"model likelihood")
    print(p)
Esempio n. 28
0
def plot_distance_trip_time(df):
    num_rows = df.shape[0]
    title = 'trip duration v distance travelled'

    print ggplot(df, aes(s.TRIP_DURATION_COL, s.DISTANCE_TRAVELED_COL_NAME)) + \
            ggtitle(_make_title(title, num_rows))+ \
            stat_smooth(colour="red") + \
            geom_point(colour='steelblue') + \
            scale_x_continuous(
                    # breaks=[10,20,30], 
                    #labels=["horrible", "ok", "awesome"]
                    )

    return df 
Esempio n. 29
0
def lineplot_compare(filename):  # Cleaner version with string vars
    df = pd.read_csv(filename)
    p_title = "Homeruns by Year by Team"
    p_xlab = "Homeruns"
    p_ylab = "Year"
    gg = (
        gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID"))
        + gp.geom_point()
        + gp.geom_line()
        + gp.ggtitle(p_title)
        + gp.xlab(p_xlab)
        + gp.ylab(p_ylab)
    )
    return gg
Esempio n. 30
0
    def point_chart(self, conn, column1, column2, table_chosen, title):

        data_df = dfile.double_selector(conn=conn,
                                        table=table_chosen,
                                        col1=column1,
                                        col2=column2)

        point_plot = ggplot(
            aes(x=column1, y=column2),
            data=data_df) + geom_point() + theme_gray() + labs(title=title)
        now = datetime.datetime.now()
        b = now
        print(b)
        print(b - a)
        print(point_plot)
Esempio n. 31
0
def googletrend_command(delta_t, threshold=0.0, inverse=False):
    """the command to run google trend algorithm.

	:param delta_t:   the upper bound for original delta_t parameter
    :param threshold: upper bound for the threshold of differentiating two classes
    :param inverse:   whether to inverse the classifier
	"""
    ## handle filepath and title based on parameter inverse
    filename = "googletrend"
    titlename = "ROC of google trend classifier"
    if inverse:
        filename += "_inverse"
        titlename += " (inverse version)"
    filepath = "./plots/%s.jpg" % filename
    ## generate data first
    data = googletrend.preprocess()
    ## store classifier evaluation metrics into dict
    output = {}
    output['tpr'] = []
    output['fpr'] = []
    output['plot'] = []
    for thre in np.arange(0, threshold + 0.1, 0.1):
        print "==> threshold: %f, inverse: %s" % (thre, inverse)
        for i in xrange(1, int(delta_t)):
            googletrend.algorithm(data, i, thre, inverse)
            tp_rate, fp_rate = googletrend.evaluate(data)
            # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate)
            output['tpr'].append(tp_rate)
            output['fpr'].append(fp_rate)
            output['plot'].append('thre_' + str(thre))
    ## plot ROC graph
    ## add a y=x baseline for comparison
    output['tpr'].extend([0.0, 1.0])
    output['fpr'].extend([0.0, 1.0])
    output['plot'].extend(['baseline', 'baseline'])
    df = pd.DataFrame(output)
    graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \
      gg.theme_seaborn() + \
      gg.ggtitle(titlename) + \
         gg.xlab("FPR") + \
         gg.ylab("TPR") + \
         gg.xlim(0.0, 1.0) + \
         gg.ylim(0.0, 1.0) + \
      gg.geom_point() + \
      gg.geom_line()
    gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
Esempio n. 32
0
    def plot(self, inputs):
        """Plot the given X and Y axes on a scatter plot"""
        if inputs.year not in self.dat.Year.values:
            return

        if inputs.xvar not in self.dat or inputs.yvar not in self.dat:
            return

        subdat = self.dat[self.dat.Year == inputs.year]
        p = ggplot(subdat, aes(x=inputs.xvar, y=inputs.yvar))

        p = p + geom_point()
        if inputs.shownames:
            p = p + geom_text(aes(label=self.ID_col), vjust=1, hjust=1)
        if inputs.linear:
            p = p + stat_smooth(color="red", method="lm")
        return p
Esempio n. 33
0
def plot_cost_history(alpha, cost_history):
    """This function is for viewing the plot of your cost history.
    You can run it by uncommenting this

        plot_cost_history(alpha, cost_history)

    call in predictions.

    If you want to run this locally, you should print the return value
    from this function.
    """
    cost_df = pandas.DataFrame({
        'Cost_History': cost_history,
        'Iteration': range(len(cost_history))
    })
    return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
           geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha)
Esempio n. 34
0
    def plot(self, inputs):
        """Plot the given X and Y axes on a scatter plot"""
        if inputs.year not in self.dat.Year.values:
            return

        if inputs.xvar not in self.dat or inputs.yvar not in self.dat:
            return

        subdat = self.dat[self.dat.Year == inputs.year]
        p = ggplot(subdat, aes(x=inputs.xvar, y=inputs.yvar))

        p = p + geom_point()
        if inputs.shownames:
            p = p + geom_text(aes(label=self.ID_col), vjust=1, hjust=1)
        if inputs.linear:
            p = p + stat_smooth(color="red", method="lm")
        return p
Esempio n. 35
0
def graph1(score_data):
    """ Average score as time goes on;
        Creates and returns graph 1, a line graph. """

    date_column = score_data[0][find_time_stamp(score_data)]

    data = DataFrame(score_data[1:], columns=score_data[0])

    # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical
    # questions so we know what to graph
    num_questions = data.select_dtypes(include=['int64']).columns.values

    # Melt data so that each question is in a seperate row
    new_data = pd.melt(data,
                       id_vars=date_column,
                       value_vars=num_questions,
                       var_name="Question",
                       value_name="Score")

    # Convert date string into an actual date type
    new_data[date_column] = pd.to_datetime(new_data[date_column],
                                           format="%m/%d/%Y")

    # Group all rows with same date and question, and then take the average.
    new_data = new_data.groupby([date_column, 'Question']).mean().reset_index()
    new_data['All'] = "Indiviual Questions"

    new_data2 = new_data.groupby(date_column).mean().reset_index()
    new_data2['Question'] = "All Questions"
    new_data2['All'] = "Average of All Questions"

    new_data = pd.concat([new_data, new_data2])

    new_data[date_column] = new_data[date_column].astype('int64')

    # Create time graph with seperate lines for each question
    ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\
        ggplot.geom_point() +\
        ggplot.geom_line() +\
        ggplot.facet_grid("All") +\
        ggplot.scale_x_continuous(labels=[""], breaks=0) +\
        ggplot.labs(x="Time", y="Average Question Score") +\
        ggplot.ggtitle("Question Scores Over Time")
    return ret
Esempio n. 36
0
def prob231g():
    filename = "results/prob231g"

    num_clusters_231g = 3
    emcall = EMCall(features_only, labels_only, num_clusters_231g)
    emcall.run_em()

    plt.plot(emcall.log_likelihood_record)
    plt.title("Likelihood over EM iterations")
    plt.savefig(filename + "_loglike.png")

    prob231g_plot_df = emcall.data.copy()
    prob231g_plot_df["class_label"] = [label for label in emcall.class_label]
    p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("EM cluster assignments")
    gg.ggsave(filename + "_clusters.png", plot = p)

    pkl.dump(obj = emcall, file = open(filename + "_a.pkl", "wb"))
    print("Done with 231g.")
    return
Esempio n. 37
0
def prob231g():
    filename = "results/prob231g"

    num_clusters_231g = 3
    emcall = EMCall(features_only, labels_only, num_clusters_231g)
    emcall.run_em()

    plt.plot(emcall.log_likelihood_record)
    plt.title("Likelihood over EM iterations")
    plt.savefig(filename + "_loglike.png")

    prob231g_plot_df = emcall.data.copy()
    prob231g_plot_df["class_label"] = [label for label in emcall.class_label]
    p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \
        gg.geom_point() + gg.ggtitle("EM cluster assignments")
    gg.ggsave(filename + "_clusters.png", plot=p)

    pkl.dump(obj=emcall, file=open(filename + "_a.pkl", "wb"))
    print("Done with 231g.")
    return
Esempio n. 38
0
def _plot_and_save_local_ancestry(df, kmer, image_filename, num_chromosomes, id_vars, x_axis, y_scale):
	print('saving plot as: {}'.format(image_filename))
	var_name='chromosome'

	local_ancestry_df_long = pd.melt(df, id_vars=id_vars, var_name=var_name, value_name='estimated_ancestry')

	new_names = {}
	for i in range(1, num_chromosomes + 1):
		new_names['test_{}'.format(i)] = 2*i - 2 * y_scale
		new_names['true_{}'.format(i)] = 2*i - 1 * y_scale

	for key, value in new_names.items():
		local_ancestry_df_long.replace(key, value, inplace=True)

	plot = ggplot.ggplot(ggplot.aes(x=x_axis, y=var_name, color='estimated_ancestry'), data=local_ancestry_df_long) \
		+ ggplot.geom_point() \
		+ ggplot.scale_y_continuous(labels=list(new_names.keys()), breaks=list(new_names.values())) \
		+ ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \
		+ ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale

	plot.save(image_filename)
Esempio n. 39
0
def biplot(X, color='cluster'):
    '''
    Prints a biplot with ggplot. Requires color variable: "cluster" in the dataframe.
    '''
    pca = PCA(n_components=2)
    
    res = pca.fit_transform(filter_numerical(X))
    
    df = pandas.DataFrame(res)
    df.columns = ["x", "y"]
    
    if color == 'cluster':
        df['Cluster'] = X[color].values
        color = 'Cluster'
    else:
        c = X[color].values
        c[c=="1"] = "Normal"
        c[c=="-1"] = "Anomalia"
        df['Detectado como:'] = c
        color = 'Detectado como:'
    
    return ggplot(aes("x","y", color=color),df) + geom_point(aes(size=40))
Esempio n. 40
0
def data_output(data, chart_title):
		print "Good News! You're data has been returned. I'm happy to show it to you."
		print "Just tell me how you want it - Table or Line Graph?"

		data_output = raw_input("Choose table or line > ")

		if data_output[0].lower() == "t":
			print "Ok, here's your data."
			print data
		elif data_output[0] == "l" or data_output[0].lower() =="g":
			import ggplot as gg 

			plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \
    			gg.geom_point(color='black') + \
    			gg.geom_line(color='green') + \
    			gg.ggtitle(chart_title) + \
    			gg.xlab("Month, Year") + \
    			gg.ylab("Value") 
    			gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B"))

			print (plot + gg.theme_xkcd())
			
def lineplot(hr_year_csv):
    # A csv file will be passed in as an argument which
    # contains two columns -- 'HR' (the number of homerun hits)
    # and 'yearID' (the year in which the homeruns were hit).
    #
    # Fill out the body of this function, lineplot, to use the
    # passed-in csv file, hr_year_csv, and create a
    # chart with points connected by lines, both colored 'red',
    # showing the number of HR by year.
    #
    # You will want to first load the csv file into a pandas dataframe
    # and use the pandas dataframe along with ggplot to create your visualization
    #
    # You can check out the data in the csv file at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv
    #
    # You can read more about ggplot at the following link:
    # https://github.com/yhat/ggplot/
    
    df = pandas.read_csv(hr_year_csv)
    gg = gp.ggplot(df, gp.aes('yearID', 'HR')) + gp.geom_point(color='red') + gp.geom_line(color='red')
    
    return gg
Esempio n. 42
0
    def plot(self, title='The amazing peixinho\'s plots'):

        import ggplot
        import sklearn.manifold
        import pandas

        reduc = self._dataset.get('projection', None)
        if reduc is None or reduc.shape[1] >= 3:
            tsne = sklearn.manifold.TSNE()
            reduc = tsne.fit_transform(self._dataset['feats'])
            self._dataset['projection'] = reduc

        df = pandas.DataFrame({
            'X': reduc[:, 0].ravel(),
            'Y': reduc[:, 1].ravel(),
            'truelabel': self._dataset['truelabel'].ravel()
        })

        df['truelabel'] = df['truelabel'].astype(object)

        return ggplot.ggplot(
            ggplot.aes(x='X', y='Y', color='truelabel'),
            data=df) + ggplot.geom_point() + ggplot.ggtitle(title)
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year,
    # which contains two columns -- yearID, and HR.
    #
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv

    # your code here

    df = pd.read_csv('hr_year.csv')
    gg = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
def lineplot_compare(hr_by_team_year_sf_la_csv):
    # Write a function, lineplot_compare, that will read a csv file
    # called hr_by_team_year_sf_la_csv and plot it using pandas and ggplot2.
    #
    # This csv file has three columns -- yearID, HR, and teamID, 
    # representing the total number of HR hit each year by the SF Giants 
    # and LA Dodgers. Produce a visualization comparing the total HR by 
    # year of the two teams. 
    # 
    # You can see the data in hr_by_team_year_sf_la_csv
    # at the link below:
    # https://www.dropbox.com/s/wn43cngo2wdle2b/hr_by_team_year_sf_la.csv
    #
    # Note that to differentiate between multiple categories on the 
    # same plot in ggplot, we can pass color in with the other arguments
    # to aes, rather than in our geometry functions.
    # 
    # For example, ggplot(data, aes(xvar, yvar, color=category_var)).  This
    # should help you.
    df = pandas.read_csv(hr_by_team_year_sf_la_csv)
    #print(df)
    gg = gp.ggplot(df, gp.aes('yearID', 'HR', color='teamID')) + gp.geom_point() + gp.geom_line()
    return gg
def lineplot(hr_year_csv):
    # Assume that we have a pandas dataframe file called hr_year, 
    # which contains two columns -- yearID, and HR.  
    # 
    # The pandas dataframe contains the number of HR hit in the
    # Major League baseball in each year.  Can you write a function,
    # lineplot, that creates a chart with points connected by lines, both
    # colored 'red', showing the number of HR by year?
    #
    # You can check out the data loaded into the dataframe at the link below:
    # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv
    
    # your code here

    df = pd.read_csv('hr_year.csv')
    gg  = gp.ggplot(df, gp.aes('yearID', 'HR'))
    gg += gp.geom_point(color='red')
    gg += gp.geom_line(color='red')
    gg += gp.ggtitle('Total HRs by Year')
    gg += gp.xlab('Year')
    gg += gp.ylab('HR')

    return gg
Esempio n. 46
0
    def scatter(self,
                dataframe,
                x=None,
                y=None,
                width=None,
                height=None,
                color=None,
                title='Scatter',
                xaxis_label=None,
                yaxis_label=None,
                label=None):
        color = self.__default_options__.get('palette',
                                             None) if color is None else color
        width = self.__default_options__.get('width',
                                             None) if width is None else width

        gg = ggplot(dataframe, aes(x, y)) + geom_point(
            color=color, alpha=0.6) + ggtitle(title)
        if xaxis_label:
            gg += scale_x_continuous(name=xaxis_label)
        if yaxis_label:
            gg += scale_y_continuous(name=xaxis_label)

        return gg
            try:
                session.run(run_script)
                truematch_mod.append(session.getvalue("mod_delta_m"+str(mc)))
                runtime_mod.append(gmodel_relaxed.Runtime)
            except RuntimeError:
                print ("unable to evaluate true matchin perf of gurobi model in "+fname)
        except (OSError, NameError, ValueError,RuntimeError):
            print "unable to process gurobi model in "+fname



np.savetxt("runtime.csv", runtime_mod, delimiter=",")
np.savetxt("tm.csv", runtime_mod, delimiter=",")

tm_df = pd_df({
    "tm": truematch_orig + truematch_mod,
    "runtime": runtime_orig + runtime_mod,
    "orig_or_mod": ["mod"]*len(runtime_mod) # ["orig"]*nmc
})
nmc = len(runtime_mod)




#print gg.ggplot(tm_df, aes('orig_or_mod', 'runtime')) + \
#  gg.geom_line(colour='steelblue')

comp_plot = gg.ggplot(data=tm_df, aesthetics=gg.aes(x='runtime', y='tm')) + gg.geom_point() + gg.scale_x_log10()
gg.ggsave("graphmatch_IP_runtime_vs_tm.pdf",plot = comp_plot)
print   "mean of tm:"+ str(np.mean( truematch_mod))
Esempio n. 48
0
def plotSetOfArrays(arrays,names,fileName):
  IDS = np.linspace(0,1,arrays[0].shape[0])
  A = IDS.reshape(arrays[0].shape[0],1)
  for i in range(0,len(arrays)):
    A = np.concatenate((A,arrays[i]),axis=1)
  Data = pd.DataFrame(A,columns = ['noise']+names)
  Melted = pd.melt(Data,id_vars=['noise'])

  pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'), data=Melted) +  ggplot.geom_line() + ggplot.geom_point()
  ggplot.ggsave(pv,'./IMG/'+fileName)

  output_file("iou_scores.html", title="correlation.py example")

  figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave")
  hold()
  line(IDS, arrays[0][:,0], color='#A6CEE3', legend=names[0])
  line(IDS, arrays[1][:,0], color='#1F78B4', legend=names[1])
  line(IDS, arrays[2][:,0], color='#B2DF8A', legend=names[2])
  line(IDS, arrays[3][:,0], color='#33A02C', legend=names[3])
  line(IDS, arrays[4][:,0], color='#fb9a99', legend=names[4])

  curplot().title = "Minimum IOU"
  grid().grid_line_alpha=0.3
  show()
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 前几行")
print(df.head())

#text = df.comments.iloc[0]   单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始
#s = SnowNLP(text)
#
#print(s.sentiments)


def get_sentiment_cn(text):
    s = SnowNLP(text)
    return s.sentiments


df["sentiment"] = df.comments.apply(get_sentiment_cn)
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值")
print(df)

print("#######################################")
print("重要信息")
print("所有影评的平均值为:", df.sentiment.mean())
print("所有影评的中位数为:", df.sentiment.median())

ggplot.ggplot(ggplot.aes(x="date", y="sentiment"),
              data=df) + ggplot.geom_point() + ggplot.geom_line(
                  color='blue') + ggplot.scale_x_date(
                      labels=ggplot.date_format("%Y-%m-%d"))

df.sort_values(['sentiment'])[:5]
#coding=utf-8
#!/usr/bin/python


### 资料来源:http://nbviewer.ipython.org/gist/wrobstory/1eb8cb704a52d18b9ee8/Up%20and%20Down%20PyData%202014.ipynb

# 导入文件模块
import ggplot as gg
from ggplot import ggplot
import numpy as np
import pandas as pd

df = pd.read_csv('/Users/zhangbo/github/pydatasv2014/USGS_WindTurbine_201307_cleaned.csv')
min_heights = df[df['Rotor Diameter'] > 10]

(ggplot(gg.aes(x='Turbine MW', y='Rotor Swept Area'), data=min_heights[:500])
    + gg.geom_point(color='#75b5aa', size=75)
    + gg.ggtitle("Rotor Swept Area vs. Power")
    + gg.xlab("Power (MW)")
    + gg.ylab("Rotor Swept Area (m^2)"))
Esempio n. 51
0
File: fft.py Progetto: smoly/bubo
#

# Signal (time domain)
signal      = pd.DataFrame()
Fs          = 150.0                            # sampling rate
Ts          = 1.0/Fs                           # sampling interval
signal['t'] = np.arange(0, 1, Ts)              # time vector
ff          = 5                                # frequency of the signal
signal['y'] = np.sin(2*np.pi*ff * signal['t']) # the signal
n           = len(signal['y'])                 # length of the signal

# Spectrum (freq domain)
spectrum       = pd.DataFrame()
k              = np.arange(n)
T              = n/Fs
spectrum['f']  = k/T                       # frequency range
spectrum['Y']  = np.fft.fft(signal['y'])/n # fft (with 1/n normalization)
spectrum['a']  = np.abs(spectrum['Y'])     # amplitude spectrum
spectrum['p']  = np.abs(spectrum['Y'])**2  # power spectrum
spectrum_h     = spectrum[:n/2]            # positive half (real signal -> hermitian spectrum)

plot_gg(gg_layer(
    gg.ggplot(signal, gg.aes(x='t', y='y')),
    gg.geom_point(),
))

plot_gg(gg_layer(
    gg.ggplot(spectrum_h, gg.aes(x='f', y='a')),
    gg.geom_point(),
))
Esempio n. 52
0
def plot_trace_points(df, **kws):
    from ggplot import geom_point, geom_line, geom_path
    return geom_point(data = df, **kws)
Esempio n. 53
0
import pandas as pd
import numpy as np
# from source import view_and_print_output
import ggplot as gg


df = pd.DataFrame()
for num_layers, num_nodes in [(2, 50), (2, 100), (2, 150), (2, 200), (4, 50), (4, 100), (4, 150), (4, 200)]:
    file_coarse = '../../data/coarse_lambda_dropout_' + str(num_layers) + '_' + str(num_nodes) + '.txt'
    newdata = pd.read_csv(file_coarse)
    newdata = newdata.sort_values(by='validation error', ascending=True)
    newdata['lambda'] = np.log10(newdata['lambda'])
    newdata['index'] = (np.arange(len(newdata), dtype='float')/len(newdata))**3
    newdata['config'] = str(num_layers * 100 + num_nodes) +  ' ' +  str(num_layers) + ' ' + str(num_nodes)
    df = df.append(newdata)
print(df.sort_values(by='validation error', ascending=False).head(20))
p = gg.ggplot(gg.aes(x='lambda', y='dropout prob', color='index'), data=df) + \
        gg.geom_point() + \
        gg.xlab('lambda') + \
        gg.ylab('dropout prob') + \
        gg.scale_x_continuous(limits=(-5, 2)) + \
        gg.facet_wrap('config')
print(p)

# Conclusion: ignore dropout
Esempio n. 54
0
    clusterAss.iloc[:,0] = clusterAss.iloc[:,0].fillna(0)
    clusterAss.iloc[:,1] = calcDistance(dataSet, centroid)
    minSSE = math.inf
    while(len(centroids) < k):
        numCurrCluster = len(centroids)
        for i in range(numCurrCluster):
            pointsInCurrCluster = dataSet.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0] == i].index,:]
            splitClusterAss, tmpCent = kmeans(pointsInCurrCluster, 2)
            splitSSE = sum(splitClusterAss.iloc[:,1])
            notSplitSSE = sum(clusterAss.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0] != i].index,1])
            currentSSE = splitSSE + notSplitSSE
            if(currentSSE < minSSE):
                minSSE = currentSSE
                bestClusterToSplit = i
                bestNewCentroids = tmpCent.copy()
                bestClusterAss = splitClusterAss.copy()
        bestClusterAss.loc[bestClusterAss.loc[:,'cluster']==1, 'cluster'] = numCurrCluster
        bestClusterAss.loc[bestClusterAss.loc[:,'cluster']==0, 'cluster'] = bestClusterToSplit
        centroids[bestClusterToSplit] = bestNewCentroids.iloc[0, :]
        centroids.append(bestNewCentroids.iloc[1, :])
        clusterAss.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0]==bestClusterToSplit].index,:] = bestClusterAss
    print('Just finished')
    return(clusterAss, pd.DataFrame(centroids))

data = pd.read_table('/home/samael/learnML/kmeans/kmeans', sep='\t', header=None)
data.columns = ['V1', 'V2']
clusterAss1, centroids1 = kmeans(data, 4)
clusterAss2, centroids2 = biKmeans(data, 4)
ggplot.ggplot(data, ggplot.aes('V1', 'V2')) + ggplot.geom_point(color=clusterAss1.cluster, size=50) + ggplot.geom_hline(y=0) + ggplot.geom_vline(x=0)
ggplot.ggplot(data, ggplot.aes('V1', 'V2')) + ggplot.geom_point(color=clusterAss2.cluster, size=50) + ggplot.geom_hline(y=0) + ggplot.geom_vline(x=0)
Esempio n. 55
0
        if (reward == 1):
            wins_for_player_1[i] += 1.0
        elif (reward == 0.5):
            draw_for_players[i] += 1.0

    print(i, wins_for_player_1[i], draw_for_players[i])
    data.append({
        'Type': 0,
        'Wins': wins_for_player_1[i],
        'Training': training_steps * (i - 1)
    })
    data.append({
        'Type': 1,
        'Wins': draw_for_players[i],
        'Training': training_steps * (i - 1)
    })
    learnitMC(training_steps, epsilon, alpha, n)
#   learnit(training_steps, epsilon, alpha) # the original learning code.

# Pandas gives you the power of R
learningdf = pd.DataFrame(data)
# I use ggplot when I generate figures in R and would like to use it with Python, HOWEVER:
# latest Pandas causes problems for ggplot so I needed these two patches:
# https://stackoverflow.com/questions/50591982/importerror-cannot-import-name-timestamp/52378663
# https://github.com/yhat/ggpy/issues/612
p = gg.ggplot(gg.aes(x='Training', y='Wins', group='Type'), data=learningdf)+ gg.xlab('Learning games') + \
    gg.ylab('Wins for player 1') + gg.ggtitle("n="+str(n)) + gg.geom_point() + gg.stat_smooth(method='loess')
p.make()
filename = "experiment_" + str(n) + ".pdf"
p.save(filename)
Esempio n. 56
0
data = []
for method in methods:
    for model in models:
        for rtol in rtols:
            print('method: {} model: {} rtol: {}'.format(method.name, model.name, rtol), end='')

            # Run
            tic = time.time()
            result = method(model, rtol)
            toc = time.time() - tic

            # Compare to gold standard
            standard = gold_standards[model.name]
            diff = result - standard.values
            max_rel_diff = np.max(diff/standard.max)

            # Append to table
            record = (method.name, model.name, rtol, max_rel_diff, toc)
            print(' err: {} toc: {}'.format(max_rel_diff, toc))
            data.append(record)


data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time'])

print(gg.ggplot(data, gg.aes(x='err', y='time', color='method'))
      + gg.geom_point(size=60.0)
      + gg.geom_line()
      + gg.scale_x_log()
      + gg.scale_y_log()
      + gg.xlim(1e-10, 1e-2))
    "orig_or_mod": ["orig"]*nmc + ["mod"]*nmc
})

p_val_truematch_diff = st.ttest_rel(truematch_orig, truematch_mod)
p_val_timediff = st.ttest_rel(runtime_orig, runtime_mod)

p_val_truematch_diff = st.wilcoxon(truematch_orig, truematch_mod)
p_val_timediff = st.wilcoxon(runtime_orig, runtime_mod)

#print gg.ggplot(tm_df, aes('orig_or_mod', 'tm')) + \
#  gg.geom_line(colour='steelblue')


#print gg.ggplot(tm_df, aes('orig_or_mod', 'runtime')) + \
#  gg.geom_line(colour='steelblue')

comp_plot = gg.ggplot(data=tm_df, aesthetics=gg.aes(x='runtime', y='tm', colour='orig_or_mod')) + gg.geom_point() + gg.scale_x_log10()
gg.ggsave("orig_IP_vs_modified_IP_3.pdf",plot = comp_plot)












Esempio n. 58
0
 def test_groups_2_aes(self):
     p = gg.ggplot(gg.aes(x='carat', y='price', color='clarity', shape='cut'), gg.diamonds) + gg.geom_point()
     _, groups = p._construct_plot_data()
     self.assertEqual(len(groups), 8*5)
Esempio n. 59
0
import ggplot as gp
import pandas as pd
import numpy as np

crime = pd.read_csv('crimeRatesByState2005.csv')
print(
    gp.ggplot(gp.aes(x='murder', y='burglary'), data=crime) +
    gp.geom_point(color='red'))
Esempio n. 60
0
# optimization costs:
print("optimization costs:")
for t in all_theta:
    t = np.mat(t.reshape(n, 1))
    print(decorateCost(t))

# calculate prediction efficiency
predict = np.round(mllogistic.sigmoid(X*theta))
print("Prediction accuracy = {}".format((predict == y).mean()*100))
detect = predict[np.where(y)]
print(" Detection accuracy = {}".format(detect.sum()/detect.size*100))

if False:
    # scatter ggplot
    mtcars.cyl = mtcars.cyl.astype(str) # changes cyl to a discrete value
    point = gg.ggplot(mtcars, gg.aes("disp", "mpg", colour = "cyl")) + gg.geom_point(size = 35)
    print point

# scatter pyplot
neg = np.where(y.A1 == 0)
pos = np.where(y.A1 == 1)
fig, ax = plt.subplots()
ax.plot(X_prenorm[neg, 1].A, X_prenorm[neg, 2].A, "ko", markerfacecolor = "b", markersize = 7, label = "cyl4")
ax.plot(X_prenorm[pos, 1].A, X_prenorm[pos, 2].A, "ko", markerfacecolor = "r", markersize = 7, label = "cyl6")
ax.set_xlabel("disp")
ax.set_ylabel("mpg")

# here is the grid range
margin = 20
u = np.linspace(min(X_prenorm[:,1]).A1 - margin, max(X_prenorm[:,1]).A1 + margin, 50)
v = np.linspace(min(X_prenorm[:,2]).A1 - margin, max(X_prenorm[:,2]).A1 + margin, 50)