コード例 #1
0
def plotTradeVsNews(tickName):
    path2 = "resultsMKII"
    frame = getNewsNTradingVol(tick_Name,path2)
    newsBuz = []
    tradingVol = []
    newsVol = []
    for i in range(len(frame['tradingVol'])):
        newsBuz.append(frame['NewsBuz'].values[i])
        tradingVol.append(np.log(frame['tradingVol'].values[i]))
        newsVol.append(np.log(frame['NewsVol'].values[i]))
    sns.set(style="ticks")
    x = np.array(newsBuz)
    y = np.array(tradingVol)
    ax = sns.jointplot(x,y,kind="hex",stat_func=kendalltau,color="#4CB391")
    ax.set_axis_labels(xlabel= "News Buz",ylabel="Trading Volume")
    g = sns.jointplot(x, y, kind="kde", size=7, space=0)
    g.set_axis_labels(xlabel= "News Buz",ylabel="Trading Volume")

    x = np.array(newsVol)
    ay = sns.jointplot(x,y,kind="hex",stat_func=kendalltau,color="#4CB391")
    ay.set_axis_labels(xlabel= "News Volume",ylabel="Trading Volume")

    h = sns.jointplot(x, y, kind="kde", size=7, space=0)
    h.set_axis_labels(xlabel= "News Volume",ylabel="Trading Volume")
    sns.plt.show()
    # sns.plt.subplot(2,1,1)#41B3D3
    # a1 = sns.regplot(x="NewsBuz", y="tradingVol", data=frame,ci=None,fit_reg=False,color="#1dad9b")
    # a1.set_ylim([0,4e8])
    # sns.plt.subplot(2,1,2)
    #
    # a2 = sns.regplot(x="NewsVol", y="tradingVol", data=frame,ci=None,fit_reg=False,color="#41B3D3")
    # a2.set_ylim([0,4e8])
    sns.plt.show()
コード例 #2
0
ファイル: doplot_stan.py プロジェクト: aasensio/axial_ratio
    def doplot(self, name):
        """
        Do some plots
        """

        self.trace = pickle.load( open( name, "rb" ) )

        var = np.vstack([self.trace['muCB'][:,0], self.trace['muCB'][:,1], self.trace['sdCB'][:,0], self.trace['sdCB'][:,1]]).T

        corner.corner(var, labels=['$\mu_C$', '$\mu_B$', '$\sigma_C$','$\sigma_B$'], show_titles=True)
        
        pl.show()

        # pl.savefig('{0}.png'.format(name))

        # Just get the first N samples. We shuffle the
        # arrays and get the subsamples
        C = self.trace['CB'][:,:,0]
        np.random.shuffle(C)
        C_slice = C[0:200,:].flatten()
        B = self.trace['CB'][:,:,1]
        np.random.shuffle(B)
        B_slice = B[0:200,:].flatten()

        # First option
        pl.plot(B_slice, C_slice, '.', alpha=0.002)
        pl.show()

        # KDE joint plot
        sns.jointplot(C_slice, B_slice, kind='kde')
        pl.show()
コード例 #3
0
ファイル: seaborn1.py プロジェクト: EnriqueU/M-L
def seaborn_join():
    data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000)
    data = pd.DataFrame(data, columns=['x', 'y']) 
    with sns.axes_style('white'):
        sns.jointplot("x", "y", data, kind='hex')
    
    plt.show()
コード例 #4
0
ファイル: beam_source.py プロジェクト: b-r-oleary/acme
 def histogram(self,x=None, y=None, l=None, t=None, **kwargs):
     """
     this is a short-cut for creating many possible histograms, at a
     specified beamline location l, or specified time t.
     - if x and y are not input, then it creates a full joint-scatterplot
       for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t)
     - if x is input, it creates a 1d histogram with respect to that parameter
     - if x and y are input, creates a 2d histogram with respect to those parameters
     """
     table = self.to_dataframe(l=l, t=t, latex=True)
     if x is None and y is None:
         g = sns.pairplot(table, **kwargs)
         for ax in g.axes.flat:
             _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90)
         return
     if x is not None and y is None:
         x = self._reformat_label(x)
         sns.distplot(table[x], **kwargs)
         plt.xlabel(x)
         return
     if x is not None and y is not None:
         x = self._reformat_label(x)
         y = self._reformat_label(y)
         sns.jointplot(x=x, y=y, data=table, **kwargs);
         return
コード例 #5
0
ファイル: plotUtils.py プロジェクト: tarlen5/pisa
def make_scatter_plot(frame, name, **kwargs):
    """
    Makes a scatter plot of column name in frame.
    """

    column_x = frame[name]
    if name == 'deltam31': column_x*=100.0

    params = []
    exclude = set(['hypo','llh','mctrue'])
    params = list(set(frame.columns).difference(exclude))

    figs = []
    # Plot correlation scatter plot for all other systematics
    for p in params:
        if p == name: continue
        column_y = frame[p]
        if p == 'deltam31': column_y*=100.0
        if 'theta' in p: column_y = np.rad2deg(column_y)

        with sns.axes_style("whitegrid"):
            sns.jointplot(column_x, column_y, size=8, color='b',
                          **kwargs)
            plt.tight_layout()
            figs.append(plt.gcf())

    return figs
コード例 #6
0
ファイル: plot_poi.py プロジェクト: mamsdiallo/ud120-projects
def plotBonusvsSalary(df):
    sns.jointplot(x="bonus", y="salary", data=df)
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    fig.savefig('bonusVSsalary.png', dpi=100)    
    #plt.savefig('bonusVSsalary.png')
    plt.show()
コード例 #7
0
	def plot_seaborn( self ):

		# https://stanford.edu/~mwaskom/software/seaborn/tutorial/distributions.html

		data = pd.read_csv( 'movement.csv' ).as_matrix()

		# 1/2 3/4 5/6 7/8
		x_column = 3
		y_column = 4

		limit = 100
		data = data[
			  ( data[:,0] == 0)
			& ( data[:,x_column] > -limit )
			& ( data[:,x_column] < limit )
			& ( data[:,y_column] > -limit )
			& ( data[:,y_column] < limit )
		]

		x = data[:,x_column]
		y = data[:,y_column]

		with sns.axes_style( 'white' ):
			sns.jointplot( x=x, y=y, kind='kde' )  # scatter, reg, resid, hex, kde

		sns.plt.show()
コード例 #8
0
def skill_vs_speed(prediction_mode, time_model, data):
    model = TimeCombiner(prediction_mode, time_model)
    Evaluator(data, model).get_report(force_run=True)
    students = data.get_students()
    skills = prediction_mode.get_skills(students)
    fastness = time_model.get_skills(students)
    sns.jointplot(pd.Series(skills), pd.Series(fastness), kind='kde', space=0).set_axis_labels("skill", "speed")
コード例 #9
0
ファイル: read_musicXML.py プロジェクト: rb-roomba/music
def show_graph(data):
    """ Show time series graph of given data. """
    height_list = sorted([[p[0], height(p[1:])] for p in data],
                         key=lambda x: x[0])
    df = pd.DataFrame(height_list)
    df.columns = ["time","height"]
    seaborn.jointplot('time', 'height', data=df)
    plt.show()
コード例 #10
0
ファイル: joint_sampler.py プロジェクト: low-sky/colira
def sbratio(sampler):
    chain = sampler.flatchain
    chain[:,2]=np.abs(chain[:,2])
    chain[:,4]=np.abs(chain[:,4])
    dd = pd.DataFrame(data=chain,
                      columns=['theta','phi','scatter','badfrac','badsig','badmn'])
    with sns.axes_style("white"):
        sns.jointplot("theta", "phi", data, kind="kde");
コード例 #11
0
ファイル: instance.py プロジェクト: ansteh/multivariate
 def plot(self, samples, columns=None):
     if(columns is None):
         df = pd.DataFrame(samples, columns=["x", "y"])
         sns.jointplot(x="x", y="y", data=df)
     else:
         df = pd.DataFrame(samples, columns=[columns[0], columns[1]])
         # sns.jointplot(x=names[0], y=names[1], data=df, xlim=xlim, ylim=ylim)
         sns.jointplot(x=columns[0], y=columns[1], data=df)
コード例 #12
0
ファイル: my_script.py プロジェクト: mosayebi/SAGE
def plot_scatter_hist_sns(x, y):
    #sns.set(color_codes=True)
    #sns.set(style="darkgrid")
    sns.set(style="ticks")
    sns.jointplot(np.array(x), np.array(y), kind="hex", size=4, stat_func=None).set_axis_labels("$\phi$", "$\\theta$")
    with PdfPages('plot4.pdf') as pdf:
         pdf.savefig()
    sns.plt.close() 
コード例 #13
0
ファイル: treesearch.py プロジェクト: MaxwellRebo/disco-dop
def plot(data, total, title, width=800.0, unit='', dosort=True,
		target=None, target2=None):
	"""A HTML bar plot given a dictionary and max value."""
	if len(data) > 30 and target is not None:
		df = pandas.DataFrame(index=data)
		df[title] = pandas.Series(data, index=df.index)
		df[target.name] = target.ix[df.index]
		if target2 is not None:
			df[target2.name] = target2.ix[df.index]
		if target.dtype == numpy.number:
			if target2 is None:
				seaborn.jointplot(target.name, title, data=df, kind='reg')
			else:
				seaborn.lmplot(target.name, title, data=df, hue=target2.name)
		else:  # X-axis is categorical
			df.sort_values(by=target.name, inplace=True)
			if target2 is None:
				seaborn.barplot(target.name, title, data=df)
			else:
				seaborn.barplot(target.name, title, data=df, hue=target2.name)
			fig = plt.gcf()
			fig.autofmt_xdate()
		# Convert to D3, SVG, javascript etc.
		# import mpld3
		# result = mpld3.fig_to_html(plt.gcf(), template_type='general',
		# 		use_http=True)

		# Convert to PNG
		figfile = io.BytesIO()
		plt.savefig(figfile, format='png')
		result = '<div><img src="data:image/png;base64, %s"/></div>' % (
				base64.b64encode(figfile.getvalue()).decode('utf8'))
		plt.clf()
		return result

	result = ['<div class=barplot>',
			('<text style="font-family: sans-serif; font-size: 16px; ">'
			'%s</text>' % title)]
	if target is not None:
		data = OrderedDict([(key, data[key]) for key in
				target.sort_values().index if key in data])
	keys = {key.split('_')[0] if '_' in key else key[0] for key in data}
	color = {}
	if len(keys) <= 5:
		color.update(zip(keys, range(1, 6)))
	keys = list(data)
	if dosort:
		keys.sort(key=data.get, reverse=True)
	for key in keys:
		result.append('<br><div style="width:%dpx;" class=b%d></div>'
				'<span>%s: %g %s</span>' % (
				int(round(width * data[key] / total)) if data[key] else 0,
				color.get(key.split('_')[0] if '_' in key else key[0], 1)
					if data[key] else 0,
				htmlescape(key), data[key], unit,))
	result.append('</div>\n')
	return '\n'.join(result)
コード例 #14
0
ファイル: mplotter.py プロジェクト: dantrim/supersusy
def make_JointPlot(plot, region, data, backgrounds) :

    sample_to_plot = []
    if data.name == plot.sample : sample_to_plot.append(data)
    if not len(sample_to_plot) :
        for bk in backgrounds :
            if bk.name == plot.sample : sample_to_plot.append(bk)
    if len(sample_to_plot) == 0 or len(sample_to_plot) > 1 :
        msg('ERROR make_JointPlot received %d samples to plot for plot with name %s'%(len(sample_to_plot), plot.name))
        sys.exit()

    # turn this tree into an array :)
    sample_to_plot = sample_to_plot[0]
    selection_ = '(' + region.tcut + ') * eventweight * ' + str(sample_to_plot.scale_factor)
    tree_array = tree2rec(sample_to_plot.tree, branches=[plot.x_var, plot.y_var],
                            selection=selection_)
    tree_array.dtype.names = (plot.x_var, plot.y_var)
    x_arr = tree_array[plot.x_var]
    y_arr = tree_array[plot.y_var]

    sns.set(style="white")

    # stats?
    stat_func_ = None
    if plot.stat_func == "kendalltau" :
        from scipy.stats import kendalltau
        stat_func_ = kendalltau
    elif plot.stat_func == None :
        from scipy.stats import pearsonr
        stat_func_ = pearsonr

    j_plot_grid = None
    if plot.cmap == None or plot.cmap == "default" :
        j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, color = plot.color, linewidth = plot.line_width, ylim=[plot.y_range_min,plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max])
        #j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, color = plot.color, linewidth = plot.line_width, joint_kws={"n_levels":plot.n_levels, "shade":True}, ylim=[plot.y_range_min,plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max])

    elif plot.cmap == "cubehelix" :
        cmap_ = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse = True)
        j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, linewidth = plot.line_width, joint_kws={"cmap":cmap_, "n_levels":plot.n_levels, "shade":True}, ylim=[plot.y_range_min, plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max])
    elif plot.cmap == "blues" :
        j_plot_grid = sns.jointplot(x_arr, y_arr, kind = plot.kind, stat_func=stat_func_, linewidth = 1.0, joint_kws={"cmap":"Blues", "n_levels":plot.n_levels, "shade":True, "shade_lowest":False}, ylim=[plot.y_range_min, plot.y_range_max], xlim=[plot.x_range_min,plot.x_range_max])
    else :
        msg("cmap attribute of joint plot not yet added")
        sys.exit()

    j_plot_grid.fig.suptitle(plot.title)
    j_plot_grid.fig.subplots_adjust(top=0.935)
    j_plot_grid.set_axis_labels(plot.x_label, plot.y_label)


    # save the plot to file
    outname = plot.name + ".eps"
    j_plot_grid.savefig(outname)
    out = indir + "/plots/" + outdir 
    utils.mv_file_to_dir(outname, out, True)
    fullname = out + "/" + outname
    msg("%s saved to : %s"%(outname, os.path.abspath(fullname)))
コード例 #15
0
ファイル: plotter.py プロジェクト: verajohne/SEP_autoencoder
def plot_approx_posterior(cov, means, index):
	mean = means[index]
	print mean.shape
	mean, cov = util.product_gaussians(mean, np.zeros(2), cov, np.identity(2))
	data = np.random.multivariate_normal(mean, cov, 200)
	df = pd.DataFrame(data, columns=["x", "y"])
	xlim = (mean[0] - 3*np.sqrt(cov[0][0]),mean[0] + 3*np.sqrt(cov[0][0]))
	ylim = (mean[1] - 3*np.sqrt(cov[1][1]),mean[1] + 3*np.sqrt(cov[1][1]))
	sns.jointplot(x="x", y="y", data=df, kind="kde", stat_func= None, xlim = xlim, ylim = ylim)	
コード例 #16
0
ファイル: show_graph.py プロジェクト: rb-roomba/music
def plot_var(times, pitches, ends, var_n):
    """ Show time series graph of variation [var_n]. """
    # var_n: 0 to 30 (0: Aria)
    n_data = filter(lambda x:(ends[var_n] < x[0] <= ends[var_n+1]),
                    zip(times, pitches))
    # seaborn
    df = pd.DataFrame(n_data)
    df.columns = ["time","height"]
    seaborn.jointplot('time', 'height', data=df)
    plt.show()
コード例 #17
0
ファイル: decay.py プロジェクト: z01nl1o02/tests
 def show(self):
     Y = np.reshape(self._pr,(1,-1)).tolist()[0]
     X = self._lams
     df = pd.DataFrame({'x':X,'y':Y})
     sns.jointplot(x='x',y='y',data=df)
     Y = np.asarray(Y)
     X = np.asarray(X)
     mean = (X*Y).sum()
     sns.plt.title('mean %f'%mean)
     sns.plt.show()
コード例 #18
0
ファイル: plotting.py プロジェクト: schevalier/MJHMC
def hist_2d(distribution, nsamples, **kwargs):
    """
    Plots a 2d hexbinned histogram of distribution
    """
    distr = distribution(ndims=2)
    sampler = MarkovJumpHMC(distr.Xinit, distr.E, distr.dEdX, **kwargs)
    samples = sampler.sample(nsamples)

    with sns.axes_style("white"):
        sns.jointplot(samples[0], samples[1], kind="kde", stat_func=None)
コード例 #19
0
def pairwise_joint_plots(df, cols):
    logging.debug('Plotting pairwise joint distributions')
    cols = sorted(cols)
    for colA, colB in [(a,b) for a in cols for b in cols if a < b]:
        file = 'joint_{}_{}.png'.format(colA, colB)
        logging.debug('joint plot: %s', file)
        fig = plt.figure()
        sns.jointplot(df[colA], df[colB], kind='hex')
        plt.savefig(file)
        plt.close()
コード例 #20
0
ファイル: KernelDensity.py プロジェクト: ryscet/TopDown
def AnalyzeAllElectrodes():
    """From Jacek """
    path = '/Users/ryszardcetnarski/Desktop/Nencki/Badanie_NFB/Dane/wszystkie_elektrody_jacek.csv'
    db = pd.read_csv(path)

    for band in ['theta', 'alpha','smr', 'beta1', 'beta2']:
        db[band+'_po'] = db[band+ '_przed'] + db[band+'_roznica']
    #    fig = plt.figure()
     #   fig.suptitle(band)
      #  corr = fig.add_subplot(211)
      #  diff = fig.add_subplot(212)

        sns.jointplot(band +'_przed', band+'_po', data=db, kind="reg")#, color="r", size=7)

      #  fig = plt.figure()
       # fig.suptitle(band)

        sns.jointplot(band +'_przed', band+'_roznica', data=db, kind="reg")#, color="r", size=7)
        conditions_str = ['mixed_conditions' for i in range(0,len(db))]
        conditions = [0 for i in range(0,len(db))]
        GeneralModel( db[band+ '_przed'] ,  db[band+ '_po'] , band, conditions, conditions_str)
        #corr.scatter(db[band +'_przed'], db[band+'_po'])
        #diff.scatter(db[band +'_przed'], db[band+'_roznica'])

    return db







#Kde using sklearn, returns object
   # kde = KernelDensity(kernel='tophat', bandwidth = 3).fit(initial[:, np.newaxis])
   # log_dens = kde.score_samples(x[:, np.newaxis])

    #Plot sklearn kernel estimate
   # kernel.plot(x, np.exp(log_dens), 'g')
    #Plot original data histogram


  #followUp = np.random.random_sample(100)
    #followUp= np.random.normal(20,10, 100)

    #followUp = np.random.normal(20,10, 100)#initial + np.random.normal(0,100,100)
    #followUp = np.random.random_sample(100)#initial + np.random.normal(0,100,100)
    #hist.hist(initial)

    #initial = np.random.normal(20,10, 100)
    #initial = np.random.random_sample(100)


    #Add noise to each observation
    #initial = #initial *0.95 + np.random.normal(100,100,100)
    #Make a follow up by adding nosie second time to the same population
コード例 #21
0
ファイル: housing_prices.py プロジェクト: tarlen5/coursera_ml
def plotCorrelation(frame):

    # Plot correlation of each variable to visualize each dimension:
    sns.jointplot("bedrooms","price",frame,size=8)
    plt.tight_layout()
    sns.jointplot("size","price",frame,size=8)
    plt.tight_layout()

    print("PAUSED...close figures to continue...")
    plt.show()
    return
コード例 #22
0
ファイル: plotting.py プロジェクト: schevalier/MJHMC
def gauss_2d(nsamples=1000):
    """
    Another simple test plot
    1d gaussian sampled from each sampler visualized as a joint 2d gaussian
    """
    gaussian = misc.distributions.TestGaussian(ndims=1)
    control = Control(gaussian.Xinit, gaussian.E, gaussian.dEdX)
    experimental = ContinuousTimeHMC(gaussian.Xinit, gaussian.E, gaussian.dEdX)

    with sns.axes_style("white"):
        sns.jointplot(control.sample(nsamples)[0], experimental.sample(nsamples)[0], kind="hex", stat_func=None)
コード例 #23
0
ファイル: drawPlot.py プロジェクト: WQ-huziang/WQ-Testcode
 def drawJointPlot(self, se1, se2):
     """
     画线性相关图,表示序列1和序列2的相关性
         :param self: 类变量本身
         :param se1: 序列1
         :param se2: 序列2
     """   
     sns.jointplot(se1, se2, kind='reg', color=self.linecolors[0])
     # plt.title(self.title)
     plt.legend()
     plt.show()
コード例 #24
0
ファイル: analysis.py プロジェクト: EhsanTadayon/alleninf
def fixed_effects(data, labels):
    
    corcoeff, p_val = pearsonr(data[labels[0]], data[labels[1]])
    print "Pearson correlation between %s and %s across all donors is %g (two tailed p value = %g)"%(labels[0], labels[1], corcoeff, p_val)
    
    grid = sns.jointplot(labels[0], labels[1], data, kind="hex")
    sns.jointplot(labels[0], labels[1], data, kind="reg", 
                         xlim=grid.ax_joint.get_xlim(),
                         ylim=grid.ax_joint.get_ylim())
    plt.show()
    
    return corcoeff, p_val
コード例 #25
0
ファイル: assembly.py プロジェクト: Hensonmw/jcvi
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs length. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize", default=1000000, type="int", help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    p.add_option("--color", default='m', help="Color of the data points")
    p.add_option("--kind", default="scatter",
                 choices=("scatter", "reg", "resid", "kde", "hex"),
                 help="Kind of plot to draw")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov.get(ctg, 0)
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))

    df = pd.DataFrame()
    xlab, ylab = "Length", "Coverage of depth (X)"
    df[xlab] = x
    df[ylab] = y
    sns.jointplot(xlab, ylab, kind=opts.kind, data=df,
                  xlim=(0, maxsize), ylim=(0, maxcov),
                  stat_func=None, edgecolor="w", color=opts.color)

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
コード例 #26
0
ファイル: movie_data_handle_v1.py プロジェクト: fzhurd/fzwork
def main():
    movie_raw_data = pd.read_csv('../input/movie_metadata.csv')
    print movie_raw_data.head(3)

    print movie_raw_data.isnull().sum()

    print movie_raw_data.shape
    movie_raw_data_dropna=movie_raw_data.dropna()
    print movie_raw_data_dropna.shape
    print movie_raw_data.dtypes


    # movie_filterd_imdbscore=movie_raw_data['imdb_score'].loc
    # movie_filterd_imdbscore=movie_raw_data.loc[movie_raw_data['imdb_score'].isin([2,3])]

    movie_filterd_imdbscore_first=movie_raw_data.loc[movie_raw_data['imdb_score'] >5]
    movie_filterd_imdbscore_from_raw=movie_raw_data.loc[movie_raw_data['imdb_score'] <8]

    print movie_filterd_imdbscore_first.shape


    movie_filterd_imdbscore_second=movie_filterd_imdbscore_first.loc[movie_raw_data['imdb_score'] <8]

    print movie_filterd_imdbscore_second.shape
    print movie_filterd_imdbscore_from_raw.shape

    print '*********************************'

    print movie_raw_data_dropna.head(3)
    profit=(((movie_raw_data_dropna['gross'].values-movie_raw_data_dropna['budget'].values))/(movie_raw_data_dropna['gross'].values))*100
    print profit

    movie_raw_data_dropna.loc[:,'profit']=pd.Series(profit, movie_raw_data_dropna.index)
    print movie_raw_data_dropna.shape
    print movie_raw_data_dropna.head(3)


    corr=movie_raw_data_dropna.corr()
    print corr

    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, cmap=cmap, vmax=1,
            square=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

    g = sns.jointplot(x="title_year", y="profit",kind='scatter',size=10,ylim = [0,110],xlim=[1980,2020],data=movie_raw_data_dropna)
    h = sns.jointplot(x="imdb_score", y="profit",kind='reg',size=10,ylim = [0,110],data=movie_raw_data_dropna)

    # j = sns.pairplot(movie_raw_data_dropna,hue='content_rating')

    plt.show()
コード例 #27
0
ファイル: c5.py プロジェクト: 3774257/abu
def sample_54_1():
    """
    5.4 使用seaborn可视化数据
    :return:
    """
    sns.distplot(tsla_df['p_change'], bins=80)
    plt.show()

    sns.boxplot(x='date_week', y='p_change', data=tsla_df)
    plt.show()

    sns.jointplot(tsla_df['high'], tsla_df['low'])
    plt.show()
def occupationAnalysis():
    img = plt.imread("playground.jpg")
    robot_position = readLog( "./csv/windfield_game1_green_withindex_position.csv")
    data=np.zeros((nbcols, nbrows))
    for robotp in robot_position:
        robotp = robotp.split(',')
        print(robotp[0])
        px = int(float(robotp[1]) * nbcols)
        py = int(float(robotp[2]) * nbrows)
        data[px][py]+=1

    robot_position = readLog( "./csv/windfield_game1_orange_withindex_position.csv")
    for robotp in robot_position:
        robotp = robotp.split(',')
        print(robotp[0])
        px = int(float(robotp[1]) * nbcols)
        py = int(float(robotp[2]) * nbrows)
        data[px][py]+=1

    robot_position = readLog( "./csv/windfield_game1_blue_withindex_position.csv")
    for robotp in robot_position:
        robotp = robotp.split(',')
        print(robotp[0])
        px = int(float(robotp[1]) * nbcols)
        py = int(float(robotp[2]) * nbrows)
        data[px][py]+=1
    fig, ax = plt.subplots()
    #heatmap = ax.pcolor(data)
    red_high = ((0., 0., 0.),
         (.3, .5, 0.5),
         (1., 1., 1.))

    blue_middle = ((0., .2, .2),
         (.3, .5, .5),
         (.8, .2, .2),
         (1., .1, .1))

    green_none = ((0,0,0),(1,0,0))

    cdict3 = {'red':  red_high,
     'green': green_none,
     'blue': blue_middle,
     'alpha': ((0.0, 0.0, 0.0),
               (0.3, 0.5, 0.5),
               (1.0, 1.0, 1.0))
    }

    #ax.scatter(x, y, label=str(i), color=color, alpha=0.5)
    #dropout_high = LinearSegmentedColormap('Dropout', cdict3)
    #plt.register_cmap(cmap = dropout_high)
    sns.jointplot(x="x", y="y", data=data, kind="kde");
コード例 #29
0
ファイル: visualise.py プロジェクト: vianziro/msc-thesis
def performance_vs_coverage(db, output=None, max_values=250, **kwargs):
    data = [
        row for row in
        db.execute(
            "SELECT "
            "    performance AS performance, "
            "    coverage "
            "FROM param_stats"
        )
    ]
    frame = pandas.DataFrame(data, columns=("Performance", "Legality"))
    sns.jointplot("Legality", "Performance", data=frame,
                  xlim=(0, 1), ylim=(0, 1))
    viz.finalise(output, **kwargs)
コード例 #30
0
 def show(self):
     pos = np.argsort(self.pr)[0][-20:]
     for k in pos:
         print self.hypos[k],self.pr[0,k]
     pos = np.argmax(self.pr)
     print 'max',self.hypos[pos],'pr=',self.pr[0,pos]
     X = []
     for idx,hypo in enumerate(self.hypos):
         N,f = hypo
         X.append(idx)
     Y = self.pr.tolist()[0]
     df = pd.DataFrame({'x':X,'y':Y})
     sns.jointplot(x='x',y='y',data=df)
     sns.plt.show()
コード例 #31
0
# In[25]:

# comapre with men and women that who have more target zero and who have not
fig, ax = plt.subplots(figsize=(10, 5))
sns.countplot(df['target'], hue=df['sex'], ax=ax)
plt.xlabel('target')
plt.ylabel('sex')
plt.xticks(rotation=50)
plt.show

# In[26]:

nums = ['age', 'sex', 'trestbps', 'chol', 'trestbps', 'target']
for i in nums:
    plt.figure(figsize=(20, 10))
    sns.jointplot(x=df[i], y=df['target'], kind='reg')
    plt.xlabel(i)
    plt.ylabel('resposne')
    plt.grid()
    plt.show()

# In[8]:

plt.bar(df['target'], df['age'], alpha=.5, width=0.8, label='chart')
plt.show()

# In[62]:

sns.catplot('sex', 'target', data=df, kind='box', hue='fbs')

# In[53]:
コード例 #32
0
print("Minimum Cost: ${}".format(_min_cost)) 
print("Maximum Cost: ${}".format(_max_cost))
print("Mean Cost: ${}".format(_mean_cost))
print("Median Cost ${}".format(_median_cost))
print("Standard deviation of Cost: ${}".format(_stddev_cost))


_housedata['bedrooms'].value_counts().plot(kind='bar')
plt.title('Total number of Bedroom')
plt.xlabel('Bedrooms')
plt.ylabel('Count of Bedrooms')
plt.show()
#sns.despine

plt.figure(figsize=(10,10))
sns.jointplot(x=_housedata.lat.values, y=_housedata.long.values, size=10)
plt.ylabel('Longitude of House', fontsize=12)
plt.xlabel('Latitude of House', fontsize=12)
plt.show()
#plt1 = plt()
#sns.despine

plt.scatter(_housedata.price,_housedata.sqft_living)
plt.title("Price of House vs Square Feet of House")
plt.show()

plt.scatter(_housedata.price,_housedata.long)
plt.title("Price of House vs Location of the house area")
plt.show()

plt.scatter(_housedata.price,_housedata.lat)
コード例 #33
0
    names = ['variance','skewness','curtosis','entropy','class'])

data.head(3)
data.describe()
data.shape
data.isna().any()
data.dtypes
data['class'].unique()
sns.countplot(x='class', data= data)
sns.violinplot( y=data['curtosis'])
sns.violinplot( y=data['entropy'])
sns.violinplot( y=data['variance'])
sns.violinplot( y=data['skewness'])
p1=sns.kdeplot(data['curtosis'], shade=True, color="r")
p1=sns.kdeplot(data['variance'], shade=True, color="b")
sns.jointplot(x=data['curtosis'], y=data['entropy'], kind='hex', linewidth = 2)
sns.jointplot(x=data['skewness'], y=data['variance'], kind='hex', color = 'skyblue', linewidth = 2)
sns.jointplot(x=data['curtosis'], y=data['variance'], kind='hex', linewidth = 2)
X = data[['variance', 'skewness' ,'curtosis', 'entropy']]
y = data[['class']]

from sklearn.model_selection import train_test_split # Support Vector Machine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

from sklearn.svm import SVC
SVC()
svc = SVC()

# Creating a dictionary of parameters

parameters = {
コード例 #34
0
ファイル: Langevin-MC.py プロジェクト: XanderJC/MCMC-Project
    def kde(self, n=0):

        sns.jointplot(x=self.samples[:, n, 0],
                      y=self.samples[:, n, 1],
                      kind="kde")
def generate_plots(plot_type=""):
    r"""
    Generate plots studying the distribution of graphs in different splits with respect to the graph size (|V| and |E|)
    
    :param plot_type: type of plot in {"histograms", "marginal_E", "marginal_V", "joint"}
    """
    assert plot_type in {"histograms", "marginal_E", "marginal_V", "joint"}
    split_names = ["test", "valid", "train"]

    tot_n_nodes = []
    tot_n_edges = []
    for split_name in split_names:
        d = ToulouseRoadNetworkDataset(split=split_name,
                                       step=0.001,
                                       max_prev_node=8)
        dataloader = DataLoader(d,
                                batch_size=1,
                                shuffle=False,
                                collate_fn=custom_collate_fn)

        n_nodes = []
        n_edges = []
        for datapoint in dataloader:
            this_x_adj, this_x_coord, this_y_adj, this_y_coord, this_img, this_seq_len, this_id = datapoint
            n_edges.append(int(this_y_adj.view(-1).sum().item()))
            n_nodes.append(int(this_seq_len[0] - 2))

        tot_n_edges += n_edges
        tot_n_nodes += n_nodes
        n_nodes = np.array(n_nodes)
        n_edges = np.array(n_edges)

        print(f"{split_name} min/mean/max len nodes", np.min(n_nodes),
              np.mean(n_nodes), np.max(n_nodes))
        print(f"{split_name} min/mean/max len edges", np.min(n_edges),
              np.mean(n_edges), np.max(n_edges))

        if plot_type == "histograms":
            plt.hist(n_nodes, bins=np.max(n_nodes) - np.min(n_nodes) +
                     1)  # arguments are passed to np.histogram
            plt.title(f"Histogram of |V| for {split_name}")
            plt.savefig(f"plots/histogram_|V|_{split_name}.png")
            plt.clf()
            plt.hist(n_edges, bins=np.max(n_edges) - np.min(n_edges) +
                     1)  # arguments are passed to np.histogram
            plt.title(f"Histogram of |E| for {split_name}")
            plt.savefig(f"plots/histogram_|E|_{split_name}.png")
            plt.clf()
        elif plot_type == "marginal_V":
            a = sns.kdeplot(n_nodes, bw=.5, shade=True, label=split_name)
        elif plot_type == "marginal_E":
            b = sns.kdeplot(n_edges, bw=.5, shade=True, label=split_name)
        else:
            sns_plot = sns.jointplot(np.log10(n_nodes),
                                     np.log10(n_edges),
                                     marginal_kws=dict(kernel="gau", bw=.02),
                                     kind="kde",
                                     bw=.05)
            sns_plot.ax_joint.set_xlabel("log10 |V|", fontsize=15)
            sns_plot.ax_joint.set_ylabel("log10 |E|", fontsize=15)
            sns_plot.ax_marg_x.set_title(split_name, fontsize=20)
            sns_plot.ax_joint.set_xlim(0.6, 1.2)
            sns_plot.ax_joint.set_ylim(0.4, 1.2)
            sns_plot.savefig(f"plots/joint_{split_name}.png")

    tot_n_nodes = np.array(tot_n_nodes)
    tot_n_edges = np.array(tot_n_edges)
    print(f"min/mean/max len nodes", np.min(tot_n_nodes), np.mean(tot_n_nodes),
          np.max(tot_n_nodes))
    print(f"min/mean/max len edges\n", np.min(tot_n_edges),
          np.mean(tot_n_edges), np.max(tot_n_edges))

    if plot_type == "marginal_V":
        a.set_xlabel("|V|")
        a.set_ylabel("p(x)")
        a.set_title("Distributions of |V|")
        a.legend()
        a.figure.savefig(f"plots/marginal_|V|.png")
        a.figure.clf()

    if plot_type == "marginal_E":
        b.set_xlabel("|E|")
        b.set_ylabel("p(x)")
        b.set_title("Distributions of |E|")
        b.legend()
        b.figure.savefig(f"plots/marginal_|E|.png")
        b.figure.clf()

    print("Done!")
コード例 #36
0
createFigure(
    figure_data_without_zynex, 'EY_ROC', EARNINGS_YIELD, 'Return On Capital (%)',
    'Earnings Yield (%)', 'ey_roc.png', 'lower right',
    vscaling=1.2, hscaling=2)
createFigure(
    figure_data, 'total_rank', 'EY_rank', 'Rank Return On Capital',
    'Rank Earnings Yield', 'ey_roc_rank.png', 'upper right',
    number_format='%d', vscaling=1.2, hscaling=2)

# Drop outliers
df_capped = df[df[EARNINGS_YIELD].between(
    df[EARNINGS_YIELD].quantile(0.05), df[EARNINGS_YIELD].quantile(0.95))]
df_capped = df_capped[df_capped['ROC'].between(
    df_capped['ROC'].quantile(0.05), df_capped['ROC'].quantile(0.95))]

# Save density plot
ax = sb.jointplot(EARNINGS_YIELD, 'ROC', data=df_capped, kind='kde', color="g")
ax.set_axis_labels('Earnings Yield (%)', 'Return On Capital (%)')
plt.tight_layout()
plt.savefig('density_plot.png', format='png')

plt.clf()

# Create industry histogram
ax = sb.countplot(x=SECTOR, data=figure_data, palette='Blues_d')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_ylabel('Amount')
ax.set_xlabel('Industry')
plt.tight_layout()
plt.savefig('industry_histogram.png', format='png')
コード例 #37
0
ファイル: tp5.py プロジェクト: anouel/cours-2017-2018
#%%
#Histogramme
seaborn.distplot(ordis.price)

#%%
# Boîte à moustaches
seaborn.factorplot("price", data=ordis, kind="box")

#%%
# violin
seaborn.factorplot("price", data=ordis, kind="violin")

#%%
# Lien entre price et var quanti (speed, hd)
seaborn.factorplot("speed", "price", data=ordis)
seaborn.jointplot("hd", "price", data=ordis, kind="reg")

#%%
# Lien entre price et var quali (ram, cd, premium, screen)
seaborn.factorplot("ram", "price", data=ordis, kind="box")
seaborn.factorplot("cd", "price", data=ordis, kind="box")
seaborn.factorplot("premium", "price", data=ordis, kind="box")
seaborn.factorplot("screen", "price", data=ordis, kind="box")

#%%
# price ~ speed et hd
t = pandas.crosstab(pandas.cut(ordis.hd, 6, precision=0),
                    ordis.speed,
                    values=ordis.price,
                    aggfunc=numpy.mean)
seaborn.heatmap(t, cmap="Blues", cbar_kws={'label': 'mean price'})
コード例 #38
0
# In[18]:

sns.pairplot(sub_task_summary_Output, hue='EV', palette='Set1')

# In[20]:

# SIMPLE LINE PLOT
sub_task_summary_Output['EV'].plot(figsize=(20, 12))

# In[26]:

# In[65]:

plt.figure(figsize=(12, 8))

sns.jointplot(x='SPI', y='EV', data=sub_task_summary_Output, color='hotpink')
sns.jointplot(x='CPI', y='EV', data=sub_task_summary_Output, color='red')
sns.jointplot(x='EAC', y='EV', data=sub_task_summary_Output, color='blue')

#

# In[41]:

# In[55]:

# In[56]:

# In[66]:

# In[67]:
コード例 #39
0
ファイル: plotsV1.py プロジェクト: bjonnh/AMBER
def heatscatter_sns(x, y, figsize=(8, 8)):
    sns.set(rc={'figure.figsize': figsize})
    sns.set(style="white", color_codes=True)
    sns.jointplot(x=x, y=y, kind='kde', color="skyblue")
コード例 #40
0
plt.figure(figsize=(10, 25))
sns.countplot(y='country', data=dataset, alpha=alpha)
plt.title('Data by country')
plt.show()

# Between Genders Male vs Female
plt.figure(figsize=(7, 7))
sex = sns.countplot(x='sex', data=dataset)

# Corelation between the Data
plt.figure(figsize=(16, 7))
cor = sns.heatmap(dataset.corr(), annot=True)

g = sns.jointplot(dataset.year,
                  dataset.suicides_no,
                  kind="kde",
                  color="#bfa9e0",
                  size=7)
plt.savefig('graph.png')

# Visualizing which age of people Suicide the most
plt.figure(figsize=(16, 7))
bar_age = sns.barplot(x='sex', y='suicides_no', hue='age', data=dataset)

# Visualizing which Generation of people Suicide the most
plt.figure(figsize=(16, 7))
bar_gen = sns.barplot(x='sex', y='suicides_no', hue='generation', data=dataset)

cat_accord_year = sns.catplot('sex',
                              'suicides_no',
                              hue='age',
コード例 #41
0
df = DataFrame(iris.data,columns = iris.feature_names)
df['target'] = iris.target
print(df)

#数据可视化
import pandas as pd
from scipy import stats,integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
#数据分布可视化,直方图和密度函数
#distplot()函数默认绘出数据的直方图和密度函数
sns.distplot(df['petal length (cm)'],bins = 15)

#jointplot()函数同时绘制散点图和直方图
sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,size =8)


#分组散点图
#用seaborn.FacetGrid标记不同的种类
sns.FacetGrid(df,hue = 'target',size =8).map(plt.scatter,'sepal length (cm)','sepal width (cm)').add_legend()


#六边形图
sns.axes_style('white')
sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,kind = 'hex',color = 'r')

#二维核密度估计图
g = sns.jointplot(x = 'sepal length (cm)',y = 'sepal width (cm)',data = df,kind = 'kde',color = 'm')
#添加散点图
g.plot_joint(plt.scatter,c='w',s=30,linewidth=1,marker='+')
コード例 #42
0
sns.distplot(bd['age'], kde=False, norm_hist=True, bins=10)
sns.distplot(bd['age'], hist=False)
sns.distplot(bd['age'], hist=False)

myimg = myplot.get_figure()
myimg.savefig('distplot.png')

sns.kdeplot(bd['age'])  # other distribution plot, less used
sns.kdeplot(bd['age'], shade=True)  # shade area
sns.kdeplot(bd['pdays'], shade=True)

myplot = sns.boxplot(y='age', data=bd)
myimg = myplot.get_figure()
myimg.savefig('boxplot.png')

myplot = sns.jointplot(x='age', y='balance', data=bd.iloc[:500, :])
myimg = myplot.get_figure()  # not work in jointplot
myimg.savefig('jointplot.png')
myplot = sns.jointplot(x='age',
                       y='balance',
                       data=bd.iloc[:100, :],
                       kind='hex',
                       size=10)
# light colour less density,givenby hex
help(sns.jointplot)
sns.jointplot(x='age',
              y='duration',
              data=bd.iloc[:100, :],
              kind='kde',
              size=10)
myplot = sns.lmplot(x='age', y='balance', data=bd.iloc[1:10, :])
コード例 #43
0
sns.distplot(data['x'])
sns.distplot(data['y'])

# In[9]:

for col in 'xy':
    sns.kdeplot(data[col], shade=True)

# In[10]:

sns.kdeplot(data)

# In[12]:

with sns.axes_style('white'):
    sns.jointplot("x", "y", data, kind='kde')

# In[13]:

with sns.axes_style('white'):
    sns.jointplot("x", "y", data, kind='hex')

# In[14]:

sns.pairplot(data)

# In[20]:

import plotly.graph_objs as go
import numpy as np
x = np.random.randn(2000)
コード例 #44
0
ファイル: testseaborn.py プロジェクト: Alafazam/seabornplots
    5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 1, 11, 10, 10, 10, 10,
    10, 10, 10, 8, 3, 7, 3, 2, 2, 2, 11, 7, 7, 11, 11, 9, 9, 8, 8, 8, 8, 7, 7,
    7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 12, 11, 11, 11, 9, 9, 9, 9, 9, 11, 11, 10,
    1, 12, 12, 12, 3, 2, 12, 11, 11, 11, 11, 11, 11, 11, 10, 3, 11, 11, 2, 2,
    1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 6, 6
]
y = [
    30, 29, 29, 24, 19, 11, 9, 8, 7, 3, 57, 54, 52, 34, 30, 29, 8, 1, 49, 44,
    33, 31, 29, 29, 28, 27, 2, 6, 5, 52, 41, 36, 18, 27, 26, 46, 32, 35, 33,
    15, 14, 10, 0, 51, 49, 44, 43, 28, 27, 26, 19, 16, 56, 21, 19, 16, 49, 43,
    39, 25, 23, 22, 21, 13, 23, 1, 13, 17, 59, 55, 54, 10, 59, 1, 59, 57, 27,
    25, 22, 21, 4, 49, 59, 31, 30, 5, 0, 8, 6, 0, 39, 37, 35, 31, 27, 25, 18,
    11, 9
]

# print rs
# x = rs.gamma(12, size=60)
# y = 2 + rs.gamma(60,size=60)
# x = rs.gamma(2, size=1000)

# print 'y = '+ str(y)

graph = sns.jointplot(x, y, kind="hex", stat_func=kendalltau, color="#4CB391")

# x = np.random.normal(size=100)
# print 'x = '+ str(x)
# graph = sns.distplot(x);

sns.plt.savefig(__main__.__file__ + ".png")
# graph.pyplot.show()
sns.plt.show()
コード例 #45
0
print("Kurtosis:")
print(data_set['T_MAX'].kurtosis())

## Graph T MAX / CO & O3
df = data_set.sort_values(['T_MAX', 'CO'], ascending=True)
plt.plot(df['T_MAX'], df['CO'])
plt.title("La concentración de CO frente a la temperatura máxima")
plt.show()

df = data_set.sort_values(['T_MAX', 'O3'], ascending=True)
plt.plot(df['T_MAX'], df['O3'])
plt.title("La concentración de Ozono frente a la temperatura máxima")
plt.show()

## Pairplot
sns.jointplot(data_set['T_MAX'], data_set['CO'], kind="reg")
plt.show()
plt.close()
sns.jointplot(data_set['T_MAX'], data_set['O3'], kind="reg")
plt.show()
plt.close()

## Correlation Matrix
data_set_corr = data_set
data_set_corr['Mes'] = data_set_corr['Mes'].map({
    'ENE': 1,
    'FEB': 2,
    'MAR': 3,
    'ABR': 4,
    'MAY': 5,
    'JUN': 6,
コード例 #46
0
ax_histx = plt.axes(rect_histx)
ax_histx.tick_params(direction='in', labelbottom=False)
ax_histy = plt.axes(rect_histy)
ax_histy.tick_params(direction='in', labelleft=False)

# the scatter plot:
ax_scatter.scatter(x, y)

# now determine nice limits by hand:
binwidth = 0.25
lim = np.ceil(np.abs([x, y]).max() / binwidth) * binwidth
ax_scatter.set_xlim((-lim, lim))
ax_scatter.set_ylim((-lim, lim))

bins = np.arange(-lim, lim + binwidth, binwidth)
ax_histx.hist(x, bins=bins)
ax_histy.hist(y, bins=bins, orientation='horizontal')

ax_histx.set_xlim(ax_scatter.get_xlim())
ax_histy.set_ylim(ax_scatter.get_ylim())

plt.show()

# Seaborn version
import numpy as np
import seaborn as sns
#sns.set(style="ticks")

sns.jointplot(x, y)
sns.jointplot(x, y, kind="hex", color="#4CB391")
#Visulization
matplotlib.rcdefaults()

plt.show(df.plot(kind = 'box'))

pd.options.display.mpl_style = 'default' # Sets the plotting display theme to ggplot2
df.plot(kind = 'box')

sns.boxplot(data=df,width=0.5)
sns.violinplot(df,width=3.5)

plt.show(sns.distplot(df.ix[:,2], rug = True, bins = 15))

with sns.axes_style("white"):
    plt.show(sns.jointplot(df.ix[:,1],df.ix[:,2], kind = "kde"))

plt.show(sns.lmplot("Benguet","Ifugao",df))

#Creating custom function
def add_2int(x,y):
    return x+y
print(add_2int(2,2))

# an algorithm example
def case(n=10,mu=3,sigma=np.sqrt(5),p=0.025,rep=100):
    m=np.zeros((rep,4))

    for i in range(rep):
        norm = np.random.normal(loc = mu, scale = sigma, size = n)
        xbar = np.mean(norm)
コード例 #48
0
ファイル: RS.py プロジェクト: afcarl/PythonDS-MLBootcamp
df.head()

import matplotlib.pyplot as plt
import seaborn as sns

df.groupby('title')['rating'].mean().sort_values(ascending=False).head()
df.groupby('title')['rating'].count().sort_values(ascending=False).head()

ratings = pd.DataFrame(df.groupby('title')['rating'].mean())

ratings['numRatings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head()

ratings['numRatings'].hist(bins=100, figsize=(10, 6))
ratings['rating'].hist(bins=100, figsize=(10, 6))
sns.jointplot(x='rating', y='numRatings', data=ratings, alpha=0.6)
# as the number of ratings goes up, so does the average rating

moviemat = df.pivot_table(index='user_id', columns='title', values='rating')

moviemat.head()

ratings.sort_values('numRatings', ascending=False).head(10)

starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']

# This will show how people who have seen star wars rate other movies
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)
コード例 #49
0
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#sns.residplot(x='age',y='fare',data=tips,color='indianred')
# Generate a green residual plot of the regression between 'hp' and 'mpg'

auto = pd.read_csv('auto.csv')


# Generate a joint plot of 'hp' and 'mpg'
sns.jointplot(x = 'hp', y = 'mpg', data = auto)

# Display the plot
plt.show()
コード例 #50
0
axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8])
axes1.scatter(j_day, dw_solar_everyday, label='Observed dw_solar', color='red')
axes1.scatter(j_day, ghi_everyday, label='Clear Sky GHI', color='green')

axes1.set_xlabel('Days')
axes1.set_ylabel('Solar Irradiance (Watts /m^2)')
axes1.set_title('Solar Irradiance - Test Year 2009')
axes1.legend(loc='best')

fig.savefig('RNN Paper Results/Exp2_1/' + test_location + '/' + test_year +
            'Figure 2.jpg',
            bbox_inches='tight')

# In[525]:

sns.jointplot(x=dw_solar_everyday, y=ghi_everyday, kind='reg')
plt.xlabel('Observed global downwelling solar (Watts/m^2)')
plt.ylabel('Clear Sky GHI (Watts/m^2)')
plt.savefig('RNN Paper Results/Exp2_1/' + test_location + '/' + test_year +
            'Figure 3',
            bbox_inches='tight')

# ### making the Kt (clear sky index at time t) column by first removing rows with ghi==0

# In[526]:

if run_train:
    # TRAIN dataset
    df_train = df_train[df_train['ghi'] != 0]
    df_train['Kt'] = df_train['dw_solar'] / df_train['ghi']
    df_train.reset_index(inplace=True)
コード例 #51
0
#mu = np.array([-0.5, -2.5])
size = 1000000 # at 10 million my RAM is overloaded

### If a vector X is normally distributed, then exp(X) is lognormally distributed with the same mean and variance

log_data = np.random.multivariate_normal(mu,cov, size=size)
level_data = np.exp(log_data)
k = level_data[:,1]
z = level_data[:,0]
lnk = log_data[:,1]
lnz = log_data[:,0]


### Plotting the joint density functions for levels and for logs
## First levels
sns.jointplot(k,z,kind="hex").set_axis_labels("Capital", "Productivity")
plt.show()

sns.jointplot(lnk,lnz,kind="hex").set_axis_labels("Log Capital", "Log Productivity")
plt.show()
'''
## Plotting the raw joint density of lognormal variables does not make much sense as in 10,000,000 observations there will be massive outliers
### I atempt to get rid of these outliers for plotting purposes

meank = np.mean(k)
sdk = np.std(k)
final_k = [x for x in k if (x > meank - 2 * sdk)]
final_k = [x for x in final_k if (x < meank + 2 * sdk)]


meanz = np.mean(z)
コード例 #52
0
def viz_cont_cont(df, features, target):
    for feature in features:
        sns.jointplot(x=feature, y=target, data=df)
merged_df.popularity.plot.hist(bins=50, color='green')
# explore vote_average distribution
# appear to be almost normal distribution
merged_df.vote_average.plot.hist(bins=50, color='red')
# to fix popularity, we will remove vote_count under 10 to prevent bias
merged_df = merged_df[~(merged_df.vote_count < 10)]
# replot
merged_df.popularity.plot.hist(bins=50, color='blue',
                               alpha=0.5)  # appear to be better
# plot scatter and find r2 for popularity versus domestic_gross columns
# before plot, we want to convert the scale into log10 and need to remove 0s
merged_df = merged_df[~(merged_df.domestic_gross == 0)]
merged_df = merged_df[~(merged_df.worldwide_gross == 0)]
merged_df.to_pickle('budget_popularity.pkl')
sns.jointplot(merged_df['popularity'],
              np.log10(merged_df['domestic_gross']),
              kind="reg",
              stat_func=hf.r2)
sns.jointplot(merged_df['popularity'],
              np.log10(merged_df['worldwide_gross']),
              kind="reg",
              stat_func=hf.r2)
sns.jointplot(merged_df['vote_average'],
              np.log10(merged_df['domestic_gross']),
              kind="reg",
              stat_func=hf.r2)
sns.jointplot(merged_df['vote_average'],
              np.log10(merged_df['worldwide_gross']),
              kind="reg",
              stat_func=hf.r2)
# popularity is R2 is 0.3 while vote_average is 0.051, we will use popularity as a metric to estimate gross income
# we will use popularity to estimate how well genres perform using tmdb data frame
コード例 #54
0
for a, b in product(features, plottables):
    msg('Making %s %s' % (a, b))
    x = with_elo[a]
    y = with_elo[b]
    msg('type = %s' % x.dtype)
    if x.dtype == 'object':
        plt.figure()
        x.value_counts().plot(kind='bar')
        plt.savefig('/data/' + a + '_hist.png')
        plt.close('all')
    else:
        try:
            xlim = tuple(np.percentile(x, [1, 99]))
            ylim = tuple(np.percentile(y, [1, 99]))
            with sns.axes_style("white"):
                sns.jointplot(x, y, kind="hex", xlim=xlim, ylim=ylim)
            plt.savefig('/data/scatter_' + a + '_' + b + '.png')
            plt.close('all')
        except:
            #        sns.violinplot(x, y)
            #        plt.savefig('/data/' + a + '_' + b + '.png')
            #        plt.close()
            plt.figure()
            x.plot(kind='hist')
            plt.savefig('/data/' + a + '_hist.png')
            plt.close('all')

do_indivs = True
if do_indivs:
    for a, b in product(features, plottables):
        msg('Making %s %s' % (a, b))
コード例 #55
0
file_out_figures = 'C:/Users/lalc/Documents/Old Documents folder/PhD/Meetings/July 2020/'
file = ['U','UN','N','SN']  
limits = [[-np.inf,-.1], [-.1,-.01], [-.01,.01], [.01,.21], [.21,np.inf]]  
limits = [[-np.inf,-.1], [-.1,-.01], [-.01,.01], [.01,.21]]
     
relind = L30min1.relscan>.25
j = -2
for i,l in enumerate(limits):
    stabind = ((Ri1[:,j]>l[0]) & (Ri1[:,j]<l[1]))
    cols = np.r_[['$L_{u_1,x_1}$', '$L_{u_1,x_2}$','$L_{v_1,x_1}$', '$L_{v_1,x_2}$','$L_{h,x_1}$', '$L_{h,x_2}$'], L30min1.columns [6:]]
    L30min1.columns = cols
    xlim = 5*200
    ylim = 5*200
    g = sns.jointplot(x ='$L_{h,x_1}$', y = '$L_{h,x_2}$', data=L30min1.loc[relind & stabind & ind1], 
                            height = 8, kind="kde", cmap="jet", xlim = (0,xlim), ylim = (0,ylim),
                            color='k')#,cbar=True, cbar_kws={"format": formatter, "label": '$Density$'})
    g.set_axis_labels('$L_{h,x_1}$', '$L_{h,x_2}$', fontsize = 24)
    g.ax_joint.plot([0,xlim],[0,ylim],'--k', linewidth = 2)
    g.ax_joint.plot(L30min1.loc[relind & stabind & ind1]['$L_{h,x_1}$'].values,L30min1.loc[relind & stabind & ind1]['$L_{h,x_2}$'].values,'o', color = 'k', alpha=.2)
    g.ax_joint.text(100, 800,'$'+'%.2f' % l[0] +'<Ri_f<'+'%.2f' % l[1] +'$',fontsize=30,color='r')
    plt.tight_layout()
    plt.savefig(file_out_figures+file[i]+'_phase_1.png')


file = ['U','UN','N','SN','VS']       
relind = L30min2.relscan>.25
for i,l in enumerate(limits):
    stabind = ((Ri2[:,-2]>l[0]) & (Ri2[:,-2]<l[1]))
    cols = np.r_[['$L_{u_1,x_1}$', '$L_{u_1,x_2}$','$L_{v_1,x_1}$', '$L_{v_1,x_2}$','$L_{h,x_1}$', '$L_{h,x_2}$'], L30min2.columns [6:]]
    L30min2.columns = cols
コード例 #56
0
# que pasa por los valores, lo desactivamos asi

sns.distplot(tips['total_bill'],kde=False)
plt.show()

# podemos modificar la cantidad de bins que son la barras,
# con el parametro bins solo pasando un int, hay que tener
# cuidado con el tamaño del bin

sns.distplot(tips['total_bill'],kde=False,bins=40)
plt.show()

# tenemos un metodos que nos compara dos columnas dentro de
# un dataset

sns.jointplot(x='total_bill',y='tip',data=tips)
plt.show()

# podemos graficar esto de varias maneras con el parametro 
# kind usando: hex, reg, kde

# este otro metodo nos muestra una serie de graficas comparando
# todas las columnas con todas, cuando se compara con si mismo,
# muestra un histogram, y cuando es con otro, es un jointplot()

sns.pairplot(tips)
plt.show()

# si queremos dividir la informacion de cada grafica por otras
# columnas por ejemplo por sexo usamos el parametro hue, se le
# pasa una columa categorial, no que tenga un valor por eso 
コード例 #57
0
Next compare the distributions of the positive and negative examples over a few features. 
Good questions to ask yourself at this point are:

	* Do these distributions make sense?
		+ Yes. You've normalized the input and these are mostly concentrated in the +/- 2 range.
	* Can you see the difference between the ditributions?
		+ Yes the positive examples contain a much higher rate of extreme values.
-----------------------------------------------------------------------------------------
'''
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)

sns.jointplot(
    pos_df['V5'], 
    pos_df['V6'],
    kind='hex', 
    xlim = (-5,5), 
    ylim = (-5,5)
)

plt.suptitle("Positive distribution")

sns.jointplot(
    neg_df['V5'], 
    neg_df['V6'],
    kind='hex', 
    xlim = (-5,5), 
    ylim = (-5,5)
)

_ = plt.suptitle("Negative distribution")
コード例 #58
0

# Histogram
sns.distplot(a = iris_data['Petal Length (cm)'], kde=False)



# Kernel Density Estimate (kde)
# This is the smoothed histogram

# kde plot
sns.kdeplot(data=iris_data['Petal Length (cm)'], shade=True)


# We can create two-dimensional kde plot
sns.jointplot(x=iris_data['Petal Length (cm)'],
              y=iris_data['Sepal Width (cm)'], kind='kde')








# Let split the data to understand difference btw species

iris_set_data = pd.read_csv('data/iris_setosa.csv', index_col="Id")
iris_ver_data = pd.read_csv('data/iris_versicolor.csv', index_col="Id")
iris_vir_data = pd.read_csv('data/iris_virginica.csv', index_col="Id")

コード例 #59
0
def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'):
    '''
    :param data: DataFrame
    :param label: label column name in the data
    :param n_feats: the number of features be used to analysis.
    :param task: regression or classification
    :return:
    '''
    columns = data.columns.tolist()
    columns.remove(label)

    if id is not None:
        if columns[id].duplicated().sum():
            print('{} is duplicated !!!'.format(id))

        columns.remove(id)
        data.drop(id, axis=1, inplace=True)

    numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes]
    numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v]
    category_names = list(set(columns) - set(numeric_names))

    if task == 'classification':
        if len(category_names):
            # data distribution for each class
            new_data = data.dropna(axis=0)
            famd = prince.FAMD(
                n_components=2,
                n_iter=3,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=42
            )
            famd = famd.fit(new_data[columns])
            ax = famd.plot_row_coordinates(
                new_data,
                ax=None,
                x_component=0,
                y_component=1,
                labels=new_data.index,
                color_labels=['{}'.format(t) for t in new_data[label]],
                ellipse_outline=False,
                ellipse_fill=True,
                show_points=True
            )
            plt.show()
        else:
            new_data = data.dropna(axis=0)
            pca = PCA(n_components=2, random_state=seed)
            X_pca = pca.fit_transform(new_data[columns])
            sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data)
            plt.show()

    # sort features for correlation plot
    sorted_feat_name = numeric_names
    if len(numeric_names) > 6:
        n_clusters = 3
        new_data = data[[label] + numeric_names].dropna(axis=0)
        new_data_feat = new_data[numeric_names]
        new_data_stand = StandardScaler().fit_transform(new_data_feat)
        kmean_init = KMeans(n_clusters=n_clusters, random_state=seed)
        new_data_kmean=kmean_init.fit_transform(
            new_data_stand.reshape(len(numeric_names), -1))
        sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1])
        sorted_feat_name = [i[0] for i in sorted_feat]

    # correlation plot for all features
    sns.heatmap(data[[label] + sorted_feat_name + category_names].corr())
    plt.show()

    # outlier detection just for numeric features
    outlier = data[numeric_names].apply(mad_based_outlier)
    for i, column in enumerate(outlier.columns):
        print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]]))

    # missing value pattern plot for all features
    msno.matrix(data[columns[:n_feats]])
    plt.show()

    msno.bar(data[columns[:n_feats]])
    plt.show()

    miss_data = data[columns[:n_feats]].isnull().sum(axis=1)
    miss_data = miss_data.to_frame()
    miss_data.columns = ['number_of_missing_attributes']
    miss_data.sort_values('number_of_missing_attributes', inplace=True)
    miss_data['index'] = list(range(0, miss_data.shape[0]))
    sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data)
    plt.show()
コード例 #60
0
def analyze_zN(z, outdir, vg, skip_umap=False, num_pcs=2, num_ksamples=20):
    zdim = z.shape[1]

    # Principal component analysis
    log('Perfoming principal component analysis...')
    pc, pca = analysis.run_pca(z)  
    log('Generating volumes...')
    for i in range(num_pcs):
        start, end = np.percentile(pc[:,i],(5,95))
        z_pc = analysis.get_pc_traj(pca, z.shape[1], 10, i+1, start, end)
        vg.gen_volumes(f'{outdir}/pc{i+1}', z_pc)

    # kmeans clustering
    log('K-means clustering...')
    K = num_ksamples
    kmeans_labels, centers = analysis.cluster_kmeans(z, K)
    centers, centers_ind = analysis.get_nearest_point(z, centers)
    if not os.path.exists(f'{outdir}/kmeans{K}'): 
        os.mkdir(f'{outdir}/kmeans{K}')
    utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl')
    np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers)
    np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d')
    log('Generating volumes...')
    vg.gen_volumes(f'{outdir}/kmeans{K}', centers)

    # UMAP -- slow step
    if zdim > 2 and not skip_umap:
        log('Running UMAP...')
        umap_emb = analysis.run_umap(z)
        utils.save_pkl(umap_emb, f'{outdir}/umap.pkl')

    # Make some plots
    log('Generating plots...')
    plt.figure(1)
    g = sns.jointplot(x=pc[:,0], y=pc[:,1], alpha=.1, s=2)
    g.set_axis_labels('PC1','PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/z_pca.png')
    
    plt.figure(2)
    g = sns.jointplot(x=pc[:,0], y=pc[:,1], kind='hex')
    g.set_axis_labels('PC1','PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/z_pca_hexbin.png')

    if zdim > 2 and not skip_umap:
        plt.figure(3)
        g = sns.jointplot(x=umap_emb[:,0], y=umap_emb[:,1], alpha=.1, s=2)
        g.set_axis_labels('UMAP1','UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/umap.png')

        plt.figure(4)
        g = sns.jointplot(x=umap_emb[:,0], y=umap_emb[:,1], kind='hex')
        g.set_axis_labels('UMAP1','UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/umap_hexbin.png')

    analysis.scatter_annotate(pc[:,0], pc[:,1], centers_ind=centers_ind, annotate=True)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.savefig(f'{outdir}/kmeans{K}/z_pca.png')

    g = analysis.scatter_annotate_hex(pc[:,0], pc[:,1], centers_ind=centers_ind, annotate=True)
    g.set_axis_labels('PC1','PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/kmeans{K}/z_pca_hex.png')

    if zdim > 2 and not skip_umap:
        analysis.scatter_annotate(umap_emb[:,0], umap_emb[:,1], centers_ind=centers_ind, annotate=True)
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        plt.savefig(f'{outdir}/kmeans{K}/umap.png')

        g = analysis.scatter_annotate_hex(umap_emb[:,0], umap_emb[:,1], centers_ind=centers_ind, annotate=True)
        g.set_axis_labels('UMAP1','UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/kmeans{K}/umap_hex.png')

    for i in range(num_pcs):
        if zdim > 2 and not skip_umap:
            analysis.scatter_color(umap_emb[:,0], umap_emb[:,1], pc[:,i], label=f'PC{i+1}')
            plt.xlabel('UMAP1')
            plt.ylabel('UMAP2')
            plt.tight_layout()
            plt.savefig(f'{outdir}/pc{i+1}/umap.png')