Exemple #1
0
def linearRegressionDemo(conn):
    '''
       Demonstrate Linear Regression
    '''
    mdl = LinearRegression(conn)
    #Train Model and Score
    lreg = LinearRegression(conn)
    mdl_dict, mdl_params = lreg.train('public.wine_training_set',['1','alcohol','proline','hue','color_intensity','flavanoids'],'quality')
    #Show model params
    mdl_params
    #Now do prediction
    predictions = lreg.predict('public.wine_test_set','quality')
    #Show prediction results
    predictions.head()
    #Show Scatter Matrix of Actual Vs Predicted
    smat = scatter_matrix(predictions.get(['quality','prediction']), diagonal='kde')   
        
    # 1 b) Linear Regression with categorical variables 
    # We'll use the auto_mpg dataset from UCI : http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
    # make, fuel_type, fuel_system are all categorical variables, rest are real.
    #Train Linear Regression Model on a mixture of Numeric and Categorical Variables
    mdl_dict, mdl_params = lreg.train('public.auto_mpg_train',['1','height','width','length','highway_mpg','engine_size','make','fuel_type','fuel_system'],'price')
    predictions = lreg.predict('public.auto_mpg_test','price')
    #Show sample predictions
    predictions.head()    
    #Display Scatter Plot of Actual Vs Predicted Values
    smat = scatter_matrix(predictions.get(['price','prediction']), diagonal='kde')    
 def scatter_plot(self, factor):
     f_and_c = self.get_factors_and_column_indices()
     # add in the atac seq column
     columns = [1,] + f_and_c[factor]
     scatter_matrix(self[columns].rank(),  
                    figsize=(16,16), 
                    alpha=0.05, color='black')
Exemple #3
0
    def corr_plots(self, append=True):
        """
        uses scatter_matrix to plot all columns pair-wise against each other 
        """
        plt.figure()
        # self._df.drop([x for ])
        # ptp.scatter_matrix(self._df, alpha=0.2, figsize=(6, 6), diagonal='kde')
        # ptp.scatter_matrix(self._df, alpha=0.2, diagonal='kde')
        # diagonal='kde'
        # ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal)
        # print(diagonal)
        # plt.savefig(self._output_dir + "correlation_matrix_%s.png" % diagonal)

        diagonal = "hist"
        print(sys._getframe().f_code.co_name, diagonal)
        # scatter_opt={'kind':'hexbin'}
        off_diagonal_opt = {"bins": "log"}  # draw options for off-diagonal elements
        # scatter_opt={}
        # scatter_matrix(self._df, alpha=0.2, hspace=0.2, wspace=0.2, diagonal=diagonal, **scatter_opt)
        # axes = ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal, **scatter_opt)
        ptp.scatter_matrix(self._df, alpha=0.2, diagonal=diagonal, **off_diagonal_opt)
        # scatter_matrix(self._df, alpha=0.2, diagonal=diagonal)

        fs = fig_summary()  # fs.mean = average(df[var_name])
        fs.label = "pair-wise correlation plots"
        fs.fig_path = self._output_dir
        fs.fig_rel_path = self._rel_dir + "correlation_plots_%s.png" % diagonal

        plt.savefig(fs.fig_path + fs.fig_rel_path)
        if not append:
            self.list_fig_summary.clear()
        self.list_fig_summary.append(fs)
def scatmat(df, category=None, colors='rgyb',
            num_plots=4, num_topics=100, num_columns=4,
            show=False, block=False, data_path=DATA_PATH, save=False, verbose=1):
    """FIXME: empty plots that dont go away, Plot and/save scatter matrix in groups of num_columns topics"""

    if category is None:
        category = list(df.columns)[-1]
    if category in df.columns:
        category = df[category]
    else:
        category = pd.Series(category)

    suffix = '{}x{}'.format(len(df), num_topics)
    save = bool(save)
    for i in range(int(min(num_plots * num_columns, num_topics) / float(num_plots))):
        scatter_matrix(df[df.columns[i * num_columns:(i + 1) * num_columns]],
                       marker='+', c=[colors[int(x) % len(colors)] for x in category.values],
                       figsize=(18, 12))
        if save:
            name = 'scatmat_topics_{}-{}.jpg'.format(i * num_columns, (i + 1) * num_columns) + suffix
            plt.savefig(os.path.join(data_path, name + '.jpg'))
        if show:
            if block:
                plt.show()
            else:
                plt.show(block=False)
def plot_data(indf, prefix='html'):
    """
    create scatter matrix plot, histograms
    """
    list_of_plots = []
    column_groups = []
    column_list = []
    for col in indf.columns:
        if len(indf[col].unique()) > 5 and 'checkin' not in col:
            column_list.append(col)
    for idx in range(0, len(column_list), 3):
        print len(column_list), idx, (idx+3)
        column_groups.append(column_list[idx:(idx+3)])

    for idx in range(len(column_groups)):
        for idy in range(0, idx):
            if idx == idy:
                continue
            print column_groups[idx]+column_groups[idy]
            pl.clf()
            scatter_matrix(indf[column_groups[idx]+column_groups[idy]])
            pl.savefig('scatter_matrix_%d_%d.png' % (idx, idy))
            list_of_plots.append('scatter_matrix_%d_%d.png' % (idx, idy))
            pl.close()

    for col in indf:
        pl.clf()
        print col
        indf[col].hist(histtype='step', normed=True)
        pl.title(col)
        pl.savefig('%s_hist.png' % col)
        list_of_plots.append('%s_hist.png' % col)

    create_html_page_of_plots(list_of_plots, prefix)
    return
    def test_scatter_plot_legacy(self):
        df = pd.DataFrame(randn(100, 2))

        with tm.assert_produces_warning(FutureWarning):
            plotting.scatter_matrix(df)

        with tm.assert_produces_warning(FutureWarning):
            pd.scatter_matrix(df)
def auto_pairs(plot_cols, df):
    import matplotlib.pyplot as plt
    from pandas.tools.plotting import scatter_matrix
    fig = plt.figure(figsize=(12, 12))
    fig.clf()
    ax = fig.gca()
    scatter_matrix(df[plot_cols], alpha=0.3, 
               diagonal='kde', ax = ax)
    return 'Done'           
def visualizeData(inputDF):

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-scatter.png'
    myPlot = inputDF.plot(
        label    = 'population',
        kind     = 'scatter',
        x        = 'longitude',
        y        = 'latitude',
        s        = inputDF["population"] / 100,
        c        = 'median_house_value',
        cmap     = plt.get_cmap("jet"),
        colorbar = True,
        alpha    = 0.1
        )
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-correlations.png'
    corrMatrix = inputDF.corr()
    attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
    myPlot = scatter_matrix(frame=inputDF[attributes], figsize=(12,8))
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2)

    print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)')
    print(   corrMatrix["median_house_value"].sort_values(ascending=False) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-medianIncome.png'
    myPlot = inputDF.plot(
        kind    = 'scatter',
        x       = "median_income",
        y       = "median_house_value",
        alpha   = 0.1,
        figsize = (12,8)
        )
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    outputFILE = 'plot-correlations-02.png'

    tempDF = inputDF.copy()

    tempDF[     "roomsPerHousehold"] = tempDF["total_rooms"]    / tempDF["households"]
    tempDF["populationPerHousehold"] = tempDF["population"]     / tempDF["households"]
    tempDF[       "bedroomsPerRoom"] = tempDF["total_bedrooms"] / tempDF["total_rooms"]

    corrMatrix = tempDF.corr()
    print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)')
    print(   corrMatrix["median_house_value"].sort_values(ascending=False) )

    attributes = ["median_house_value","median_income","roomsPerHousehold","bedroomsPerRoom"]
    myPlot = scatter_matrix(frame=tempDF[attributes], figsize=(12,8))
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Exemple #9
0
	def pd_scatter_matrix(self):
		"""
		No parameters. Run on object's attributres.
		A normal scatter plot matrix using pandas.scatter_matrix. Nothing
		new here.
		"""
		class_groups = self.ALL.groupby(self.classes)
		plt.figure()
		scatter_matrix(self.ALL, alpha=0.2, figsize=(60, 60), diagonal='kde')
		plt.savefig('scatter_matrix.png')
def plot_scatter_matrix(x,y,fname='scatter_matrix.png'):
    import pandas as pd
    from pandas.tools.plotting import scatter_matrix
    df = pd.DataFrame(np.hstack((x,y.reshape(len(y),1))), columns=['intensity', 'gaussian', 'gradient mag', 'grad dir', 'laplacian', 'imglog', 'label'])
    df['label'] = df['label'].astype(int)
    colors = ['red','green','blue','cyan','black']
    import matplotlib.pyplot as plt
    scatter_matrix(df,figsize=[9,7],marker='x',c=df.label.apply(lambda xx:colors[xx]))
    plt.savefig(fname)
    print "Saved scatter matrix to %s" % fname
	def plotPcaProjections(self, pca_components=(0, 4)):
		""" Plots the principal components projected on the data.

		**Parameters**

		pca_components : int (tuple)
		    The number of the principal components to be projected
		"""
		tmp = np.dot(self.evts,self.U[:, pca_components[0]:pca_components[1]])
		df = pd.DataFrame(tmp)
		scatter_matrix(df, alpha=.2, s=4, c='k', figsize=(10, 10), diagonal='kde', marker=".")
Exemple #12
0
 def plot(self, df=None, type=None, **parameter):
     if type == "bar":
         df.plot.bar(**parameter)
     if type == "barh":
         df.plot.barh(**parameter)
     if type == "hist":
         df.hist(**parameter)
     if type == "scatter":
         df.plot.scatter(**parameter)
     if type == "scatter matrix":
         scatter_matrix(df, diagonal="kde", **parameter)
Exemple #13
0
def data_visualization():
	# box and whisker plots
	dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
	plt.show()

	# histotrams
	dataset.hist()
	plt.show()

	# scatter plot matrix
	scatter_matrix(dataset)
	plt.show()
def create_xy():
    rawdata = pd.read_csv(get_fullpath('insurance.csv'), delimiter = ',')
    print('====== describe ======')    
    print rawdata.describe()
    print('====== head ======')    
    print rawdata.head()
    print('====== corr ======')    
    print rawdata.corr()
    print('====== describe ======')    
    scatter_matrix(rawdata)
    rawdata.age.plot()
    rawdata.charges.hist()
    rawdata.boxplot()
def azureml_main(frame1):
    import matplotlib
    matplotlib.use("agg")
    matplotlib.style.use('ggplot')
    import pandas as pd
    from pandas.tools.plotting import scatter_matrix
    from mpl_toolkits.mplot3d import Axes3D
    import numpy as np
    import matplotlib.pyplot as plt
    Azure = False

## If not running in MAML read the data from a csv file.
    if(Azure == False):
        frame1 = pd.read_csv("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\forestfireslog.csv")

    
    fig2 = plt.figure(2, figsize = (10,6))
    ax = fig2.gca()
    plt.plot(frame1["X"], frame1["Y"], 'bo', alpha = 0.2)
    fig2.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig2.png")
    
    fig1 = plt.figure(1, figsize = (10,6))
    ax = fig1.gca()
    
    plotCols = ["FFMC", "DMC", "DC", "ISI", \
    "temp", "RH", "wind", "rain", "areaLog"]
#    print(pd.DataFrame.head(frame1[plotCols]))
    scatter_matrix(frame1[plotCols], ax = ax, alpha = 0.5)
    fig1.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig1.png")
    
    print(frame1.shape)
    trmCols = ["FFMC", "ISI", "rain"]
    trmLst = [[-3.0, 10.0], [-10.0, 4.0], [-10.0, 3.0]]
    frame2 = trimOutliers(frame1, trmCols, trmLst)    
    print(frame1.shape)
    
    fig4 = plt.figure(4, figsize = (10,6))
    ax = fig4.gca()
    scatter_matrix(frame2[plotCols], ax = ax, alpha = 0.5)
    fig4.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig4.png")
    
    fig5 = plt.figure(5, figsize = (12,8))
    ax = fig5.add_subplot(221, projection='3d')
    ax.scatter(frame2["X"], frame2["Y"], frame2["areaLog"], c = 'r')
    ax = fig5.add_subplot(222, projection='3d')
    ax.scatter(frame1["X"], frame1["Y"], frame1["areaLog"], c = 'r')
    fig5.savefig("C:\\Users\\Steve\\Documents\\AzureML\\Data Sets\\Forest_Fire\\fig5.png")
    
    return frame1
def plot_full_feature_scatter_matrix(X,Y,fname='scatter_matrix_full_feature.png'):
    print X.shape;
    print Y.shape
    import pandas as pd;
    from pandas.tools.plotting import scatter_matrix;
    COL = ['VAR1', 'VAR2', 'VAR3', 'VAR4', 'VAR5', 'VAR6', 'VAR7', 'VAR8', 'VAR9', 'VAR10', 'VAR11', 'VAR12', 'VAR13', 'VAR14', 'VAR15', 'VAR16', 'VAR17', 'VAR18', 'VAR19', 'VAR20', 'label'];
    df = pd.DataFrame(np.hstack((X,Y.reshape(Y.shape[0],1))), columns=COL);
    df['label'] = df['label'].astype(int);
    R = list(np.linspace(0,1,num=20));
    G = list(np.linspace(1,0,num=20));
    B = list(np.linspace(0,1,num=20));
    colors = ['r','g','blue','c','m','y','black','w','orange','darkgreen','r','g','blue','c','m','y','black','w','orange','darkgreen']
    import matplotlib.pyplot as plt;
    scatter_matrix(df,figsize=[25,25], marker='x',diagonal='kde',c=df.label.apply(lambda k:colors[k]));
    plt.savefig(fname);
Exemple #17
0
def azureml_main(frame1):
  import matplotlib
  matplotlib.use('agg')
  
  import pandas as pd
  import matplotlib.pyplot as plt
  from pandas.tools.plotting import scatter_matrix 

## Remove unwanted columns 
  frame1.drop(["X", "Y", "month", "day"], axis = 1, inplace = True)

## Create a scatter plot matrix 
  fig1 = plt.figure(1, figsize = (12,9))
  ax = fig1.gca()
  scatter_matrix(frame1, alpha=0.2, figsize=(10, 10), diagonal='kde', ax=ax)
  fig1.savefig('scatter2.png')
  return frame1
def factor_scatter_matrix(df, factor, factor_labels, legend_title=None,
                          palette=None, title=None):
    '''Create a scatter matrix of the variables in df, with differently colored
    points depending on the value of df[factor].
    inputs:
        df: pandas.DataFrame containing the columns to be plotted, as well
            as factor.
        factor: string or pandas.Series. The column indicating which group
            each row belongs to.
        palette: A list of hex codes, at least as long as the number of groups.
            If omitted, a predefined palette will be used, but it only includes
            9 groups.
    '''

    if isinstance(factor, basestring):
        factor_name = factor  # save off the name
        factor = df[factor]  # extract column
        df = df.drop(factor_name, axis=1)  # remove from df, so it
        # doesn't get a row and col in the plot.

    classes = list(set(factor))

    if palette is None:
        palette = sns.color_palette("gist_ncar", len(set(factor)))
    elif isinstance(palette, basestring):
        palette = sns.color_palette(palette, len((set(factor))))
    else:
        palette = sns.color_palette(palette)

    color_map = dict(zip(classes, palette))

    if len(classes) > len(palette):
        raise ValueError((
            "Too many groups for the number of colors provided."
            "We only have {} colors in the palette, but you have {}"
            "groups.").format(len(palette), len(classes)))

    colors = factor.apply(lambda group: color_map[group])
    axarr = scatter_matrix(df, figsize=(10, 10),
                           marker='o', c=np.array(list(colors)), diagonal=None,
                           alpha=1.0)

    if legend_title is not None:
        plt.grid('off')
        plt.legend([plt.Circle((0, 0), fc=color) for color in palette],
                factor_labels, title=legend_title, loc='best',
                ncol=3)
    if title is not None:
        plt.suptitle(title)

    # for rc in xrange(len(df.columns)):
    #     for group in classes:
    #         y = df[factor == group].icol(rc).values
    #         gkde = gaussian_kde(y)
    #         ind = np.linspace(y.min(), y.max(), 1000)
    #         axarr[rc][rc].plot(ind, gkde.evaluate(ind), c=color_map[group])

    return axarr, color_map
Exemple #19
0
def scatterMatrix(dframe):
    '''
       Show Scatter Matrix
    '''
    df = DataFrame(dframe)
    #Rename columns so that the plot if not very cluttered.
    df.columns = range(len(df.columns))
    smatrix = scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')
    plt.show()
def featurecomparison(cr,features,name,path,columns=None):
    if not os.path.exists(path):
        os.makedirs(path)
        
    sessions = cr[cr.trial > 0][features]
    if columns is not None:
        sessions.columns = columns
    sessions = sessions.groupby(level=['subject','session'])
    for (subject,session),group in sessions:
        scatter_matrix(group)
        plt.suptitle(str.format('{0} (session {1})',subject,session))
        fname = str.format("{0}_session_{1}_{2}.png",
                           subject, session, name)
        fpath = os.path.join(path,subject)
        if not os.path.exists(fpath):
            os.makedirs(fpath)
        fpath = os.path.join(fpath,fname)
        plt.savefig(fpath)
        plt.close('all')
Exemple #21
0
def scatter_matrix_bin_target(df, bin_col, numeric_cols):
    """Scatter matrix of numerical columns, showing colors based
    on a binary target variable

    Parameters
    ----------
    df : pandas.DataFrame
        Contains columnar data containing `bin_col` and `numeric_cols` columns

    bin_col : str
        Name of column containing binary data

    numeric_cols : [str]
        List containing column names containing numerical data

    Reference
    ---------
    http://stackoverflow.com/questions/28034424/pandas-scatter-matrix-plot-categorical-variables
    """
    _scatter_color = df[bin_col].apply(lambda v: ('red', 'blue')[v])
    scatter_matrix(df[numeric_cols], c=_scatter_color)
def plot_error_correlations(vs, fractions):
    import pandas as pd
    from pandas.tools.plotting import scatter_matrix
    from matplotlib import pyplot as plt


    vs2 = pd.DataFrame(vs)
    real = fractions
    # vs2 = vs2.subtract(real,axis=0)
    vs2 = (vs2.T - real).T
    fig,ax = plt.subplots(figsize=[plotinfo.TEXTWIDTH_IN*.5, plotinfo.TEXTWIDTH_IN*.5])
    axes = scatter_matrix(vs2, ax=ax, color='k', marker='.', s=2.)
    for ax in axes.ravel():
        ax.grid(False)
        ax.set_axis_bgcolor((1,1,1))
        ax.set_xticks([])
        ax.set_yticks([])
        ax.spines['bottom'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)

    for a in range(vs2.shape[1]):
        for b in range(a+1, vs2.shape[1]):
            da,db = vs2[[a,b]].T.values
            r = np.corrcoef(da,db)[0,1]
            ax = axes[b,a]
            ylabel = ax.get_ylabel()
            xlabel = ax.get_xlabel()
            ax.clear()
            ax.set_ylabel(ylabel, fontsize=5)
            ax.set_xlabel(xlabel, fontsize=5)
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_ylim(0,1)
            ax.set_xlim(0,1)
            ax.text(.55, .4,
                    '{:.2}'.format(r),
                    fontsize=9,
                    horizontalalignment='center',
                    verticalalignment='center')
            # ax = axes[a,b]
            # ax.plot([0,0],[-.25,+.25], 'k-', lw=3)
            # ax.plot([-.25,+.25], [0,0], 'k-', lw=3)
            # ax.scatter(db, da, color='r', s=1)
    axes[0,0].set_ylabel(axes[0,0].get_ylabel(), fontsize=5)
    axes[5,5].set_xlabel(axes[5,5].get_xlabel(), fontsize=5)

    fig.tight_layout()
    fig.savefig('figures/error_scatter.eps')
    fig.savefig('figures/error_scatter.pdf')
    fig.savefig('figures/error_scatter.png', dpi=1200)
Exemple #23
0
def explore_data(dataframe, histograms=True, scattermatrix=True, export_summary=True):
    '''
    '''
    # Create directory for output
    import os
    from pandas.tools.plotting import scatter_matrix
    if not os.path.exists('output'):
        os.mkdir('output')
    print('Descriptive statistics exported as "output/summary_original.csv"')
    summary = dataframe.describe(include='all').round(2).transpose()
    summary.insert(1, 'missing', len(dataframe) - summary['count'])
    summary.insert(0, 'type', dataframe.dtypes)
    if export_summary == True:
        summary.to_csv('output/summary_original.csv')
    if histograms == True:
        print('Histograms exported as "output/histograms.png"')
        dataframe.hist();
        plt.savefig('output/histograms.png')
    if scattermatrix == True:
        print('Scatter matrix exported as "output/scatter_matrix.png"')
        scatter_matrix(dataframe, diagonal='kde');
        plt.savefig('output/scatter_matrix.png')
def visualize(config):

    # Create various visualizations of the data, this would help to create a feature vector
    for dataset in config['datasets']:
        scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde')
        fig_name = dataset['name'] + '_scatter_matrix' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        parallel_coordinates(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_parallel_coordinates' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        radviz(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_radviz' + '.png'
        plt.savefig(fig_name)
        plt.close()

    return OK
def plot_data(indf, prefix='html', do_scatter=False):
    """
    create scatter matrix plot, histograms
    """
    list_of_plots = []
    if do_scatter:
        column_groups = []
        for idx in range(0, len(indf.columns), 3):
            print len(indf.columns), idx, (idx+3)
            column_groups.append(indf.columns[idx:(idx+3)])

        for idx in range(len(column_groups)):
            for idy in range(0, idx):
                if idx == idy:
                    continue
                print column_groups[idx]+column_groups[idy]
                pl.clf()
                scatter_matrix(indf[column_groups[idx]+column_groups[idy]])
                pl.savefig('scatter_matrix_%d_%d.png' % (idx, idy))
                list_of_plots.append('scatter_matrix_%d_%d.png' % (idx, idy))
                pl.close()

    for col in indf:
        pl.clf()
        print col
        if 'WnvPresent' in indf.columns and col != 'WnvPresent':
            haswnv = indf['WnvPresent'] == 1
            notwnv = indf['WnvPresent'] == 0
            indf[notwnv][col].hist(histtype='step', normed=True)
            indf[haswnv][col].hist(histtype='step', normed=True)
        else:
            indf[col].hist(histtype='step', normed=True)
        pl.title(col)
        pl.savefig('%s_hist.png' % col)
        list_of_plots.append('%s_hist.png' % col)

    create_html_page_of_plots(list_of_plots, prefix)
    return
def save_scatter(best_pipe, df_X, y, start, shard):
    """
    Plots and saves scatter_matrix of data.

    Parameters
    ----------
    best_pipe : object
        Pipeline used.

    df_X : pandas.DataFrame
        Input data.

    y : list
        Target labels.

    start : float
        Time at start of run.

    shard : bool
        Indicates if shard of data is used.

    Returns
    -------
    None
    """

    df = pipe_transform(best_pipe, df_X)
    df['match'] = y

    colors = ['red', 'green']
    scatter_matrix(df, alpha=0.25, figsize=(20, 12),
                   c=df.match.apply(lambda x: colors[x]))

    fname = str(int(start - 1470348265)).zfill(7) + '_'
    if shard:
        fname = 'shard_' + fname
    plt.savefig('../output/%sscatter-matrix' % fname)
    plt.close('all')
def factor_scatter_matrix(df, factor, palette=None, title=None):
    '''Create a scatter matrix of the variables in df, with differently colored
    points depending on the value of df[factor].
    inputs:
        df: pandas.DataFrame containing the columns to be plotted, as well
            as factor.
        factor: string or pandas.Series. The column indicating which group
            each row belongs to.
        palette: A list of hex codes, at least as long as the number of groups.
            If omitted, a predefined palette will be used, but it only includes
            9 groups.
    '''

    if isinstance(factor, basestring):
        factor_name = factor  # save off the name
        factor = df[factor]  # extract column
        df = df.drop(factor_name, axis=1)  # remove from df, so it
        # doesn't get a row and col in the plot.

    classes = list(set(factor))

    if palette is None:
        #palette = matplotlib.colors.cnames.values()
        palette = ['#e41a1c', '#377eb8', '#4eae4b',
                   '#994fa1', '#ff8101', '#fdfc33',
                   '#a8572c', '#f482be', '#999999',
                   '#4B610B', '#DF013A', '#DF013A']

    color_map = dict(zip(classes, palette))

    if len(classes) > len(palette):
        raise ValueError((
            "Too many groups for the number of colors provided."
            "We only have {} colors in the palette, but you have {}"
            "groups.").format(len(palette), len(classes)))

    colors = factor.apply(lambda group: color_map[group])
    axarr = scatter_matrix(df, figsize=(10, 10),
                           marker='o', c=colors, diagonal=None)
    if title is not None:
        plt.title(title)

    # for rc in xrange(len(df.columns)):
    #     for group in classes:
    #         y = df[factor == group].icol(rc).values
    #         gkde = gaussian_kde(y)
    #         ind = np.linspace(y.min(), y.max(), 1000)
    #         axarr[rc][rc].plot(ind, gkde.evaluate(ind), c=color_map[group])

    return axarr, color_map
Exemple #28
0
def test_quantile_vs_tmm():
    """
    Test quantile normalization versus TMM
    in rank correlation of genes.
    """
    counts_fname = utils.load_testdata("pasilla")
    # Consider only a subset of the samples
    samples = OrderedDict()
    samples["Untreated 1"] = "untreated1"
    samples["Untreated 2"] = "untreated2"
    exp_obj = experiment.Experiment(counts_fname, samples)
    quantile_counts_df = normalizers.norm_q(exp_obj)
    tmm_counts_df = normalizers.norm_tmm(exp_obj)
    print "\nQuantile versus TMM Testing:"
    print "--------------"
    print "Normalized quantile counts: "
    print quantile_counts_df.head()
    print "Normalized TMM counts: "
    print tmm_counts_df.head()
    print "Correlating the genes."
    # Merge the dataframes together, indexing by gene
    combined_df = pandas.merge(quantile_counts_df, tmm_counts_df,
                               left_index=True,
                               right_index=True,
                               suffixes=["_q", "_tmm"],
                               how="outer")
    # Get log of counts: get rid of infinite values
    log_counts_df = combined_df.apply(np.log2).replace([-np.inf, np.inf],
                                                       np.nan)
    print "Combined dataframe: "
    print combined_df.head()
    print "Combined log dataframe: "
    print log_counts_df.head()
    # Plot correlation
    from pandas.tools.plotting import scatter_matrix
    scatter_matrix(log_counts_df, alpha=0.2, figsize=(8, 7))
    plot_utils.save_fig("quantile_vs_tmm_corr", ext="png")
Exemple #29
0
def plot_discern_distributions( aml, brca, luad ):
	'''
	Plot some useful visualizations of the DISCERN scores as a scatter matrix, where
	the diagonal is the kernel density of the scores, and the off-diagonals are
	scatter plots comparing two conditions. Pass in filenames for where the DISCERN
	scores are stored.
	'''

	from pandas.tools.plotting import scatter_matrix
	import seaborn as sns

	AML = pd.read_csv( aml, index_col=0 )
	BRCA = pd.read_csv( brca, index_col=0 )
	LUAD = pd.read_csv( luad, index_col=0 )
	AML['Gene'], BRCA['Gene'], LUAD['Gene'] = AML.index, BRCA.index, LUAD.index
	AML['AML'], BRCA['BRCA'], LUAD['LUAD'] = np.log10(AML['T2']), np.log10(BRCA['T2']), np.log10(LUAD['T2'])
	AML, BRCA, LUAD = AML[['Gene', 'AML']], BRCA[['Gene', 'BRCA']], LUAD[['Gene', 'LUAD']]

	data = pd.merge( AML, BRCA, on='Gene' )
	data = pd.merge( data, LUAD, on='Gene' )
	
	with sns.axes_style( "whitegrid" ):
		scatter_matrix( data, alpha=0.2, figsize=(6,6), diagonal='kde', color='c', density_kwds={'c': 'r', 'lw':1}, lw=0, grid=False ) 

	plt.savefig( 'DISCERN_Scores.pdf' )
	plt.clf()

	print "TOP 10 GENES SORTED BY EACH METHOD"
	print "AML"
	print data.sort( 'AML', ascending=False )[['Gene', 'AML']][:10]
	print
	print "BRCA"
	print data.sort( 'BRCA', ascending=False )[['Gene', 'BRCA']][:10]
	print
	print "LUAD"
	print data.sort( 'LUAD', ascending=False )[['Gene', 'LUAD']][:10]
Exemple #30
0
def scatterPlot(dataset, out="DISPLAY"):
    """
    Perform a scatter plot of dataset.
    
    Args:
        DataFrame:: dataset
        str::out: path to save the plot.
    """
    
    graph = scatter_matrix(dataset)
    if out=='DISPLAY':
        plt.show()
    else:
        plt.savefig(out+"scatter_matrix")
    plt.clf()
    return graph
Exemple #31
0
    label="population",
    c="median_house_value",
    cmap=plt.get_cmap("jet"),
    colorbar=True,
)
plt.legend()
plt.show()

corr_matrix = housing.corr().round(2)
corr_matrix['median_house_value'].sort_values(ascending=False)

from pandas.tools.plotting import scatter_matrix
attributes = [
    "median_house_value", "median_income", "total_rooms", "housing_median_age"
]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()
housing.info()
housing.total_bedrooms.hist()
plt.show()

housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing[
    "bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing[
    "population_per_household"] = housing["population"] / housing["households"]

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

housing = strat_train_set.copy()
    (symbol, DataReader(symbol, "yahoo", pause=1)) for symbol in symbols)
panel = Panel(data).swapaxes('items', 'minor')
closing = panel['Close'].dropna()
closing.head()

# Calculate log returns
rets = log(closing / closing.shift(1)).dropna()
rets.head()

# Correlation Matrix
corr_matrix = rets.corr()
corr_matrix

# Plot correlation and scatter
from pandas.tools.plotting import scatter_matrix
scatter_matrix(rets)

#Cholesky decomposition
from scipy.linalg import cholesky
upper_cholesky = cholesky(corr_matrix, lower=False)
upper_cholesky

# Simulation parameters
# business days
import numpy as np
from pandas import bdate_range  # business days

n_days = 21
dates = bdate_range(start=closing.ix[-1].name, periods=n_days)
n_assets = len(symbols)
n_sims = 50000
Exemple #33
0


# histograms

dataset.hist()

plt.show()



#multivariate plot

# scatter plot matrix

scatter_matrix(dataset)

plt.show()



# Split-out validation dataset

array = dataset.values

X = array[:,0:4]

Y = array[:,4]

validation_size = 0.15
# Import the housing information for analysis

housing = pd.DataFrame.from_csv('../data/housing.csv', index_col=0)
housing.head()

# In[5]:

# Use covariance to calculate the association

housing.cov()

# In[6]:

# Use correlation to calculate the association is more appropriate in this case
housing.corr()

# In[7]:

# scatter matrix plot
from pandas.tools.plotting import scatter_matrix
sm = scatter_matrix(housing, figsize=(10, 10))

# ## Let's do an analysis by yourself!
#
# ## Observe the association between LSTAT and MEDV:

# In[8]:

# This time we take a closer look at MEDV vs LSTAT。 What is the association between MEDV and LSTAT you observed?
housing.plot(kind='scatter', x='LSTAT', y='MEDV', figsize=(10, 10))
             alpha=0.4,
             s=dataset["population"] / 100,
             label="population",
             figsize=(10, 7),
             c="median_house_value",
             cmap=plt.get_cmap("gist_rainbow"),
             colorbar=True)
plt.legend()

corr_matrix = dataset.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))

attributes = [
    "median_house_value", "median_income", "total_rooms", "housing_median_age"
]
scatter_matrix(dataset[attributes], figsize=(8, 8))

dataset["rooms_per_household"] = dataset["total_rooms"] / dataset["households"]
dataset[
    "bedrooms_per_room"] = dataset["total_bedrooms"] / dataset["total_rooms"]
dataset[
    "population_per_household"] = dataset["population"] / dataset["households"]
corr_matrix = dataset.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))

dataset1 = dataset
dataset_labels = dataset["median_house_value"].copy()
dataset = dataset.drop("median_house_value", axis=1)
dataset = dataset.dropna(subset=["total_bedrooms"])

median = dataset["total_bedrooms"].median()
def DrawGraph8(df):
    from pandas.tools.plotting import scatter_matrix
    scatter_matrix(df[['Open', 'High', 'Low', 'Close']], alpha=0.2, figsize=(25,16), diagonal='kde')
    plt.show()
# In[735]:

df.columns


# Finding the co-relation between the features

# In[736]:

df.corr()


# In[737]:

scatter_matrix(df,alpha=0.5, figsize=(30,32));


# ### Splitting the dataset with all features

# In[738]:

df_allfeatures = df


# In[739]:

df.columns


# In[740]:
Exemple #38
0
##print(auto.tail())
print(auto.describe())
##print(list(auto))

## range, mean, and standard deviation for columns
## only use first 7 columns
rownames = list(['min', 'max', 'mean', 'sd'])
vals = pd.DataFrame([
    auto.ix[:, 0:6].min(), auto.ix[:, 0:6].max(), auto.ix[:, 0:6].mean(),
    auto.ix[:, 0:6].std()
],
                    index=rownames)
print(vals)

##compare data
axes = scatter_matrix(auto)
plt.show()
##Now with subplots :)
fig, axes = plt.subplots(nrows=2, ncols=2)

ax1 = auto.plot(x='weight',
                y='mpg',
                c='displacement',
                kind='scatter',
                ax=axes[0, 0],
                rot=45)

auto.plot(x='horsepower',
          y='mpg',
          c='acceleration',
          kind='scatter',
from sklearn.cluster import AffinityPropagation as Clusterer
clusterer = Clusterer()

X, y = data.get.people_xy()
vecs = []
names = []
couples = data.get.couples_raw()
for couple in couples:
    vecs.append(
        np.array(X[y.index(couple["male"])]) -
        np.array(X[y.index(couple["female"])]))
    names.append(couple["male"].split(' ')[0] + " - " +
                 couple["female"].split(" ")[0])

labels = Clusterer().fit(X).labels_

df = pandas.DataFrame(vecs,
                      columns=[
                          "Extroversion", "Emotional", "Agreeableness",
                          "Conscientiousness", "Intellect"
                      ])

plot = scatter_matrix(df,
                      figsize=(15, 15),
                      marker='o',
                      hist_kwds={'bins': 10},
                      s=60,
                      alpha=1,
                      cmap="nipy_spectral")

plt.show()
Exemple #40
0
print(
    "¿Cuál es el valor de la variable respuesta para el paciente número 415?")
print(pd_diabetes['Y'][415])

print("¿Cuál es el resumen de datos?")
print(pd_diabetes.describe())

# import matplotlib.pyplot as plt
# plt.show()
#plt.plot(pd_diabetes['AGE'].plot.hist(x = 'Age',alpha=0.5))
plt.hist(pd_diabetes['AGE'])
plt.show()

plt.scatter(x=pd_diabetes['AGE'], y=pd_diabetes['Y'])
plt.show()

#representar la scatter matrix
from pandas.tools.plotting import scatter_matrix
scatter_matrix(pd_diabetes, alpha=0.2, figsize=(12, 12), diagonal='kde')

pd_diabetes.corr()

# # #Y vs SEX, scatter plot
# plt.boxplot(by='SEX',column = 'Y')

# fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6, 6), sharey=True)
# axes[0, 0].boxplot(pd_diabetes, labels=labels)
# axes[0, 0].set_title('Default', fontsize=fs)

# plt.show()
Exemple #41
0
 def scat(**kwds):
     return plotting.scatter_matrix(df, **kwds)
Exemple #42
0
data = pandas.read_csv('brain_size.csv', sep=';', na_values=".")
t = np.linspace(-6, 6, 20)
sin_t = np.sin(t)
cos_t = np.cos(t)
pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})
data.shape
data.columns
print(data['Gender'])
data[data['Gender'] == 'Female']['VIQ'].mean()
groupby_gender = data.groupby('Gender')
for gender, value in groupby_gender['VIQ']:
    print((gender, value.mean()))
groupby_gender.mean()
from pandas.tools import plotting
plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count']])
plotting.scatter_matrix(data[['PIQ', 'VIQ', 'FSIQ']])
stats.ttest_1samp(data['VIQ'], 0)
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)
stats.ttest_ind(data['FSIQ'], data['PIQ'])
stats.ttest_rel(data['FSIQ'], data['PIQ'])
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)
stats.wilcoxon(data['FSIQ'], data['PIQ'])
#23結束

x = np.linspace(-5, 5, 20)
np.random.seed(1)
# normal distributed noise
y = -5 + 3 * x + 4 * np.random.normal(size=x.shape)
Exemple #43
0
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])

plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster"]])
plt.scatter(centers.calories,
            centers.alcohol,
            linewidths=3,
            marker='+',
            s=300,
            c='black')
plt.xlabel("Calories")
plt.ylabel("Alcohol")

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]],
               s=100,
               alpha=1,
               c=colors[beer["cluster"]],
               figsize=(10, 5))
plt.suptitle("With 3 centroids initialized")

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]],
               s=100,
               alpha=1,
               c=colors[beer["cluster2"]],
               figsize=(10, 5))
plt.suptitle("With 2 centroids initialized")
plt.show()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
def function(myData):
    print(myData.head(20))
    print()

    # Summary of data
    print(myData.describe())
    print()

    # Look at the number of instances of each class
    # class distribution
    print(myData.groupby('increase_rate').size())

    # Box and whisker plots
    myData.plot(kind='box',
                subplots=True,
                layout=(2, 2),
                sharex=False,
                sharey=False)
    plt.show()

    # Histogram
    myData.hist()
    plt.show()

    # Scatterplots to look at 2 variables at once
    # scatter plot matrix
    scatter_matrix(myData)
    plt.show()

    ######################################################
    # Evaluate algorithms
    ######################################################

    # Separate training and final validation data set. First remove class
    # label from data (X). Setup target class (Y)
    # Then make the validation set 20% of the entire
    # set of labeled data (X_validate, Y_validate)
    valueArray = myData.values
    X = valueArray[:, 0:4]
    Y = valueArray[:, 4]
    test_size = 0.20
    seed = 7
    X_train, X_validate, Y_train, Y_validate = cross_validation.train_test_split(
        X, Y, test_size=test_size, random_state=seed)

    # Setup 10-fold cross validation to estimate the accuracy of different models
    # Split data into 10 parts
    # Test options and evaluation metric
    num_folds = 10
    num_instances = len(X_train)
    seed = 7
    scoring = 'accuracy'

    #Normalize the Data
    X = preprocessing.normalize(X)

    ######################################################
    # Use different algorithms to build models
    ######################################################

    # Add each algorithm and its name to the model array
    models = []
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))
    models.append(('RF', RandomForestClassifier()))

    # Evaluate each model, add results to a results array,
    # Print the accuracy results (remember these are averages and std
    results = []
    names = []
    for name, model in models:
        kfold = cross_validation.KFold(n=num_instances,
                                       n_folds=num_folds,
                                       random_state=seed)
        cv_results = cross_validation.cross_val_score(model,
                                                      X_train,
                                                      Y_train,
                                                      cv=kfold,
                                                      scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)


######################################################
# For the best model, see how well it does on the
# validation test.  This is for KNeighborsClassifier
######################################################
# Make predictions on validation dataset
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_validate)

    print()
    print(accuracy_score(Y_validate, predictions))
    print(confusion_matrix(Y_validate, predictions))
    print(classification_report(Y_validate, predictions))

    ######################################################
    # For the best model, see how well it does on the
    # validation test. This is for DecisionTreeClassifier
    ######################################################
    # Make predictions on validation dataset
    cart = DecisionTreeClassifier()
    cart.fit(X_train, Y_train)
    predictions = cart.predict(X_validate)

    print()
    print(accuracy_score(Y_validate, predictions))
    print(confusion_matrix(Y_validate, predictions))
    print(classification_report(Y_validate, predictions))

    ######################################################
    # For the best model, see how well it does on the
    # validation test. This is for GaussianNB
    ######################################################
    # Make predictions on validation dataset
    nb = GaussianNB()
    nb.fit(X_train, Y_train)
    predictions = nb.predict(X_validate)

    print()
    print(accuracy_score(Y_validate, predictions))
    print(confusion_matrix(Y_validate, predictions))
    print(classification_report(Y_validate, predictions))

    ######################################################
    # For the best model, see how well it does on the
    # validation test. This is for SVM
    ######################################################
    # Make predictions on validation dataset
    svm = SVC()
    svm.fit(X_train, Y_train)
    predictions = svm.predict(X_validate)

    print()
    print(accuracy_score(Y_validate, predictions))
    print(confusion_matrix(Y_validate, predictions))
    print(classification_report(Y_validate, predictions))

    ######################################################
    # For the best model, see how well it does on the
    # validation test. This is for RandomForestClassifier
    ######################################################
    # Make predictions on validation dataset
    rf = RandomForestClassifier()
    rf.fit(X_train, Y_train)
    predictions = rf.predict(X_validate)

    print()
    print(accuracy_score(Y_validate, predictions))
    print(confusion_matrix(Y_validate, predictions))
    print(classification_report(Y_validate, predictions))
#-*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from pandas.tools.plotting import scatter_matrix

df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd'])
corr_mat = df.corr()
print corr_mat

scatter_matrix(df, alpha=0.2, figsize=(16, 16), diagonal='kde')

plt.show()
#plt.savefig('features.png')
Exemple #46
0
train["Fare"][train["Survived"] == 1] = train["Fare"][train["Survived"] == 1].fillna(train["Fare"][train["Survived"] == 1].median())


train["SibSp"][train["Survived"] == 1] = train["SibSp"][train["Survived"] == 1].fillna(train["SibSp"][train["Survived"] == 1].mode())
train["SibSp"][train["Survived"] == 0] = train["SibSp"][train["Survived"] == 0].fillna(train["SibSp"][train["Survived"] == 0].mode())
train["Parch"][train["Survived"] == 1] = train["Parch"][train["Survived"] == 1].fillna(train["Parch"][train["Survived"] == 1].mode())
train["Parch"][train["Survived"] == 0] = train["Parch"][train["Survived"] == 0].fillna(train["Parch"][train["Survived"] == 0].mode())

#creating new coulumn "Relatives" containing sum of nymber of Siblings/Spouse/Parent or Children a person has onboard
train["Relatives"] = train["SibSp"] + train["Parch"]
train["Relatives"][train["Survived"] == 1] = train["Relatives"][train["Survived"] == 1].fillna(train["Relatives"][train["Survived"] == 1].mode())
train["Relatives"][train["Survived"] == 0] = train["Relatives"][train["Survived"] == 0].fillna(train["Relatives"][train["Survived"] == 0].mode())

#plotting scatter plot to get an idea of correlation within varaibles
numeric_cols = train[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Relatives"]]
_ = scatter_matrix(numeric_cols, c = train["Survived"] ,alpha = 0.2, figsize=(8,8), diagonal = 'hist')
plt.show()


#converting "Age" into a discrete variable for decision tree functioning. Age of 16 was taken as all people below age 16 (children) had
#higher chances of getting saved as observed from training data.
train["Age"][train["Age"] < 16] = 0
train["Age"][train["Age"] >= 16][ train["Age"] < 60] = 1
train["Age"][train["Age"] >= 60] = 2

#taking log of Fare column as it has a very long range. This discretizes into only 3 values - 0,1 and 2.
train["Fare"] = np.log10(train["Fare"]+1).astype(int)


#plotting scatter plot again to see effect of above discretizations
numeric_cols = train[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Relatives"]]
data.plot(kind='box', subplots=True, layout=(3, 3), sharex=False, sharey=False)
plt.show()

# In[7]:

# Correction Matrix Plot

import numpy

correlations = data.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0, 9, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

# In[8]:

# Scatterplot Matrix
from pandas.tools.plotting import scatter_matrix
scatter_matrix(data)
plt.show()

# In[ ]:
plt.savefig("attribute_histogram_plots")
# plt.show()

sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.2)
plt.savefig('map1.png')

sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10,7), c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False)
plt.savefig('map2.png')

corr_matrix = sf.corr()
corr_matrix["lastsoldprice"].sort_values(ascending=False)

from pandas.tools.plotting import scatter_matrix

attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"]
scatter_matrix(sf[attributes], figsize=(12, 8))
plt.savefig('matrix.png')

sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5)
plt.savefig('scatter.png')

sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft']

corr_matrix = sf.corr()
corr_matrix["lastsoldprice"].sort_values(ascending=False)

len(sf['neighborhood'].value_counts())

freq = sf.groupby('neighborhood').count()['address']
mean = sf.groupby('neighborhood').mean()['price_per_sqft']
cluster = pd.concat([freq, mean], axis=1)


""" there seem to exist positive correlation between balance and income, limit and rating"""

dummies = pd.get_dummies(df['Married']).rename(columns = lambda x: 'Married_'+str(x))

df=pd.concat([df,dummies["Married_Yes"]],axis=1)

#2

dummies = pd.get_dummies(df['Ethnicity']).rename(columns = lambda x: 'Ethnicity_'+str(x))

df_fin=pd.concat([df,dummies[["Ethnicity_Asian" , "Ethnicity_Caucasian"]]],axis=1).drop(["Ethnicity","Married"],1)

scatter_matrix(df_fin,figsize=(10,10))
#3
est = smf.ols(formula = 'Balance ~ Unnamed: 0+ Income+ Limit+ Rating+ Cards+ Age+ Education+ Gender+ Student+ Married+ Married_Yes+ Ethnicity_Asian+ Ethnicity_Caucasian', data=df_fin).fit()
print est.summary()

#http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/example_regression_plots.html

# fig = plt.figure(figsize=(12,8))
# fig = sm.graphics.plot_regress_exog(est, "Income", fig=fig)

student_resid=est.outlier_test()["student_resid"]

plt.plot(est.fittedvalues,student_resid)

#5
est1 = smf.ols(formula = 'Balance ~ Income+ Limit+ Rating+ Student', data=df_fin).fit()
with open('points1878.geojson') as f:
    point_data = json.load(f)

cells = 25, 50, 75
bandwidths = 25, 50, 100, 150, 250

data = {year: data.add_coordinates(value, point_data, coordinates_to_meters=False)
        for year, value in pop_data.items()}

plotting.plot_densities_all(
    data,
    cell_size=50,
    bw=100,
    kernel='epanechnikov',
    subplot_title_param=dict(year='vuosi'),
    labels='luterilaiset ortodoksit erotus'.split(),
    title='Tiheys'
).set_facecolor('white')

results = pd.read_csv('kaikki.csv')
print(results)
corr_values = results.loc[:, lambda results: 's km exposure isolation information'.split()]
corr_values.columns = 'S D hPg gPg H'.split()
print(corr_values.corr(method='spearman'))

plt.style.use('ggplot')
scatter_matrix(corr_values, figsize=(10, 10), diagonal='hist')
plt.suptitle('Hajontamatriisi', size=20)
plt.show()
# In[3]:

df

# In[4]:

df.index = list(range(1, len(df.index) + 1))

# In[5]:

color_codes = ["#FF0000", "#0000FF", "#00FF00"]
class_names = list(set(df.iloc[:, -7]))
colors = [color_codes[class_names.index(x)] for x in list(df.iloc[:, -7])]
plotting.scatter_matrix(df[list(df.columns[:])],
                        figsize=(30, 30),
                        color=colors)
plt.show()

# 赤が健康体、青がパーキンソン病の人であり、一部の散布図では分離できそうなことが読み取れる。

# In[6]:

# 名前と評価値を抜くためのブーリアンを作成
libool = [True] * len(df.columns)
libool[-7] = False
libool[0] = False
# 行列の正規化
dfs = df.iloc[:, libool].apply(lambda x: (x - x.mean()) / x.std(),
                               axis=0).fillna(0)
Exemple #52
0
def scattergr(dataset):

	scatter_matrix(dataset)
	plt.show()
Exemple #53
0
predicted = clf.predict(x_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

sales_test['return'] = clf.predict(z)
print(sales_test['return'].value_counts())

Kscore = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
print(Kscore)
print(Kscore.mean())

#correlation matrix
from pandas.tools.plotting import scatter_matrix
plt.style.use('ggplot')
scatter = scatter_matrix(attributes, alpha=0.2, figsize=(6, 6), diagonal='kde')
#plt.show()


def plot_corr(df, size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
Exemple #54
0
plt.scatter(tips['total_bill'], tips['tip'], marker='x')

plt.scatter(tips['total_bill'],
            tips['tip'],
            marker='x',
            alpha=0.5,
            s=100,
            color='green')

#cargando data
feliz = pd.read_csv('happy2015.csv')
feliz.columns

#importando librerias
from pandas.tools.plotting import scatter_matrix
scatter_matrix(feliz)

#subsetting
feliz.columns
scatter_matrix(feliz[['Happiness Score', 'Economy (GDP per Capita)']])

sub_feliz = feliz[[
    'Happiness Score', 'Economy (GDP per Capita)',
    'Trust (Government Corruption)', 'Generosity'
]]
scatter_matrix(sub_feliz)

iris.plot(kind="scatter", x="sepal_length", y="sepal_width")

g = sns.FacetGrid(iris, hue='species', size=5)
g.map(plt.scatter, 'sepal_length', 'sepal_width').add_legend()
Exemple #55
0
 def ScatterPlot(self, data_frame):
     scatter_matrix(data_frame, diagonal='kde', color='green', alpha=1)
     plt.savefig("ScaterCommon.png")
Exemple #56
0
			line = line.replace("None", "null")
			fp.write("db.rssInfo.insert("+ line +")\n")
			pp.pprint(line)
			#print(row.inserted_id)
		except:
			print(i)
			i = i + 1
			pass;

fp.close()
#or use sys.stdin if data is too large
#can be added depending on the size of json file

cursor = collection1.find()
client.close()

exit(0)

#to get description and summary of data
data = pd.DataFrame(list(collection1.find()))

for x in data:
	#whatever computation that has to be done	
	pass;

print(data.describe())
data.plot()

scatter_matrix(data, figsize = (10, 10))
plt.show()
Exemple #57
0
def visual_correlations(data_frame):
    scatter_matrix(data_frame[["median_house_value", "median_income"]],
                   figsize=(12, 8))
    pyplot.show()
rfecv.fit(trainData, trainLabel)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plotting features with cross validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()


# After an hour, the SVM model has been trained optimizing the features in the database. Using only these features
# will reduce the time of training of the model so used only 373 features instead of input of 561. 


print('Accuracy of the SVM model on test data is ', rfecv.score(testData,testLabel) )
# Getting the best features
best_features = []
for ix,val in enumerate(rfecv.support_):
    if val==True:
        best_features.append(testData[:,ix])


#The above yields an accuracy of approximately 97%. Following helps in visualization.
from pandas.tools.plotting import scatter_matrix
visualize = pd.DataFrame(np.asarray(best_features).T)
print(visualize.shape)
scatter_matrix(visualize.iloc[:,0:5], alpha=0.2, figsize=(6, 6), diagonal='kde')
@author: abrown09
"""

import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

churn_data = pd.read_csv('/Users/amybrown/Thinkful/Capstone/Data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Data exploration

churn_data.shape # check dimensions
churn_data.dtypes
print(churn_data.head(10)) 
# not sure what tenure vaar means-it must be the amount of time customer has been with company
print(churn_data.describe()) # looks like numerical data is complete. senior citizen is not an age variable but a binary categorical variable
# mean tenure is 32...i'm going to guess this is months and not years because that seems crazy

categorical = churn_data.dtypes[churn_data.dtypes == 'object'].index
print(categorical)

churn_data[categorical].describe() # all data appear complete
# I think total charges needs to be changed to float

churn_data.hist(column='tenure', figsize=(9,6))
churn_data.hist(column='MonthlyCharges', figsize=(9,6))

scatter_matrix(churn_data, alpha=0.2, figsize=(6, 6), diagonal='kde')

# should Yes and No responses be changed to 1s and 0s?
Exemple #60
0
from sklearn.svm import SVC

# Importing the dataset
titanic_train = pd.read_csv('../input/train.csv')
titanic_test = pd.read_csv('../input/test.csv')
titanic_train.info()
titanic_test.info()
titanic_train.describe()

# In[ ]:

#plotting the scatter matrix first
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

scatter_matrix(titanic_train, figsize=(25, 25))
plt.show()

# In[ ]:

#dropping the columns which might not affect prediction
titanic_train = titanic_train.drop(['PassengerId', 'Ticket'], 1)
titanic_test = titanic_test.drop(['Ticket'], 1)

#To convert Sex to category datatype
titanic_train['Sex'] = titanic_train['Sex'].astype('category')
titanic_test['Sex'] = titanic_test['Sex'].astype('category')

#drop cabin because of too many NaN values
titanic_train = titanic_train.drop(['Cabin'], 1)
titanic_test = titanic_test.drop(['Cabin'], 1)