Python pairplot Examples, seaborn.pairplot Python Examples

Example #1

0

Show file

def plot_uturns(summary, critical_rad=2.9, save=False, condition='Condition',
    context='notebook'):
    """Plot and print steepest turns over more than critical_rad"""
    turn_column = next(col for col in summary.columns
        if col.startswith('Max. Turn Over'))
    columns_of_interest = ['Skew Lines Distance', 'Mean Velocity',
        'Arrest Coefficient', condition, turn_column]

    uturns = summary[summary[turn_column] > critical_rad]

    skip_steps = int(next(word
        for word in turn_column.split() if word.isdigit()))
    print('\nPlotting turns with more than {} rad over {} steps'.format(
        critical_rad, skip_steps))
    for cond, cond_uturns in uturns.groupby(condition):
        n_tracks = len(summary[summary[condition] == cond])
        n_turns = len(cond_uturns)
        print('  {} tracks in {} with {} U-Turns ({:2.2f} %).'.format(
            n_tracks, cond, n_turns, n_turns/n_tracks*100))

    sns.set(style='white', context=context)
    sns.pairplot(uturns[columns_of_interest], hue=condition, diag_kind='kde')
    plt.tight_layout()

    if save:
        conditions = [cond.replace('= ', '')
            for cond in summary[condition].unique()]
        plt.savefig('U-Turns_' + '-'.join(conditions) +
            '_{:1.1f}over{}steps.png'.format(critical_rad, skip_steps), dpi=300)
    else:
        plt.show()

Example #2

0

Show file

File: exam-bicycle.py Project: 52Pig/algorithm

def useDataAnalysis(df_train_origin):
    # 风速
    df_train_origin.groupby('windspeed').mean().plot(y='count', marker='o')
    # plt.show()
    # 湿度
    df_train_origin.groupby('humidity').mean().plot(y='count', marker='o')
    # plt.show()
    # 温度
    df_train_origin.groupby('temp').mean().plot(y='count', marker='o')
    # plt.show()
    # 温度湿度变化
    df_train_origin.plot(x='temp', y='humidity', kind='scatter')
    # plt.show()
    # scatter一下各个维度
    fig, axs = plt.subplots(2, 3, sharey=True)
    df_train_origin.plot(kind='scatter', x='temp', y='count', ax=axs[0, 0], figsize=(16, 8), color='magenta')
    df_train_origin.plot(kind='scatter', x='atemp', y='count', ax=axs[0, 1], color='cyan')
    df_train_origin.plot(kind='scatter', x='humidity', y='count', ax=axs[0, 2], color='red')
    df_train_origin.plot(kind='scatter', x='windspeed', y='count', ax=axs[1, 0], color='yellow')
    df_train_origin.plot(kind='scatter', x='month', y='count', ax=axs[1, 1], color='blue')
    df_train_origin.plot(kind='scatter', x='hour', y='count', ax=axs[1, 2], color='green')

    sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count")

    # 来看看相关度咯
    corr = df_train_origin[['temp', 'weather', 'windspeed', 'day', 'month', 'hour', 'count']].corr()
    print corr

    # 用颜色深浅来表示相关度
    plt.figure()
    plt.matshow(corr)
    plt.colorbar()
    plt.show()

Example #3

0

Show file

File: RMSD_Analysis_Mutant_PDBs.py Project: jaaamessszzz/DDGBenchmarking

            def pairplot():
                sns.set_style('white', {'axes.grid': True, 'axes.edgecolor':'0'})
                sns.set_context('paper', font_scale=1.5, rc={'lines.linewidth': 1})
                # sns.despine()
                WT_pairplot = sns.pairplot(df_subset,
                                           vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'],
                                           size =3,
                                           hue='WT PDBID',
                                           hue_order=sorted(list(mutant_df['WT PDBID'].unique())),
                                           kind='scatter',
                                           diag_kind='hist'
                                           )# .add_legend(bbox_to_anchor=(1.1, 0.5))
                lgd = WT_pairplot.fig.legend(handles=df_subset['WT PDBID'], labels=df_subset['WT PDBID'], bbox_to_anchor=(1.05, 0.5))
                output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is WT PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD')
                title = WT_pairplot.fig.suptitle('PairPlot for %s' %description, fontsize =24, y=1.05)
                output_pdf.savefig(WT_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight')

                Mut_pairplot = sns.pairplot(df_subset,
                                            vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'],
                                            size =3,
                                            hue='Mutant PDBID',
                                            hue_order= sorted(list(mutant_df['Mutant PDBID'].unique())),
                                            kind='scatter',
                                            diag_kind='hist'
                                           )# .add_legend(bbox_to_anchor=(1.1, 0.5))
                lgd = Mut_pairplot.fig.legend(handles=df_subset['Mutant PDBID'], labels=df_subset['Mutant PDBID'], bbox_to_anchor=(1.05, 0.5))
                output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is Mutant PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD')
                title = Mut_pairplot.fig.suptitle('PairPlot for %s' % description, fontsize=24, y=1.05)
                output_pdf.savefig(Mut_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight')

Example #4

0

Show file

File: Data_Gather_Preprocess.py Project: XiaolinZHONG/DATA

    def Plot_data_df(self,df_2,label,strac,pair=True):
        ''''''
        import matplotlib.pylab as plt
        import math
        import seaborn as sns
        import pandas as pd

        x = range(df_2.shape[0])
        maxn = df_2.shape[1]
        fig = plt.figure(1,figsize=(10,maxn+10))
        sns.set(style='darkgrid',color_codes=True)
        style = 'white'
        for i in xrange(maxn):
            y=df_2.ix[:,i]
            plot_context = fig.add_subplot(maxn, 2, 2*i+1)
            plot_context.scatter(x, y, c=label,s=10,alpha=0.7)

            plt.title(strac[i])
            fig.add_subplot(maxn, 2, 2*i + 2)
            sns.distplot(y,axlabel=False)
            fig.tight_layout()
            plt.title(strac[i])

        df_label=pd.DataFrame({'label':list(label)})
        df_2=pd.concat([df_2,df_label],axis=1,join='inner')
        if pair==True:
            sns.pairplot(df_2,vars=strac,hue='label',)
        plt.show()

Example #5

0

Show file

File: DataAnalysis.py Project: lanttern/Machine-Learning-Algorithm-Implementation

    def feature_analysis(self):
        """
        Make a plot to visulize important features to separate labels
        """

        if self.is_voicedata:
            # explore all paired scatter plots
            seaborn.set_context("poster")
            plt.figure(figsize = (10,8))
            plot_all = seaborn.pairplot(self.data, hue = "label")
            plt.suptitle("Feature analysis for voice dataset - all features")
            plot_all.savefig("../Figures/voice_exploration.png", bbox_inches="tight")
            # explore paired scatter plots with selected features
            plt.figure(figsize = (10,8))
            plot_selected = seaborn.pairplot(self.data[["skew","kurt", "meanfun", \
                         "meanfreq", "IQR", "label"]], hue = "label")
            plt.suptitle("Feature analysis for voice dataset - selected features")
            plot_selected.savefig("../Figures/voice_exploration_selected.png", bbox_inches="tight")
        else:
            seaborn.set_context("poster")
            plt.figure(figsize = (10,8))
            # explore all paired scatter plots
            plot_all = seaborn.pairplot(self.data, hue = "Self-defined label")
            plt.suptitle("Feature analysis for EEG dataset - all features")
            plot_all.savefig("../Figures/EEG_exploration.png", bbox_inches="tight")
            # explore paired scatter plots with selected features
            plt.figure(figsize = (10,8))
            plot_selected = seaborn.pairplot(self.data[["Delta","Theta","Alpha 1",\
                           "Beta 2", "Gamma1", "Self-defined label"]], hue = "Self-defined label")
            plt.suptitle("Feature analysis for EEG dataset - selected features")
            plot_selected.savefig("../Figures/EEG_exploration_selected.png", bbox_inches="tight")

Example #6

0

Show file

File: catalog.py Project: sannecottaar/slabpy

 def scatter_corrplot_parameters(self,params):
     """
     Plots two parameters from the catalog against each other.
     """
     snsdf = self.pandas_data_frame(params)
     print(snsdf)
     sns.pairplot(snsdf,dropna=True, size=10)

Example #7

0

Show file

File: C4_3_basicPrinciples.py Project: akansal1/statsintro_python

def scatterplot():
    '''Fancy scatterplots, using the package "seaborn" '''
    import seaborn as sns
    
    df = sns.load_dataset("iris")
    sns.pairplot(df, hue="species", size=2.5)    
    C2_8_mystyle.printout_plain('multiScatterplot.png')

Example #8

0

Show file

File: housing.py Project: noprom/python-machine-learning

def visualize_data():
    '''
    可视化数据
    '''
    sns.set(style='whitegrid', context='notebook')
    sns.pairplot(df[cols], size=2.5)
    plt.show()

Example #9

0

Show file

File: class_crosssectiondataexplorer.py Project: plutoese/mars

 def pair(self,vars=None,save=False):
     sns.pairplot(self._data[vars])
     if save:
         self._save()
     else:
         plt.show()
     plt.close()

Example #10

0

Show file

File: classifier.py Project: zenki2001cn/SnippetCode

def plot_2D_by_sns(subplot, data, target, target_names):
    # 转换成pandas格式
    # pdata = pd.DataFrame(data, columns=['diss', 'corr', 'h**o', 'energy', 'asm', 'contrast', 'lbp', 'lbp_integrals', '9', '10', '11', '12', '13', '14'])
    # sns.set(color_codes=True)
    sns.set()
    pdata = pd.DataFrame(data)
    sns.pairplot(data=pdata, hue='species', markers=["o", "s", "D"])

Example #11

0

Show file

File: simple_regression.py Project: JVP3122/Python-Machine-Learning-NFL-Game-Predictor

def statistical_analysis(df):
	""" Check correlation of features to spread """
	#correlation matrix
	corrmat = df.corr()
	f, ax = plt.subplots(figsize=(12, 9))
	hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f')
	plt.yticks(rotation=0)
	plt.xticks(rotation=90)

	corrvec = abs(df.corr()['result_spread'].copy())
	print corrvec.sort_values()

	#scatterplot
	sns.set()
	cols = ['result_spread','rush_attempt_diff','turn_diff','yards_diff','third_diff','sack_diff','sack_ydiff','p_attempt_diff']
	sns.pairplot(df[cols], size = 2.5)

	# normality_check(df['result_spread'])
	# normality_check(df['rush_attempt_diff'])
	# normality_check(df['turn_diff'])
	# normality_check(df['yards_diff'])
	# normality_check(df['third_diff'])
	# normality_check(df['sack_diff'])
	# normality_check(df['sack_ydiff'])
	# normality_check(df['poss_diff'])
	# normality_check(df['p_attempt_diff'])
	""" Rush attempt shows light tails but otherwise these main features appear normally distributed """

Example #12

0

Show file

File: interact_analysis.py Project: chrissly31415/amimanera

def pairplot(df,hue_name=None):

    if hue_name is not None:
        df['target']=hue_name
        sns.pairplot(df, hue='target')
    else:
        sns.pairplot(df)

Example #13

0

Show file

File: plot_linked_stats.py Project: ifiddes/notch2nl_10x

def do_pairplots(counts, base_dir, sample):
    """
    Produces three pairplots - one for each group and a joint plot.
    """
    markers = ["o", "s"]
    r, total_gems, assigned_gems, assigned_gems_by_para = assign_gems(counts)
    df = pd.DataFrame.from_dict(r)
    unique_gems = find_unique_gems(assigned_gems_by_para)
    num_unique = len(unique_gems)
    num_not_unique = len(df) - num_unique
    unique_bins = ["{:,} unique".format(num_unique) if x in unique_gems else "{:,} not unique".format(num_not_unique) for x in df["GemId"]]
    df["Unique mappings"] = unique_bins
    sns_plot = sns.pairplot(df, hue="Unique mappings", markers=markers, plot_kws=dict(s=10))
    sns_plot.fig.text(0.87, 0.6, "{:,} Total Gems".format(len(total_gems)))
    sns_plot.savefig(os.path.join(base_dir, "{}_combined_plot.pdf".format(sample)), format="pdf")
    # now re-label to simply unique/not unique and make separate pairplots
    unique_simple_bins = ["Unique" if x in unique_gems else "Not Unique" for x in df["GemId"]]
    df["Unique mappings"] = unique_simple_bins
    for i, subset in enumerate(["Unique", "Not Unique"]):
        df2 = df[df["Unique mappings"] == subset]
        color = sns.color_palette()[i]
        cmap = sns.light_palette(color, as_cmap=True)
        sns_plot = sns.pairplot(df2, markers=markers[i], plot_kws=dict(color=color, s=10))
        sns_plot.map_lower(sns.kdeplot, cmap=cmap, n_levels=50)
        p = subset.replace(" ", "_").lower()
        sns_plot.savefig(os.path.join(base_dir, "{}_{}_combined_plot.pdf".format(sample, p)), format="pdf")
    plt.close('all')

Example #14

0

Show file

File: DataAnalysis.py Project: ravindrapanda/sml-suburb-similarity

def Pairplot(feature_mat,weight=None):
	'''Plot pairplot for given feature matrix'''
	if weight == None:
		sns.pairplot(feature_mat)
	else:
		g = sns.pairplot(feature_mat,weight,palette=sns.color_palette("GnBu_d",n_colors=len(feature_mat)),vars=feature_mat.columns.values[:-1])
	#g = g.map(plt.scatter)
	sns.plt.show()

Example #15

0

Show file

File: figs_BasicPrinciples.py Project: fluxium/statsintro

def scatterplot():
    import seaborn as sns
    sns.set()
    sns.set_context('poster')
    
    df = sns.load_dataset("iris")
    sns.pairplot(df, hue="species", size=2.5)    
    mystyle.printout_plain('multiScatterplot.png')

Example #16

0

Show file

File: 第五章.py Project: aijiajia/million-eyes

def demo01():
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set()
    iris = sns.load_dataset('iris')
    print(iris.head())
    sns.pairplot(iris, hue='species', size=1.5)
    plt.show()

Example #17

0

Show file

File: visualize.py Project: tereka114/MachineLearningCombinator

def feature_correlation(x,filepath=None, visualize=False):
    """
    :param x:
    """
    seaborn.pairplot(x)
    if visualize:
    	seaborn.plt.show()
    if not filepath == None:
    	plt.savefig(filepath)

Example #18

0

Show file

File: housing.py Project: southpaw94/MachineLearning

def main():

    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
            header = None,
            sep = '\s+')
    df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',
            'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B',
            'LSTAT', 'MEDV']
    print(df.head())

    # Select a subset of the features and plot the correlation between features
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
    sns.pairplot(df[cols], size=2.5);
    plt.title('Correlations between 5 features')
    plt.show()

    # Plot a heatmap of the same subset of features
    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=2.5)
    hm = sns.heatmap(cm,
            cbar = True,
            annot = True,
            square = True,
            fmt = '.2f',
            annot_kws = {'size': 15},
            yticklabels = cols,
            xticklabels = cols)
    plt.show()

    X = df[['RM']].values
    y = df['MEDV'].values

    sc_x = StandardScaler()
    sc_y = StandardScaler()

    X_std = sc_x.fit_transform(X)
    y_std = sc_y.fit_transform(y)
    
    lr = LinearRegressionGD()
    lr.fit(X_std, y_std)

    plt.plot(range(1, lr.n_iter + 1), lr.cost_)
    plt.ylabel('SSE')
    plt.xlabel('Epoch')
    plt.show()

    lin_regplot(X_std, y_std, lr)
    plt.xlabel('Average number of rooms [RM] (standardized)')
    plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
    plt.show()
    
    # Example classification for a house with 5 rooms
    num_rooms_std = sc_x.transform([5.0])
    price_std = lr.predict(num_rooms_std)
    print("Price in $1000's: %.3f" % \
            sc_y.inverse_transform(price_std))

Example #19

0

Show file

File: general_utils.py Project: ian-whitestone/studies

def seaborn_plot(df,plot_type='pairplot',columns=False):
	sns.set()
	mpl.rc("figure", figsize=(16, 8.65))
	plotting_df=(df[columns] if columns else df)
	if plot_type=='pairplot':
		sns.pairplot(plotting_df)
	elif plot_type=='corr_plot':
		sns.corrplot(plotting_df)
	sns.plt.show()
	return

Example #20

0

Show file

File: plotting.py Project: tomhettinger/clustering

def pairplot(df, group="group"):
    sns.pairplot(data=df.drop('id', axis=1),
                 vars=['age', 'weight', 'heartrate', 'height'],
                 hue=group,
                 diag_kind='kde', 
                 size=5,
                 diag_kws=dict(shade=True, linewidth=2),
                 plot_kws=dict(s=50) )
    if group == "group":
        plt.savefig(os.path.join(FIG_PATH, 'pairplot.png'), dpi=100)
    else:
        plt.savefig(os.path.join(FIG_PATH, 'pairplot_%s.png' % group), dpi=100)

Example #21

0

Show file

File: house_price_predict_v1.py Project: fzhurd/fzwork

def main():
    train=load_data('../input/train.csv')
    print train.head(5)
    # print train.shape
    # print train.describe()
    # print pd.isnull(train).any()
    # print train.mean()

    train.fillna(train.mean())

    sns.set()
    sns.pairplot(train[["MSSubClass", "MSZoning",  "LotFrontage" ]], hue=train[["SalePrice"]])
    sns.plt.show()

Example #22

0

Show file

File: vis.py Project: aweinstein/hrv_altitude

def pair_plot(metrics):

    cols = [#'meanRR',
            #'meanHR',
            'SDNN',
            'RMSSD',
            #'peak_VLF', 'peak_LF', 'peak_HF',
            #'power_VLF', 'power_LF', 'power_HF',
            'peak_HF',
            'power_LFHF',
            #'pcpower_VLF', 'pcpower_LF', 'pcpower_HF',
            #'nupower_LF', 'nupower_HF'
    ]
    sns.pairplot(metrics, hue='height', vars=cols)

Example #23

0

Show file

File: model_inference.py Project: keeganhines/EvoPhys

	def correlation_plot(self, logarithmic=True):
		# plot pairwise parameter correlations with a scatterplot matrix

		if not self.sampling_finished:
			raise Exception("Must run .sample() before any output results can be viewed.")

		if not logarithmic: 
			sns.pairplot(self.posterior_samples)
		else:
			df = np.log10(self.posterior_samples.iloc[:,:-1]).dropna()
			g = sns.PairGrid(df, diag_sharey=False)
			g.map_lower(sns.kdeplot, cmap="Blues_d")
			g.map_upper(plt.scatter,alpha=.1)
			g.map_diag(sns.kdeplot, lw=3)

Example #24

0

Show file

File: corpus.py Project: Honlan/ai4law

	def distribution(self):
		# 绘制频数、聚合度、自由度分布
		import pandas as pd
		import matplotlib
		import matplotlib.pyplot as plt
		matplotlib.use('TkAgg')
		import seaborn as sns
		sns.set(style="white", color_codes=True)

		df = []
		for key, value in self.result.items():
			df.append([value['freq'], value['doa'], value['dof']])
		df = pd.DataFrame(data=df, columns=['frequency', 'doa', 'dof'])
		sns.pairplot(df)
		plt.show()

Example #25

0

Show file

File: viz.py Project: houstondatavis/data-jam-april-2016

def plot_by_neighborhood(data, geojson_file):
    nbr = data.groupby('NEIGHBORHOOD')

    geonbr = gp.read_file(geojson_file).dropna()
    geonbr = geonbr.set_index('alias').join(nbr.mean())
    geonbr['# of Requests'] = nbr.size()
    geonbr['Income ($k)'] = geonbr['Median_HHI'] / 1000.

    cols = ['# of Requests', 'time/SLA ratio', 'Income ($k)']
    _, axes = axes_grid(len(cols))
    for col, ax in zip(cols, axes.flat):
        geonbr.plot(column=col, axes=ax)
        ax.set_title(col)

    seaborn.pairplot(geonbr[cols])

Example #26

0

Show file

File: classify_ghosts_v4h.py Project: fzhurd/fzwork

def main():

    df_train=pd.read_csv('../input/train.csv')
    df_test=pd.read_csv('../input/test.csv')

    sns.set()
    sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type")
    # sns.plt.show()

    df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul']
    df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length']
    df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length']


    df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul']
    df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length']
    df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length']

    test_id = df_test['id']
    df_train.drop(['id'], axis=1, inplace=True)
    df_test.drop(['id'], axis=1, inplace=True)


    df_train.drop(['color'], axis=1, inplace=True)
    df_test.drop(['color'], axis=1, inplace=True)


    X_train = df_train.drop('type', axis=1)
    y_train=df_train['type']


    X_train = pd.get_dummies(X_train)
    df_test_data = pd.get_dummies(df_test)


    # from sklearn.model_selection import train_test_split
    # x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

    # lr = LogisticRegression(penalty='l2',C=1000000)
    # lr.fit(X_train,y_train)
    # y_pred= lr.predict(df_test) 

    # print(classification_report(y_pred,y_test))

    # test_results=logistic_regression(X_train,y_train,df_test_data)

    test_results=run_classifier(X_train,y_train,df_test_data, 'rf')
    save_result(test_id, test_results,'results_logistic_regression.csv')

Example #27

0

Show file

File: beam_source.py Project: b-r-oleary/acme

 def histogram(self,x=None, y=None, l=None, t=None, **kwargs):
     """
     this is a short-cut for creating many possible histograms, at a
     specified beamline location l, or specified time t.
     - if x and y are not input, then it creates a full joint-scatterplot
       for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t)
     - if x is input, it creates a 1d histogram with respect to that parameter
     - if x and y are input, creates a 2d histogram with respect to those parameters
     """
     table = self.to_dataframe(l=l, t=t, latex=True)
     if x is None and y is None:
         g = sns.pairplot(table, **kwargs)
         for ax in g.axes.flat:
             _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90)
         return
     if x is not None and y is None:
         x = self._reformat_label(x)
         sns.distplot(table[x], **kwargs)
         plt.xlabel(x)
         return
     if x is not None and y is not None:
         x = self._reformat_label(x)
         y = self._reformat_label(y)
         sns.jointplot(x=x, y=y, data=table, **kwargs);
         return

Example #28

0

Show file

def plot_shapes(summary, save=False, condition='Condition', context='notebook'):
    """Plot and print area and volume of all steps and averaged over track"""
    columns_of_interest = ['Scan. Area/Step', 'Scan. Vol./Step',
        'Mean Surface Area (µm2)', 'Mean Volume (µm3)', 'Mean Sphericity',
        condition]

    sns.set(style='white', context=context)
    sns.pairplot(summary[columns_of_interest], hue=condition, diag_kind='kde')
    plt.tight_layout()

    if save:
        conditions = [cond.replace('= ', '')
            for cond in summary[condition].unique()]
        plt.savefig('Shapes_' + '-'.join(conditions), dpi=300)
    else:
        plt.show()

Example #29

0

Show file

File: visualization.py Project: maxdatamax/Default-Credit-Card-Prediction

def visualize_hist_pairplot(X,y,selected_feature1,selected_feature2,features,diag_kind):
	"""
	Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature1 - First feature
	selected_feature1 - Second feature
	diag_kind -- Type of plot in the diagonal (Histogram or Density Function)
	"""

	#create data
	joint_data=np.column_stack((X,y))
	column_names=features

	#create dataframe
	df=pd.DataFrame(data=joint_data,columns=column_names)

	#plot
	palette = sea.hls_palette()
	splot=sea.pairplot(df, hue="Y", palette={0:palette[2],1:palette[0]},vars=[selected_feature1,selected_feature2],diag_kind=diag_kind)
	splot.fig.suptitle('Pairwise relationship: '+selected_feature1+" vs "+selected_feature2)
	splot.set(xticklabels=[])
	# plt.subplots_adjust(right=0.94, top=0.94)

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_{}_hist_pairplot.png'.format(output_dir,selected_feature1,selected_feature2))

Example #30

0

Show file

File: linear_regression.py Project: samanmunikar/Predictive-Analysis

#See the distrubution of the data
sns.distplot(data['charges'],ax= ax[0,0])
sns.distplot(data['age'],ax=ax[0,1])
sns.distplot(data['bmi'],ax= ax[1,0])
sns.distplot(data['children'],ax= ax[1,1])


sns.countplot(data['sex'],ax=ax[2,0])
sns.countplot(data['smoker'],ax= ax[2,1])
sns.countplot(data['region'],ax= ax[3,0])



#visualizeing skewness
sns.pairplot(data)

#Lets look at smokers vs non-smokers on age vs charges:

sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'muted', height = 7)
plt.show(sns)

#Lets look at correlation:

corr = data.corr()

sns.heatmap(corr, cmap = 'Wistia', annot= True)
plt.show(sns)

############################################01_04_ConvertCategoricalDataintoNumbers##############################################
#option0: pandas factorizing: maps each category to a different integer = label encoder

Example #31

0

Show file

File: chapter05.py Project: cmarsa/pydshb_notes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# chapter05.py

#%%
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

#%%
sns.set()
sns.pairplot(iris, hue='species', height=1.5)

#%%
X_iris = iris.drop('species', axis=1)
X_iris.shape

#%%
y_iris = iris['species']
y_iris.shape

#%%
# Suppervised learning example: Simple linear regression
import matplotlib.pyplot as plt
import numpy as np

plt.figure()
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)

Example #32

0

Show file

File: LinearRegression.py Project: Vineeta12345/Simple-Linear-Regression

@author: Vineeta
"""
import tkinter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
data=pd.read_csv("sales_data.csv",encoding='latin1')
data.shape
data.describe()
data.isnull().sum()
data = data.dropna(axis=1)
print(data)
sns.pairplot(data,x_vars=['QUANTITYORDERED','PRICEEACH','MSRP'], y_vars='SALES',height=4,
            aspect=1,kind='scatter')
plt.show()
import pandas as pd
from sklearn import linear_model
X = data[['QUANTITYORDERED','PRICEEACH','MSRP']]
Y = data['SALES']
regr = linear_model.LinearRegression()
regr.fit(X, Y)
QUANTITYORDERED = 56
PRICEEACH = 93.2
MSRP=150
print ('Predicted SalesPrice: \n', regr.predict([[QUANTITYORDERED,PRICEEACH,MSRP]]))
import tkinter as tk 
root = tkinter.Tk()
canvas1 = tk.Canvas(root, width = 500, height = 300)
canvas1.pack()

Example #33

0

Show file

File: visualization.py Project: hombre66/Bayesian_Analysis

    DATA = load(fn)
    COST_MODEL, TRACE = DATA['model'], DATA['trace']

########################
# Model visualization
########################

#Plot Cost model KDE (ln x scale)
FIG2, _ = plt.subplots(1, 1, figsize=(13, 6))
plt.title(f'KDE Workstaion Cost {MODEL_PREFIX} Observation versus Model',
          fontsize=16)
kdeplot(log(Y), label='Observation')
kdeplot(log(Y_), label='Model')
plt.xlabel('Wrks Cost Ln()', fontsize=16)
plt.ylabel('Density', fontsize=16)
FIG2.savefig(f'Cost_model_KDE_{MODEL_PREFIX}_{F_BASENAME}.png')

#Plot TIERS visualization relationships
for tier in TIERS:
    cols = list(ATT) + [MEASURE, 'model_cost']
    ppp = log(PP[cols]).copy()
    ppp = concat([ppp, PP[tier]], axis=1)
    tvrf_name = f'Visualizing relationships-{MODEL_PREFIX}-{tier}-{F_BASENAME}.png'
    pairplot(ppp, hue=tier, height=3, kind='scatter').savefig(tvrf_name)

pairplot(ppp, height=3, kind='scatter', diag_kind='kde').savefig(
    f'Visualizing relationships-{MODEL_PREFIX}-{F_BASENAME}.png')

SUMMARY = df_summary(TRACE)
print(SUMMARY)

Example #34

0

Show file

File: 7107029022_925_RAhomework.py Project: jennychiou/Class_RA

# In[3]:


#描述性分析
s = data[['temperature','pressure', 'windspeed','electricity_consumption']]
s.describe()


# In[4]:


#散佈圖
sns.set(style='whitegrid', context='notebook')
cols = ['temperature', 'pressure', 'windspeed', 'electricity_consumption']
sns.pairplot(data[cols], size=2.5);
plt.tight_layout()
plt.show()


# In[5]:


#根據用電量分類
def get_consumption_category(wt):
    if wt < 200:
        return "<200kWh"
    elif 200 < wt < 400:
        return "200kWh~400kWh"
    elif 400 < wt < 600:
        return "400kWh~600kWh"

Example #35

0

Show file

    })

school_palette = {
    "Most cited analytical": "tab:red",
    "Anti-teorethical": "tab:blue",
    "Master": "tab:green",
    "Cavell": "tab:orange",
}

#%%
g = sns.pairplot(
    df,
    x_vars=["parentheses_ratio", "dot_parentheses_ratio", "r2"],
    y_vars=["author"],
    hue="school",
    height=5,
    aspect=0.6,
    diag_kind=None,
    markers=["H", "s", "o", "D"],
    plot_kws={"s": 50},
    palette=school_palette,
)
g._legend.remove()
g.add_legend(
    bbox_to_anchor=(0.37, 0.24),
    frameon=True,
    label_order=[
        "Cavell", "Master", "Anti-teorethical", "Most cited analytical"
    ],
)
g._legend.set_title(None)
g.axes.flatten()[0].set_ylabel("")

Example #36

0

Show file

File: Data_visualization.py Project: fchsieh/NCTU-CS-assignments

### Import and Global vars ###

# Commonly used module
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

iris_data = pd.read_csv("./preprocessed_iris.csv")

# In[2]:

g = sns.pairplot(iris_data, hue="class", diag_kind="kde")
g.savefig('iris_output.png')

# In[3]:

google_data = pd.read_csv("./preprocessed_googleplaystore.csv",
                          usecols=[
                              "Category", "Rating", "Reviews", "Installs",
                              "Price", "Last Updated"
                          ])
cat_list = list(google_data["Category"].unique())
replace_list = list(range(0, len(cat_list)))
mymap = dict(zip(cat_list, replace_list))

google_data = google_data.applymap(lambda s: mymap.get(s) if s in mymap else s)

Example #37

0

Show file

File: 02_1_iris2D.py Project: garretaserra/smartCitiesScript

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load iris dataset
iris = pd.read_csv("..\datasets\iris.csv")
iris['variety'] = iris['variety'].astype('category')

# EDA
print(iris.info())
print(iris.groupby('variety').size())
print(iris.describe(include='all'))
sns.pairplot(iris, hue="variety")
sns.lmplot(x='petal.length',
           y='petal.width',
           data=iris,
           hue="variety",
           fit_reg=False)

Example #38

0

Show file

File: deep_leaning_sklearn_diabetes.py Project: Josafa01/machine_learning

base.shape

base.isnull().values.any()

base

base.info()

base.describe()

sns.countplot(x = 'Outcome', data = base);

base.hist(figsize=(20,12));

sns.pairplot(base, hue = 'Outcome', 
             vars = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']);

sns.heatmap(base.corr(), annot = True);

X = base.iloc[:, 0:8].values

X

y = base.iloc[:, 8].values

y

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

Example #39

0

Show file

File: navodit_jain_311298_code_3.py Project: ninja3697/ZS-Data-Science-Challange-2019-Rank-54-Solution

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
"""Loading the dataset"""

raw_data = pd.read_csv('drive/My Drive/data.csv', index_col=0)
raw_data.head()
"""**Exploratory Data Analysis**"""

# Checking for missing values
raw_data.isnull().sum()
"""Bivariate Analysis"""

sns.pairplot(raw_data, hue='is_goal')
plt.show()
"""Missing values treatment"""

raw_data.team_id.value_counts()

# drop columns which do not contribute to predictions
raw_data.drop('match_event_id', inplace=True, axis=1)
raw_data.drop('team_name', inplace=True, axis=1)
raw_data.drop('date_of_game', inplace=True, axis=1)
raw_data.drop('lat/lng', inplace=True, axis=1)
raw_data.drop('team_id', inplace=True, axis=1)

raw_data.shape

# Filling missing shot_id_numbers

Example #40

0

Show file

File: auto_efficency_regression.py Project: wupengbo125/penter

# 处理类别型数据，其中origin列代表了类别1,2,3,分布代表产地：美国、欧洲、日本
# 其弹出这一列
origin = dataset.pop('Origin')
# 根据origin列来写入新列
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
dataset.tail()

# 切分为训练集和测试集
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

# %% 统计数据
sns.pairplot(train_dataset[["Cylinders", "Displacement", "Weight", "MPG"]],
             diag_kind="kde")
# %%
# 查看训练集的输入X的统计数据
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

# 移动MPG油耗效能这一列为真实标签Y
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')


# 标准化数据
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

Example #41

0

Show file

ax.plot(x, y, 'o', markersize=1, label="data")
ax.plot(x, slm.fittedvalues, 'b--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.set_xlim([0, 50000])
ax.legend(loc='best')

# We do the same thing for variables in another way using seaborns.

# In[17]:

import seaborn as sns
sns.pairplot(
csv_data,
x_vars=['LotArea', 'GrLivArea', 'YearBuilt', 'FullBath', '2ndFlrSF'],
y_vars='SalePrice',
height=7,
aspect=0.7,
kind='reg')

# This is the same last few steps for other variables. The difference between SST and SSE is the improvement in prediction from the regression model, compared to the mean model. Dividing that difference by SST gives R-squared. It is the proportional improvement in prediction from the regression model, compared to the mean model. It indicates the goodness of fit of the model. R-squared has the useful property that its scale is intuitive: it ranges from zero to one, with zero indicating that the proposed model does not improve prediction over the mean model, and one indicating perfect prediction. Improvement in the regression model results in proportional increases in R-squared. One pitfall of R-squared is that it can only increase as predictors are added to the regression model. This increase is artificial when predictors are not actually improving the model’s fit. To remedy this, a related statistic, Adjusted R-squared, incorporates the model’s degrees of freedom. Adjusted R-squared will decrease as predictors are added if the increase in model fit does not make up for the loss of degrees of freedom. Likewise, it will increase as predictors are added if the increase in model fit is worthwhile. Adjusted R-squared should always be used with models with more than one predictor variable. It is interpreted as the proportion of total variance that is explained by the model. There are situations in which a high R-squared is not necessary or relevant. When the interest is in the relationship between variables, not in prediction, the R-square is less important. An example is a study on how religiosity affects health outcomes. A good result is a reliable relationship between religiosity and health. No one would expect that religion explains a high percentage of the variation in health, as health is affected by many other factors. Even if the model accounts for other variables known to affect health, such as income and age, an R-squared in the range of 0.10 to 0.15 is reasonable.

# ## Multi-Variable Linear Regression
#
# In this part we try to fit the regerssion line using 6 variables. Our candidates are 'LotArea', 'GrLivArea', 'LotFrontage', '2ndFlrSF', 'YearBuilt', 'FullBath'. These are elected using correlation with 'SalesPrice' variable. Variables with higher correlation would probably be more suitable to use for regression problem. Following code will plot the regerssion line for each of these variables compared to 'SalePrice'.
#

# In[27]:

# using statistic model for variables
x_tot = csv_data[[

Example #42

0

Show file

File: MineralVisualFinal.py Project: Freeverc/MineralVisualApp

    def draw_func(self):
        if len(self.all_data.index) == 0:
            self.show_message('请导入采样数据')
        elif len(self.info_data.index) == 0:
            self.show_message('请导入采样信息')
        elif not self.region_linked:
            self.show_message("请点击链接")
        elif not self.figure_able:
            self.show_message("数据包含非数值类型，不可画图！")
        else:
            self.clear_func()
            self.cur_slice()
            # 解决无法显示中文
            plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
            # plt.rcParams['font.sans-serif']=['SimHei'] #指定默认字体,SimHei为黑体
            # 解决无法显示负号
            plt.rcParams['axes.unicode_minus'] = False
            plt.title(self.figure_type)
            if self.figure_type == "主成分分析":
                region_data = self.cur_data.iloc[:, 0].values.tolist()
                print(region_data)
                regions = list(set(region_data))
                print(regions)
                region_color = [(int(regions.index(i) * 255 / len(regions))) for i in region_data]
                # region_color = [regions.index[i] for i in region_data]
                print(region_color)
                data = self.cur_data.iloc[:, 1:].values
                data = data - np.mean(data, axis=0)
                print("data",data.shape)
                cov_mat = np.cov(data, rowvar=0)
                print("cov:", cov_mat.shape)

                eig_vals, eig_vects = np.linalg.eig(np.mat(cov_mat))
                low_data_mat = data * eig_vects
                print("low:", low_data_mat.shape)
                eig_val_indice = np.argsort(eig_vals)

                top = 2
                n_eig_val_indice = range(top)
                print("n_eig_val_indice", n_eig_val_indice)
                n_eig_vects = eig_vects[:, n_eig_val_indice]
                print("n_eig:",n_eig_vects.shape)
                recon_mat = (low_data_mat * eig_vects) + np.mean(data, axis=0)
                print("rec:", recon_mat.shape)
                x = np.array(low_data_mat)[:, 0]
                y = np.array(low_data_mat)[:, 1]
                # z = np.array(low_data_mat)[:, 2]
                for region in regions:
                    index = [i for i, data in enumerate(region_data) if data == region]
                    plt.scatter(x[index], y[index])
                plt.legend(regions)
            elif self.figure_type == '平行坐标图':
                parallel_coordinates(self.cur_data, self.region_method)
            elif self.figure_type == "Andrews图":
                colors = ['b', 'g', 'r', 'orange']
                andrews_curves(self.cur_data, self.region_method, color=colors)
            elif self.figure_type == 'Radiv图':
                radviz(self.cur_data, self.region_method)
            elif self.figure_type == '矩阵散点图':
                print("绘制矩阵散点图")
                sns.pairplot(data=self.cur_data, hue=self.region_method)
                f = plt.gcf()
                self.ax = f
                self.canvas = FigureCanvas(f)
            elif self.figure_type == 'Chernoff脸谱图':
                self.cur_data.to_excel('cur_data.xlsx')
                print("data out")
                # goto_r()
                os.system("python ./PyToR.py")
                face_info = pd.read_csv('face_info.csv')
                # f_str = face_info.to_string()

                font = {'weight': 'normal',
                         'size': 11,
                         }

                plt.text(500, 0 , "脸谱图条目                 数据列", fontdict=font)
                for index, row in face_info.iterrows():
                    f_str = row[0] + " : "
                    plt.text(500, 20 + 20 * index, f_str, fontdict=font)
                    f_str = row[1]
                    plt.text(650, 30 + 20 * index, f_str, fontdict=font)
                plt.imshow(Image.open('face.png'))
                plt.gca().add_patch(plt.Rectangle(xy=(500, 20), width=100, height=300,
                                                  edgecolor=[1, 1, 1],
                                                  fill=False,
                                                  linewidth=2))
                # print("文件命名为:face.jpg")
                # info=pd.read_csv('face_info.csv',encoding='GBK')
                # print("effect of variables:\n{}".format(info))

            self.table_view.setVisible(False)
            self.canvas.setVisible(True)
            self.figure_layout.removeWidget(self.table_view)
            self.figure_layout.addWidget(self.canvas)
            self.canvas.draw()
            self.figure_state = 2

Example #43

0

Show file

g = sns.catplot(x="Rated 4.4 or more", y="Reviews", data=df)
g.savefig('rated4.4ormore-reviews.png')

df = df.drop('Rated 4.4 or more', axis=1)
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
g = sns.heatmap(corr,
                cmap=cmap,
                mask=mask,
                annot=True,
                vmax=.3,
                center=0,
                square=True,
                linewidths=.5,
                cbar_kws={"shrink": .5})
g.figure.savefig('heatmap.png')

g = sns.pairplot(df)
g.savefig('pairwise.png')

Example #44

0

Show file

    mask[np.triu_indices_from(mask)] = True

    with sns.axes_style("white"):
        sns.heatmap(data=corr, mask=mask, annot=True, vmin=-1, vmax=1)


# In[78]:

plot_heatmap(df)

# In[79]:

cols = ['temp', 'atemp', 'windspeed', 'humidity']

pp = sns.pairplot(df[cols],
                  diag_kws=dict(shade=True),
                  diag_kind="kde",
                  kind="reg")

fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('Correlação das variáveis numéricas',
             fontsize=14,
             fontweight='bold')

# In[80]:

sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'count']],
            orient='h')
fig = plt.gcf()
fig.set_size_inches(12, 6)
fig.suptitle('Análise de Outliers', fontsize=14, fontweight='bold')

Example #45

0

Show file

def print_ica_plot(comp, scaled_data):
    ica = FastICA(n_components=comp)
    ica_fit = ica.fit_transform(scaled_data)
    ica_df = pd.DataFrame(ica_fit)

    sns.pairplot(ica_df)

Example #46

0

Show file

File: algorithm_liner_regression.py Project: qiaowenfanggithub/smartbi_check

 def visualization(self):
     """
     接口请求参数
         "tableName": "advertising",  # str,数据库表名
         "X": ["TV", "radio", "newspaper"],  # list,自变量，当表格方向为h时表示多个变量名，为v时表示分类变量字段
         "Y": ["sales"],  # list,因变量,当表格方向为v是使用
         "show_options": ["y_count", "pairs", "corr", "y_corr"], # 展示选项
         "x_count": [], # list，选择要展示频率分布直方图的自变量
         "box": [], # list，选择要展示箱型图的自变量
     :return:
     """
     try:
         res = []
         self.table_data = self.table_data.astype("float")
         data = self.table_data.describe()
         res.append(
             transform_table_data_to_html({
                 "data": data.values.tolist(),
                 "title": "描述性统计分析",
                 "col": data.columns.tolist(),
                 "row": data.index.tolist()
             }))
         if self.config.get("x_count") and self.config.get("x_count")[0]:
             for x in self.config["x_count"]:
                 sns.distplot(self.table_data[x], kde=False)
                 # 显示纵轴标签
                 plt.ylabel("frequency")
                 # 显示图标题
                 # plt.title("{} - frequency distribution histogram".format(x))
                 res.append({
                     "title":
                     "{} - 频率分布".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "y_count" in self.config["show_options"]:
             sns.distplot(self.table_data[self.config["Y"][0]], kde=False)
             # 显示横轴标签
             plt.xlabel("section")
             # 显示纵轴标签
             plt.ylabel("frequency")
             # 显示图标题
             # plt.title("y frequency distribution histogram")
             res.append({
                 "title":
                 "{} - 频率分布".format(self.config["Y"][0]),
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if self.config.get("box") and self.config.get("box")[0]:
             for x in self.config["box"]:
                 sns.boxplot(self.table_data[x], palette="Set2", orient="v")
                 # 显示图标题
                 # plt.title("{} - Box distribution to check outliers".format(x))
                 res.append({
                     "title":
                     "{} - 箱型图".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "pairs" in self.config["show_options"]:
             sns.pairplot(self.table_data)
             # plt.title("Variable relation in pairs")
             res.append({
                 "title":
                 "变量两两关系图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "corr" in self.config["show_options"]:
             corr = self.table_data.corr()
             sns.heatmap(corr,
                         xticklabels=corr.columns,
                         yticklabels=corr.columns,
                         linewidths=0.2,
                         cmap="YlGnBu",
                         annot=True)
             # plt.title("Correlation between variables")
             res.append({
                 "title":
                 "相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "y_corr" in self.config["show_options"]:
             self.table_data.corr()[self.config["Y"][0]].sort_values(
                 ascending=False).plot(kind='bar')
             # plt.title("Correlations between y and x")
             res.append({
                 "title":
                 "因变量和各自变量的相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         response_data = {"res": res, "code": "200", "msg": "ok!"}
         return response_data
     except Exception as e:
         return {"data": "", "code": "500", "msg": "{}".format(e.args)}

Example #47

0

Show file

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

dataset = pd.read_csv('studentperformance.csv')
dataset.columns = [
    'gender', 'race', 'ped', 'lunch', 'test', 'math', 'reading', 'writing'
]

dataset.info()
dataset.describe()

pd.plotting.scatter_matrix(dataset)
sns.pairplot(dataset)

sns.barplot(dataset['gender'].value_counts().index,
            dataset['gender'].value_counts(),
            hue=['female', 'male'])

sns.barplot(dataset['race'], dataset['math'], hue=dataset['gender'])

sns.barplot(dataset['race'], dataset['reading'], hue=dataset['gender'])

sns.barplot(dataset['race'], dataset['writing'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['math'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['reading'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['writing'], hue=dataset['gender'])

Example #48

0

Show file

File: trees_dataset.py Project: vgangadhar/scikit-learn-mooc

target_column = "Species"

# %% [markdown]
# Let's check the dataset more into details.

# %%
penguins.head()

# %% [markdown]
# Since that we have few samples, we can check a scatter plot to observe the
# samples distribution.

# %%
import seaborn as sns

pairplot_figure = sns.pairplot(penguins, hue="Species")
pairplot_figure.fig.set_size_inches(9, 6.5)

# %% [markdown]
# First let's check the feature distributions by looking at the diagonal plots
# of the pairplot. We can deduce the following intuitions:
#
# * The Adelie species can be differentiated from the Gentoo and Chinstrap
#   species depending on the culmen length;
# * The Gentoo species can be differentiated from the Adelie and Chinstrap
#   species depending on the culmen depth.
#
# ## Regression dataset
#
# In a regression setting, the target is a continuous variable instead of
# categories. Here, we use two features of the dataset to make such a problem:

Example #49

0

Show file

File: PhotoDataAnalysis_main.py Project: prettypop/PhotoData

def main():

    photoData = PhotoData.Photo()

    lists = []
    header = []
    cols = []

    # Set initial data
    photoDirectories = []
    ignoreCameraModel = []
    fillEmptyLensModel = {
        "DSC-RX100M3": "No Lens Data",
    }  # sample
    PlotBarRotate = {}

    # Set plot initial data
    plotFixSizeX = 12
    plotFixSizeY = 8
    plotGrid = False
    plotSubPlots = False
    plotFontSize = 10
    plotRotate = 0

    plotBar = False
    plotScatter = False
    plotHexbin = False
    plotPie = False

    for line in open('PhotoDataAnalysis.ini', 'r'):
        line = line.strip()
        if line == "":
            pass
        elif line[0:1] == "#":
            pass
        else:
            item, param = line.split("=", 1)
            if item == "PhotoDirectory": photoDirectories.append(param)
            if item == "IgnoreCameraModel": ignoreCameraModel.append(param)
            if item == "FillEmptyLensModel":
                param1, param2 = param.split(":", 1)
                fillEmptyLensModel[param1] = param2.strip()
            if item == "PlotFigSizeX": plotFixSizeX = int(param)
            if item == "PlotFigSizeY": plotFixSizeY = int(param)
            if item == "PlotGrid":
                if param.lower() == "true": plotGrid = True
                else: plotGrid = False
            if item == "PlotSubPlots":
                if param.lower() == "true": plotSubPlots = True
                else: plotSubPlots = False
            if item == "PlotsFontSize": plotFontSize = param  # Default値
            if item == "PlotBarRotate":
                param1, param2 = param.split(":")
                PlotBarRotate[param1] = int(param2)
                if param1 == "Default": plotRotate = param2
            if item == "PlotBar":
                if param == "True": plotBar = True
                else: plotBar = False
            if item == "PlotScatter":
                if param == "True": plotScatter = True
                else: PlotScatter = False
            if item == "PlotHexbin":
                if param == "True": plotHexbin = True
                else: plotHexbin = False
            if item == "PlotPie":
                if param == "True": plotPie = True
                else: plotPie = False
            if item == "PlotSeaborn":
                if param == "True": plotSeaborn = True
                else: plotSeaborn = False

    print("#")
    print("#")
    print("#")
    print("# Load and analyze photo data")
    print("#")
    print("#")
    print("#")

    print("> Load photo data")
    for photoDirectory in photoDirectories:
        lists, header = GetExifData(lists=lists,
                                    ignoreCameraModel=ignoreCameraModel,
                                    fileFullPath=photoDirectory)
        if header != []: cols = header

    now = datetime.datetime.now()
    now_dt = str(now.year).zfill(2) + str(now.month).zfill(2) + str(
        now.day).zfill(2) + " " + str(now.hour).zfill(2) + str(
            now.minute).zfill(2) + str(now.second).zfill(2)

    # Data保存先ディレクトリ
    if os.path.exists("./Data") == False: os.mkdir("./Data")

    dataDir = "./Data/" + now_dt
    os.mkdir(dataDir)

    #
    # このエラーが出た時の確認用: AssertionError: 45 columns passed, passed data had 43 columns
    #
    #for x in lists:
    #    print(str(len(x)) + ":" + x[0] )

    # - - -
    # Set PANDAS
    # - - -
    df = pd.DataFrame(data=lists, columns=cols)
    df = df.applymap(illegal_char_remover)

    df["Count"] = df.apply(lambda x: 1, axis=1)

    #iPhoneのMakeがBlankの時に更新
    df["Make"] = df.apply(lambda x: "Apple"
                          if x.Model[0:6] == "iPhone" else x.Make,
                          axis=1)

    #LensModelがEmptyの時に更新
    df["LensModel_org"] = df.apply(lambda x: x.LensModel, axis=1)  # Backup
    df["LensModel"] = df.apply(lambda x: "Unknown"
                               if x.LensModel == "" else x.LensModel,
                               axis=1)
    df["LensModel"] = df.apply(lambda x: fillEmptyLensModel[x.Model]
                               if x.Model in fillEmptyLensModel and x.LensModel
                               == "Unknown" else x.LensModel,
                               axis=1)

    # EXIF Dataの数値がブランクのものを0で埋める
    df["ISOSpeedRatings"] = df.apply(
        lambda x: 0 if x.ISOSpeedRatings == "" else x.ISOSpeedRatings, axis=1)
    df["FNumber_cust"] = df.apply(lambda x: 0
                                  if x.FNumber_cust == "" else x.FNumber_cust,
                                  axis=1)
    df["ExposureTime_calc"] = df.apply(
        lambda x: 0 if x.ExposureTime_calc == "" else x.ExposureTime_calc,
        axis=1)
    df["ExposureTime_cust"] = df.apply(
        lambda x: 0 if x.ExposureTime_cust == "" else x.ExposureTime_cust,
        axis=1)
    df["FocalLengthIn35mmFilm"] = df.apply(
        lambda x: 0
        if x.FocalLengthIn35mmFilm == "" else x.FocalLengthIn35mmFilm,
        axis=1)

    df["LightSource_cust"] = df.apply(
        lambda x: "Auto" if x.LightSource_cust == 0 else x.LightSource_cust,
        axis=1)

    # 年。月、曜日、時間帯の列を追加
    df["Year"] = df.apply(lambda x: GetYear(x.DateTimeOriginal), axis=1)
    df["Month"] = df.apply(lambda x: GetMonth(x.DateTimeOriginal), axis=1)
    df["Year_Month"] = df.apply(lambda x: GetYearMonth(x.DateTimeOriginal),
                                axis=1)
    df["Hour"] = df.apply(lambda x: GetHour(x.DateTimeOriginal), axis=1)
    df["Week"] = df.apply(lambda x: GetWeek(x.DateTimeOriginal), axis=1)

    # List Camera Model
    print("[Camera Model List]")
    pvt_cm = pd.pivot_table(df,
                            values="Count",
                            index=["Make", "Model"],
                            aggfunc=lambda x: len(x))
    print(pvt_cm)

    # - - -
    # Create charts
    # - - -
    print("[Create Charts]")

    # Sort of bar chart
    plots_bar = [
        ["FNumber_cust", ["Make", "Model"]],
        ["FNumber_cust", ["Make", "Model", "LensModel"]],
        ["FocalLengthIn35mmFilm", ["Make", "Model"]],
        ["FocalLengthIn35mmFilm", ["Make", "Model", "LensModel"]],
        ["FocalLength_cust", ["Make", "Model"]],
        ["FocalLength_cust", ["Make", "Model", "LensModel"]],
        ["ShutterSpeed_calc", ["Make", "Model"]],
        ["ShutterSpeed_calc", ["Make", "Model", "LensModel"]],
        ["ISOSpeedRatings", ["Make", "Model"]],
        ["ISOSpeedRatings", ["Make", "Model", "LensModel"]],
        ["Orientation_cust", ["Make", "Model"]],
        ["Orientation_cust", ["Make", "Model", "LensModel"]],
        ["LightSource_cust", ["Make", "Model"]],
        ["LightSource_cust", ["Make", "Model", "LensModel"]],
        ["MeteringMode_cust", ["Make", "Model"]],
        ["MeteringMode_cust", ["Make", "Model", "LensModel"]],
        ["ApertureValue_cust", ["Make", "Model"]],
        ["ApertureValue_cust", ["Make", "Model", "LensModel"]],
        ["BrightnessValue_cust", ["Make", "Model"]],
        ["BrightnessValue_cust", ["Make", "Model", "LensModel"]],
        ["ExposureBiasValue_cust", ["Make", "Model"]],
        ["ExposureBiasValue_cust", ["Make", "Model", "LensModel"]],
        ["MaxApertureValue_cust", ["Make", "Model"]],
        ["MaxApertureValue_cust", ["Make", "Model", "LensModel"]],
        ["Sharpness_cust", ["Make", "Model"]],
        ["Sharpness_cust", ["Make", "Model", "LensModel"]],
        ["SceneCaptureType_cust", ["Make", "Model"]],
        ["SceneCaptureType_cust", ["Make", "Model", "LensModel"]],
        ["Make", ""],
        ["Model", ""],
        ["LensModel", ""],
        ["Year", ["Make", "Model"]],
        ["Year", ["Make", "Model", "LensModel"]],
        ["Year_Month", ["Make", "Model"]],
        ["Year_Month", ["Make", "Model", "LensModel"]],
        ["Month", ["Make", "Model"]],
        ["Month", ["Make", "Model", "LensModel"]],
        ["Hour", ["Make", "Model"]],
        ["Hour", ["Make", "Model", "LensModel"]],
        ["Week", ["Make", "Model"]],
        ["Week", ["Make", "Model", "LensModel"]],
        ["Hour", "ISOSpeedRatings"],
        ["Hour", "FNumber_cust"],
        ["Hour", "ShutterSpeed_cust"],
        ["FNumber_cust", "ISOSpeedRatings"],

        #["ExposureTime_calc", ""],
        # [["FNumber_cust", "ISOSpeedRatings"], ""],   ### このパターンはエラー
    ]

    # Sort of Scatter chart
    plots_scatter = [
        ["ExposureTime_calc", "FNumber_cust"],
        ["Hour", "FNumber_cust"],
        ["Hour", "ISOSpeedRatings"],
        ["ISOSpeedRatings", "FNumber_cust"],
        ["FocalLengthIn35mmFilm", "FNumber_cust"],
        ["FocalLength_cust", "FNumber_cust"],
        ["ApertureValue_cust", "FNumber_cust"],
        ["ShutterSpeed_calc", "FNumber_cust"],
        ["ShutterSpeed_calc", "ISOSpeedRatings"],
        ["ShutterSpeed_calc", "ExposureTime_calc"],
    ]

    # Sort of Pie chart
    plots_pie = [
        "Make",
        "Model",
        "LensModel",
        "FocalLengthIn35mmFilm",
        "FocalLength_cust",
        "FNumber_cust",
        "ISOSpeedRatings",
        "Year",
        "Month",
        "Week",
    ]

    # Pivot TableをEXCELに書き出し
    saveExcelFile = dataDir + "/Photo Data " + now_dt + ".xlsx"
    writer = pd.ExcelWriter(saveExcelFile)
    df.to_excel(writer, sheet_name=now_dt)

    # - - - - - - - - - -
    # Plots Bar Chart作成
    # - - - - - - - - - -
    if plotBar == True:
        for idx, clm in plots_bar:

            if clm != "":
                clm2 = ""
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    columns=clm,
                                    aggfunc=lambda x: len(x))

                if isinstance(clm, list):
                    n = 0
                    for val in clm:
                        n += 1
                        if n == 1: clm2 = val
                        else: clm2 = clm2 + " & " + val
                else: clm2 = clm

                idx = ModifyName(idx)  # Pivotを作成した後に名称変更
                fn = idx + " by " + ModifyName(clm2)
                pTitle = "x: " + ModifyName(idx) + " | y: " + ModifyName(clm2)
                lgd = True
            else:
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    aggfunc=lambda x: len(x))

                idx = ModifyName(idx)  # Pivotを作成した後に名称変更
                fn = idx
                pTitle = idx
                lgd = False

            # 長い名称を縮小
            fn = ReduceName(fn)

            # Write to EXCEL File
            ds.to_excel(writer, sheet_name=fn.replace("&", "_"))

            # Draw plot
            if idx in PlotBarRotate:
                rotate = PlotBarRotate[idx]  #Rotateを個別に設定している場合
            else:
                rotate = plotRotate
            fontSize = 8

            ds.columns.name = ""
            ds.index.name = ""
            ds.plot(kind="bar",
                    title=pTitle,
                    grid=plotGrid,
                    legend=lgd,
                    subplots=plotSubPlots,
                    fontsize=plotFontSize,
                    rot=rotate,
                    figsize=(plotFixSizeX, plotFixSizeY),
                    stacked=True)

            saveFile = dataDir + "/Pd_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
    else:
        pass

    # - - - - - - - - - -
    # Plots scatter/hexbin chart作成
    # - - - - - - - - - -
    for val_x, val_y in plots_scatter:
        pTx = ModifyName(val_x)
        pTy = ModifyName(val_y)
        pTitle = str(pTx) + " vs " + str(pTy)

        # 長い名称を縮小
        fn = ReduceName(pTitle)

        if plotScatter == True:
            # Scatter Chart
            df.plot(
                kind='scatter',
                x=val_x,
                y=val_y,
                linewidth="2",
                c="blue",
                edgecolors="blue",
                title=pTitle,
                grid=plotGrid,
                legend=lgd,
                subplots=plotSubPlots,
                fontsize=plotFontSize,
                #rot=plotRotate,
                figsize=(plotFixSizeX, plotFixSizeY),
                stacked=True)
            plt.xlabel(pTx)
            plt.ylabel(pTy)
            saveFile = dataDir + "/Ps_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
        else:
            pass

        if plotHexbin == True:
            # Hexbin Chart
            df.plot(
                kind='hexbin',
                x=val_x,
                y=val_y,
                gridsize=30,
                marginals=False,
                cmap=cm.PuBu,
                title=pTitle,
                grid=plotGrid,
                legend=lgd,
                subplots=plotSubPlots,
                fontsize=plotFontSize,
                #rot=plotRotate,
                figsize=(plotFixSizeX, plotFixSizeY),
                stacked=True)

            plt.xlabel(pTx)
            plt.ylabel(pTy)
            saveFile = dataDir + "/Ph_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
        else:
            pass

    # - - - - - - - - - -
    # Plots pie chart
    # - - - - - - - - - -
    if plotPie == True:
        for idx in plots_pie:
            try:
                pTitle = ModifyName(idx)
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    aggfunc=lambda x: len(x))
                ds.plot(kind="pie",
                        y="Count",
                        subplots=True,
                        title=pTitle,
                        autopct='%.1f',
                        figsize=(plotFixSizeX, plotFixSizeY),
                        counterclock=False,
                        startangle=90,
                        pctdistance=0.8)
                plt.ylabel("")
                fn = ModifyName(idx)

                # 長い名称を縮小
                fn = ReduceName(fn)

                saveFile = dataDir + "/Pp_" + str(fn) + ".png"
                print("> Plot:" + saveFile)
                plt.axis('equal')
                plt.savefig(saveFile)
                plt.close()

            except AssertionError as err:
                print("*EXCEPTION:", err)

    else:
        pass

    # - - - - - - - - - -
    # Seaborn Chart
    # - - - - - - - - - -
    if plotSeaborn == True:
        # Seaborn PairPlot #1
        df_select = df.loc[:, [
            "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc",
            "ISOSpeedRatings", "ApertureValue_cust", "ExposureBiasValue_cust",
            "LensModel"
        ]]
        sb = sns.pairplot(df_select, hue="LensModel")
        saveFile = dataDir + "/Sp_Pairplot1.png"
        plt.savefig(saveFile)
        plt.close()

        # Seaborn PairPlot #2
        df_select = df.loc[:, [
            "Hour", "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc",
            "ISOSpeedRatings", "LensModel"
        ]]
        sb = sns.pairplot(df_select, hue="LensModel")
        saveFile = dataDir + "/Sp_Pairplot2.png"
        plt.savefig(saveFile)
        plt.close()

        # Seaborn JpintPlot (using scatter param)
        for val_x, val_y in plots_scatter:
            pTx = ModifyName(val_x)
            pTy = ModifyName(val_y)
            pTitle = str(pTx) + " vs " + str(pTy)

            sb_kind = "hex"  # reg, kde, hex
            sb = sns.jointplot(val_x, val_y, df, kind=sb_kind)

            # 長い名称を縮小
            fn = ReduceName(pTitle)
            saveFile = dataDir + "/Sj_" + str(fn) + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()

        # Seaborn HeatMap


#       sb = sns.heatmap(df.corr())

    else:
        pass

    # 最後にSave
    writer.save()
    print("> Saved EXCEL file: " + saveExcelFile)

Example #50

0

Show file

df['Current_Year'] = 2020

df['Age_of_Car'] = df['Current_Year'] - df['Year']

df.head()

df = df.drop(['Car_Name', 'Year', 'Current_Year'], axis=1)

df.head()

df = pd.get_dummies(df, drop_first=True)

df.head()

sns.pairplot(df)

corr = df.corr()
top_features = corr.index
plt.figure(figsize=(12, 6))
sns.heatmap(corr, annot=True, cmap='RdYlGn')

x = df.iloc[:, 1:]

y = df.iloc[:, 0]

x.head()

y.head()

#Feature Im[ortance

Example #51

0

Show file

File: PlayStoreAppsRatingPrediction.py Project: bleonahasanaj/playstore_apps_rating_predict

                  color='lightgreen')
plt.show(bar)

# In[24]:

#Shohim varshmërinë e atributeve me njëra tjetrën
x = data_file['Rating'].dropna()
y = data_file['Size'].dropna()
z = data_file['Installs'][data_file.Installs != 0].dropna()
p = data_file['Reviews'][data_file.Reviews != 0].dropna()
t = data_file['Type'].dropna()
price = data_file['Price']

p = sns.pairplot(pd.DataFrame(
    list(zip(x, y, np.log(z), np.log10(p), t, price)),
    columns=['Rating', 'Size', 'Installs', 'Reviews', 'Type', 'Price']),
                 hue='Type',
                 palette="Set2")

# In[25]:

#Shohim konvergjencën e vlerave për atributin Rating
data_file.hist(column='Rating')
plt.ylim(0, 10841)
plt.title("Shpërndarja e Rating")
plt.xlabel("Vlera e Rating")
plt.ylabel("Nr. i aplikacioneve")

# In[26]:

#Enkodimi i atributit App

Example #52

0

Show file

File: plots.py Project: roxarcucci/VGPosp

def pairplots(trainA):
    sns.set(style="ticks", color_codes=True)
    sns.pairplot(trainA, diag_kind='kde')
    plt.show()

Example #53

0

Show file

File: reftep_replica_results.py Project: robbisg/mvpa_itab_wu

    for c in columns:

        if 'mep' in c:
            idx = int(c[-1]) - 1
            data = np.log(mat['AmpsMclean'][()][idx])
        #elif 'amplitude' in c:
        #    data = np.log(mat[c][()][idx])
        else:
            data = mat[c][()][idx]
        vector.append(data)

    vector = np.vstack(vector)
    df = pd.DataFrame(vector.T, columns=columns)

    sns.pairplot(df, diag_kws=diag_kws, plot_kws=plot_kws)
    mat.close()

##############################################################################
task = 'phastimate'
threshold_key = 'phases32'

full_dataset = list()

for i in range(9):
    sub = "sub-%03d" % (i + 1)
    filename = os.path.join(
        path, sub,
        sub + "_space-sensor_window-500_atlas-subject_band-mu_%s.mat" % (task))
    mat = h5py.File(filename, 'r')
    vector = []

Example #54

0

Show file

File: script1199.py Project: darkblue-b/kaggleScape

house = house.drop(['id', 'date'], axis=1)

# **Pairplot Visualisation**
#
# Let's create some Seaborn pairplots for the features ('sqft_lot','sqft_above','price','sqft_living','bedrooms') to get a feel for how the various features are distributed vis-a-vis the price as well as the number of bedrooms

# In[ ]:

#sns.pairplot(house[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], hue='bedrooms', palette='afmhot',size=1.4)

# In[ ]:

with sns.plotting_context("notebook", font_scale=2.5):
    g = sns.pairplot(
        house[['sqft_lot', 'sqft_above', 'price', 'sqft_living', 'bedrooms']],
        hue='bedrooms',
        palette='tab20',
        size=6)
g.set(xticklabels=[])

# From the pairplots, we seem to get the classical linear distribution of the data points, for example with price against sqft_living. This bodes well as in the latter analysis, we will implement some linear models which we will use in our Feature ranking. Let's look at the correlation heatmap:

# In[ ]:

str_list = []  # empty list to contain columns with strings (words)
for colname, colvalue in house.iteritems():
    if type(colvalue[1]) == str:
        str_list.append(colname)
# Get to the numeric columns by inversion
num_list = house.columns.difference(str_list)
# Create Dataframe containing only numerical features

Example #55

0

Show file

File: mainRo.py Project: SalgueroRoci/Regression-Practice

from sklearn.model_selection import train_test_split, cross_val_score

# Removes scientific notation
np.set_printoptions(suppress=True)

# Loading data
data = pd.read_csv("Dataset.csv")
x_title = ['Tm', ' Pr', 'Th', 'Sv']
y_title = 'Idx'
x_original = data[x_title]
y_original = data.Idx
x_train, x_test, y_train, y_test = train_test_split(x_original, y_original, test_size=0.5, random_state=1)


# Plotting the Data scatter Linear Regression======================================= 
seaborn.pairplot(data, x_vars=x_title, y_vars=y_title, size=7, aspect=1)
matplot.show()

#plotting the original graphs and linear regression
x_attributes = ["Tm", " Pr", "Th", "Sv"]
x_labels = ['Tempurature', 'Pressure', 'Thermal Conductivity', 'Sound Velocity']

for count, x_attr in enumerate(x_attributes):
    matplot.scatter(x_original[x_attr], y_original)
    liLSM = LinearRegression()
    liLSM.fit(x_train[x_attr].reshape(-1,1), y_train)
    y_predict = liLSM.predict(x_test[x_attr].reshape(-1,1))
    matplot.plot(x_test[x_attr].reshape(-1,1), y_predict, 'r') 
    matplot.legend(['Predicted line','Observed data'])
    matplot.xlabel(x_labels[count])
    matplot.ylabel('Chem Index')

Example #56

0

Show file

File: StockMarketAnalysis.py Project: thiagobandeira60/Stock_Market_Analysis

# %%
# Now let's compare google to itself

sns.jointplot('GOOG', 'GOOG', tech_returns, kind='scatter', color='seagreen')
# %%
# That's a perfect linear relationship, and that makes sense, since we are comparing google to google.
# %%
# Now let's check if there are relationships between different tech stocks

sns.jointplot('GOOG', 'MSFT', tech_returns, kind='scatter', color='seagreen')
# %%
# Now let's do some plots that will make it easy to compare the tech stocks on our list

tech_returns.head()
# %%
sns.pairplot(tech_returns.dropna())
# %%
sns.pairplot(tech_returns.dropna(), kind="reg")
# %%
sns.pairplot(tech_returns.dropna(), kind="reg", diag_kind='kde')
# %%
# Just so we can have an idea on how to interpret these graphs:

from IPython.display import SVG
SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg')
# %%
# The above visualizations show a an interesting correlation between Google and Amazon daily returns
# We can dig a little deeper and use a PairGrid to see a more detailed and controled plot between those two.
# %%
returns_fig = sns.PairGrid(tech_returns.dropna())
returns_fig.map_upper(plt.scatter, color='purple')

Example #57

0

Show file

File: daily3.py Project: adityagupta9612/pyanalytics

import matplotlib.pyplot as plt
#https://matplotlib.org/
df1.groupby('gender').size()
df1.groupby('gender').size().plot(kind='bar')

plt.hist(df1['marks'])

#https://seaborn.pydata.org/index.html
import seaborn as sns
# sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")
iris.head()
iris.tail()
df1.groupby('gender').size()
iris.groupby('species').size().plot(kind='bar')
sns.pairplot(iris)

#%%
#Load Inbuilt Datasets
import statsmodels.api as sm
#https://vincentarelbundock.github.io/Rdatasets/datasets.html
mtcars = sm.datasets.get_rdataset(dataname='mtcars', package='datasets')
mtcars.data.head()
mtcars.data.tail()
mtcars.data.columns

#%%
#Load from Excel/ CSV and export to
data = mtcars.data
data.head(6)
type(data)

Example #58

0

Show file

File: assignment.py Project: Bharanitharan13/Predict-customer-will-deposit-or-not

    print('='*45)

# croosstab with catagorical
for c in category :
    table = pd.crosstab(data[c],data['y'])
    table.plot(kind='bar')

# for numeric variable
corelation = data.corr()
ax=plt.subplots(figsize=(9,7))
sns.heatmap(corelation,annot = True)


### multivaite analysis

sns.pairplot(data,hue='y',palette='coolwarm')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def categorical_variable(dataframe):
    variable_name=[i for i in dataframe.columns if dataframe.dtypes[i]=='object']
    for x in variable_name:
        dataframe[x]=le.fit_transform(dataframe[x])
    return dataframe

categorical_variable(transformed_data)
transformed_data.columns

# feature selecition
from sklearn.model_selection import train_test_split

Example #59

0

Show file

            df_company[df_company['kmean'] == 0][4],
            s=100,
            c='red',
            label='Cluster 1')
plt.scatter(df_company[df_company['kmean'] == 1][2],
            df_company[df_company['kmean'] == 1][4],
            s=100,
            c='blue',
            label='Cluster 1')
plt.scatter(df_company[df_company['kmean'] == 2][2],
            df_company[df_company['kmean'] == 2][4],
            s=100,
            c='green',
            label='Cluster 1')

sns.pairplot(df_company, hue='kmean')

# ############################ Hierarchical Clustering

#l1 = [df['EduDegree'],df['HasChild'],df['GeoLivArea']]
#df['hc_split'] = pd.concat(, axis=1 )

df['hc_split'] = df['EduDegree'].map(str) + df['HasChild'].map(
    str) + df['GeoLivArea'].map(str)
sns.countplot('hc_split', data=df)

df.drop(['EduDegree', 'HasChild', 'GeoLivArea'], inplace=True, axis=1)
df.drop(['CustId'], inplace=True, axis=1)

df_hc = df.drop(['hc_split'], axis=1)

Example #60

-1

Show file

File: classify_ghosts_v4h2.py Project: fzhurd/fzwork

def main():

    df_train=pd.read_csv('../input/train.csv')
    df_test=pd.read_csv('../input/test.csv')

    sns.set()
    sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type")
    # sns.plt.show()

    df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul']
    df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length']
    df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length']


    df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul']
    df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length']
    df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length']

    test_id = df_test['id']

    df_train.drop(['id'], axis=1, inplace=True)
    df_test.drop(['id'], axis=1, inplace=True)

    df_train.drop(['color'], axis=1, inplace=True)
    df_test.drop(['color'], axis=1, inplace=True)

    df_train_data = df_train.drop('type', axis=1)
    df_train_results=df_train['type']

    df_train_data = pd.get_dummies(df_train_data)
    df_test_data = pd.get_dummies(df_test)

    test_results=run_classifier(df_train_data,df_train_results,df_test_data, 'rf')
    save_result(test_id, test_results,'results_logistic_regression.csv')