Example #1
0
def plot_uturns(summary, critical_rad=2.9, save=False, condition='Condition',
    context='notebook'):
    """Plot and print steepest turns over more than critical_rad"""
    turn_column = next(col for col in summary.columns
        if col.startswith('Max. Turn Over'))
    columns_of_interest = ['Skew Lines Distance', 'Mean Velocity',
        'Arrest Coefficient', condition, turn_column]

    uturns = summary[summary[turn_column] > critical_rad]

    skip_steps = int(next(word
        for word in turn_column.split() if word.isdigit()))
    print('\nPlotting turns with more than {} rad over {} steps'.format(
        critical_rad, skip_steps))
    for cond, cond_uturns in uturns.groupby(condition):
        n_tracks = len(summary[summary[condition] == cond])
        n_turns = len(cond_uturns)
        print('  {} tracks in {} with {} U-Turns ({:2.2f} %).'.format(
            n_tracks, cond, n_turns, n_turns/n_tracks*100))

    sns.set(style='white', context=context)
    sns.pairplot(uturns[columns_of_interest], hue=condition, diag_kind='kde')
    plt.tight_layout()

    if save:
        conditions = [cond.replace('= ', '')
            for cond in summary[condition].unique()]
        plt.savefig('U-Turns_' + '-'.join(conditions) +
            '_{:1.1f}over{}steps.png'.format(critical_rad, skip_steps), dpi=300)
    else:
        plt.show()
Example #2
0
def useDataAnalysis(df_train_origin):
    # 风速
    df_train_origin.groupby('windspeed').mean().plot(y='count', marker='o')
    # plt.show()
    # 湿度
    df_train_origin.groupby('humidity').mean().plot(y='count', marker='o')
    # plt.show()
    # 温度
    df_train_origin.groupby('temp').mean().plot(y='count', marker='o')
    # plt.show()
    # 温度湿度变化
    df_train_origin.plot(x='temp', y='humidity', kind='scatter')
    # plt.show()
    # scatter一下各个维度
    fig, axs = plt.subplots(2, 3, sharey=True)
    df_train_origin.plot(kind='scatter', x='temp', y='count', ax=axs[0, 0], figsize=(16, 8), color='magenta')
    df_train_origin.plot(kind='scatter', x='atemp', y='count', ax=axs[0, 1], color='cyan')
    df_train_origin.plot(kind='scatter', x='humidity', y='count', ax=axs[0, 2], color='red')
    df_train_origin.plot(kind='scatter', x='windspeed', y='count', ax=axs[1, 0], color='yellow')
    df_train_origin.plot(kind='scatter', x='month', y='count', ax=axs[1, 1], color='blue')
    df_train_origin.plot(kind='scatter', x='hour', y='count', ax=axs[1, 2], color='green')

    sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count")

    # 来看看相关度咯
    corr = df_train_origin[['temp', 'weather', 'windspeed', 'day', 'month', 'hour', 'count']].corr()
    print corr

    # 用颜色深浅来表示相关度
    plt.figure()
    plt.matshow(corr)
    plt.colorbar()
    plt.show()
            def pairplot():
                sns.set_style('white', {'axes.grid': True, 'axes.edgecolor':'0'})
                sns.set_context('paper', font_scale=1.5, rc={'lines.linewidth': 1})
                # sns.despine()
                WT_pairplot = sns.pairplot(df_subset,
                                           vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'],
                                           size =3,
                                           hue='WT PDBID',
                                           hue_order=sorted(list(mutant_df['WT PDBID'].unique())),
                                           kind='scatter',
                                           diag_kind='hist'
                                           )# .add_legend(bbox_to_anchor=(1.1, 0.5))
                lgd = WT_pairplot.fig.legend(handles=df_subset['WT PDBID'], labels=df_subset['WT PDBID'], bbox_to_anchor=(1.05, 0.5))
                output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is WT PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD')
                title = WT_pairplot.fig.suptitle('PairPlot for %s' %description, fontsize =24, y=1.05)
                output_pdf.savefig(WT_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight')

                Mut_pairplot = sns.pairplot(df_subset,
                                            vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'],
                                            size =3,
                                            hue='Mutant PDBID',
                                            hue_order= sorted(list(mutant_df['Mutant PDBID'].unique())),
                                            kind='scatter',
                                            diag_kind='hist'
                                           )# .add_legend(bbox_to_anchor=(1.1, 0.5))
                lgd = Mut_pairplot.fig.legend(handles=df_subset['Mutant PDBID'], labels=df_subset['Mutant PDBID'], bbox_to_anchor=(1.05, 0.5))
                output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is Mutant PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD')
                title = Mut_pairplot.fig.suptitle('PairPlot for %s' % description, fontsize=24, y=1.05)
                output_pdf.savefig(Mut_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight')
    def Plot_data_df(self,df_2,label,strac,pair=True):
        ''''''
        import matplotlib.pylab as plt
        import math
        import seaborn as sns
        import pandas as pd

        x = range(df_2.shape[0])
        maxn = df_2.shape[1]
        fig = plt.figure(1,figsize=(10,maxn+10))
        sns.set(style='darkgrid',color_codes=True)
        style = 'white'
        for i in xrange(maxn):
            y=df_2.ix[:,i]
            plot_context = fig.add_subplot(maxn, 2, 2*i+1)
            plot_context.scatter(x, y, c=label,s=10,alpha=0.7)

            plt.title(strac[i])
            fig.add_subplot(maxn, 2, 2*i + 2)
            sns.distplot(y,axlabel=False)
            fig.tight_layout()
            plt.title(strac[i])

        df_label=pd.DataFrame({'label':list(label)})
        df_2=pd.concat([df_2,df_label],axis=1,join='inner')
        if pair==True:
            sns.pairplot(df_2,vars=strac,hue='label',)
        plt.show()
    def feature_analysis(self):
        """
        Make a plot to visulize important features to separate labels
        """

        if self.is_voicedata:
            # explore all paired scatter plots
            seaborn.set_context("poster")
            plt.figure(figsize = (10,8))
            plot_all = seaborn.pairplot(self.data, hue = "label")
            plt.suptitle("Feature analysis for voice dataset - all features")
            plot_all.savefig("../Figures/voice_exploration.png", bbox_inches="tight")
            # explore paired scatter plots with selected features
            plt.figure(figsize = (10,8))
            plot_selected = seaborn.pairplot(self.data[["skew","kurt", "meanfun", \
                         "meanfreq", "IQR", "label"]], hue = "label")
            plt.suptitle("Feature analysis for voice dataset - selected features")
            plot_selected.savefig("../Figures/voice_exploration_selected.png", bbox_inches="tight")
        else:
            seaborn.set_context("poster")
            plt.figure(figsize = (10,8))
            # explore all paired scatter plots
            plot_all = seaborn.pairplot(self.data, hue = "Self-defined label")
            plt.suptitle("Feature analysis for EEG dataset - all features")
            plot_all.savefig("../Figures/EEG_exploration.png", bbox_inches="tight")
            # explore paired scatter plots with selected features
            plt.figure(figsize = (10,8))
            plot_selected = seaborn.pairplot(self.data[["Delta","Theta","Alpha 1",\
                           "Beta 2", "Gamma1", "Self-defined label"]], hue = "Self-defined label")
            plt.suptitle("Feature analysis for EEG dataset - selected features")
            plot_selected.savefig("../Figures/EEG_exploration_selected.png", bbox_inches="tight")
Example #6
0
 def scatter_corrplot_parameters(self,params):
     """
     Plots two parameters from the catalog against each other.
     """
     snsdf = self.pandas_data_frame(params)
     print(snsdf)
     sns.pairplot(snsdf,dropna=True, size=10)
def scatterplot():
    '''Fancy scatterplots, using the package "seaborn" '''
    import seaborn as sns
    
    df = sns.load_dataset("iris")
    sns.pairplot(df, hue="species", size=2.5)    
    C2_8_mystyle.printout_plain('multiScatterplot.png')
def visualize_data():
    '''
    可视化数据
    '''
    sns.set(style='whitegrid', context='notebook')
    sns.pairplot(df[cols], size=2.5)
    plt.show()
 def pair(self,vars=None,save=False):
     sns.pairplot(self._data[vars])
     if save:
         self._save()
     else:
         plt.show()
     plt.close()
Example #10
0
def plot_2D_by_sns(subplot, data, target, target_names):
    # 转换成pandas格式
    # pdata = pd.DataFrame(data, columns=['diss', 'corr', 'h**o', 'energy', 'asm', 'contrast', 'lbp', 'lbp_integrals', '9', '10', '11', '12', '13', '14'])
    # sns.set(color_codes=True)
    sns.set()
    pdata = pd.DataFrame(data)
    sns.pairplot(data=pdata, hue='species', markers=["o", "s", "D"])
def statistical_analysis(df):
	""" Check correlation of features to spread """
	#correlation matrix
	corrmat = df.corr()
	f, ax = plt.subplots(figsize=(12, 9))
	hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f')
	plt.yticks(rotation=0)
	plt.xticks(rotation=90)

	corrvec = abs(df.corr()['result_spread'].copy())
	print corrvec.sort_values()

	#scatterplot
	sns.set()
	cols = ['result_spread','rush_attempt_diff','turn_diff','yards_diff','third_diff','sack_diff','sack_ydiff','p_attempt_diff']
	sns.pairplot(df[cols], size = 2.5)

	# normality_check(df['result_spread'])
	# normality_check(df['rush_attempt_diff'])
	# normality_check(df['turn_diff'])
	# normality_check(df['yards_diff'])
	# normality_check(df['third_diff'])
	# normality_check(df['sack_diff'])
	# normality_check(df['sack_ydiff'])
	# normality_check(df['poss_diff'])
	# normality_check(df['p_attempt_diff'])
	""" Rush attempt shows light tails but otherwise these main features appear normally distributed """
def pairplot(df,hue_name=None):

    if hue_name is not None:
        df['target']=hue_name
        sns.pairplot(df, hue='target')
    else:
        sns.pairplot(df)
Example #13
0
def do_pairplots(counts, base_dir, sample):
    """
    Produces three pairplots - one for each group and a joint plot.
    """
    markers = ["o", "s"]
    r, total_gems, assigned_gems, assigned_gems_by_para = assign_gems(counts)
    df = pd.DataFrame.from_dict(r)
    unique_gems = find_unique_gems(assigned_gems_by_para)
    num_unique = len(unique_gems)
    num_not_unique = len(df) - num_unique
    unique_bins = ["{:,} unique".format(num_unique) if x in unique_gems else "{:,} not unique".format(num_not_unique) for x in df["GemId"]]
    df["Unique mappings"] = unique_bins
    sns_plot = sns.pairplot(df, hue="Unique mappings", markers=markers, plot_kws=dict(s=10))
    sns_plot.fig.text(0.87, 0.6, "{:,} Total Gems".format(len(total_gems)))
    sns_plot.savefig(os.path.join(base_dir, "{}_combined_plot.pdf".format(sample)), format="pdf")
    # now re-label to simply unique/not unique and make separate pairplots
    unique_simple_bins = ["Unique" if x in unique_gems else "Not Unique" for x in df["GemId"]]
    df["Unique mappings"] = unique_simple_bins
    for i, subset in enumerate(["Unique", "Not Unique"]):
        df2 = df[df["Unique mappings"] == subset]
        color = sns.color_palette()[i]
        cmap = sns.light_palette(color, as_cmap=True)
        sns_plot = sns.pairplot(df2, markers=markers[i], plot_kws=dict(color=color, s=10))
        sns_plot.map_lower(sns.kdeplot, cmap=cmap, n_levels=50)
        p = subset.replace(" ", "_").lower()
        sns_plot.savefig(os.path.join(base_dir, "{}_{}_combined_plot.pdf".format(sample, p)), format="pdf")
    plt.close('all')
def Pairplot(feature_mat,weight=None):
	'''Plot pairplot for given feature matrix'''
	if weight == None:
		sns.pairplot(feature_mat)
	else:
		g = sns.pairplot(feature_mat,weight,palette=sns.color_palette("GnBu_d",n_colors=len(feature_mat)),vars=feature_mat.columns.values[:-1])
	#g = g.map(plt.scatter)
	sns.plt.show()
def scatterplot():
    import seaborn as sns
    sns.set()
    sns.set_context('poster')
    
    df = sns.load_dataset("iris")
    sns.pairplot(df, hue="species", size=2.5)    
    mystyle.printout_plain('multiScatterplot.png')
Example #16
0
def demo01():
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set()
    iris = sns.load_dataset('iris')
    print(iris.head())
    sns.pairplot(iris, hue='species', size=1.5)
    plt.show()
def feature_correlation(x,filepath=None, visualize=False):
    """
    :param x:
    """
    seaborn.pairplot(x)
    if visualize:
    	seaborn.plt.show()
    if not filepath == None:
    	plt.savefig(filepath)
Example #18
0
def main():

    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
            header = None,
            sep = '\s+')
    df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',
            'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B',
            'LSTAT', 'MEDV']
    print(df.head())

    # Select a subset of the features and plot the correlation between features
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
    sns.pairplot(df[cols], size=2.5);
    plt.title('Correlations between 5 features')
    plt.show()

    # Plot a heatmap of the same subset of features
    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=2.5)
    hm = sns.heatmap(cm,
            cbar = True,
            annot = True,
            square = True,
            fmt = '.2f',
            annot_kws = {'size': 15},
            yticklabels = cols,
            xticklabels = cols)
    plt.show()

    X = df[['RM']].values
    y = df['MEDV'].values

    sc_x = StandardScaler()
    sc_y = StandardScaler()

    X_std = sc_x.fit_transform(X)
    y_std = sc_y.fit_transform(y)
    
    lr = LinearRegressionGD()
    lr.fit(X_std, y_std)

    plt.plot(range(1, lr.n_iter + 1), lr.cost_)
    plt.ylabel('SSE')
    plt.xlabel('Epoch')
    plt.show()

    lin_regplot(X_std, y_std, lr)
    plt.xlabel('Average number of rooms [RM] (standardized)')
    plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
    plt.show()
    
    # Example classification for a house with 5 rooms
    num_rooms_std = sc_x.transform([5.0])
    price_std = lr.predict(num_rooms_std)
    print("Price in $1000's: %.3f" % \
            sc_y.inverse_transform(price_std))
Example #19
0
def seaborn_plot(df,plot_type='pairplot',columns=False):
	sns.set()
	mpl.rc("figure", figsize=(16, 8.65))
	plotting_df=(df[columns] if columns else df)
	if plot_type=='pairplot':
		sns.pairplot(plotting_df)
	elif plot_type=='corr_plot':
		sns.corrplot(plotting_df)
	sns.plt.show()
	return
Example #20
0
def pairplot(df, group="group"):
    sns.pairplot(data=df.drop('id', axis=1),
                 vars=['age', 'weight', 'heartrate', 'height'],
                 hue=group,
                 diag_kind='kde', 
                 size=5,
                 diag_kws=dict(shade=True, linewidth=2),
                 plot_kws=dict(s=50) )
    if group == "group":
        plt.savefig(os.path.join(FIG_PATH, 'pairplot.png'), dpi=100)
    else:
        plt.savefig(os.path.join(FIG_PATH, 'pairplot_%s.png' % group), dpi=100)
Example #21
0
def main():
    train=load_data('../input/train.csv')
    print train.head(5)
    # print train.shape
    # print train.describe()
    # print pd.isnull(train).any()
    # print train.mean()

    train.fillna(train.mean())

    sns.set()
    sns.pairplot(train[["MSSubClass", "MSZoning",  "LotFrontage" ]], hue=train[["SalePrice"]])
    sns.plt.show()
Example #22
0
def pair_plot(metrics):

    cols = [#'meanRR',
            #'meanHR',
            'SDNN',
            'RMSSD',
            #'peak_VLF', 'peak_LF', 'peak_HF',
            #'power_VLF', 'power_LF', 'power_HF',
            'peak_HF',
            'power_LFHF',
            #'pcpower_VLF', 'pcpower_LF', 'pcpower_HF',
            #'nupower_LF', 'nupower_HF'
    ]
    sns.pairplot(metrics, hue='height', vars=cols)
Example #23
0
	def correlation_plot(self, logarithmic=True):
		# plot pairwise parameter correlations with a scatterplot matrix

		if not self.sampling_finished:
			raise Exception("Must run .sample() before any output results can be viewed.")

		if not logarithmic: 
			sns.pairplot(self.posterior_samples)
		else:
			df = np.log10(self.posterior_samples.iloc[:,:-1]).dropna()
			g = sns.PairGrid(df, diag_sharey=False)
			g.map_lower(sns.kdeplot, cmap="Blues_d")
			g.map_upper(plt.scatter,alpha=.1)
			g.map_diag(sns.kdeplot, lw=3)
Example #24
0
	def distribution(self):
		# 绘制频数、聚合度、自由度分布
		import pandas as pd
		import matplotlib
		import matplotlib.pyplot as plt
		matplotlib.use('TkAgg')
		import seaborn as sns
		sns.set(style="white", color_codes=True)

		df = []
		for key, value in self.result.items():
			df.append([value['freq'], value['doa'], value['dof']])
		df = pd.DataFrame(data=df, columns=['frequency', 'doa', 'dof'])
		sns.pairplot(df)
		plt.show()
Example #25
0
def plot_by_neighborhood(data, geojson_file):
    nbr = data.groupby('NEIGHBORHOOD')

    geonbr = gp.read_file(geojson_file).dropna()
    geonbr = geonbr.set_index('alias').join(nbr.mean())
    geonbr['# of Requests'] = nbr.size()
    geonbr['Income ($k)'] = geonbr['Median_HHI'] / 1000.

    cols = ['# of Requests', 'time/SLA ratio', 'Income ($k)']
    _, axes = axes_grid(len(cols))
    for col, ax in zip(cols, axes.flat):
        geonbr.plot(column=col, axes=ax)
        ax.set_title(col)

    seaborn.pairplot(geonbr[cols])
Example #26
0
def main():

    df_train=pd.read_csv('../input/train.csv')
    df_test=pd.read_csv('../input/test.csv')

    sns.set()
    sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type")
    # sns.plt.show()

    df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul']
    df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length']
    df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length']


    df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul']
    df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length']
    df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length']

    test_id = df_test['id']
    df_train.drop(['id'], axis=1, inplace=True)
    df_test.drop(['id'], axis=1, inplace=True)


    df_train.drop(['color'], axis=1, inplace=True)
    df_test.drop(['color'], axis=1, inplace=True)


    X_train = df_train.drop('type', axis=1)
    y_train=df_train['type']


    X_train = pd.get_dummies(X_train)
    df_test_data = pd.get_dummies(df_test)


    # from sklearn.model_selection import train_test_split
    # x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

    # lr = LogisticRegression(penalty='l2',C=1000000)
    # lr.fit(X_train,y_train)
    # y_pred= lr.predict(df_test) 

    # print(classification_report(y_pred,y_test))

    # test_results=logistic_regression(X_train,y_train,df_test_data)

    test_results=run_classifier(X_train,y_train,df_test_data, 'rf')
    save_result(test_id, test_results,'results_logistic_regression.csv')
Example #27
0
 def histogram(self,x=None, y=None, l=None, t=None, **kwargs):
     """
     this is a short-cut for creating many possible histograms, at a
     specified beamline location l, or specified time t.
     - if x and y are not input, then it creates a full joint-scatterplot
       for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t)
     - if x is input, it creates a 1d histogram with respect to that parameter
     - if x and y are input, creates a 2d histogram with respect to those parameters
     """
     table = self.to_dataframe(l=l, t=t, latex=True)
     if x is None and y is None:
         g = sns.pairplot(table, **kwargs)
         for ax in g.axes.flat:
             _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90)
         return
     if x is not None and y is None:
         x = self._reformat_label(x)
         sns.distplot(table[x], **kwargs)
         plt.xlabel(x)
         return
     if x is not None and y is not None:
         x = self._reformat_label(x)
         y = self._reformat_label(y)
         sns.jointplot(x=x, y=y, data=table, **kwargs);
         return
Example #28
0
def plot_shapes(summary, save=False, condition='Condition', context='notebook'):
    """Plot and print area and volume of all steps and averaged over track"""
    columns_of_interest = ['Scan. Area/Step', 'Scan. Vol./Step',
        'Mean Surface Area (µm2)', 'Mean Volume (µm3)', 'Mean Sphericity',
        condition]

    sns.set(style='white', context=context)
    sns.pairplot(summary[columns_of_interest], hue=condition, diag_kind='kde')
    plt.tight_layout()

    if save:
        conditions = [cond.replace('= ', '')
            for cond in summary[condition].unique()]
        plt.savefig('Shapes_' + '-'.join(conditions), dpi=300)
    else:
        plt.show()
def visualize_hist_pairplot(X,y,selected_feature1,selected_feature2,features,diag_kind):
	"""
	Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature1 - First feature
	selected_feature1 - Second feature
	diag_kind -- Type of plot in the diagonal (Histogram or Density Function)
	"""

	#create data
	joint_data=np.column_stack((X,y))
	column_names=features

	#create dataframe
	df=pd.DataFrame(data=joint_data,columns=column_names)

	#plot
	palette = sea.hls_palette()
	splot=sea.pairplot(df, hue="Y", palette={0:palette[2],1:palette[0]},vars=[selected_feature1,selected_feature2],diag_kind=diag_kind)
	splot.fig.suptitle('Pairwise relationship: '+selected_feature1+" vs "+selected_feature2)
	splot.set(xticklabels=[])
	# plt.subplots_adjust(right=0.94, top=0.94)

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_{}_hist_pairplot.png'.format(output_dir,selected_feature1,selected_feature2))
#See the distrubution of the data
sns.distplot(data['charges'],ax= ax[0,0])
sns.distplot(data['age'],ax=ax[0,1])
sns.distplot(data['bmi'],ax= ax[1,0])
sns.distplot(data['children'],ax= ax[1,1])


sns.countplot(data['sex'],ax=ax[2,0])
sns.countplot(data['smoker'],ax= ax[2,1])
sns.countplot(data['region'],ax= ax[3,0])



#visualizeing skewness
sns.pairplot(data)

#Lets look at smokers vs non-smokers on age vs charges:

sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'muted', height = 7)
plt.show(sns)

#Lets look at correlation:

corr = data.corr()

sns.heatmap(corr, cmap = 'Wistia', annot= True)
plt.show(sns)

############################################01_04_ConvertCategoricalDataintoNumbers##############################################
#option0: pandas factorizing: maps each category to a different integer = label encoder 
Example #31
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# chapter05.py

#%%
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

#%%
sns.set()
sns.pairplot(iris, hue='species', height=1.5)

#%%
X_iris = iris.drop('species', axis=1)
X_iris.shape

#%%
y_iris = iris['species']
y_iris.shape

#%%
# Suppervised learning example: Simple linear regression
import matplotlib.pyplot as plt
import numpy as np

plt.figure()
rng = np.random.RandomState(42)
x = 10 * rng.rand(50)
y = 2 * x - 1 + rng.randn(50)
plt.scatter(x, y)
@author: Vineeta
"""
import tkinter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
data=pd.read_csv("sales_data.csv",encoding='latin1')
data.shape
data.describe()
data.isnull().sum()
data = data.dropna(axis=1)
print(data)
sns.pairplot(data,x_vars=['QUANTITYORDERED','PRICEEACH','MSRP'], y_vars='SALES',height=4,
            aspect=1,kind='scatter')
plt.show()
import pandas as pd
from sklearn import linear_model
X = data[['QUANTITYORDERED','PRICEEACH','MSRP']]
Y = data['SALES']
regr = linear_model.LinearRegression()
regr.fit(X, Y)
QUANTITYORDERED = 56
PRICEEACH = 93.2
MSRP=150
print ('Predicted SalesPrice: \n', regr.predict([[QUANTITYORDERED,PRICEEACH,MSRP]]))
import tkinter as tk 
root = tkinter.Tk()
canvas1 = tk.Canvas(root, width = 500, height = 300)
canvas1.pack()
    DATA = load(fn)
    COST_MODEL, TRACE = DATA['model'], DATA['trace']

########################
# Model visualization
########################

#Plot Cost model KDE (ln x scale)
FIG2, _ = plt.subplots(1, 1, figsize=(13, 6))
plt.title(f'KDE Workstaion Cost {MODEL_PREFIX} Observation versus Model',
          fontsize=16)
kdeplot(log(Y), label='Observation')
kdeplot(log(Y_), label='Model')
plt.xlabel('Wrks Cost Ln()', fontsize=16)
plt.ylabel('Density', fontsize=16)
FIG2.savefig(f'Cost_model_KDE_{MODEL_PREFIX}_{F_BASENAME}.png')

#Plot TIERS visualization relationships
for tier in TIERS:
    cols = list(ATT) + [MEASURE, 'model_cost']
    ppp = log(PP[cols]).copy()
    ppp = concat([ppp, PP[tier]], axis=1)
    tvrf_name = f'Visualizing relationships-{MODEL_PREFIX}-{tier}-{F_BASENAME}.png'
    pairplot(ppp, hue=tier, height=3, kind='scatter').savefig(tvrf_name)

pairplot(ppp, height=3, kind='scatter', diag_kind='kde').savefig(
    f'Visualizing relationships-{MODEL_PREFIX}-{F_BASENAME}.png')

SUMMARY = df_summary(TRACE)
print(SUMMARY)
# In[3]:


#描述性分析
s = data[['temperature','pressure', 'windspeed','electricity_consumption']]
s.describe()


# In[4]:


#散佈圖
sns.set(style='whitegrid', context='notebook')
cols = ['temperature', 'pressure', 'windspeed', 'electricity_consumption']
sns.pairplot(data[cols], size=2.5);
plt.tight_layout()
plt.show()


# In[5]:


#根據用電量分類
def get_consumption_category(wt):
    if wt < 200:
        return "<200kWh"
    elif 200 < wt < 400:
        return "200kWh~400kWh"
    elif 400 < wt < 600:
        return "400kWh~600kWh"
Example #35
0
    })

school_palette = {
    "Most cited analytical": "tab:red",
    "Anti-teorethical": "tab:blue",
    "Master": "tab:green",
    "Cavell": "tab:orange",
}

#%%
g = sns.pairplot(
    df,
    x_vars=["parentheses_ratio", "dot_parentheses_ratio", "r2"],
    y_vars=["author"],
    hue="school",
    height=5,
    aspect=0.6,
    diag_kind=None,
    markers=["H", "s", "o", "D"],
    plot_kws={"s": 50},
    palette=school_palette,
)
g._legend.remove()
g.add_legend(
    bbox_to_anchor=(0.37, 0.24),
    frameon=True,
    label_order=[
        "Cavell", "Master", "Anti-teorethical", "Most cited analytical"
    ],
)
g._legend.set_title(None)
g.axes.flatten()[0].set_ylabel("")
### Import and Global vars ###

# Commonly used module
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

iris_data = pd.read_csv("./preprocessed_iris.csv")

# In[2]:

g = sns.pairplot(iris_data, hue="class", diag_kind="kde")
g.savefig('iris_output.png')

# In[3]:

google_data = pd.read_csv("./preprocessed_googleplaystore.csv",
                          usecols=[
                              "Category", "Rating", "Reviews", "Installs",
                              "Price", "Last Updated"
                          ])
cat_list = list(google_data["Category"].unique())
replace_list = list(range(0, len(cat_list)))
mymap = dict(zip(cat_list, replace_list))

google_data = google_data.applymap(lambda s: mymap.get(s) if s in mymap else s)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load iris dataset
iris = pd.read_csv("..\datasets\iris.csv")
iris['variety'] = iris['variety'].astype('category')

# EDA
print(iris.info())
print(iris.groupby('variety').size())
print(iris.describe(include='all'))
sns.pairplot(iris, hue="variety")
sns.lmplot(x='petal.length',
           y='petal.width',
           data=iris,
           hue="variety",
           fit_reg=False)
base.shape

base.isnull().values.any()

base

base.info()

base.describe()

sns.countplot(x = 'Outcome', data = base);

base.hist(figsize=(20,12));

sns.pairplot(base, hue = 'Outcome', 
             vars = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']);

sns.heatmap(base.corr(), annot = True);

X = base.iloc[:, 0:8].values

X

y = base.iloc[:, 8].values

y

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
"""Loading the dataset"""

raw_data = pd.read_csv('drive/My Drive/data.csv', index_col=0)
raw_data.head()
"""**Exploratory Data Analysis**"""

# Checking for missing values
raw_data.isnull().sum()
"""Bivariate Analysis"""

sns.pairplot(raw_data, hue='is_goal')
plt.show()
"""Missing values treatment"""

raw_data.team_id.value_counts()

# drop columns which do not contribute to predictions
raw_data.drop('match_event_id', inplace=True, axis=1)
raw_data.drop('team_name', inplace=True, axis=1)
raw_data.drop('date_of_game', inplace=True, axis=1)
raw_data.drop('lat/lng', inplace=True, axis=1)
raw_data.drop('team_id', inplace=True, axis=1)

raw_data.shape

# Filling missing shot_id_numbers
# 处理类别型数据,其中origin列代表了类别1,2,3,分布代表产地:美国、欧洲、日本
# 其弹出这一列
origin = dataset.pop('Origin')
# 根据origin列来写入新列
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
dataset.tail()

# 切分为训练集和测试集
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

# %% 统计数据
sns.pairplot(train_dataset[["Cylinders", "Displacement", "Weight", "MPG"]],
             diag_kind="kde")
# %%
# 查看训练集的输入X的统计数据
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

# 移动MPG油耗效能这一列为真实标签Y
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')


# 标准化数据
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
Example #41
0
ax.plot(x, y, 'o', markersize=1, label="data")
ax.plot(x, slm.fittedvalues, 'b--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.set_xlim([0, 50000])
ax.legend(loc='best')

# We do the same thing for variables in another way using seaborns.

# In[17]:

import seaborn as sns
sns.pairplot(
    csv_data,
    x_vars=['LotArea', 'GrLivArea', 'YearBuilt', 'FullBath', '2ndFlrSF'],
    y_vars='SalePrice',
    height=7,
    aspect=0.7,
    kind='reg')

# This is the same last few steps for other variables. The difference between SST and SSE is the improvement in prediction from the regression model, compared to the mean model. Dividing that difference by SST gives R-squared. It is the proportional improvement in prediction from the regression model, compared to the mean model. It indicates the goodness of fit of the model. R-squared has the useful property that its scale is intuitive: it ranges from zero to one, with zero indicating that the proposed model does not improve prediction over the mean model, and one indicating perfect prediction. Improvement in the regression model results in proportional increases in R-squared. One pitfall of R-squared is that it can only increase as predictors are added to the regression model. This increase is artificial when predictors are not actually improving the model’s fit. To remedy this, a related statistic, Adjusted R-squared, incorporates the model’s degrees of freedom. Adjusted R-squared will decrease as predictors are added if the increase in model fit does not make up for the loss of degrees of freedom. Likewise, it will increase as predictors are added if the increase in model fit is worthwhile. Adjusted R-squared should always be used with models with more than one predictor variable. It is interpreted as the proportion of total variance that is explained by the model. There are situations in which a high R-squared is not necessary or relevant. When the interest is in the relationship between variables, not in prediction, the R-square is less important. An example is a study on how religiosity affects health outcomes. A good result is a reliable relationship between religiosity and health. No one would expect that religion explains a high percentage of the variation in health, as health is affected by many other factors. Even if the model accounts for other variables known to affect health, such as income and age, an R-squared in the range of 0.10 to 0.15 is reasonable.

# ## Multi-Variable Linear Regression
#
# In this part we try to fit the regerssion line using 6 variables. Our candidates are 'LotArea', 'GrLivArea', 'LotFrontage', '2ndFlrSF', 'YearBuilt', 'FullBath'. These are elected using correlation with 'SalesPrice' variable.  Variables with higher correlation would probably be more suitable to use for regression problem. Following code will plot the regerssion line for each of these variables compared to 'SalePrice'.
#

# In[27]:

# using statistic model for variables
x_tot = csv_data[[
    def draw_func(self):
        if len(self.all_data.index) == 0:
            self.show_message('请导入采样数据')
        elif len(self.info_data.index) == 0:
            self.show_message('请导入采样信息')
        elif not self.region_linked:
            self.show_message("请点击链接")
        elif not self.figure_able:
            self.show_message("数据包含非数值类型,不可画图!")
        else:
            self.clear_func()
            self.cur_slice()
            # 解决无法显示中文
            plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
            # plt.rcParams['font.sans-serif']=['SimHei'] #指定默认字体,SimHei为黑体
            # 解决无法显示负号
            plt.rcParams['axes.unicode_minus'] = False
            plt.title(self.figure_type)
            if self.figure_type == "主成分分析":
                region_data = self.cur_data.iloc[:, 0].values.tolist()
                print(region_data)
                regions = list(set(region_data))
                print(regions)
                region_color = [(int(regions.index(i) * 255 / len(regions))) for i in region_data]
                # region_color = [regions.index[i] for i in region_data]
                print(region_color)
                data = self.cur_data.iloc[:, 1:].values
                data = data - np.mean(data, axis=0)
                print("data",data.shape)
                cov_mat = np.cov(data, rowvar=0)
                print("cov:", cov_mat.shape)

                eig_vals, eig_vects = np.linalg.eig(np.mat(cov_mat))
                low_data_mat = data * eig_vects
                print("low:", low_data_mat.shape)
                eig_val_indice = np.argsort(eig_vals)

                top = 2
                n_eig_val_indice = range(top)
                print("n_eig_val_indice", n_eig_val_indice)
                n_eig_vects = eig_vects[:, n_eig_val_indice]
                print("n_eig:",n_eig_vects.shape)
                recon_mat = (low_data_mat * eig_vects) + np.mean(data, axis=0)
                print("rec:", recon_mat.shape)
                x = np.array(low_data_mat)[:, 0]
                y = np.array(low_data_mat)[:, 1]
                # z = np.array(low_data_mat)[:, 2]
                for region in regions:
                    index = [i for i, data in enumerate(region_data) if data == region]
                    plt.scatter(x[index], y[index])
                plt.legend(regions)
            elif self.figure_type == '平行坐标图':
                parallel_coordinates(self.cur_data, self.region_method)
            elif self.figure_type == "Andrews图":
                colors = ['b', 'g', 'r', 'orange']
                andrews_curves(self.cur_data, self.region_method, color=colors)
            elif self.figure_type == 'Radiv图':
                radviz(self.cur_data, self.region_method)
            elif self.figure_type == '矩阵散点图':
                print("绘制矩阵散点图")
                sns.pairplot(data=self.cur_data, hue=self.region_method)
                f = plt.gcf()
                self.ax = f
                self.canvas = FigureCanvas(f)
            elif self.figure_type == 'Chernoff脸谱图':
                self.cur_data.to_excel('cur_data.xlsx')
                print("data out")
                # goto_r()
                os.system("python ./PyToR.py")
                face_info = pd.read_csv('face_info.csv')
                # f_str = face_info.to_string()

                font = {'weight': 'normal',
                         'size': 11,
                         }

                plt.text(500, 0 , "脸谱图条目                 数据列", fontdict=font)
                for index, row in face_info.iterrows():
                    f_str = row[0] + " : "
                    plt.text(500, 20 + 20 * index, f_str, fontdict=font)
                    f_str = row[1]
                    plt.text(650, 30 + 20 * index, f_str, fontdict=font)
                plt.imshow(Image.open('face.png'))
                plt.gca().add_patch(plt.Rectangle(xy=(500, 20), width=100, height=300,
                                                  edgecolor=[1, 1, 1],
                                                  fill=False,
                                                  linewidth=2))
                # print("文件命名为:face.jpg")
                # info=pd.read_csv('face_info.csv',encoding='GBK')
                # print("effect of variables:\n{}".format(info))

            self.table_view.setVisible(False)
            self.canvas.setVisible(True)
            self.figure_layout.removeWidget(self.table_view)
            self.figure_layout.addWidget(self.canvas)
            self.canvas.draw()
            self.figure_state = 2
Example #43
0
g = sns.catplot(x="Rated 4.4 or more", y="Reviews", data=df)
g.savefig('rated4.4ormore-reviews.png')

df = df.drop('Rated 4.4 or more', axis=1)
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
g = sns.heatmap(corr,
                cmap=cmap,
                mask=mask,
                annot=True,
                vmax=.3,
                center=0,
                square=True,
                linewidths=.5,
                cbar_kws={"shrink": .5})
g.figure.savefig('heatmap.png')

g = sns.pairplot(df)
g.savefig('pairwise.png')
Example #44
0
    mask[np.triu_indices_from(mask)] = True

    with sns.axes_style("white"):
        sns.heatmap(data=corr, mask=mask, annot=True, vmin=-1, vmax=1)


# In[78]:

plot_heatmap(df)

# In[79]:

cols = ['temp', 'atemp', 'windspeed', 'humidity']

pp = sns.pairplot(df[cols],
                  diag_kws=dict(shade=True),
                  diag_kind="kde",
                  kind="reg")

fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('Correlação das variáveis numéricas',
             fontsize=14,
             fontweight='bold')

# In[80]:

sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'count']],
            orient='h')
fig = plt.gcf()
fig.set_size_inches(12, 6)
fig.suptitle('Análise de Outliers', fontsize=14, fontweight='bold')
Example #45
0
def print_ica_plot(comp, scaled_data):
    ica = FastICA(n_components=comp)
    ica_fit = ica.fit_transform(scaled_data)
    ica_df = pd.DataFrame(ica_fit)

    sns.pairplot(ica_df)
 def visualization(self):
     """
     接口请求参数
         "tableName": "advertising",  # str,数据库表名
         "X": ["TV", "radio", "newspaper"],  # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
         "Y": ["sales"],  # list,因变量,当表格方向为v是使用
         "show_options": ["y_count", "pairs", "corr", "y_corr"], # 展示选项
         "x_count": [], # list,选择要展示频率分布直方图的自变量
         "box": [], # list,选择要展示箱型图的自变量
     :return:
     """
     try:
         res = []
         self.table_data = self.table_data.astype("float")
         data = self.table_data.describe()
         res.append(
             transform_table_data_to_html({
                 "data": data.values.tolist(),
                 "title": "描述性统计分析",
                 "col": data.columns.tolist(),
                 "row": data.index.tolist()
             }))
         if self.config.get("x_count") and self.config.get("x_count")[0]:
             for x in self.config["x_count"]:
                 sns.distplot(self.table_data[x], kde=False)
                 # 显示纵轴标签
                 plt.ylabel("frequency")
                 # 显示图标题
                 # plt.title("{} - frequency distribution histogram".format(x))
                 res.append({
                     "title":
                     "{} - 频率分布".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "y_count" in self.config["show_options"]:
             sns.distplot(self.table_data[self.config["Y"][0]], kde=False)
             # 显示横轴标签
             plt.xlabel("section")
             # 显示纵轴标签
             plt.ylabel("frequency")
             # 显示图标题
             # plt.title("y frequency distribution histogram")
             res.append({
                 "title":
                 "{} - 频率分布".format(self.config["Y"][0]),
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if self.config.get("box") and self.config.get("box")[0]:
             for x in self.config["box"]:
                 sns.boxplot(self.table_data[x], palette="Set2", orient="v")
                 # 显示图标题
                 # plt.title("{} - Box distribution to check outliers".format(x))
                 res.append({
                     "title":
                     "{} - 箱型图".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "pairs" in self.config["show_options"]:
             sns.pairplot(self.table_data)
             # plt.title("Variable relation in pairs")
             res.append({
                 "title":
                 "变量两两关系图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "corr" in self.config["show_options"]:
             corr = self.table_data.corr()
             sns.heatmap(corr,
                         xticklabels=corr.columns,
                         yticklabels=corr.columns,
                         linewidths=0.2,
                         cmap="YlGnBu",
                         annot=True)
             # plt.title("Correlation between variables")
             res.append({
                 "title":
                 "相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "y_corr" in self.config["show_options"]:
             self.table_data.corr()[self.config["Y"][0]].sort_values(
                 ascending=False).plot(kind='bar')
             # plt.title("Correlations between y and x")
             res.append({
                 "title":
                 "因变量和各自变量的相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         response_data = {"res": res, "code": "200", "msg": "ok!"}
         return response_data
     except Exception as e:
         return {"data": "", "code": "500", "msg": "{}".format(e.args)}
Example #47
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

dataset = pd.read_csv('studentperformance.csv')
dataset.columns = [
    'gender', 'race', 'ped', 'lunch', 'test', 'math', 'reading', 'writing'
]

dataset.info()
dataset.describe()

pd.plotting.scatter_matrix(dataset)
sns.pairplot(dataset)

sns.barplot(dataset['gender'].value_counts().index,
            dataset['gender'].value_counts(),
            hue=['female', 'male'])

sns.barplot(dataset['race'], dataset['math'], hue=dataset['gender'])

sns.barplot(dataset['race'], dataset['reading'], hue=dataset['gender'])

sns.barplot(dataset['race'], dataset['writing'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['math'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['reading'], hue=dataset['gender'])

sns.barplot(dataset['ped'], dataset['writing'], hue=dataset['gender'])
target_column = "Species"

# %% [markdown]
# Let's check the dataset more into details.

# %%
penguins.head()

# %% [markdown]
# Since that we have few samples, we can check a scatter plot to observe the
# samples distribution.

# %%
import seaborn as sns

pairplot_figure = sns.pairplot(penguins, hue="Species")
pairplot_figure.fig.set_size_inches(9, 6.5)

# %% [markdown]
# First let's check the feature distributions by looking at the diagonal plots
# of the pairplot. We can deduce the following intuitions:
#
# * The Adelie species can be differentiated from the Gentoo and Chinstrap
#   species depending on the culmen length;
# * The Gentoo species can be differentiated from the Adelie and Chinstrap
#   species depending on the culmen depth.
#
# ## Regression dataset
#
# In a regression setting, the target is a continuous variable instead of
# categories. Here, we use two features of the dataset to make such a problem:
def main():

    photoData = PhotoData.Photo()

    lists = []
    header = []
    cols = []

    # Set initial data
    photoDirectories = []
    ignoreCameraModel = []
    fillEmptyLensModel = {
        "DSC-RX100M3": "No Lens Data",
    }  # sample
    PlotBarRotate = {}

    # Set plot initial data
    plotFixSizeX = 12
    plotFixSizeY = 8
    plotGrid = False
    plotSubPlots = False
    plotFontSize = 10
    plotRotate = 0

    plotBar = False
    plotScatter = False
    plotHexbin = False
    plotPie = False

    for line in open('PhotoDataAnalysis.ini', 'r'):
        line = line.strip()
        if line == "":
            pass
        elif line[0:1] == "#":
            pass
        else:
            item, param = line.split("=", 1)
            if item == "PhotoDirectory": photoDirectories.append(param)
            if item == "IgnoreCameraModel": ignoreCameraModel.append(param)
            if item == "FillEmptyLensModel":
                param1, param2 = param.split(":", 1)
                fillEmptyLensModel[param1] = param2.strip()
            if item == "PlotFigSizeX": plotFixSizeX = int(param)
            if item == "PlotFigSizeY": plotFixSizeY = int(param)
            if item == "PlotGrid":
                if param.lower() == "true": plotGrid = True
                else: plotGrid = False
            if item == "PlotSubPlots":
                if param.lower() == "true": plotSubPlots = True
                else: plotSubPlots = False
            if item == "PlotsFontSize": plotFontSize = param  # Default値
            if item == "PlotBarRotate":
                param1, param2 = param.split(":")
                PlotBarRotate[param1] = int(param2)
                if param1 == "Default": plotRotate = param2
            if item == "PlotBar":
                if param == "True": plotBar = True
                else: plotBar = False
            if item == "PlotScatter":
                if param == "True": plotScatter = True
                else: PlotScatter = False
            if item == "PlotHexbin":
                if param == "True": plotHexbin = True
                else: plotHexbin = False
            if item == "PlotPie":
                if param == "True": plotPie = True
                else: plotPie = False
            if item == "PlotSeaborn":
                if param == "True": plotSeaborn = True
                else: plotSeaborn = False

    print("#")
    print("#")
    print("#")
    print("# Load and analyze photo data")
    print("#")
    print("#")
    print("#")

    print("> Load photo data")
    for photoDirectory in photoDirectories:
        lists, header = GetExifData(lists=lists,
                                    ignoreCameraModel=ignoreCameraModel,
                                    fileFullPath=photoDirectory)
        if header != []: cols = header

    now = datetime.datetime.now()
    now_dt = str(now.year).zfill(2) + str(now.month).zfill(2) + str(
        now.day).zfill(2) + " " + str(now.hour).zfill(2) + str(
            now.minute).zfill(2) + str(now.second).zfill(2)

    # Data保存先ディレクトリ
    if os.path.exists("./Data") == False: os.mkdir("./Data")

    dataDir = "./Data/" + now_dt
    os.mkdir(dataDir)

    #
    # このエラーが出た時の確認用: AssertionError: 45 columns passed, passed data had 43 columns
    #
    #for x in lists:
    #    print(str(len(x)) + ":" + x[0] )

    # - - -
    # Set PANDAS
    # - - -
    df = pd.DataFrame(data=lists, columns=cols)
    df = df.applymap(illegal_char_remover)

    df["Count"] = df.apply(lambda x: 1, axis=1)

    #iPhoneのMakeがBlankの時に更新
    df["Make"] = df.apply(lambda x: "Apple"
                          if x.Model[0:6] == "iPhone" else x.Make,
                          axis=1)

    #LensModelがEmptyの時に更新
    df["LensModel_org"] = df.apply(lambda x: x.LensModel, axis=1)  # Backup
    df["LensModel"] = df.apply(lambda x: "Unknown"
                               if x.LensModel == "" else x.LensModel,
                               axis=1)
    df["LensModel"] = df.apply(lambda x: fillEmptyLensModel[x.Model]
                               if x.Model in fillEmptyLensModel and x.LensModel
                               == "Unknown" else x.LensModel,
                               axis=1)

    # EXIF Dataの数値がブランクのものを0で埋める
    df["ISOSpeedRatings"] = df.apply(
        lambda x: 0 if x.ISOSpeedRatings == "" else x.ISOSpeedRatings, axis=1)
    df["FNumber_cust"] = df.apply(lambda x: 0
                                  if x.FNumber_cust == "" else x.FNumber_cust,
                                  axis=1)
    df["ExposureTime_calc"] = df.apply(
        lambda x: 0 if x.ExposureTime_calc == "" else x.ExposureTime_calc,
        axis=1)
    df["ExposureTime_cust"] = df.apply(
        lambda x: 0 if x.ExposureTime_cust == "" else x.ExposureTime_cust,
        axis=1)
    df["FocalLengthIn35mmFilm"] = df.apply(
        lambda x: 0
        if x.FocalLengthIn35mmFilm == "" else x.FocalLengthIn35mmFilm,
        axis=1)

    df["LightSource_cust"] = df.apply(
        lambda x: "Auto" if x.LightSource_cust == 0 else x.LightSource_cust,
        axis=1)

    # 年。月、曜日、時間帯の列を追加
    df["Year"] = df.apply(lambda x: GetYear(x.DateTimeOriginal), axis=1)
    df["Month"] = df.apply(lambda x: GetMonth(x.DateTimeOriginal), axis=1)
    df["Year_Month"] = df.apply(lambda x: GetYearMonth(x.DateTimeOriginal),
                                axis=1)
    df["Hour"] = df.apply(lambda x: GetHour(x.DateTimeOriginal), axis=1)
    df["Week"] = df.apply(lambda x: GetWeek(x.DateTimeOriginal), axis=1)

    # List Camera Model
    print("[Camera Model List]")
    pvt_cm = pd.pivot_table(df,
                            values="Count",
                            index=["Make", "Model"],
                            aggfunc=lambda x: len(x))
    print(pvt_cm)

    # - - -
    # Create charts
    # - - -
    print("[Create Charts]")

    # Sort of bar chart
    plots_bar = [
        ["FNumber_cust", ["Make", "Model"]],
        ["FNumber_cust", ["Make", "Model", "LensModel"]],
        ["FocalLengthIn35mmFilm", ["Make", "Model"]],
        ["FocalLengthIn35mmFilm", ["Make", "Model", "LensModel"]],
        ["FocalLength_cust", ["Make", "Model"]],
        ["FocalLength_cust", ["Make", "Model", "LensModel"]],
        ["ShutterSpeed_calc", ["Make", "Model"]],
        ["ShutterSpeed_calc", ["Make", "Model", "LensModel"]],
        ["ISOSpeedRatings", ["Make", "Model"]],
        ["ISOSpeedRatings", ["Make", "Model", "LensModel"]],
        ["Orientation_cust", ["Make", "Model"]],
        ["Orientation_cust", ["Make", "Model", "LensModel"]],
        ["LightSource_cust", ["Make", "Model"]],
        ["LightSource_cust", ["Make", "Model", "LensModel"]],
        ["MeteringMode_cust", ["Make", "Model"]],
        ["MeteringMode_cust", ["Make", "Model", "LensModel"]],
        ["ApertureValue_cust", ["Make", "Model"]],
        ["ApertureValue_cust", ["Make", "Model", "LensModel"]],
        ["BrightnessValue_cust", ["Make", "Model"]],
        ["BrightnessValue_cust", ["Make", "Model", "LensModel"]],
        ["ExposureBiasValue_cust", ["Make", "Model"]],
        ["ExposureBiasValue_cust", ["Make", "Model", "LensModel"]],
        ["MaxApertureValue_cust", ["Make", "Model"]],
        ["MaxApertureValue_cust", ["Make", "Model", "LensModel"]],
        ["Sharpness_cust", ["Make", "Model"]],
        ["Sharpness_cust", ["Make", "Model", "LensModel"]],
        ["SceneCaptureType_cust", ["Make", "Model"]],
        ["SceneCaptureType_cust", ["Make", "Model", "LensModel"]],
        ["Make", ""],
        ["Model", ""],
        ["LensModel", ""],
        ["Year", ["Make", "Model"]],
        ["Year", ["Make", "Model", "LensModel"]],
        ["Year_Month", ["Make", "Model"]],
        ["Year_Month", ["Make", "Model", "LensModel"]],
        ["Month", ["Make", "Model"]],
        ["Month", ["Make", "Model", "LensModel"]],
        ["Hour", ["Make", "Model"]],
        ["Hour", ["Make", "Model", "LensModel"]],
        ["Week", ["Make", "Model"]],
        ["Week", ["Make", "Model", "LensModel"]],
        ["Hour", "ISOSpeedRatings"],
        ["Hour", "FNumber_cust"],
        ["Hour", "ShutterSpeed_cust"],
        ["FNumber_cust", "ISOSpeedRatings"],

        #["ExposureTime_calc", ""],
        # [["FNumber_cust", "ISOSpeedRatings"], ""],   ### このパターンはエラー
    ]

    # Sort of Scatter chart
    plots_scatter = [
        ["ExposureTime_calc", "FNumber_cust"],
        ["Hour", "FNumber_cust"],
        ["Hour", "ISOSpeedRatings"],
        ["ISOSpeedRatings", "FNumber_cust"],
        ["FocalLengthIn35mmFilm", "FNumber_cust"],
        ["FocalLength_cust", "FNumber_cust"],
        ["ApertureValue_cust", "FNumber_cust"],
        ["ShutterSpeed_calc", "FNumber_cust"],
        ["ShutterSpeed_calc", "ISOSpeedRatings"],
        ["ShutterSpeed_calc", "ExposureTime_calc"],
    ]

    # Sort of Pie chart
    plots_pie = [
        "Make",
        "Model",
        "LensModel",
        "FocalLengthIn35mmFilm",
        "FocalLength_cust",
        "FNumber_cust",
        "ISOSpeedRatings",
        "Year",
        "Month",
        "Week",
    ]

    # Pivot TableをEXCELに書き出し
    saveExcelFile = dataDir + "/Photo Data " + now_dt + ".xlsx"
    writer = pd.ExcelWriter(saveExcelFile)
    df.to_excel(writer, sheet_name=now_dt)

    # - - - - - - - - - -
    # Plots Bar Chart作成
    # - - - - - - - - - -
    if plotBar == True:
        for idx, clm in plots_bar:

            if clm != "":
                clm2 = ""
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    columns=clm,
                                    aggfunc=lambda x: len(x))

                if isinstance(clm, list):
                    n = 0
                    for val in clm:
                        n += 1
                        if n == 1: clm2 = val
                        else: clm2 = clm2 + " & " + val
                else: clm2 = clm

                idx = ModifyName(idx)  # Pivotを作成した後に名称変更
                fn = idx + " by " + ModifyName(clm2)
                pTitle = "x: " + ModifyName(idx) + " | y: " + ModifyName(clm2)
                lgd = True
            else:
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    aggfunc=lambda x: len(x))

                idx = ModifyName(idx)  # Pivotを作成した後に名称変更
                fn = idx
                pTitle = idx
                lgd = False

            # 長い名称を縮小
            fn = ReduceName(fn)

            # Write to EXCEL File
            ds.to_excel(writer, sheet_name=fn.replace("&", "_"))

            # Draw plot
            if idx in PlotBarRotate:
                rotate = PlotBarRotate[idx]  #Rotateを個別に設定している場合
            else:
                rotate = plotRotate
            fontSize = 8

            ds.columns.name = ""
            ds.index.name = ""
            ds.plot(kind="bar",
                    title=pTitle,
                    grid=plotGrid,
                    legend=lgd,
                    subplots=plotSubPlots,
                    fontsize=plotFontSize,
                    rot=rotate,
                    figsize=(plotFixSizeX, plotFixSizeY),
                    stacked=True)

            saveFile = dataDir + "/Pd_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
    else:
        pass

    # - - - - - - - - - -
    # Plots scatter/hexbin chart作成
    # - - - - - - - - - -
    for val_x, val_y in plots_scatter:
        pTx = ModifyName(val_x)
        pTy = ModifyName(val_y)
        pTitle = str(pTx) + " vs " + str(pTy)

        # 長い名称を縮小
        fn = ReduceName(pTitle)

        if plotScatter == True:
            # Scatter Chart
            df.plot(
                kind='scatter',
                x=val_x,
                y=val_y,
                linewidth="2",
                c="blue",
                edgecolors="blue",
                title=pTitle,
                grid=plotGrid,
                legend=lgd,
                subplots=plotSubPlots,
                fontsize=plotFontSize,
                #rot=plotRotate,
                figsize=(plotFixSizeX, plotFixSizeY),
                stacked=True)
            plt.xlabel(pTx)
            plt.ylabel(pTy)
            saveFile = dataDir + "/Ps_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
        else:
            pass

        if plotHexbin == True:
            # Hexbin Chart
            df.plot(
                kind='hexbin',
                x=val_x,
                y=val_y,
                gridsize=30,
                marginals=False,
                cmap=cm.PuBu,
                title=pTitle,
                grid=plotGrid,
                legend=lgd,
                subplots=plotSubPlots,
                fontsize=plotFontSize,
                #rot=plotRotate,
                figsize=(plotFixSizeX, plotFixSizeY),
                stacked=True)

            plt.xlabel(pTx)
            plt.ylabel(pTy)
            saveFile = dataDir + "/Ph_" + fn + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()
        else:
            pass

    # - - - - - - - - - -
    # Plots pie chart
    # - - - - - - - - - -
    if plotPie == True:
        for idx in plots_pie:
            try:
                pTitle = ModifyName(idx)
                ds = pd.pivot_table(df,
                                    values="Count",
                                    index=idx,
                                    aggfunc=lambda x: len(x))
                ds.plot(kind="pie",
                        y="Count",
                        subplots=True,
                        title=pTitle,
                        autopct='%.1f',
                        figsize=(plotFixSizeX, plotFixSizeY),
                        counterclock=False,
                        startangle=90,
                        pctdistance=0.8)
                plt.ylabel("")
                fn = ModifyName(idx)

                # 長い名称を縮小
                fn = ReduceName(fn)

                saveFile = dataDir + "/Pp_" + str(fn) + ".png"
                print("> Plot:" + saveFile)
                plt.axis('equal')
                plt.savefig(saveFile)
                plt.close()

            except AssertionError as err:
                print("*EXCEPTION:", err)

    else:
        pass

    # - - - - - - - - - -
    # Seaborn Chart
    # - - - - - - - - - -
    if plotSeaborn == True:
        # Seaborn PairPlot #1
        df_select = df.loc[:, [
            "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc",
            "ISOSpeedRatings", "ApertureValue_cust", "ExposureBiasValue_cust",
            "LensModel"
        ]]
        sb = sns.pairplot(df_select, hue="LensModel")
        saveFile = dataDir + "/Sp_Pairplot1.png"
        plt.savefig(saveFile)
        plt.close()

        # Seaborn PairPlot #2
        df_select = df.loc[:, [
            "Hour", "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc",
            "ISOSpeedRatings", "LensModel"
        ]]
        sb = sns.pairplot(df_select, hue="LensModel")
        saveFile = dataDir + "/Sp_Pairplot2.png"
        plt.savefig(saveFile)
        plt.close()

        # Seaborn JpintPlot (using scatter param)
        for val_x, val_y in plots_scatter:
            pTx = ModifyName(val_x)
            pTy = ModifyName(val_y)
            pTitle = str(pTx) + " vs " + str(pTy)

            sb_kind = "hex"  # reg, kde, hex
            sb = sns.jointplot(val_x, val_y, df, kind=sb_kind)

            # 長い名称を縮小
            fn = ReduceName(pTitle)
            saveFile = dataDir + "/Sj_" + str(fn) + ".png"
            print("> Plot:" + saveFile)
            plt.savefig(saveFile)
            plt.close()

        # Seaborn HeatMap


#       sb = sns.heatmap(df.corr())

    else:
        pass

    # 最後にSave
    writer.save()
    print("> Saved EXCEL file: " + saveExcelFile)
Example #50
0
df['Current_Year'] = 2020

df['Age_of_Car'] = df['Current_Year'] - df['Year']

df.head()

df = df.drop(['Car_Name', 'Year', 'Current_Year'], axis=1)

df.head()

df = pd.get_dummies(df, drop_first=True)

df.head()

sns.pairplot(df)

corr = df.corr()
top_features = corr.index
plt.figure(figsize=(12, 6))
sns.heatmap(corr, annot=True, cmap='RdYlGn')

x = df.iloc[:, 1:]

y = df.iloc[:, 0]

x.head()

y.head()

#Feature Im[ortance
                  color='lightgreen')
plt.show(bar)

# In[24]:

#Shohim varshmërinë e atributeve me njëra tjetrën
x = data_file['Rating'].dropna()
y = data_file['Size'].dropna()
z = data_file['Installs'][data_file.Installs != 0].dropna()
p = data_file['Reviews'][data_file.Reviews != 0].dropna()
t = data_file['Type'].dropna()
price = data_file['Price']

p = sns.pairplot(pd.DataFrame(
    list(zip(x, y, np.log(z), np.log10(p), t, price)),
    columns=['Rating', 'Size', 'Installs', 'Reviews', 'Type', 'Price']),
                 hue='Type',
                 palette="Set2")

# In[25]:

#Shohim konvergjencën e vlerave për atributin Rating
data_file.hist(column='Rating')
plt.ylim(0, 10841)
plt.title("Shpërndarja e Rating")
plt.xlabel("Vlera e Rating")
plt.ylabel("Nr. i aplikacioneve")

# In[26]:

#Enkodimi i atributit App
Example #52
0
def pairplots(trainA):
    sns.set(style="ticks", color_codes=True)
    sns.pairplot(trainA, diag_kind='kde')
    plt.show()
    for c in columns:

        if 'mep' in c:
            idx = int(c[-1]) - 1
            data = np.log(mat['AmpsMclean'][()][idx])
        #elif 'amplitude' in c:
        #    data = np.log(mat[c][()][idx])
        else:
            data = mat[c][()][idx]
        vector.append(data)

    vector = np.vstack(vector)
    df = pd.DataFrame(vector.T, columns=columns)

    sns.pairplot(df, diag_kws=diag_kws, plot_kws=plot_kws)
    mat.close()

##############################################################################
task = 'phastimate'
threshold_key = 'phases32'

full_dataset = list()

for i in range(9):
    sub = "sub-%03d" % (i + 1)
    filename = os.path.join(
        path, sub,
        sub + "_space-sensor_window-500_atlas-subject_band-mu_%s.mat" % (task))
    mat = h5py.File(filename, 'r')
    vector = []
Example #54
0
house = house.drop(['id', 'date'], axis=1)

# **Pairplot Visualisation**
#
# Let's create some Seaborn pairplots for the features ('sqft_lot','sqft_above','price','sqft_living','bedrooms') to get a feel for how the various features are distributed vis-a-vis the price as well as the number of bedrooms

# In[ ]:

#sns.pairplot(house[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], hue='bedrooms', palette='afmhot',size=1.4)

# In[ ]:

with sns.plotting_context("notebook", font_scale=2.5):
    g = sns.pairplot(
        house[['sqft_lot', 'sqft_above', 'price', 'sqft_living', 'bedrooms']],
        hue='bedrooms',
        palette='tab20',
        size=6)
g.set(xticklabels=[])

# From the pairplots, we seem to get the classical linear distribution of the data points, for example with price against sqft_living. This bodes well as in the latter analysis, we will implement some linear models which we will use in our Feature ranking. Let's look at the correlation heatmap:

# In[ ]:

str_list = []  # empty list to contain columns with strings (words)
for colname, colvalue in house.iteritems():
    if type(colvalue[1]) == str:
        str_list.append(colname)
# Get to the numeric columns by inversion
num_list = house.columns.difference(str_list)
# Create Dataframe containing only numerical features
from sklearn.model_selection import train_test_split, cross_val_score

# Removes scientific notation
np.set_printoptions(suppress=True)

# Loading data
data = pd.read_csv("Dataset.csv")
x_title = ['Tm', ' Pr', 'Th', 'Sv']
y_title = 'Idx'
x_original = data[x_title]
y_original = data.Idx
x_train, x_test, y_train, y_test = train_test_split(x_original, y_original, test_size=0.5, random_state=1)


# Plotting the Data scatter Linear Regression======================================= 
seaborn.pairplot(data, x_vars=x_title, y_vars=y_title, size=7, aspect=1)
matplot.show()

#plotting the original graphs and linear regression
x_attributes = ["Tm", " Pr", "Th", "Sv"]
x_labels = ['Tempurature', 'Pressure', 'Thermal Conductivity', 'Sound Velocity']

for count, x_attr in enumerate(x_attributes):
    matplot.scatter(x_original[x_attr], y_original)
    liLSM = LinearRegression()
    liLSM.fit(x_train[x_attr].reshape(-1,1), y_train)
    y_predict = liLSM.predict(x_test[x_attr].reshape(-1,1))
    matplot.plot(x_test[x_attr].reshape(-1,1), y_predict, 'r') 
    matplot.legend(['Predicted line','Observed data'])
    matplot.xlabel(x_labels[count])
    matplot.ylabel('Chem Index')
# %%
# Now let's compare google to itself

sns.jointplot('GOOG', 'GOOG', tech_returns, kind='scatter', color='seagreen')
# %%
# That's a perfect linear relationship, and that makes sense, since we are comparing google to google.
# %%
# Now let's check if there are relationships between different tech stocks

sns.jointplot('GOOG', 'MSFT', tech_returns, kind='scatter', color='seagreen')
# %%
# Now let's do some plots that will make it easy to compare the tech stocks on our list

tech_returns.head()
# %%
sns.pairplot(tech_returns.dropna())
# %%
sns.pairplot(tech_returns.dropna(), kind="reg")
# %%
sns.pairplot(tech_returns.dropna(), kind="reg", diag_kind='kde')
# %%
# Just so we can have an idea on how to interpret these graphs:

from IPython.display import SVG
SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg')
# %%
# The above visualizations show a an interesting correlation between Google and Amazon daily returns
# We can dig a little deeper and use a PairGrid to see a more detailed and controled plot between those two.
# %%
returns_fig = sns.PairGrid(tech_returns.dropna())
returns_fig.map_upper(plt.scatter, color='purple')
Example #57
0
import matplotlib.pyplot as plt
#https://matplotlib.org/
df1.groupby('gender').size()
df1.groupby('gender').size().plot(kind='bar')

plt.hist(df1['marks'])

#https://seaborn.pydata.org/index.html
import seaborn as sns
# sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")
iris.head()
iris.tail()
df1.groupby('gender').size()
iris.groupby('species').size().plot(kind='bar')
sns.pairplot(iris)

#%%
#Load Inbuilt Datasets
import statsmodels.api as sm
#https://vincentarelbundock.github.io/Rdatasets/datasets.html
mtcars = sm.datasets.get_rdataset(dataname='mtcars', package='datasets')
mtcars.data.head()
mtcars.data.tail()
mtcars.data.columns

#%%
#Load from Excel/ CSV and export to
data = mtcars.data
data.head(6)
type(data)
    print('='*45)

# croosstab with catagorical
for c in category :
    table = pd.crosstab(data[c],data['y'])
    table.plot(kind='bar')

# for numeric variable
corelation = data.corr()
ax=plt.subplots(figsize=(9,7))
sns.heatmap(corelation,annot = True)


### multivaite analysis

sns.pairplot(data,hue='y',palette='coolwarm')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def categorical_variable(dataframe):
    variable_name=[i for i in dataframe.columns if dataframe.dtypes[i]=='object']
    for x in variable_name:
        dataframe[x]=le.fit_transform(dataframe[x])
    return dataframe

categorical_variable(transformed_data)
transformed_data.columns

# feature selecition
from sklearn.model_selection import train_test_split 
Example #59
0
            df_company[df_company['kmean'] == 0][4],
            s=100,
            c='red',
            label='Cluster 1')
plt.scatter(df_company[df_company['kmean'] == 1][2],
            df_company[df_company['kmean'] == 1][4],
            s=100,
            c='blue',
            label='Cluster 1')
plt.scatter(df_company[df_company['kmean'] == 2][2],
            df_company[df_company['kmean'] == 2][4],
            s=100,
            c='green',
            label='Cluster 1')

sns.pairplot(df_company, hue='kmean')

# ############################ Hierarchical Clustering

#l1 = [df['EduDegree'],df['HasChild'],df['GeoLivArea']]
#df['hc_split'] = pd.concat(, axis=1 )

df['hc_split'] = df['EduDegree'].map(str) + df['HasChild'].map(
    str) + df['GeoLivArea'].map(str)
sns.countplot('hc_split', data=df)

df.drop(['EduDegree', 'HasChild', 'GeoLivArea'], inplace=True, axis=1)
df.drop(['CustId'], inplace=True, axis=1)

df_hc = df.drop(['hc_split'], axis=1)
Example #60
-1
def main():

    df_train=pd.read_csv('../input/train.csv')
    df_test=pd.read_csv('../input/test.csv')

    sns.set()
    sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type")
    # sns.plt.show()

    df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul']
    df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length']
    df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length']


    df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul']
    df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length']
    df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length']

    test_id = df_test['id']

    df_train.drop(['id'], axis=1, inplace=True)
    df_test.drop(['id'], axis=1, inplace=True)

    df_train.drop(['color'], axis=1, inplace=True)
    df_test.drop(['color'], axis=1, inplace=True)

    df_train_data = df_train.drop('type', axis=1)
    df_train_results=df_train['type']

    df_train_data = pd.get_dummies(df_train_data)
    df_test_data = pd.get_dummies(df_test)

    test_results=run_classifier(df_train_data,df_train_results,df_test_data, 'rf')
    save_result(test_id, test_results,'results_logistic_regression.csv')