def loadSDSSdata(folder='/Users/sammy/Google Drive/MachineLearning/AstroSDSS/', filename="qso10000.csv", plot=False): """ Load SDSS QSO data. The redshift range is rather broard from about 0.3 to 6. """ filename = folder + filename qsos = pd.read_csv(filename,index_col=0, usecols=["objid","dered_r","spec_z","u_g_color", "g_r_color","r_i_color","i_z_color","diff_u", "diff_g1","diff_i","diff_z"]) qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)] qso_features = copy.copy(qsos) qso_redshifts = qsos["spec_z"] del qso_features["spec_z"] if plot: ## truncate the color at z=2.5 just to keep some contrast. norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5) cmap = cm.jet_r m = cm.ScalarMappable(norm=norm, cmap=cmap) pd.scatter_matrix(qso_features[0:2000], alpha=0.2, figsize=[15, 15], color=m.to_rgba(qso_redshifts.values)) plt.savefig('Sample.pdf') plt.close() X_train, X_test, y_train, y_test = train_test_split(qso_features.values, qso_redshifts.values, random_state=42) print "feature vector shape=", qso_features.values.shape print 'Training sample shape=', X_train.shape print 'Testing sample shape=', X_test.shape return X_train, X_test, y_train, y_test
def feature_m(df_all): df_X = df_all[['upgraded_HD', 'upgraded_cpu', 'upgraded_memory', 'apple_care', 'year', 'px', 'cpu_speed', 'image_url_ct', 'memory', 'HD_size']] df_X['apple_care'] = binarize_boolean_series(df_X['apple_care']) df_X['upgraded_HD'] = binarize_boolean_series(df_X['upgraded_HD']) df_X['upgraded_memory'] = binarize_boolean_series(df_X['upgraded_memory']) df_X['upgraded_cpu'] = binarize_boolean_series(df_X['upgraded_cpu']) df_X['year'] = df_X['year'].astype(int) df_X['px'] = df_X['px'].astype(int) df_X['cpu_speed'] = df_X['cpu_speed'].astype(float) df_X['HD_size'] = df_X['HD_size'].astype(float) df_X['memory'] = df_X['memory'].astype(int) pd.scatter_matrix(df_X, figsize=(15,15)); y = df_X.pop('year').ravel() X = np.array(df_X) return X, y
def test_scatter_plot_legacy(self): df = pd.DataFrame(randn(100, 2)) with tm.assert_produces_warning(FutureWarning): plotting.scatter_matrix(df) with tm.assert_produces_warning(FutureWarning): pd.scatter_matrix(df)
def _scatterMatrixAct(self): df = self.getNumberDataFrame() if df is None: return DyMatplotlib.newFig() pd.scatter_matrix(df) plt.gcf().show()
def show_scatter(data, col): ''' shows a scatter matrix of the data ''' if col: pd.scatter_matrix(data[col], figsize=(10, 10)) else: pd.scatter_matrix(data, figsize=(10, 10))
def slide_13(): macro = pd.read_csv(MACRODATAPATH) data = macro[['cpi', 'm1', 'tbilrate', 'unemp']] trans_data = np.log(data).diff().dropna() print trans_data[-5:] plt.scatter(trans_data['m1'], trans_data['unemp']) plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp')) pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)
def scatter_matrix_topp(sorted_frame, selected_axes, percentile=1): """ Arguments: - `sorted_frame`: - `selected_axes`: the axes to include in .the scatterplot matrix - `percentile`: """ pd.scatter_matrix( np.log(sorted_frame[selected_axes]+1)[:(percentile*len(sorted_frame)/100)] )
def scoreCorrelations(preds): figs=[] for p in preds: pred=preds[p] df=pred.data x = df.pivot_table(index='peptide', columns='allele', values=pred.scorekey) f=plt.figure() ax=f.add_subplot(111) pd.scatter_matrix(x, alpha=0.2, figsize=(12,12), diagonal='hist',ax=ax) #plt.tight_layout() figs.append(f) return figs
def plot_scatter_matrix(title, tr, fig=None): if (fig is None): fig = plt.Figure() t6 = pandas.Series(tr['c']) t8 = pandas.Series(tr['gmm'][:,0]) t9 = pandas.Series(tr['gmm'][:,1]) t10 = pandas.Series(tr['gmm_p'][:,0]) t11 = pandas.Series(tr['pbeta']) df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11}) pandas.scatter_matrix(df) plt.title(title) return fig
def createMatrix(self, event): # TODO Fix ugly gridlines. sns.setStyle('nogrid') failed dlg = GraphDialog(self.parent, "Matrix Plot Input", ("Select Data",), size=(500, 300), groups=False) if dlg.ShowModal() == wx.ID_OK: ds = [d[0] for d in dlg.GetName()] df = self.parent.data[ds] n = len(ds) dlg.Destroy() pd.scatter_matrix(df, grid=False) plt.show()
def performScaling(self): self.log_data = pd.DataFrame(np.log(self.data), columns=self.data.columns) self.log_samples = pd.DataFrame(np.log(self.samples), columns=self.samples.columns) fname = "customers_log.csv" if not os.path.isfile(fname): self.log_data.to_csv(fname) scaler = preprocessing.StandardScaler() self.data_log_std = pd.DataFrame(scaler.fit_transform(self.log_data), columns=self.log_data.columns) self.data_log_std.to_csv("customers_log_std.csv") pd.scatter_matrix(self.log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde') print(self.log_samples) # plt.show() return
def plot_feature_scatter(df_feat, df_files, write_dst=''): '''Plot scatter matrix for all features. Save Exercise-labeled version of scatter plot for inspection''' # visualize features in the test set ax = pd.scatter_matrix(df_feat, alpha=0.2, figsize=(15, 15), diagonal='kde'); # remove axis labels for axi in ax: for axij in axi: axij.set_yticks([]) axij.set_xticks([]) if write_dst: # also create and save a version of this plot with points colored by exercise label df_labeled = df_feat.join(df_files.Exercise) g = sns.PairGrid(df_labeled, hue="Exercise") g.map_upper(plt.scatter, alpha=0.2) g.map_diag(plt.hist) # g.map_lower(sns.kdeplot, alpha=0.2, cmap='Greys_d') # trouble calculating the kde g.add_legend() g.savefig(write_dst) plt.close() # don't create the plot here return ax
def make_scatter_plots(features_of_interest, df): ''' This function makes bivariate scatter matrix plot for the inputed features of interest, which are typically the individual features of the greatest importance in our supervised learning classification model INPUTS: features_of_interest = list of strings; df = pandas data frame containing song feature data ''' plt.figure '''get mask containing songs used in our model''' good_mask = np.load('good_mask.npy') df = df[good_mask] contains_outliers = 'B- Var(c.t.)' ''' remove outliers in the 'B- Var(c.t.)' feature to better see plots ''' df = df[np.abs(df[contains_outliers]\ - df[contains_outliers].mean()) / df[contains_outliers].std() <= 2.3 ] df_trunc = df[features_of_interest] color_dict = dict() '''label data points by color''' color_dict['tec'] = 'b' color_dict['hip'] = 'r' color_dict['cla'] = 'g' color_dict['roc'] = 'k' color_dict['pop'] = 'c' color_set = np.array([color_dict[name] for name in df['Label']]) ax = pd.scatter_matrix(df_trunc, color = color_set) plt.xlabel([]) plt.ylabel([])
def exploratory_viz(loansData): plt.figure() p = loansData['FICO.Score'].hist() plt.savefig('../figs/fico_score_hist.png') a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(14,14)) plt.savefig('../figs/loan_scatter_matrix.png')
def plot_data(loansData): plt.figure() p = loansData['FICO.Score'].hist(bins=20) plt.show() a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist') plt.show()
def scale_features(property_data, samples): # Scale the data using the natural logarithm log_data = property_data log_data['Price'] = np.log(property_data['Price']) # Scale the sample data using the natural logarithm log_samples = samples log_samples['Price'] = np.log(samples['Price']) print "\nSamples after scaling:" display(log_samples) # Produce a scatter matrix for each pair of newly-transformed features pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14, 8), diagonal = 'kde') plt.show() return log_data, log_samples
def visualize(data): # visualization import seaborn as sns import matplotlib.pyplot as plt # scatter matrix in Seaborn sns.pairplot(data) # scatter matrix in Pandas pd.scatter_matrix(data, figsize=(12, 10)) # Use a **correlation matrix** to visualize the correlation between all numerical variables. # compute correlation matrix data.corr() # display correlation matrix in Seaborn using a heatmap sns.heatmap(data.corr())
def openFile(filename): df_genes = pd.read_csv(filename) df_genes2 = df_genes[['A', 'C', 'D', 'B']] # print df_genes2.head() # plt.show() gene_scatter = pd.scatter_matrix(df_genes2) gene_scatter plt.show()
def colored_scatter_matrix(data, colors, title, save=None): """ Scatter matrix with parametrized colors (e.g. classes) """ print 'Plot scatter matrix...' fig, ax = plt.subplots(figsize=(12.0, 7.5)) pd.scatter_matrix( data, diagonal='kde', figsize=(10, 10), ax=ax, c=colors, cmap=None ) ax.set_title(title) if save: fig.savefig(save) else: mng = plt.get_current_fig_manager() mng.window.showMaximized() plt.show()
def get_iris_dataset(): iris_dataset = load_iris() #1. The format of the dataset print("Keys of iris_dataset: \n{}".format(iris_dataset.keys())) print("Target names: {}".format(iris_dataset['target_names'])) print("Feature names: \n{}".format(iris_dataset['feature_names'])) # data -> numpy.ndarray # row -> the labels # column -> the features print("Type of data: {}".format(iris_dataset['data'].shape)) # (150,4) print("Type of target: {}".format(iris_dataset['target'].shape)) # (150,) #import pdb; pdb.set_trace() #2. split the dataset into training set and testing set # y = f(X) X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'],test_size=0.2, random_state=0) print("X_train shape: {}".format(X_train.shape)) print("y_train shape: {}".format(y_train.shape)) print("X_test shape: {}".format(X_test.shape)) print("y_test shape: {}".format(y_test.shape)) #import pdb; pdb.set_trace() # 3. inspect the data - virtualize it # convert Numpy array int oa pandas DataFrame iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names) # pdb; pdb.set_trace() grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3) plt.show() #import pdb; pdb.set_trace() # The modelu from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) # build the model on the training set knn.fit(X_train, y_train) # the prediction X_new = np.array([[5, 2.9, 1, 0.2]]) prediction = knn.predict(X_new) print("Prediction: {}".format(prediction)) print("Predicted target name: {}".format(iris_dataset['target_names'][prediction])) y_pred = knn.predict(X_test) print("Test set predictions:\n {}".format(y_pred)) print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))
def data_analysis_and_correlation(df_education, df_gdp): """ Analysis and Correlation education data with gdp. """ print "[Data Analysis and Correlation of Education to GDP data] ==> Begin" common_countries = list(set(df_education['Country'].tolist()) & set(df_gdp['Country'].tolist())) gdp = [] total_school_time = [] men_school_time = [] women_school_time = [] for cntry in common_countries: df1 = df_education[df_education['Country'] == cntry] df2 = df_gdp[df_gdp['Country'] == cntry] if df2['GDP_'+ df1['Year'].iloc[0]].iloc[0] != '': total_school_time.append(int(df1['Total_School_Time'].iloc[0])) men_school_time.append(int(df1['Men_School_Time'].iloc[0])) women_school_time.append(int(df1['Women_School_Time'].iloc[0])) gdp.append(math.log(df2['GDP_'+ df1['Year'].iloc[0]].iloc[0])) df_edu_to_gdp = pd.DataFrame({'Total': total_school_time, 'Men': men_school_time, \ 'Women': women_school_time, 'GDP': gdp}) print df_edu_to_gdp.corr(), "\n" gdp_np_array = np.array(df_edu_to_gdp.GDP.tolist()) for col in ['Women', 'Men', 'Total']: r_val, p_val = sp(gdp_np_array, np.array(df_edu_to_gdp[col].tolist())) print "Correlation of GDP against {}:".format(col) print "Pearsons correlation coefficient: {}".format(r_val) print "2-tailed p-values: {}\n".format(p_val) # Scatter matrix plot with histogram of data plots in the diagonal pd.scatter_matrix(df_edu_to_gdp, alpha=0.05, figsize=(10, 10), diagonal='hist') plt.savefig('figures/education_to_gdp/data_education_gdp_analysis.png') plt.clf() # # ==> Conclusion / Summary # GDP Men Total Women # GDP 1.000000 0.495794 0.479050 0.497923 # Men 0.495794 1.000000 0.971663 0.942572 # Total 0.479050 0.971663 1.000000 0.977217 # Women 0.497923 0.942572 0.977217 1.000000 # print """
def scattermatrix(tables): fig = plot.figure(frameon=False,facecolor='white') index=common_index(tables) data=pd.DataFrame(index=index) for i in tables: data[i[0]]=i[1].ix[index]['MEDIAN'] axs=pd.scatter_matrix(data, alpha=0.2, figsize=(8,8), diagonal='none', marker='.',) for ax in axs[:,0]: ax.grid('off', axis='both') ax.set_ylabel(wrap(ax.get_ylabel()), rotation=0, va='center', labelpad=30) ax.set_yticks([]) for ax in axs[-1,:]: ax.grid('off', axis='both') ax.set_xlabel(wrap(ax.get_xlabel()), rotation=90) ax.set_xticks([]) return fig
def _doplot(self, data, ax, kind, subplots, kwargs): """Do core plotting""" cols = data.columns rows = int(round(np.sqrt(len(data.columns)),0)) if len(data.columns) == 1: kwargs['subplots'] = 0 if kind == 'pie': kwargs['subplots'] = True if subplots == 0: layout = None else: layout=(rows,-1) if kind == 'bar': if len(data) > 50: ax.get_xaxis().set_visible(False) if len(data) > 400: print ('too many bars to plot') return if kind == 'scatter': axs = self.scatter(data, ax, **kwargs) if kwargs['sharey'] == 1: lims = self.fig.axes[0].get_ylim() for a in self.fig.axes: a.set_ylim(lims) elif kind == 'boxplot': axs = data.boxplot(ax=ax, rot=kwargs['rot'], grid=kwargs['grid']) #boxplot won't accept required kwargs? if kwargs['logy'] == 1: ax.set_yscale('log') elif kind == 'histogram': bins = int(kwargs['bins']) axs = data.plot(kind='hist',layout=layout, ax=ax, **kwargs) elif kind == 'heatmap': axs = self.heatmap(data, ax, kwargs) elif kind == 'bootstrap': axs = plotting.bootstrap_plot(data) elif kind == 'scatter_matrix': axs = pd.scatter_matrix(data, ax=ax, **kwargs) elif kind == 'hexbin': x = cols[0] y = cols[1] axs = data.plot(x,y,ax=ax,kind='hexbin',gridsize=20,**kwargs) else: axs = data.plot(ax=ax, layout=layout, **kwargs) return axs
def realiseData(): data = pd.read_csv(csvPath) # data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False) pd.scatter_matrix(data) plt.show()
from sklearn.cluster import DBSCAN import pandas as pd import numpy as np import matplotlib.pyplot as plt """ DBSCN describe: 核心对象: 某个点的密度达到阈值(minPts)则为核心点 邻域阈值(r) 传销算法 """ colors = np.array(['red', 'green', 'blue', 'yellow']) # 读取数据 beer = pd.read_csv('./data/data.txt', sep=' ') X = beer[["calories", "sodium", "alcohol", "cost"]] # dbscan db = DBSCAN(eps=10, min_samples=2).fit(X) beer['cluster_db'] = db.labels_ beer.groupby('cluster_db').mean() pd.scatter_matrix(X, c=colors[beer.cluster_db], figsize=(10, 10), s=100) plt.show()
df.groupby('species').agg(np.mean) df.groupby('species').agg([np.min, np.max]) df.groupby('species').describe() # explore data by sorting, looking for differences between species df.sort_index(by='sepal_length').values df.sort_index(by='sepal_width').values df.sort_index(by='petal_length').values df.sort_index(by='petal_width').values # explore data visually, looking for differences between species df.petal_width.hist(by=species, sharex=True) df.boxplot(column='petal_width', by='species') df.boxplot(by='species') df.plot(x='petal_length', y='petal_width', kind='scatter', c=iris.target) pd.scatter_matrix(df, c=iris.target) ## PART 2: Write a function to predict the species for each observation # create a dictionary so we can reference columns by name col_ix = {col: index for index, col in enumerate(df.columns)} # define function that takes in a row of data and returns a predicted species def classify_iris(data): if data[col_ix['petal_length']] < 3: return 'setosa' elif data[col_ix['petal_width']] < 1.8: return 'versicolor' else: return 'virginica'
scaled_df.describe() # In[21]: # Correlation matrix scaled_df.corr() # In[22]: # Correlation plots pd.scatter_matrix(scaled_df, figsize=(22,22)) plt.show() # In[23]: # Correlation heatmap sns.set(rc={'figure.figsize':(80,10)}) corr = scaled_df.corr() ax = sns.heatmap( corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True
#!/usr/bin/env python2 # -*- coding: utf-8 -*- import pandas as pd import numpy as np from sklearn import linear_model import matplotlib.pyplot as plt wine = pd.read_csv('/Users/Shared/py/winequality-red.csv', sep=';') clf = linear_model.LinearRegression() X = wine.drop(['quality'], axis=1) Y = wine['quality'] clf.fit(X, Y) print(clf.coef_) print(clf.intercept_) print( pd.DataFrame({ "Name": X.columns, "Coefficients": clf.coef_ }).sort_values(by='Coefficients')) plt.matshow(wine.corr()) pd.scatter_matrix(wine) plt.scatter(X, Y)
lookup_fruit_name = dict( zip(fruits.fruit_label.unique(), fruits.fruit_name.unique())) lookup_fruit_name #split the data in test and traing with the target variable fruit_label,random_state like seed in R X = fruits[['height', 'width', 'mass', 'color_score']] y = fruits['fruit_label'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) #visualize data as pairs scatterplot of al independent variable relation with target from matplotlib import cm cmap = cm.get_cmap('gnuplot') scatter = pd.scatter_matrix(X_train, c=y_train, marker='o', s=40, hist_kwds={'bins': 15}, figsize=(9, 9), cmap=cmap) #visualize in 3d # plotting a 3D scatter plot from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c=y_train, marker='o',
ufo_cols = ufo.columns.tolist() ufo_cols = [names.replace(' ', '_') for names in ufo.columns.tolist()] ufo_cols2 = [names.replace(' ', '_') for names in ufo.columns] ufo.columns = ufo.columns.str.replace(' ', '_') ufo.columns = ufo_cols # ufo.Location = ufo.City + ', ' + ufo.State ufo['Location'] = ufo.City + ', ' + ufo.State users = pd.read_table('u.user', sep='|', index_col='user_id') users.groupby('occupation').count() users.occupation.value_counts() users.groupby('occupation').age.mean() users.groupby('occupation').age.agg(['min', 'max']) users.groupby(['occupation', 'gender']).age.mean() users.groupby(['occupation', 'gender']).age.agg(['mean', 'count']) import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = (10, 8) drinks[['beer', 'wine']].sort('beer').values drinks.plot(kind='scatter', x='beer', y='wine', alpha=.3) plt.xlabel('Beer') plt.ylabel('Wine') pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8)) plt.style.use('ggplot') drinks.continent.value_counts().plot(kind='bar') drinks.groupby('continent').mean().plot(kind='bar', figsize=(10, 8)) drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar') drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True)
def scatterplot(data, title=None, color=None): pd.scatter_matrix(data, alpha=0.3, diagonal='kde', color=color) if title is not None: plt.suptitle(title) plt.show()
# Plot the data (similar to before) plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9) """ COMMON PROBLEMS - Multicollinearity """ # Now let's run a multiple linear regression # The temp variable is no longer significant. Why? Multicollinearity est_m = smf.ols(formula='cnt ~ atemp + temp + workingday + windspeed', data=bike_dat).fit() est_m.summary() # Scatter plot (observe the (unsurprising) correlation between atemp and temp) cols = ['cnt', 'atemp', 'windspeed', 'weathersit', 'temp', 'workingday', 'hum'] pd.scatter_matrix(bike_dat[cols]) # Correlation coefficient matrix corr_matrix = np.corrcoef(bike_dat[cols].T) sm.graphics.plot_corr(corr_matrix, xnames=cols) # Let's say we wanted to include an interaction term # We would do this by including the ':' between interacting variables est_m = smf.ols(formula='cnt ~ temp + windspeed + temp:windspeed + workingday', data=bike_dat).fit() est_m.summary() # An alternate way of specifying interaction terms # a*b is equivalent to a + b + a:b est_m = smf.ols(formula='cnt ~ temp*windspeed + workingday',
wiki_data = wiki_data.set_index('Date') wiki_data.index = wiki_data.index.map(lambda x : parse(x)) wiki_data['changes'] = wiki_data['changes'].astype(int) death_data = pd.read_csv('CausesOfDeath_France_2001-2008.csv') death_data['Value'] = death_data['Value'].str.replace(' ','') death_data['Value'] = death_data['Value'].apply(lambda x : int(re.compile(r'[^0-9]').sub('0',x))) death_data = death_data[['ICD10','Value','SEX','TIME']] causes = death_data.groupby('ICD10')['Value'].sum().order(ascending=False)[0:5].index.values filtered = death_data[death_data['ICD10'].isin(causes)] filtered_agg = filtered.groupby(['ICD10','TIME']).sum() filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot() filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="bar") filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh") filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh", stacked=True) cars = pd.read_csv('cars.csv',sep=';',index_col=0).drop('STRING') cars['MPG'] = cars['MPG'].astype(float) cars['Cylinders'] = cars['Cylinders'].astype(float) cars['Weight'] = cars['Weight'].astype(float) cars['Acceleration'] = cars['Acceleration'].astype(float) cars['Horsepower'] = cars['Horsepower'].astype(float) pd.scatter_matrix(cars, diagonal='kde', color='k', alpha=0.3)
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('ml-bank').getOrCreate() df = spark.read.csv('bank.csv', header=True, inferSchema=True) df.printSchema() import pandas as pd pd.DataFrame(df.take(5), columns=df.columns).transpose() numeric_features = [t[0] for t in df.dtypes if t[1] == 'int'] print(df.select(numeric_features).describe().toPandas().transpose()) numeric_data = df.select(numeric_features).toPandas() axs = pd.scatter_matrix(numeric_data, figsize=(8, 8)) n = len(numeric_data.columns) for i in range(n): v = axs[i, 0] v.yaxis.label.set_rotation(0) v.yaxis.label.set_ha('right') v.set_yticks(()) h = axs[n - 1, i] h.xaxis.label.set_rotation(90) h.set_xticks(()) df = df.select('age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit') cols = df.columns print(df.printSchema())
dataset['quality'].unique()#3-9 dataset.head() dataset.tail() #To find the statistical summary dataset.describe() #Univariate Analysis dataset.hist() #Multivariate Analysis from pandas.tools.plotting import scatter_matrix pd.scatter_matrix(dataset) #Group the dependent variable and independent variables array=dataset.values X=array[:,0:11] Y=array[:,11] #Splitting the dataset into training set and test set from sklearn.cross_validation import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train)
iris.petal_width.hist(by=iris.species, sharex=True) iris.boxplot(column='petal_width', by='species') iris.boxplot(by='species') # map species to a numeric value so that plots can be colored by category iris['species_num'] = iris.species.map({ 'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2 }) iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap='Blues') pd.scatter_matrix(iris, c=iris.species_num) ## TASK 4 # If petal length is less than 3, predict setosa. # Else if petal width is less than 1.8, predict versicolor. # Otherwise predict virginica. ## BONUS # define function that accepts a row of data and returns a predicted species def classify_iris(row): if row[2] < 3: # petal_length return 0 # setosa elif row[3] < 1.8: # petal_width
centers = beer.groupby("cluster3").mean().reset_index() print(centers) # 图形化展示聚类效果(k=3) from pandas import scatter_matrix import matplotlib.pyplot as plt import numpy as np plt.rcParams['font.size'] = 14 colors = np.array(['red', 'green', 'blue', 'yellow']) plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster3"]]) plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black') plt.xlabel("Calories") plt.ylabel("Alcohol") plt.show() scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster3"]], figsize=(10, 10)) plt.suptitle("With 3 centroids initialized") plt.show()
print("X_test shape: {}".format(X_test.shape)) print("y_test shape: {}".format(y_test.shape)) # create dataframe from data in X_train # label the columns using the strings in iris_dataset.feature_names import numpy as np import matplotlib.pyplot as plt import pandas as pd import mglearn iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names) # create a scatter matrix from the dataframe, color by y_train grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15), marker='o', hist_kwds={'bins': 20}, s=60, alpha=.8, cmap=mglearn.cm3) #pip install mglearn #Building Your First Model: k-Nearest Neighbors from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) #knn = KNeighborsClassifier(n_neighbors=7) knn.fit(X_train, y_train) #Making Predictions X_new = np.array([[5, 2.9, 1, 0.2]]) print("X_new.shape: {}".format(X_new.shape))
sv = df.groupby(['Survived', 'Pclass', 'Sex'])['Name'].count() sv.unstack().plot.bar() plt.savefig('bars_gruppen.png') # 5. Paarplot def make_col(x): """Einfärben nach Überleben""" if x == 0: return (1, 0, 0) # rot else: return (0, 0, 1) # blau col = df['Survived'].apply(make_col) pd.scatter_matrix(df, c=col, figsize=(15, 15)) plt.savefig('paarplot.png') # 7. Datenaufbereitung del df['Cabin'] del df['Name'] df = df.dropna() X = df[['Pclass', 'Age']].values y = df['Survived'].values # 8. Modell erstellen Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42) m = KNeighborsClassifier(n_neighbors=1)
ax1.scatter(hollywood_movies["Profitability"], hollywood_movies["Audience Rating"]) ax1.set_xlabel("Profitability") ax1.set_ylabel("Audience Rating") ax1.set_title("Hollywood Movies, 2017-2011") ax2.scatter(hollywood_movies["Audience Rating"], hollywood_movies["Profitability"]) ax2.set_xlabel("Audience Rating") ax2.set_ylabel("Profitability") ax2.set_title("Hollywood Movies, 2017-2011") plt.show() ## 3. Scatter matrix - profitability and critic ratings ## normal_movies = hollywood_movies[hollywood_movies["Film"] != "Paranormal Activity"] filtered_movies = normal_movies[["Profitability","Audience Rating"]] pd.scatter_matrix(filtered_movies,figsize = (6,6)) plt.show() ## 4. Box plot - audience and critic ratings ## normal_movies.boxplot(column = ["Critic Rating","Audience Rating"]) ## 5. Box plot - critic vs audience ratings per year ## normal_movies = normal_movies.sort_values("Year") fig = plt.figure(figsize = (8,4)) ax1 = fig.add_subplot(1,2,1) ax2 = fig.add_subplot(1,2,2) sns.boxplot(data=normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year",y = "Critic Rating", ax = ax1) sns.boxplot(data = normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year", y = "Audience Rating", ax = ax2)
'640', '', '', '', '660', '', '', '', '680', '', '', '', '700', '720', '', '', '', '740', '', '', '', '760', '', '', '', '780', '', '', '', '800', '', '', '', '820', '', '', '', '840' ]) q0 = p.set_xlabel('FICO Score') q1 = p.set_ylabel('Interest Rate %') q2 = p.set_title('Lending Rate Plot') #Create a new data frame with selected columns for analysing data loansmin = loansdata.filter([ 'Interest.Rate', 'FICO.Score', 'Loan.Length', 'Monthly.Income', 'Amount.Requested' ], axis=1) a = pd.scatter_matrix(loansmin, alpha=0.05, figsize=(10, 10), diagonal='hist') # a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(10, 10), diagonal='kde') # a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(8, 8), diagonal='kde') # a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(12, 12), diagonal='kde') interest_rate = loansmin['Interest.Rate'] loan_amount = loansmin['Amount.Requested'] fico_score = loansmin['FICO.Score'] y = np.matrix(interest_rate).transpose() x1 = np.matrix(fico_score).transpose() x2 = np.matrix(loan_amount).transpose() x = np.column_stack([x1, x2]) X = sm.add_constant(x)
# # plt.xticks(np.arange(len(frame)), values) # plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8) # ## Common attributes for plot formatting #plt.xlabel(key) #plt.ylabel('Number of Passengers') #plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key)) #plt.show() # Then look at correlations # This will also be quite problem-specific since mixture of variables are tricky # In principle I'd like to see some joint stats pd.scatter_matrix(data_trn, alpha=0.3, figsize=(5,6), diagonal='kde'); # In case of mixed data this really doesn't give you a good sense of relationships # I guess you might split into continuous and categorical, but still how about the relationship between continuous and categorical? # Note: L-shaped pairs of variables: if you sum or take the product you get stuff that is more constant or maybe linear, maybe it tells you something # You have all kind of 'garbage' continuous with categorical or binary and # all combos of those # Maybe you can try to see a pair and the class clr = ['r', 'b', 'y', 'm', 'c', 'k'] col_i = 'SibSp' col_j = 'Parch' # Adding some random noise to distinguish the dots Z = DataFrame(np.random.rand(nTrn,2), index=data_trn.index) dxy = 0.45 for j in range(len(set(y_trn))): ix = y_trn==j
drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3) # same scatterplot, except point color varies by 'spirit_servings' # note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0 drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', c='spirit_servings', colormap='Blues') # same scatterplot, except all European countries are colored red colors = np.where(drinks.continent == 'EU', 'r', 'b') drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors) # scatterplot matrix of all numerical columns pd.scatter_matrix(drinks) ''' Advanced Filtering (of rows) and Selecting (of columns) ''' # loc: filter rows by LABEL, and select columns by LABEL users.loc[1] # row with label 1 users.loc[1:3] # rows with labels 1 through 3 users.loc[1:3, 'age':'occupation'] # rows 1-3, columns 'age' through 'occupation' users.loc[:, 'age':'occupation'] # all rows, columns 'age' through 'occupation' users.loc[[1, 3], ['age', 'gender']] # rows 1 and 3, columns 'age' and 'gender' # iloc: filter rows by POSITION, and select columns by POSITION
# performance in Paris pres[pres.dep=="PARIS"] ''' VISUALIZATION ''' pres.ump.plot(kind='hist', bins=20) pres.ps.plot(kind='hist', bins=20) pres.fn.plot(kind='hist', bins=20) pres[['ump', 'ps']].sort('ump').values pres.plot(kind='scatter', x='ps', y='ump') # fits hypothesis: higher UMP votes, lower PS votes pres.plot(kind='scatter', x='ump', y='fn') # line not as evident; but votes may have been interchangeable # demonstration of vote distribution relationships between binomes pd.scatter_matrix(pres[['ump', 'ps', 'fn']], figsize=(10, 8)) pres[['ump', 'ps', 'fn']].plot(kind='hist', stacked=True) # testing hypothesis of voters "so far on the left they come out on the (far) right" pd.scatter_matrix(pres[['fn', 'ug1', 'ug2']], figsize=(10, 8)) # ^^ it works! pd.scatter_matrix(pres[['fn', 'ug2', 'ug3']], figsize=(10, 8)) ''' Data source: http://data.gouv.fr Data desc:
print("acc_train = {}, acc_test ={}".format(acc_train, acc_test)) print("Confusion Matrix:\n{}\n\n {} \n".format(CML, CM)) print("f1_train = {}, f1_test ={}".format(f1_train, f1_test)) print("fbeta_train = {}, fbeta_test ={}".format(fb_train, fb_test)) print("ROC_AUC_train = {}, ROC_AUC_test ={}".format( roc_auc_train, roc_auc_test)) ############################################# Initial Visual Tests ##################################################### ########## ScatterMatrixPlot ########## if False: #Transformed features pd.scatter_matrix(biochemistry_data, alpha=0.3, figsize=(16, 8), diagonal='kde') plt.show() if False: from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.decomposition import PCA ndims = 2 dim_labels = [] for i in range(1, ndims + 1): dim_labels.append("Dimension {}".format(i))
fileName = r'../dataSet/Auto.csv' #if 'coerce', then invalid parsing will be set as NaN df = pd.read_csv(fileName) df_numeric = df.apply(pd.to_numeric, args=('coerce',)) mask = ~np.isnan(df_numeric['cylinders'].values) & ~np.isnan(df_numeric['displacement'].values)\ & ~np.isnan(df_numeric['horsepower'].values) & ~np.isnan(df_numeric['weight'].values)\ & ~np.isnan(df_numeric['acceleration'].values) & ~np.isnan(df_numeric['year'].values)\ & ~np.isnan(df_numeric['origin'].values) X_raw = df_numeric[['cylinders','displacement','horsepower','weight','acceleration','year','origin']][mask] y = df_numeric['mpg'][mask] X = sm.add_constant(X_raw) est = sm.OLS(y,X).fit() print('Exercise 9 Answer:') print('(a) see figure 1') pd.scatter_matrix(df, alpha=0.5) print('(b) ') #correlations = np.corrcoef(pd.concat([y, X_raw], axis=1), rowvar=0) correlations = np.corrcoef(df_numeric.loc[:,'mpg':'origin'][mask], rowvar=0) print('(c)') print(est.summary()) print('(c) i. The null-hypersis of all the regression coefficients are zero can be reject by large F-statistic with very small P-value.') print('(c) ii. From P-value of each predictor, all predictor has statistically significant relationship to the response except cylinders, horsepower and acceleration.') print('(c) iii. The coefficient of year show positive relationship. And increase of 1 year gain 0.7508 increase of mpg. It\'s means cars become more fuel efficient by year.') print('(d) see figure 2.') plt.figure(2) # R plot for lm object will generate 6 plots: residuals against fitted values, sqrt(|residuals|) against fitted values, Normal Q-Q plot, #Cook's distances versus row lables, residuals against leverages, and Cook's distances against leverage. By default, the first 3 and 5 are provided # we plot default by python #residuals vs fitted values
# Aufgabe 2 # # Verschaffe Dir einen Überblick # über die Werte der Spalten *Art* und *Status*. print("\nArten von Schiffen:") print(df['Art'].value_counts()) print("\nStatus von Schiffen:") print(df['Status'].value_counts()) # Aufgabe 3 # # Schaue nach möglichen Korrelationen. pd.scatter_matrix(df) plt.savefig('matrix.png') # Aufgabe 4 # # Plotte Länge gegen Höhe als Streudiagramm. df.plot.scatter('Länge', 'Höhe') plt.savefig('scatter.png') # Aufgabe 5 # # Einer der Einträge enthält einen **Datenfehler**. print("\nEintrag mit Datenfehler:") print(df.ix['HMS Hood'].transpose())
import scipy import numpy import matplotlib import pandas import sklearn # Load dataset url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) # head print(dataset.head(20)) # descriptions print(dataset.describe()) dataset.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False) matplotlib.pyplot.show() dataset.hist() matplotlib.pyplot.show() # scatter plot matrix pandas.scatter_matrix(dataset) matplotlib.pyplot.show()
score = r2_score(pred, y_test) scores.append(score) #calculate mean of all 1000 scores score = np.mean(scores) print "\nR^2 score for predicting Milk is: ", score #OBSERVATION #A low r^2 value indicates that it cannot be predicted with too much accuracy using all the feautres we have. However since there is a positive value, there must be some features which can predict its value to a higher accuracy and hence it fits the data. So we should keep this feature for identifying customer habits. ################################################################################################################## ''' VISUALIZATION OF FEATURE DATA''' #viualize data with diagnol showing data distribution pd.scatter_matrix(data, alpha=0.3, figsize=(14, 8), diagonal='kde') #plt.show() '''FEATURE SCALING USING LOG''' # Scale the data using the natural logarithm log_data = np.log(data) # Scale the sample data using the natural logarithm log_samples = np.log(samples) # Produce a scatter matrix for each pair of newly-transformed features pd.scatter_matrix(log_data, alpha=0.3, figsize=(14, 8), diagonal='kde') plt.show() print "\nScaled sampled data:\n" print log_samples
# Scatter plots macro = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \ '\macrodata.txt') macro.head() data = macro[['cpi','m1','tbilrate','unemp']] data.head() data.head() trans_data = np.log(data).diff().dropna() plt.scatter(trans_data['m1'], trans_data['unemp']) plt.title('Changes in log %s vs. log %s' % ('m1','unemp')) pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3) # Plotting map import pandas as pd import numpy as np import matplotlib.pyplot as plt data = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \ '\ch08_Haiti.csv') data.info() data.head() data.shape data.columns data[['INCIDENT DATE', 'LATITUDE','LONGITUDE']][:10]
colors = Bok_GmGFs['VV-VH'] plt.scatter(Bok_GmGFs['gap_fraction'], Bok_GmGFs['VH-VVnorm'], c=colors, alpha=0.3, cmap='viridis') plt.ylabel("Normalized VH-VV Backscatter (Gamma0 dB)") plt.xlabel("Canopy Gap Fraction") plt.colorbar(); plt.savefig("Correlation_VH-VVNormVsGapFraction.tiff", dpi=300) plt.savefig("Correlation_VH-VVNormVsGapFraction.pdf", dpi=300) #plt.legend() Bok_GmGFs2 = Bok_GmGFs.drop('Year', 1) Bok_GmGFs2 = Bok_GmGFs2.drop(Bok_GmGFs2.columns[[0, 1]], axis=1) Bok_GmGFs2 = pd.DataFrame(Bok_GmGFs2) pd.scatter_matrix(Bok_GmGFs2, alpha=0.2, figsize=(10, 10), diagonal='kde') plt.show plt.savefig("Scatter_Gamma0_Bands_GapFraction.tiff", dpi=300) plt.savefig("Scatter_Gamma0_Bands_GapFraction.pdf", dpi=300) pp = sns.pairplot(data = Bok_GmGFs, y_vars =['gap_fraction'], x_vars = ['VH-VVnorm','VVVHratio','VV-VH']) plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.tiff", dpi=300) plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.pdf", dpi=300) Bok_GmGFs.describe() # ger summary statistics of each variable in Bok_GmGFs ''' PlotID', 'SARdate', 'VHgamma0', 'VVgamma0', 'VHdb', 'VVdb', 'VV-VH',
scatter_matrix(dataset) # Plotting Graph plt.scatter(dataset['total_rooms'], dataset['total_bedrooms']) plt.show() plt.scatter() x = np.arange(-10, 10, 0.01) y = 0.7 * x + 5 plt.plot(x, y) plt.show() y1 = 0.7 * x**2 + x + 8 plt.plot(x, y1) plt.show() sig_y = 1 / (1 + np.power(np.e, -x)) plt.plot(x, sig_y) plt.show() a = np.random.randn(10) b = np.random.randn(5, 5) pd.scatter_matrix(dataset.loc[:, :]) pd.show_versions(as_json=False) corr_mat = dataset.corr() sns.heatmap(corr_mat, annot=True) np.arange(23, 55, 2) np.linspace(0, 100, 6)
# boxplot of beer servings by continent (shows five-number summary and outliers) drinks.boxplot(column="beer_servings", by="continent") # scatterplot of beer servings versus wine servings drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", alpha=0.3) # same scatterplot, except point color varies by 'spirit_servings' # note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0 drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", c="spirit_servings", colormap="Blues") # same scatterplot, except all European countries are colored red colors = np.where(drinks.continent == "EU", "r", "b") drinks.plot(x="beer_servings", y="wine_servings", kind="scatter", c=colors) # scatterplot matrix of all numerical columns pd.scatter_matrix(drinks) """ Advanced Filtering (of rows) and Selecting (of columns) """ # loc: filter rows by LABEL, and select columns by LABEL users.loc[1] # row with label 1 users.loc[1:3] # rows with labels 1 through 3 users.loc[1:3, "age":"occupation"] # rows 1-3, columns 'age' through 'occupation' users.loc[:, "age":"occupation"] # all rows, columns 'age' through 'occupation' users.loc[[1, 3], ["age", "gender"]] # rows 1 and 3, columns 'age' and 'gender' # iloc: filter rows by POSITION, and select columns by POSITION users.iloc[0] # row with 0th position (first row)
ForwardU = Forward1.loc[Forward1.Status=='UFA',:] ForwardR = Forward1.loc[Forward1.Status=='RFA',:] Correlation #goalies #correlation across category Gcor = G1617.loc[:, ['Ovrl', 'SV%', 'Supp', 'ReMin', 'HighSV%', 'PP SV%', 'FA', 'SO SV%', 'Cap Hit', 'Ginj']] Gcor.corr() plt.matshow(Gcor.corr()) plt.xticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue', rotation = 'vertical') plt.yticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue') plt.colorbar() plt.show() pd.scatter_matrix(Gcor, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue') plt.show() #correlation from one select category Gcor2 = G1617.loc[:, ['GP', 'W', 'L', 'SA', 'SV', 'GA', 'SV%']] Gcor2.cov() plt.matshow(Gcor2.corr()) plt.xticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue', rotation = 'vertical') plt.yticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue') plt.colorbar() plt.show() pd.scatter_matrix(Gcor2, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue') plt.show() #players
''' PLOTS ''' ''' Creates a df with only the numerical columns for a scatter matrix RESULT: Nearly all of the independent variables follow some sort of power law distribution ''' Numerical_df = Master_df[['Num_Adv_Event','Num_Serious', 'Num_Other','Num_Life_Threat','Num_Hosp', 'Num_Congen_Anom','Num_Disable','Num_Deaths', 'Num_Male','Num_Female','AE_Per_Year','Adj_Num_AE', 'Adj_Per_Year']] pd.scatter_matrix(Numerical_df, diagonal='kde') ''' Correlation matrix RESULT: Num_Adv_Event is highly correlated (>0.60 with every other column except for Num_Congen_Anom, Num_Disable and Num_Deaths ''' Corr_matrix = Master_df.corr() Corr_matrix.to_csv('C:\Users\jonbryan90\Desktop\Corr_Matrix') ''' Density plots by Innovation_Cat for the promising variabes (Num_Adv_Event, Num_Congen, Num_Disabe, Num_Deaths) ''' Master_df.groupby('Innovation_Cat').Num_Adv_Event.plot(kind='kde', linewidth=2.5,
'id', 'RR', 'C_S', 'U_U_C', 'A_D_R_R', 'a_d_i_r', 'a_d_a_r_r', 'a_u_d_a_r_r', 'mb_s', 'mb_e', 'mb_sub', 'mb_esec', 'mb_inp', 'mb_insec', 'mb_uneng', 'mb_idles' ] ax.set_xticklabels(labels, fontsize=10) ax.set_yticklabels(labels, fontsize=6) ax.matshow(corr) plt.xticks(range(len(corr.columns)), corr.columns) plt.yticks(range(len(corr.columns)), corr.columns) plot_corr(input, 15) from pandas import scatter_matrix scatter_matrix(input, diagonal='kde') san = input.corr() corr = pd.DataFrame(san) #plotting categorical variables san = input.day san.value_counts().plot(kind='bar') #looking for unique domains: len(set(input.from_domain_hash)) #sendex approach # anova test for weekly data from statsmodels.formula.api import ols
# Clean Data: Remove null value rows loansData.dropna(inplace=True) loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: float(x.rstrip('%'))) loansData['Loan.Length'] = loansData['Loan.Length'].map(lambda x: int(x.rstrip('months'))) loansData['FICO.Score'] = loansData['FICO.Range'].map(lambda x: int(x.split('-')[0])) # Create Histogram of FICO scores plt.figure() a = loansData['FICO.Score'].hist() plt.savefig("Bar_Plot_FICO_Score.png") # Create Scatter Matrix of loan data plt.figure() a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist') plt.savefig("Scatter_Matrix_Loan_Data.png") # Create Scatter Plot of loan data (FICO vs Interest Rate) plt.figure() a = loansData.plot.scatter(x = 'FICO.Score', y = 'Interest.Rate') plt.savefig("Scatter_Plot_Loan_Data.png") # The dependent variable y = np.matrix(loansData['Interest.Rate']).transpose() # The independent variables shaped as columns x1 = np.matrix(loansData['FICO.Score']).transpose() x2 = np.matrix(loansData['Amount.Requested']).transpose() x = np.column_stack([x1,x2])
9. class = Class variable (0 or 1) ''' names = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = pd.read_csv(url, names=names) print type(dataframe) # df_head = dataframe.head() # print df_head # df_shape = dataframe.shape # print df_shape # df_dtypes = dataframe.dtypes # print df_dtypes # df_describe = dataframe.describe() # print df_describe # df_correlation = dataframe.corr() # print df_correlation plt.figure() # dataframe.plot.hist(by='age') # dataframe['age'].plot.hist() # dataframe.plot.box(by='age') # dataframe.plot(kind='box') pd.scatter_matrix(dataframe) plt.show()
#Calculate average sale price by zip code as proxy for zip code avg_by_zip = df.groupby(['ZIP CODE'])['SALE PRICE'].median().reset_index() avg_by_zip.columns = ['ZIP CODE', 'avg_sale_by_zip'] df = pd.merge(df, avg_by_zip, on='ZIP CODE', how='outer') #Transform sale price using log normal function to normalize data def log(x): return math.log(x) df['log_sale'] = df['SALE PRICE'].apply(log) df['log_avg_sale'] = df['avg_sale_by_zip'].apply(log) df['gsf_log'] = df['GROSS SQUARE FEET'].apply(log) #Investigate potential relationships via scatter matrix a = pd.scatter_matrix(df, figsize = (10,10), diagonal='hist') #Split into train and test data sets labels = df['log_sale'] df_clean = df[['TOTAL UNITS', 'avg_sale_by_zip', 'GROSS SQUARE FEET']] X_train, X_test, y_train, y_test = train_test_split(df_clean, labels, \ test_size=0.2, random_state=0) #Prep independent and dependent variables for regression y = np.matrix(y_train).transpose() #Fit the OLS model X = sm.add_constant(X_train) model = sm.OLS(y, X_train) fitted = model.fit()
from sklearn.pipeline import Pipeline #imputing within a pipeline from sklearn.svm import SVC #support vector classification plt.style.use('ggplot') iris = datasets.load_iris() type(iris) print(iris.keys()) type(iris.data), type(iris.target) iris.data.shape iris.target_names X = iris.data y = iris.target df = pd.DataFrame(X, columns=iris.feature_names) print(df.head()) _ = pd.scatter_matrix(df, c=y, figsize=[8, 8], s=150, marker='D') knn = KNeighborsClassifier(n_neighbors=6) knn.fit(X, y) y_pred = knn.predict(X) new_prediction = knn.predict(X) print("Prediction: {}".format(new_prediction)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) confusion_matrix(y_test, y_pred) classification_report(y_test, y_pred)