def plot_data(dataset): dataset.hist() plt.show() scatter_matrix(dataset) plt.show()
def showGraph(dataset): # 直方图 dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1) pyplot.show() # 密度图 dataset.plot(kind='density', subplots=True, layout=(4, 4), sharex=False, fontsize=1) pyplot.show() # 箱线图 dataset.plot(kind='box', subplots=True, layout=(4, 4), sharex=False, sharey=False, fontsize=8) pyplot.show() # 散点矩阵图 scatter_matrix(dataset) pyplot.show() # 相关矩阵图 fig = pyplot.figure() ax = fig.add_subplot(111) cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none') fig.colorbar(cax) ticks = np.arange(0, 14, 1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(names) ax.set_yticklabels(names) pyplot.show()
def plot_data(self): self.dataset.hist() plt.show() scatter_matrix(self.dataset) plt.show()
def visualizeData(inputDF): ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-scatter.png' myPlot = inputDF.plot( label = 'population', kind = 'scatter', x = 'longitude', y = 'latitude', s = inputDF["population"] / 100, c = 'median_house_value', cmap = plt.get_cmap("jet"), colorbar = True, alpha = 0.1 ) plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-correlations.png' corrMatrix = inputDF.corr() attributes = ["median_house_value","median_income","total_rooms","housing_median_age"] myPlot = scatter_matrix(frame=inputDF[attributes], figsize=(12,8)) plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2) print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)') print( corrMatrix["median_house_value"].sort_values(ascending=False) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-medianIncome.png' myPlot = inputDF.plot( kind = 'scatter', x = "median_income", y = "median_house_value", alpha = 0.1, figsize = (12,8) ) plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outputFILE = 'plot-correlations-02.png' tempDF = inputDF.copy() tempDF[ "roomsPerHousehold"] = tempDF["total_rooms"] / tempDF["households"] tempDF["populationPerHousehold"] = tempDF["population"] / tempDF["households"] tempDF[ "bedroomsPerRoom"] = tempDF["total_bedrooms"] / tempDF["total_rooms"] corrMatrix = tempDF.corr() print('\ncorrMatrix["median_house_value"].sort_values(ascending=False)') print( corrMatrix["median_house_value"].sort_values(ascending=False) ) attributes = ["median_house_value","median_income","roomsPerHousehold","bedroomsPerRoom"] myPlot = scatter_matrix(frame=tempDF[attributes], figsize=(12,8)) plt.savefig(outputFILE, bbox_inches='tight', pad_inches=0.2) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def showGraph(dataset): # 箱线图 dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False) pyplot.show() # 直方图 dataset.hist() pyplot.show() # 散点矩阵图 scatter_matrix(dataset) pyplot.show()
def hist_scatter(unique_track): #histograms and scatterplots vars_of_interest = ['acousticness','danceability', 'energy','instrumentalness', 'liveness','loudness', 'speechiness','tempo','time_signature','valence'] for var in vars_of_interest: plt.figure() #the histogram shows counts, plt.hist(unique_track[var]) plt.ylabel('Counts') plt.xlabel(var) plt.title('Histogram of '+var) #correlation matrix and interpretation using pairwise scatter plot plt.figure() scatter_matrix(unique_track[vars_of_interest]) unique_track[vars_of_interest].corr().to_csv('correlation.csv') print('\nCorrelation Matrix') print(unique_track[vars_of_interest].corr())
housing = strat_train_set.copy() # visualization of features housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"] / 100, label="population", figsize=(10, 7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True) #coorelations among features corr_matrix = housing.corr() corr_matrix["median_house_value"].sort_values(ascending=False) scatter_matrix(housing[["median_house_value", "median_income", "total_rooms"]], figsize=(12, 8)) #combination of attributes housing["rooms_per_household"] = housing["total_rooms"] / housing["households"] housing["bedrooms_per_household"] = housing["total_bedrooms"] / housing[ "total_rooms"] housing[ "population_per_threshold"] = housing["population"] / housing["households"] corr_matrix = housing.corr() housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() #Data cleaning (getting rid of the corresponding districts) housing.dropna(subset=["total_bedrooms"]) #Data cleaning (getting rid of the whole attribute) housing.drop("total_bedrooms", axis=1) #Data cleaning (setting values to some value) median = housing["total_bedrooms"].median()
def scatter_plot(data): scatter_matrix_plot = scatter_matrix(dataset, figsize=(20, 20)) for ax in scatter_matrix_plot.ravel(): ax.set_xlabel(ax.get_xlabel(), fontsize=7, rotation=45) ax.set_ylabel(ax.get_ylabel(), fontsize=7, rotation=90) return scatter_matrix_plot
# Log-X df.plot.scatter(x='CRIM', y='PRICE', logx=True) plt.title('Scatter plot of Price vs. log(Crime)') plt.show() """ Scatter Plot Matrix (산점도 행렬) """ # Import LIbrary import pandas as pd from pandas.plotting import scatter_matrix import matplotlib.pyplot as plt df = pd.read_csv('iris.csv') # Scatter Plot Matrix with Histogram scatter_matrix(df, alpha=0.5) plt.show() # Scatter Plot Matrix with Kernel Density Estimation scatter_matrix(df, alpha=0.5, diagonal='kde') plt.show() """ Heatmap (히트맵) """ ## Using Pandas hexbin import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('iris.csv') df.plot.hexbin(x='sepal-length', y='sepal-width', gridsize=25) plt.show()
# Check Correlation Correlation = data.corr() pd.DataFrame(Correlation) correlation_Y = pd.DataFrame(Correlation["Survived"]) correlation_Y.sort_values(by="Survived", ascending=False) print(correlation_Y) # data Visualization # histogram data.hist() plt.figure(figsize=(10.8, 7.6)) plt.show() # Multimodal Data Visualizations scatter_matrix(data) plt.figure(figsize=(21.6, 15.2)) plt.show() # correlation matrix # matshow: Plot a matrix or array as an image fig = plt.figure(figsize=(21.6, 15.2)) ax = fig.add_subplot(111) cax = ax.matshow(data.corr(), vmin=-1, vmax=1, interpolation="none") fig.colorbar(cax) ticks = np.arange(0, 20, 1) ax.set_xticks(ticks) ax.set_yticks(ticks) # set names names = [ "Survived", "Pclass", "Age", "SibSp", "Parch", "Fare", "Female", "Male",
from __future__ import print_function # https://stackoverflow.com/questions/29433824/unable-to-import-matplotlib-pyplot-as-plt-in-virtualenv import matplotlib matplotlib.use('TkAgg') from pandas import read_csv import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix url = "https://goo.gl/vhm1eU" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] data = read_csv(url, names=names) # the distribution data description = data.describe() print(description) # the dimensions data print('The dimensions of data: ', data.shape) scatter_matrix(data) plt.show()
from pandas import to_datetime from pandas import DataFrame from sklearn.model_selection import train_test_split from sklearn.metrics import explained_variance_score # Load data DIR = '../Logsn/ind_and_selBcol/v140/' FILE = 'HPmth023.csv' filename = DIR + FILE # Use the following labels names = ['BatteryStateOfCharge_Percent','BatteryVoltage_V','A_mean','min', 'Wh_sum','DV','fD_all','fD_sel','cyc','TemperatureEnvironment_C','t_1','SOH'] dataset = read_csv(filename, usecols=names) data = dataset.values # No split-out of validation dataset to test and validation sets test_size = 0.4 train_size = None #print(train_size, type(train_size)) # IMPORTANT: keep time series order by shuffle=False X_train, X_test = train_test_split(data, test_size=test_size, train_size=train_size, shuffle=False) #print(X_train) # convert to dataframe dfX = DataFrame(X_train) scatter_matrix(dfX) pyplot.show()
#Datatypes of each attribute print("printing Datatypes") print(dataset.dtypes) #Describe Dataset print("printing Desctiption of ") print(dataset.describe()) #Correlations print("Printing Data Correlation") print(dataset.corr()) #Histogram dataset.hist scatter_matrix(dataset) allData = plt.subplot(441) allData.set_title('All Data Together') #plt.show(4,4,0) setosaData = pd.read_csv('iris_setosa.csv') setosaData.hist scatter_matrix(setosaData) setosa = plt.subplot(442) setosa.set_title('Setosa Data') #plt.show(4,4,1) versicolorData = pd.read_csv('iris_versicolor.csv') versicolorData.hist scatter_matrix(versicolorData) versiColor = plt.subplot(443)
training_data.plot(kind='density', subplots=True, sharex=False, figsize=(10,10)) plt.show() # In[ ]: training_data.corr() # In[ ]: from pandas.plotting import scatter_matrix scatter_matrix(training_data, figsize=(10,10)) plt.show() # # Working with Rows # In[ ]: for idx, row in test_data.iterrows(): print(row['Name'], row['Pclass']) # # Making Some Predictions # In[ ]:
def make_scatter_matrix(self): fig, ax = plt.subplots() scatter_matrix(self.load.data[self.load.inputs], diagonal='kde') plt.savefig(f"Data/Visual/{self.load}_scatter_matrix.png", bbox_inches='tight') plt.close(fig)
dataset.hist() plt.show() # 绘制密度图--是一种表现与数据值对应的边界或域对象的图形表示方法,一般用于呈现连续变量 dataset.plot(kind ='density',subplots = True,sharex= False) plt.show() # 绘制箱线图--盒须图,是一种非常好的用于显示数据分布状况的手段。中位数,上四分位数,下四分位数,上边缘,下边缘,边缘之外的异常值 dataset.plot(kind = 'box',subplots = True,sharex = False) plt.show() ''' # 相关矩阵图 correlations = dataset.corr(method='pearson') print(correlations) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(correlations, vmin=-1, vmax=1) fig.colorbar(cax) ticks = np.arange(len(dataset.columns)) ax.set_xticks(ticks) #设置x轴或者y轴只显示哪些刻度 ax.set_yticks(ticks) ax.set_xticklabels(dataset.columns, rotation=90) #设置刻度标签,rotation,fontsize ax.set_yticklabels(dataset.columns) plt.show() #散点矩阵图 from pandas.plotting import scatter_matrix scatter_matrix(dataset) # plt.show()
# writer=pd.ExcelWriter("C:\\Users\\harish647\\Desktop\\iris.xlsx",engine='xlsxwriter') # df1.to_excel(writer,sheet_name='Sheet1') # df2.to_excel(writer,sheet_name='Sheet2') # #print(df1.boxplot(by='sepal_length',column=['sepal_width'],grid=True)) # #df1.hist()#histogram plot # #plt.show()#univarient plot # tips=sns.load_dataset('iris') # print(tips.head()) # sns.set_style("whitegrid") # #sns.boxplot(x='sepal_length',y='sepal_width',hue='species',data=tips,palette='deep')#boxplot # sns.despine() # sns.set_context('poster',font_scale=2)#setFont # sns.lmplot(x='sepal_length',y='sepal_width',size=2,data=tips)#regression plot ##multivarient plot data.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False) #whisker plot and box which is mainly for univarient scatter_matrix(data) #scatter matrix plot how one effetced by other plt.show() #writer.save() # wb=openpyxl.load_workbook("C:\\Users\\harish647\\Desktop\\iris.xlsx") # print(wb.sheetnames()) #print(df)
from sklearn.datasets import load_iris import pandas as pd import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix iris = load_iris() print(iris) print(iris.data) print(iris.feature_names) print(iris.target) print(iris.target_names) X = iris.data y = iris.target df = pd.DataFrame(X, columns=iris.feature_names) df['class'] = y print(df) print(df.describe()) scatter_matrix(df) plt.show()
s=strat_train_copy['population'] / 100, label='population', figsize=(10, 7), c='median_house_value', cmap=plt.get_cmap("jet"), colorbar=True) plt.legend() plt.show() #looking the correlations between attributes corr_matrix = strat_train_copy.corr() print(corr_matrix['median_house_value'].sort_values(ascending=False)) attributes = [ 'median_house_value', 'median_income', 'total_rooms', 'housing_median_age' ] scatter_matrix(strat_train_copy[attributes], figsize=(12, 8)) plt.show() #creating new attributes strat_train_copy['rooms_per_household'] = strat_train_copy[ 'total_rooms'] / strat_train_copy['households'] strat_train_copy['bedrooms_per_room'] = strat_train_copy[ 'total_bedrooms'] / strat_train_copy['total_rooms'] strat_train_copy['population_per_household'] = strat_train_copy[ 'population'] / strat_train_copy['households'] corr_matrix = strat_train_copy.corr() print(corr_matrix['median_house_value'].sort_values(ascending=False)) housing = strat_train.drop("median_house_value", axis=1) housing_labels = strat_train["median_house_value"].copy()
def scat(**kwds): return plotting.scatter_matrix(df, **kwds)
axis=1).plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False, figsize=(9, 9), title='Box Plot for each input variable') plt.savefig('fruits_boxplot') plt.show() fruits.drop('fruit_label', axis=1).hist(bins=30, figsize=(9, 9)) pl.suptitle("Histogram for each numeric input variable") plt.savefig('fruits_hist') plt.show() scatter_matrix(fruits.drop('fruit_label', axis=1), figsize=(10, 5)) plt.show() clf = DecisionTreeClassifier().fit(X_train, y_train) print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train) print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf2.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf2.score(X_test, y_test)))
df["ind"] = pd.Series(df.index).apply(lambda i: i % 50) df.pivot("ind", "species")[col].plot(kind="box") plt.show() plt.close() df.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)") plt.title("길이 대 너비") plt.show() plt.close() colors = ["r", "g", "b"] markers = [".", "*", "^"] fig, ax = plt.subplots(1, 1) for i, spec in enumerate(df["species"].unique()): ddf = df[df["species"] == spec] ddf.plot(kind="scatter", x="sepal width (cm)", y="sepal length (cm)", alpha=0.5, s=10 * (i + 1), ax=ax, color=colors[i], marker=markers[i], label=spec) plt.legend() plt.show() scatter_matrix(df) plt.show() plt.close()
# Scatter Matrix Plot for Multivariate Data from matplotlib import pyplot as plt from pandas import read_csv #import pandas as pd from pandas.plotting import scatter_matrix import warnings warnings.filterwarnings(action="ignore") hNames = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = read_csv("indians-diabetes.data.csv", names=hNames) scatter_matrix(dataframe) plt.show()
print("Dimensões da base:", df.shape) print() print(df.info()) print() print(df.describe()) print() df.hist(figsize=[10, 10]) plt.show() paleta_cores = {0: 'green', 1: 'red'} cores = [paleta_cores[c] for c in df['classe']] scatter_matrix(df[atributos], figsize=[11, 11], c=cores) plt.show() #%% # ************************* # *** Pré-processamento *** # ************************* from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler minMaxScaler = MinMaxScaler(feature_range=(0, 1)) standardScaler = StandardScaler() # média 0 e desvio padrão 1 Xescalonado = minMaxScaler.fit_transform(X) Xpadronizado = standardScaler.fit_transform(X)
x = frame2['No of Siblings or Spouses on Board'] y = frame2['Name'] fig, ax = plt.subplots() ax.bar(y, x) plt.show() women_frame = frame2.loc[frame2['Sex'] == 'Female'] y = women_frame['Age'] x = women_frame['Passenger Class'] plt.bar(x, y) plt.xticks(x) plt.show() men_frame = frame2.loc[frame2['Sex'] == 'Male'] y = men_frame['Age'] x = men_frame['Passenger Class'] plt.bar(x, y) plt.xticks(x) plt.show() y = frame2['Age'] x = frame2['Passenger Class'] plt.scatter(y, x) plt.show() frame3 = pd.read_csv('airquality.csv', sep=',', usecols=["Ozone", "Solar.R", "Wind", "Temp"]) scatter_matrix(frame3) plt.show()
extended_pheno_data_males[extended_pheno_data_males['Diagnosis'] == 'TD'][[ 'FIQ', 'VIQ' ]].mean() #Or #for general descriptives extended_pheno_data_males.describe() #Groupby for better implementation #cleaner aesthitics #groupby spits/is an object ASD_TD_pheno_datamales = extended_pheno_data_males.groupby('Diagnosis') ASD_TD_mean = ASD_TD_pheno_datamales.mean() ASD_TD_max = ASD_TD_pheno_datamales.max() ''' #Plot some of them (?) plotting.scatter_matrix(extended_pheno_data_males[['FIQ', 'Parcel_64','Parcel_148']]) plt.show() #shows plt.close() #terminates figure? plotting.scatter_matrix(extended_pheno_data_males[['FIQ', 'Parcel_1','Parcel_2', 'Parcel_3', 'Parcel_4', 'Parcel_5', 'Parcel_6', 'Parcel_7', 'Parcel_8', 'Parcel_9', 'Parcel_10', 'Parcel_11', 'Parcel_12', 'Parcel_13', 'Parcel_14', 'Parcel_15']]) plt.show() #looking for bimodal plots as if there are 2 populations ''' #STATS - R like formulas #Regression #Simple model = ols("Parcel_48 ~ FIQ", extended_pheno_data_males).fit() print(model.summary()) model = ols("Parcel_48 ~ Diagnosis + 1", extended_pheno_data_males).fit(
y = np.random.randint(0, 50, 1000) np.corrcoef(x, y) # In[4]: Correlation Matrix import pandas as pd df = pd.DataFrame({'a': np.random.randint(0, 50, 1000)}) df['b'] = df['a'] + np.random.normal(0, 10, 1000) # positively correlated with 'a' df['c'] = 100 - df['a'] + np.random.normal(0, 5, 1000) # negatively correlated with 'a' df['d'] = np.random.randint(0, 50, 1000) # not correlated with 'a' from pandas.plotting import scatter_matrix df.corr() scatter_matrix(df, figsize=(6, 6)) plt.show() # In[5] # http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-25-chi.html import numpy as np import pandas as pd import scipy.stats as stats np.random.seed(10) # Sample data randomly at fixed probabilities voter_race = np.random.choice(a= ["C1","C2","C3","C4","C5"], p = [0.05, 0.15 ,0.25, 0.05, 0.5], size=1000)
X=data[choix] # Isolation des variables d'entrées Humidity3pm et RainToday y=data['RainTomorrow'] # Isolation de la variable de sortie RainTomorrow X=X.values y=y.values scaler = StandardScaler().fit(X) # normalisation des valeurs ( moyenne à 0 et écart type de 1) X[:] = scaler.transform(X) # remplacement des valeurs du dataframe par les valeurs normalisées en conservant le type DataFrame for k in data: print("Calcul du coefficient de corrélation de la colonne ",k ," avec RainTomorrow") print(data['RainTomorrow'].corr(data[k])) # calcul des coefficients de corrélation pour chaques colonnes avec la variable de sortie "RainTomorrow" # On sélectionne les colonnes RISK_MM RainToday et Humidity3pm params=['Humidity3pm','RainToday','RainTomorrow'] # création de la liste des variables intéressantes scatter_matrix(data[params], alpha=0.2, figsize=(12,10),diagonal='kde') #Trace la matrice des graphiques plt.show() X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2) # Création des jeux de données (20% de la taille du jeu initial) et d'apprentissages (80% de la taille du jeu initial) from sklearn.linear_model import LogisticRegression logisticRegr = LogisticRegression() # logisticRegr.fit(X_train, y_train) #Entrainement du modèle sur le jeu d'apprentissage ( Calcul des coefficients ) y_pred = logisticRegr.predict(X_test) # Prédictions sur les données d'entrées du jeu de test for k in range (19):
import pandas as pd from pandas.plotting import scatter_matrix import matplotlib.pyplot as plt import numpy as np # 1. Import and parse the dataset. data = pd.read_csv('diabetes.csv') print(data.head()) # 2. Print out summary stats. print(data.describe()) # 3. Create (i) histogram plots for the data in # each column, and (ii) scatter plots showing # the correlation between columns. scatter_matrix(data, alpha=0.2, figsize=(10, 10)) plt.show() # 4. Split the data into training, test, and # holdout data sets mask = np.random.rand(len(data)) < 0.8 training = data[mask] test = data[~mask] mask = np.random.rand(len(training)) < 0.8 holdout = training[~mask] training = training[mask] # save these data sets training.to_csv('training.csv', index=False) test.to_csv('test.csv', index=False)
from pandas.plotting import andrews_curves plt.figure(6) andrews_curves(dataset, 'class') # plt.show() from pandas.plotting import parallel_coordinates plt.figure(7) parallel_coordinates(dataset, 'class') # plt.show() # 散点图矩阵,这有助于发现变量之间的结构化关系,散点图代表了两变量的相关程度 # 如果呈现出沿着对角线分布的趋势,说明它们的相关性较高 from pandas.plotting import scatter_matrix plt.figure(8) scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde') plt.show() # 三. 线性回归分析鸢尾花 # 该部分主要采用线性回归算法对鸢尾花的特征数据进行分析, # 预测花瓣长度、花瓣宽度、花萼长度、花萼宽度四个特征之间的线性关系。 from sklearn.datasets import load_iris hua = load_iris() # 获取花瓣的长和宽 x = [n[0] for n in hua.data] y = [n[1] for n in hua.data] import numpy as np # 转换成数组 x = np.array(x).reshape(len(x), 1) y = np.array(y).reshape(len(y), 1)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import seaborn as sb from sklearn.naive_bayes import GaussianNB import os os.chdir("C:\\Directory") df = pd.read_csv("Annexure_1_result.csv") df.drop(['Unnamed: 0'], axis=1, inplace=True) df.info() df.describe() scatter_matrix(df, figsize=(15, 10)) plt.show() df.head() predict_df = df.drop([ "Text_id", "Sampled_date", "T_site", "T_plant", "Sampling_point", "Condition" ], axis=1) X = predict_df.drop(["Fault"], axis=1) y = predict_df["Fault"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) model = [] model.append(('LR', LogisticRegression()))
# Drop the highly correlated features from our training data one_hot_df6 = one_hot_df6.drop(to_drop, axis=1) #Check columns after drop print('\r\n*********After: Dropping Highly Correlated Fields**************************************') one_hot_df6.info(verbose=False) onehots_stats = one_hot_df6.describe() #join one hot with df mergedf = pd.merge(one_hot_df6, df5, left_index=True, right_index=True) #scatter plot of all the numerics from pandas.plotting import scatter_matrix ax = scatter_matrix(df5,figsize=(10, 10)) df_grouped = df5.groupby(by=['vendor_name']) print (df_grouped.describe()) # this python magics will allow plot to be embedded into the notebook import matplotlib.pyplot as plt import warnings warnings.simplefilter('ignore', DeprecationWarning) %matplotlib inline # lets look at the boxplots separately vars_to_plot_separate = [['state_bottle_cost', 'state_bottle_retail'], ['sale_dollars'],
# partition the data into two classes y_train_1 = y_train == 1 # apple in True class, others in False class y_test_1 = y_test == 1 # apple in True class, others in False class y_train = 2 - y_train_1 # apple = 1; others =2 y_test = 2 - y_test_1 seeData = True if seeData: # plotting a scatter matrix from matplotlib import cm from pandas.plotting import scatter_matrix cmap = cm.get_cmap('gnuplot') scatter = scatter_matrix(X_train, c=y_train, marker='o', s=40, hist_kwds={'bins': 15}, figsize=(9, 9), cmap=cmap) # plotting a 3D scatter plot import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d # must keep fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c=y_train, marker='o', s=100)
colorbar=True ) plt.legend() #%% # Lets check correlation coef's for Median House Values corr_matrix = housing.corr() corr_matrix['median_house_value'].sort_values(ascending=False) #%% from pandas.plotting import scatter_matrix attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age'] scatter_matrix(housing[attributes], figsize=(12,8)) #%% # Focus on median_income and median_house_vale housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1) #%% housing['rooms_per_household'] = housing['total_rooms']/ housing['households'] housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms'] housing['population_per_household'] = housing['population']/housing['households'] corr_matrix = housing.corr() corr_matrix['median_house_value'].sort_values(ascending=False) #%%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) #analysing the data corr_matrix = data.corr() from pandas.plotting import scatter_matrix attributes = [ 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude', 'Y house price of unit area' ] scatter_matrix(data[attributes]) #from the scatter matrix, we can see that there doesn't exist a strong #correlation between any 2 variables #selecting a suitable model for the data # multiple linear regression from sklearn.linear_model import LinearRegression regressor1 = LinearRegression() regressor1.fit(X_train, y_train) #evaluating the selected model # multiple linear regression from sklearn.metrics import r2_score r2 = r2_score(y_test, regressor1.predict(X_test))
def plot(input_ts='-', columns=None, start_date=None, end_date=None, clean=False, skiprows=None, index_type='datetime', names=None, ofilename='plot.png', type='time', xtitle='', ytitle='', title='', figsize='10,6.0', legend=None, legend_names=None, subplots=False, sharex=True, sharey=False, colors='auto', linestyles='auto', markerstyles=' ', style='auto', logx=False, logy=False, xaxis='arithmetic', yaxis='arithmetic', xlim=None, ylim=None, secondary_y=False, mark_right=True, scatter_matrix_diagonal='kde', bootstrap_size=50, bootstrap_samples=500, norm_xaxis=False, norm_yaxis=False, lognorm_xaxis=False, lognorm_yaxis=False, xy_match_line='', grid=False, label_rotation=None, label_skip=1, force_freq=None, drawstyle='default', por=False, invert_xaxis=False, invert_yaxis=False, round_index=None, plotting_position='weibull', source_units=None, target_units=None, lag_plot_lag=1): r"""Plot data.""" # Need to work around some old option defaults with the implementation of # mando legend = bool(legend == '' or legend == 'True' or legend is None) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.ticker import FixedLocator tsd = tsutils.common_kwds(tsutils.read_iso_ts(input_ts, skiprows=skiprows, names=names, index_type=index_type), start_date=start_date, end_date=end_date, pick=columns, round_index=round_index, dropna='all', source_units=source_units, target_units=target_units, clean=clean) if type in ['bootstrap', 'heatmap', 'autocorrelation', 'lag_plot']: if len(tsd.columns) != 1: raise ValueError(""" * * The '{1}' plot can only work with 1 time-series in the DataFrame. * The DataFrame that you supplied has {0} time-series. * """.format(len(tsd.columns), type)) if por is True: tsd = tsutils.common_kwds(tsutils.read_iso_ts(tsd), start_date=start_date, end_date=end_date, round_index=round_index, dropna='no') # This is to help pretty print the frequency try: try: pltfreq = str(tsd.index.freq, 'utf-8').lower() except TypeError: pltfreq = str(tsd.index.freq).lower() if pltfreq.split(' ')[0][1:] == '1': beginstr = 3 else: beginstr = 1 if pltfreq == 'none': short_freq = '' else: # short freq string (day) OR (2 day) short_freq = '({0})'.format(pltfreq[beginstr:-1]) except AttributeError: short_freq = '' if legend_names: lnames = tsutils.make_list(legend_names) if len(lnames) != len(set(lnames)): raise ValueError(""" * * Each name in legend_names must be unique. * """) if len(tsd.columns) == len(lnames): renamedict = dict(list(zip(tsd.columns, lnames))) elif type == 'xy' and len(tsd.columns) // 2 == len(lnames): renamedict = dict(list(zip(tsd.columns[2::2], lnames[1:]))) renamedict[tsd.columns[1]] = lnames[0] else: raise ValueError(""" * * For 'legend_names' you must have the same number of comma * separated names as columns in the input data. The input * data has {0} where the number of 'legend_names' is {1}. * * If 'xy' type you need to have legend names as x,y1,y2,y3,... * """.format(len(tsd.columns), len(lnames))) tsd.rename(columns=renamedict, inplace=True) else: lnames = tsd.columns if colors == 'auto': colors = color_list else: colors = tsutils.make_list(colors) if linestyles == 'auto': linestyles = line_list else: linestyles = tsutils.make_list(linestyles) if markerstyles == 'auto': markerstyles = marker_list else: markerstyles = tsutils.make_list(markerstyles) if markerstyles is None: markerstyles = ' ' if style != 'auto': nstyle = tsutils.make_list(style) if len(nstyle) != len(tsd.columns): raise ValueError(""" * * You have to have the same number of style strings as time-series to plot. * You supplied '{0}' for style which has {1} style strings, * but you have {2} time-series. * """.format(style, len(nstyle), len(tsd.columns))) colors = [] markerstyles = [] linestyles = [] for st in nstyle: colors.append(st[0]) if len(st) == 1: markerstyles.append(' ') linestyles.append('-') continue if st[1] in marker_list: markerstyles.append(st[1]) try: linestyles.append(st[2:]) except IndexError: linestyles.append(' ') else: markerstyles.append(' ') linestyles.append(st[1:]) if linestyles is None: linestyles = [' '] else: linestyles = [' ' if i == ' ' else i for i in linestyles] markerstyles = [' ' if i is None else i for i in markerstyles] icolors = itertools.cycle(colors) imarkerstyles = itertools.cycle(markerstyles) ilinestyles = itertools.cycle(linestyles) style = ['{0}{1}{2}'.format(next(icolors), next(imarkerstyles), next(ilinestyles)) for i in list(range(len(tsd.columns)))] # reset to beginning of iterator icolors = itertools.cycle(colors) imarkerstyles = itertools.cycle(markerstyles) ilinestyles = itertools.cycle(linestyles) if (logx is True or logy is True or norm_xaxis is True or norm_yaxis is True or lognorm_xaxis is True or lognorm_yaxis is True): warnings.warn(""" * * The --logx, --logy, --norm_xaxis, --norm_yaxis, --lognorm_xaxis, and * --lognorm_yaxis options are deprecated. * * For --logx use --xaxis="log" * For --logy use --yaxis="log" * For --norm_xaxis use --type="norm_xaxis" * For --norm_yaxis use --type="norm_yaxis" * For --lognorm_xaxis use --type="lognorm_xaxis" * For --lognorm_yaxis use --type="lognorm_yaxis" * """) if xaxis == 'log': logx = True if yaxis == 'log': logy = True if type in ['norm_xaxis', 'lognorm_xaxis', 'weibull_xaxis']: xaxis = 'normal' if logx is True: logx = False warnings.warn(""" * * The --type={1} cannot also have the xaxis set to {0}. * The {0} setting for xaxis is ignored. * """.format(xaxis, type)) if type in ['norm_yaxis', 'lognorm_yaxis', 'weibull_yaxis']: yaxis = 'normal' if logy is True: logy = False warnings.warn(""" * * The --type={1} cannot also have the yaxis set to {0}. * The {0} setting for yaxis is ignored. * """.format(yaxis, type)) xlim = _know_your_limits(xlim, axis=xaxis) ylim = _know_your_limits(ylim, axis=yaxis) figsize = tsutils.make_list(figsize) if not isinstance(tsd.index, pd.DatetimeIndex): tsd.insert(0, tsd.index.name, tsd.index) if type in ['xy', 'double_mass']: if tsd.shape[1] % 2 != 0: raise AttributeError(""" * * The 'xy' and 'double_mass' types must have an even number of columns * arranged as x,y pairs. You supplied {0} columns. * """.format(tsd.shape[1])) colcnt = tsd.shape[1] // 2 elif type in ['norm_xaxis', 'norm_yaxis', 'lognorm_xaxis', 'lognorm_yaxis', 'weibull_xaxis', 'weibull_yaxis']: colcnt = tsd.shape[1] if type in ['xy', 'double_mass', 'norm_xaxis', 'norm_yaxis', 'lognorm_xaxis', 'lognorm_yaxis', 'weibull_xaxis', 'weibull_yaxis', 'heatmap']: _, ax = plt.subplots(figsize=figsize) plotdict = {(False, True): ax.semilogy, (True, False): ax.semilogx, (True, True): ax.loglog, (False, False): ax.plot} if type == 'time': ax = tsd.plot(legend=legend, subplots=subplots, sharex=sharex, sharey=sharey, style=None, logx=logx, logy=logy, xlim=xlim, ylim=ylim, secondary_y=secondary_y, mark_right=mark_right, figsize=figsize, drawstyle=drawstyle) for index, line in enumerate(ax.lines): plt.setp(line, color=style[index][0]) plt.setp(line, marker=style[index][1]) plt.setp(line, linestyle=style[index][2:]) xtitle = xtitle or 'Time' if legend is True: plt.legend(loc='best') elif type in ['taylor']: from .. skill_metrics import centered_rms_dev from .. skill_metrics import taylor_diagram ref = tsd.iloc[:, 0] std = [pd.np.std(ref)] ccoef = [1.0] crmsd = [0.0] for col in range(1, len(tsd.columns)): std.append(pd.np.std(tsd.iloc[:, col])) ccoef.append(pd.np.corrcoef(tsd.iloc[:, col], ref)[0][1]) crmsd.append(centered_rms_dev(tsd.iloc[:, col].values, ref.values)) taylor_diagram(pd.np.array(std), pd.np.array(crmsd), pd.np.array(ccoef)) elif type in ['target']: from .. skill_metrics import centered_rms_dev from .. skill_metrics import rmsd from .. skill_metrics import bias from .. skill_metrics import target_diagram biases = [] rmsds = [] crmsds = [] ref = tsd.iloc[:, 0].values for col in range(1, len(tsd.columns)): biases.append(bias(tsd.iloc[:, col].values, ref)) crmsds.append(centered_rms_dev(tsd.iloc[:, col].values, ref)) rmsds.append(rmsd(tsd.iloc[:, col].values, ref)) target_diagram(pd.np.array(biases), pd.np.array(crmsds), pd.np.array(rmsds)) elif type in ['xy', 'double_mass']: # PANDAS was not doing the right thing with xy plots # if you wanted lines between markers. # Fell back to using raw matplotlib. # Boy I do not like matplotlib. for colindex in range(colcnt): ndf = tsd.iloc[:, colindex*2:colindex*2 + 2] if type == 'double_mass': ndf = ndf.dropna().cumsum() oxdata = pd.np.array(ndf.iloc[:, 0]) oydata = pd.np.array(ndf.iloc[:, 1]) plotdict[(logx, logy)](oxdata, oydata, linestyle=next(ilinestyles), color=next(icolors), marker=next(imarkerstyles), label=lnames[colindex], drawstyle=drawstyle) ax.set_xlim(xlim) ax.set_ylim(ylim) if legend is True: ax.legend(loc='best') if type == 'double_mass': xtitle = xtitle or 'Cumulative {0}'.format(tsd.columns[0]) ytitle = ytitle or 'Cumulative {0}'.format(tsd.columns[1]) elif type in ['norm_xaxis', 'norm_yaxis', 'lognorm_xaxis', 'lognorm_yaxis', 'weibull_xaxis', 'weibull_yaxis']: ppf = tsutils.set_ppf(type.split('_')[0]) ys = tsd.iloc[:, :] for colindex in range(colcnt): oydata = pd.np.array(ys.iloc[:, colindex].dropna()) oydata = pd.np.sort(oydata)[::-1] n = len(oydata) norm_axis = ax.xaxis oxdata = ppf(tsutils.set_plotting_position(n, plotting_position)) if type in ['norm_yaxis', 'lognorm_yaxis', 'weibull_yaxis']: oxdata, oydata = oydata, oxdata norm_axis = ax.yaxis plotdict[(logx, logy)](oxdata, oydata, linestyle=next(ilinestyles), color=next(icolors), marker=next(imarkerstyles), label=lnames[colindex], drawstyle=drawstyle) # Make it pretty xtmaj = pd.np.array([0.01, 0.1, 0.5, 0.9, 0.99]) xtmaj_str = ['1', '10', '50', '90', '99'] xtmin = pd.np.concatenate([pd.np.linspace(0.001, 0.01, 10), pd.np.linspace(0.01, 0.1, 10), pd.np.linspace(0.1, 0.9, 9), pd.np.linspace(0.9, 0.99, 10), pd.np.linspace(0.99, 0.999, 10)]) xtmaj = ppf(xtmaj) xtmin = ppf(xtmin) norm_axis.set_major_locator(FixedLocator(xtmaj)) norm_axis.set_minor_locator(FixedLocator(xtmin)) if type in ['norm_xaxis', 'lognorm_xaxis', 'weibull_xaxis']: ax.set_xticklabels(xtmaj_str) ax.set_ylim(ylim) ax.set_xlim(ppf(xlim)) elif type in ['norm_yaxis', 'lognorm_yaxis', 'weibull_yaxis']: ax.set_yticklabels(xtmaj_str) ax.set_xlim(xlim) ax.set_ylim(ppf(ylim)) if type in ['norm_xaxis', 'norm_yaxis']: xtitle = xtitle or 'Normal Distribution' ytitle = ytitle or tsd.columns[0] elif type in ['lognorm_xaxis', 'lognorm_yaxis']: xtitle = xtitle or 'Log Normal Distribution' ytitle = ytitle or tsd.columns[0] elif type in ['weibull_xaxis', 'weibull_yaxis']: xtitle = xtitle or 'Weibull Distribution' ytitle = ytitle or tsd.columns[0] if type in ['norm_yaxis', 'lognorm_yaxis', 'weibull_yaxis']: xtitle, ytitle = ytitle, xtitle if legend is True: ax.legend(loc='best') elif type in ['kde', 'probability_density']: ax = tsd.plot(kind='kde', legend=legend, subplots=subplots, sharex=sharex, sharey=sharey, style=None, logx=logx, logy=logy, xlim=xlim, ylim=ylim, secondary_y=secondary_y, figsize=figsize) for index, line in enumerate(ax.lines): plt.setp(line, color=style[index][0]) plt.setp(line, marker=style[index][1]) plt.setp(line, linestyle=style[index][2:]) ytitle = ytitle or 'Density' if legend is True: plt.legend(loc='best') elif type == 'kde_time': from scipy.stats.kde import gaussian_kde _, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=figsize, gridspec_kw={'width_ratios': [1, 4]}) tsd.plot(legend=legend, subplots=subplots, sharex=sharex, sharey=sharey, style=None, logx=logx, logy=logy, xlim=xlim, ylim=ylim, secondary_y=secondary_y, mark_right=mark_right, figsize=figsize, drawstyle=drawstyle, ax=ax1) for index, line in enumerate(ax1.lines): plt.setp(line, color=style[index][0]) plt.setp(line, marker=style[index][1]) plt.setp(line, linestyle=style[index][2:]) xtitle = xtitle or 'Time' ylimits = ax1.get_ylim() ny = pd.np.linspace(ylimits[0], ylimits[1], 1000) for col in range(len(tsd.columns)): xvals = tsd.iloc[:, col].dropna().values pdf = gaussian_kde(xvals) ax0.plot(pdf(ny), ny, linestyle=style[col][2:], color=style[col][0], marker=style[col][1], label=tsd.columns[col], drawstyle=drawstyle) ax0.set(xlabel='Probability Density', ylabel=ytitle) elif type == 'boxplot': tsd.boxplot(figsize=figsize) elif type == 'scatter_matrix': from pandas.plotting import scatter_matrix if scatter_matrix_diagonal == 'probablity_density': scatter_matrix_diagonal = 'kde' scatter_matrix(tsd, diagonal=scatter_matrix_diagonal, figsize=figsize) elif type == 'lag_plot': from pandas.plotting import lag_plot lag_plot(tsd, lag=lag_plot_lag) xtitle = xtitle or 'y(t)' ytitle = ytitle or 'y(t+{0})'.format(short_freq or 1) elif type == 'autocorrelation': from pandas.plotting import autocorrelation_plot autocorrelation_plot(tsd) xtitle = xtitle or 'Time Lag {0}'.format(short_freq) elif type == 'bootstrap': from pandas.plotting import bootstrap_plot bootstrap_plot(tsd, size=bootstrap_size, samples=bootstrap_samples, color='gray') elif type == 'heatmap': # Find beginning and end years byear = tsd.index[0].year eyear = tsd.index[-1].year tsd = tsutils.asbestfreq(tsd) if tsd.index.freqstr != 'D': raise ValueError(""" * * The "heatmap" plot type can only work with daily time series. * """) dr = pd.date_range('{0}-01-01'.format(byear), '{0}-12-31'.format(eyear), freq='D') ntsd = tsd.reindex(index=dr) groups = ntsd.iloc[:, 0].groupby(pd.TimeGrouper('A')) years = pd.DataFrame() for name, group in groups: ngroup = group.values if len(group.values) == 365: ngroup = pd.np.append(group.values, [pd.np.nan]) years[name.year] = ngroup years = years.T plt.imshow(years, interpolation=None, aspect='auto') plt.colorbar() yticks = list(range(byear, eyear + 1)) skip = len(yticks)//20 + 1 plt.yticks(range(0, len(yticks), skip), yticks[::skip]) mnths = [0, 30, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334] mnths_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] plt.xticks(mnths, mnths_labels) grid = False elif (type == 'bar' or type == 'bar_stacked' or type == 'barh' or type == 'barh_stacked'): stacked = False if type[-7:] == 'stacked': stacked = True kind = 'bar' if type[:4] == 'barh': kind = 'barh' ax = tsd.plot(kind=kind, legend=legend, stacked=stacked, style=style, logx=logx, logy=logy, xlim=xlim, ylim=ylim, figsize=figsize) for index, line in enumerate(ax.lines): plt.setp(line, color=style[index][0]) plt.setp(line, marker=style[index][1]) plt.setp(line, linestyle=style[index][2:]) freq = tsutils.asbestfreq(tsd, force_freq=force_freq).index.freqstr if freq is not None: if 'A' in freq: endchar = 4 elif 'M' in freq: endchar = 7 elif 'D' in freq: endchar = 10 elif 'H' in freq: endchar = 13 else: endchar = None nticklabels = [] if kind == 'bar': taxis = ax.xaxis else: taxis = ax.yaxis for index, i in enumerate(taxis.get_majorticklabels()): if index % label_skip: nticklabels.append(' ') else: nticklabels.append(i.get_text()[:endchar]) taxis.set_ticklabels(nticklabels) plt.setp(taxis.get_majorticklabels(), rotation=label_rotation) if legend is True: plt.legend(loc='best') elif type == 'histogram': tsd.hist(figsize=figsize) else: raise ValueError(""" * * Plot 'type' {0} is not supported. * """.format(type)) if xy_match_line: if isinstance(xy_match_line, str): xymsty = xy_match_line else: xymsty = 'g--' nxlim = ax.get_xlim() nylim = ax.get_ylim() maxt = max(nxlim[1], nylim[1]) mint = min(nxlim[0], nylim[0]) ax.plot([mint, maxt], [mint, maxt], xymsty, zorder=1) ax.set_ylim(nylim) ax.set_xlim(nxlim) plt.xlabel(xtitle) plt.ylabel(ytitle) if invert_xaxis is True: plt.gca().invert_xaxis() if invert_yaxis is True: plt.gca().invert_yaxis() plt.grid(grid) plt.title(title) plt.tight_layout() if ofilename is None: return plt plt.savefig(ofilename)
# test if it running correctly # print(dataset.shape) ######## 3 ######### # add some user text st.info("Here is a description of your dataset") # get description # to say: write automatically how to display data (if it's text, dataset, or whatever) st.write(dataset.describe()) st.info("Here is a plot to see distributions and correlations") # display the scatter matrix scatter_matrix(dataset, diagonal="hist") st.set_option('deprecation.showPyplotGlobalUse', False) # display the plot st.pyplot() ######## 4 ######### # train the model # get x and y # x = dataset.iloc[:,[ 0, 1, 2, 3]].values x = dataset.loc[:, ["Temperature"]] # x = dataset.iloc[:,[ 0]].values # y = dataset.iloc[:, -1].values
def class_wise_scatter(data_frame): scatter_matrix(data_frame, alpha=0.5, figsize=(6, 6), diagonal='kde') plt.show()
# %% import pandas as pd from pandas.plotting import scatter_matrix # Reading the data and load it as a DataFrame df = pd.read_csv('auto-mpg.csv') # Print out the column names print('Column names are: ', list(df.columns)) scatter_matrix(df, alpha=0.4, figsize=(7, 7)) # Make target (y) equal to mpg y = df.pop('mpg') # Make x a large matrix containing displacement, cylinders, weight, acceleration and model year X = df[['displacement', 'cylinders', 'weight', 'acceleration', 'model year']] #%% # Import the nessecary Library from Sklearn from sklearn.model_selection import train_test_split #Split the Data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Import the module from sklearn.linear_model import LinearRegression
# descriptions print(dataset.describe()) # class distribution print(dataset.groupby('class').size()) # box and whisker plots dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False) plt.show() # histograms dataset.hist() plt.show() # scatter plot matrix scatter_matrix(dataset) plt.show() # Split-out validation dataset array = dataset.values X = array[:,0:4] Y = array[:,4] validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = \ model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'accuracy'
import pickle dataset = pd.read_csv('data_banknote.csv') print(dataset.shape) # scatter_matrix(dataset,color=colors) # plotting.show() # define colors list, to be used to plot survived either red (=0) or green (=1) colors = ['red', 'green'] # make a scatter plot scatter_matrix(dataset, figsize=[20, 20], marker='.', c=dataset.Class.apply(lambda x: colors[x])) plotting.show() #Corelation matrix corrmat = dataset.corr() fig = plt.figure(figsize=(12, 9)) sns.heatmap(corrmat, vmax=.8, square=True) plt.show() #Create Validation dataset array = dataset.values X = array[:, 0:4] y = array[:, 4] validation_size = 0.20 seed = 7
def main(): ### fetching data # fetch_housing_data() housing = load_housing_data() ### Exploring data to gain insights # print (housing.head()) # print(housing.info()) # print(housing['ocean_proximity'].value_counts()) # print (housing.describe()) # plot_hist(housing) ### Create train set and test set from data using random sampling; use sklearn to get create train and test set train_set, test_set = train_test_func(housing) ### Exploring test_set # print(test_set.head()) # housing['median_income'].hist() # plt.show() ### To limit the income category, we will divide by 1.5 housing['income_cat'] = np.ceil(housing['median_income'] / 1.5) ### Generalize the label with minimal value, so those greater than 5 label it with 5. housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace=True) # housing['income_cat'].hist() # print(housing['income_cat'].value_counts()) # plt.show() ### Create train and test set from data using stratified sampling split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing['income_cat']): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] # print(strat_test_set['income_cat'].value_counts() / len(strat_test_set)) # print(housing["income_cat"].value_counts() / len(housing)) for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) ### Discover and visualize the data to gain insights # housing = strat_train_set.copy() # housing.plot(kind="scatter",x ="longitude", y = "latitude",alpha = 0.1) # save_fig("better_visual_plot") # housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, # s=housing["population"]/100, label="population", figsize=(10,7), # c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, # sharex=False) # plt.legend() # save_fig("housing_prices_scatterplot") corr_matrix = housing.corr() # print(corr_matrix["median_house_value"].sort_values(ascending = False)) attribs = [ "median_house_value", "median_income", "total_rooms", "housing_median_age" ] scatter_matrix(housing[attribs], figsize=(12, 8)) # save_fig("scatter_matrix_plot") housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1) plt.axis([0, 16, 0, 550000]) # save_fig("income_vs_house_value_scatterplot") housing[ "rooms_per_household"] = housing["total_rooms"] / housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing[ "total_rooms"] housing["population_per_household"] = housing["population"] / housing[ "households"] corr_matrix = housing.corr() # print(corr_matrix["median_house_value"].sort_values(ascending = False)) ### Prepare data for machine learning algo housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set housing_labels = strat_train_set["median_house_value"].copy() sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head() # print(sample_incomplete_rows) # print(sample_incomplete_rows.dropna(subset= ["total_bedrooms"])) #drop all data with na # print(sample_incomplete_rows.drop("total_bedrooms",axis=1)) # drop column with na ### impute the missing values try: from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+ except ImportError: from sklearn.preprocessing import Imputer as SimpleImputer imputer = SimpleImputer(strategy="median") ### removing categorical data housing_num = housing.drop('ocean_proximity', axis=1) imputer.fit(housing_num) # print (imputer.statistics_) ###transform the training set: X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index) housing_cat = housing[['ocean_proximity']] from sklearn.preprocessing import LabelEncoder # nearest value will assume that it is related. from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelBinarizer encoder_LB = LabelBinarizer() housing_cat_LB_1hot = encoder_LB.fit_transform(housing_cat) # print(housing_cat_LB_1hot) # print(housing.columns) # attr_adder = FunctionTransformer(add_extra_features,validate = True, kw_args = {'add_bedroom_per_room':False}) # housing_extra_attribs = attr_adder.fit_transform(housing.values) # print(housing.values) # housing_extra_attribs = pd.DataFrame( # housing_extra_attribs, # columns=list(housing.columns)+["rooms_per_household", "population_per_household"], # index=housing.index) # housing_extra_attribs.head() # print (housing.columns) global rooms_ix, bedrooms_ix, population_ix, household_ix rooms_ix, bedrooms_ix, population_ix, household_ix = [ list(housing.columns).index(col) for col in ("total_rooms", "total_bedrooms", "population", "households") ] # attr_adder = FunctionTransformer(add_extra_features, validate=False,kw_args={"add_bedrooms_per_room": False}) # housing_extra_attribs = attr_adder.fit_transform(housing.values) from sklearn.preprocessing import FunctionTransformer def add_extra_features(X, add_bedrooms_per_room=True): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = FunctionTransformer(add_extra_features, validate=False, kw_args={"add_bedrooms_per_room": False}) housing_extra_attribs = attr_adder.fit_transform(housing.values) housing_extra_attribs = pd.DataFrame( housing_extra_attribs, columns=list(housing.columns) + ["rooms_per_household", "population_per_household"], index=housing.index) # print(housing_extra_attribs.head(10)) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # print(housing_num_tr) from sklearn.compose import ColumnTransformer num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) # print (housing_prepared) # print (housing_prepared.shape) # print (housing_labels.shape) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # print (housing.iloc[:5]) # print(housing_prepared.shape) some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] # print(some_data.shape) some_data_prepared = full_pipeline.transform(some_data) print('prediction:', lin_reg.predict(some_data_prepared)) print('Actual:', list(some_labels)) from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) from sklearn.metrics import mean_absolute_error lin_mae = mean_absolute_error(housing_labels, housing_predictions) print(lin_mae) from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor(random_state=42) tree_reg.fit(housing_prepared, housing_labels) housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print(tree_rmse)
## Ax4 gl.plot(days_keys,ACCDIST, ax = ax4, labels = ["","","ACCDIST"],AxesStyle = "Normal", alpha = alpha_stem, color = "k", legend = ["ACCDIST"], fill = 0) # Set final properties and save figure gl.subplots_adjust(left=.09, bottom=.10, right=.90, top=.95, wspace=.05, hspace=0.05) gl.set_fontSizes(ax = [ax1,ax2,ax3,ax4], title = 20, xlabel = 20, ylabel = 20, legend = 15, xticks = 12, yticks = 12) gl.savefig(folder_images + image_name, dpi = 100, sizeInches = [20, 7]) # %% if(plotting_variables): from pandas.plotting import scatter_matrix data_df["Target_reg"] = (data_df["Target_reg"] - np.mean(data_df["Target_reg"]))/np.std(data_df["Target_reg"]) scatter_matrix(data_df[["Target_reg","day_1","week_1","Target_1","Daily_gap_1","HMA_1"]]) # scatter_matrix(data_df_train[["Target_reg","day_1","week_1","Range_HL_1","Target_1", # "Daily_gap_1","HMA_1","RSI_1","MACD_1","ACCDIST_1"]]) plt.show() plt.gcf().set_size_inches( 10, 10 ) plt.savefig(folder_images +'variables_1.png', dpi = 100) ## Variables scatter_matrix(data_df[["Target_reg","week_1","Target_1","Target_2","Target_3","RSI_1","MACD_1","ACCDIST_1"]]) # scatter_matrix(data_df_train[["Target_reg","day_1","week_1","Range_HL_1","Target_1", # "Daily_gap_1","HMA_1","RSI_1","MACD_1","ACCDIST_1"]]) plt.show() plt.gcf().set_size_inches( 10, 10 ) plt.savefig(folder_images +'variables_2.png', dpi = 100) ## Variables