__author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/4/2019 10:10 PM' import pandas as pd from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_TEST, APS_FULL_COLUMNS from ay_hw_4.util_data import load_data from ay_hw_4.util_stratistic import count_neg_and_pos if __name__ == "__main__": pd.set_option('display.max_columns', 100) X_train, y_train = load_data(ROOT_PATH + APS_TRAIN, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=False) X_test, y_test = load_data(ROOT_PATH + APS_TEST, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=False) train_num_pos, train_num_neg = count_neg_and_pos(y_train) test_num_pos, test_num_neg = count_neg_and_pos(y_test) print("the number of pos data is : ", train_num_pos + test_num_pos)
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/2/2019 9:10 AM' from ay_hw_4._global import ROOT_PATH, CRIME from ay_hw_4.util_data import load_data, train_test_split_by_size if __name__ == "__main__": X_data, y_data = load_data(ROOT_PATH + CRIME, y_column_index=-1) print("X Row Data Shape: ", X_data.shape) print("y Row Data Shape: ", y_data.shape) X_train, X_test, y_train, y_test = train_test_split_by_size( X_data, y_data, train_size=1495, random_state=2333) print("X_Train Data Shape: ", X_train.shape) print("y_Train Data Shape: ", y_train.shape) print("X_test Data Shape: ", X_test.shape) print("y_test Data Shape: ", y_test.shape)
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/4/2019 5:30 PM' import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_FULL_COLUMNS from ay_hw_4.util_data import load_data, to_binary_numeric if __name__ == "__main__": pd.set_option('display.max_columns', 100) X_data, y_data = load_data(ROOT_PATH + APS_TRAIN, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) y_data = to_binary_numeric(y_data, classNeg="neg") data = pd.concat([y_data, X_data], axis=1) correlation = data.corr() fig = plt.figure(figsize=(20, 15)) sns.heatmap(correlation, vmin=-1, vmax=1, cmap=sns.color_palette("Blues")) plt.show() # 把dropOrNot打开 将报错, 因为数据中有10列存在NaN
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/2/2019 10:36 PM' import pandas as pd import numpy as np from ay_hw_4._global import ROOT_PATH, CRIME from ay_hw_4.util_data import load_data if __name__ == "__main__": X_data, y_data = load_data(ROOT_PATH + CRIME, skip_first_column=5, y_column_index=-1, needImpute=True) data = pd.concat([X_data, y_data], axis=1) cvFormula = lambda x: np.std(x) / np.mean(x) cvResult = np.apply_along_axis(cvFormula, axis=0, arr=data.to_numpy()) print("The total {} features CV are: (first 20 rows)\n {}".format( len(cvResult), cvResult))
# __author__ = 'Aaron Yang' __email__ = '*****@*****.**' __date__ = '10/2/2019 6:39 PM' import numpy as np import pandas as pd from sklearn.impute import SimpleImputer from ay_hw_4._global import ROOT_PATH, CRIME from ay_hw_4.util_data import load_data if __name__ == "__main__": pd.set_option('display.max_columns', 100) X_data, y_data = load_data(ROOT_PATH + CRIME, y_column_index=-1, skip_first_column=5) print("X_data Row Data Shape: ", X_data.shape) print("y Row Data Shape: ", y_data.shape) X_data = X_data.replace('?', np.nan) missingValueColumnIndex = X_data.columns[X_data.isnull().any()] print( "In the beginning, there are total {} columns has missing value in the dataset " .format(missingValueColumnIndex.shape[0])) print( "------------------------------------------------------------------------------" ) print(X_data[missingValueColumnIndex].describe()) # so we can only impute only one column (index=25)