def selectKBest(features, labels, k, testSize=10): testBaseDataframe = features.head(testSize) columnNames = features.columns.values.tolist() resultColumpList = [] dataframe = SelectKBest(chi2, k=k).fit_transform(features, labels) resultDataFrame = dataframe[0:testSize] for colIndex in range(0, resultDataFrame.shape[1]): for columnName in columnNames: if np.array_equal( resultDataFrame[:, colIndex], testBaseDataframe[columnName].as_matrix().astype(float)): resultColumpList.append(columnName) DataPreprocessing.dropColumns(testBaseDataframe, [columnName]) columnNames.remove(columnName) return resultColumpList
"user_loan_ident" ] target_key = "user_own_overdue" #print(df['user_income_range'].head(20)) #print(df['user_last_consume'].head(20)) df.convert_objects(convert_numeric=True) #df['user_last_consume'].str.replace('','') #df.info() validation_size = 0.20 seed = 7 X = df[attributes] Y = df[target_key] X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) X_train_datapreprocessing = DataPreprocessing(X_train, attributes, target_key) binary_transform_attrs = [ 'user_live_address', 'user_rela_name', 'user_relation', 'user_rela_phone', 'user_high_edu', 'user_company_name' ] X_train = X_train_datapreprocessing.transform_to_binary( binary_transform_attrs) X_train = X_train_datapreprocessing.transform_dtype(binary_transform_attrs, d_type=[int], uniform_type=True) area_attrs = ['user_live_province', 'user_live_city'] resource_dir = '../resources' X_train = X_train_datapreprocessing.china_area_number_mapping( area_attrs, resource_dir) X_train = X_train_datapreprocessing.transform_dtype(area_attrs, d_type=[int],
model.train_model() def evaluate_model(model, noise): model.prepare_data_generator() model.create_model() return model.evaluate_model(noise) noise_levels = [0.05, 0.1, 0.2] results = {"resnet": [], "xception": [], "inceptionv3": [], "vgg19":[], "inceptionresnetv2": []} # Testing for noisy images for func in ['sp','rand']: for level in noise_levels: preproc = DataPreprocessing(data_path, test_ratio) preproc.prepare_image_path_df(func, level) vgg19_model = VGG19Classifier(preproc.train_df, preproc.test_df, preproc.y_test, epochs, batch_size) inceptionv3_model = InceptionV3Classifier(preproc.train_df, preproc.test_df, preproc.y_test, epochs, batch_size) resnet50_model = ResNet50Classifier(preproc.train_df, preproc.test_df, preproc.y_test, epochs, batch_size) xception_model = XceptionClassifier(preproc.train_df, preproc.test_df, preproc.y_test, epochs, batch_size) inceptionresnetv2_model = InceptionResNetV2Classifier(preproc.train_df, preproc.test_df, preproc.y_test, epochs, batch_size) noise = func + str(level) noise_attr = func+ '-level-' + str(level) results["resnet"].append({noise_attr : evaluate_model(resnet50_model, noise)}) results["xception"].append({noise_attr: evaluate_model(xception_model, noise)}) results["inceptionv3"].append({noise_attr: evaluate_model(inceptionv3_model, noise)}) results["vgg19"].append({noise_attr: evaluate_model(vgg19_model, noise)})
#print(metrics.accuracy_score(y_validation, rf_pred_probs)) #print(metrics.precision_score(y_validation, rf_pred_probs)) #print(metrics.f1_score(y_validation, rf_pred_probs)) #print(metrics.classification_report(y_validation, rf_pred_probs)) ############################################################################################################## # Way two, cross-validation, using KFold spliting the source data set into train and test, repeat k times, the default evaluation train_df = pandas.read_csv(train_fullpath, sep=',', na_values='NA', low_memory=False) #for item in train_df.columns.values: # pandas.to_numeric(train_df[item]) X_train = train_df[attributes] y_train = train_df[target_key] train_datapreprocessing = DataPreprocessing( pandas.concat([X_train, y_train], axis=1), attributes, target_key) #train_datapreprocessing.data_summary() binary_transform_attrs = [ 'user_live_address', 'user_rela_name', 'user_relation', 'user_rela_phone', 'user_high_edu', 'user_company_name' ] X_train = train_datapreprocessing.transform_x_to_binary( binary_transform_attrs) X_train = train_datapreprocessing.transform_x_dtype(binary_transform_attrs, d_type=[int], uniform_type=True) area_attrs = ['user_live_province', 'user_live_city'] resource_dir = '../resources' X_train = train_datapreprocessing.china_area_number_mapping( area_attrs, resource_dir) X_train = train_datapreprocessing.transform_x_dtype(area_attrs,
if __name__ == "__main__": file_fullpath = '/home/login01/Workspaces/python/dataset/cs.csv' #df = pandas.read_csv(file_fullpath,sep=',',index_col=0,na_values='NA',dtype=object,low_memory=False) #df = pandas.read_csv(file_fullpath,sep=',',index_col=0,na_values='NA',low_memory=False) df = pandas.read_csv(file_fullpath, sep=',', na_values='NA', low_memory=False) attributes = [ "RevolvingUtilizationOfUnsecuredLines", "age", "NumberOfTime30-59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines", "NumberOfTime60-89DaysPastDueNotWorse", "NumberOfDependents" ] target_key = "SeriousDlqin2yrs" #print(df['age'].value_counts()) #df.info() #cut_points = [30,50,70] #binning(df['age'],cut_points=cut_points) mypreprocessing = DataPreprocessing(df, attributes, target_key) age_bins = mypreprocessing.single_attr_binning('age', bin_num=5) print(type(age_bins)) print(pandas.value_counts(age_bins)) groups = df.groupby(mypreprocessing.single_attr_binning('age', bin_num=5)) print(type(groups)) print(list(groups))