def run(self, default_column='default payment next month'): woe_calc = WOE() uuid_t = uuid.uuid4().hex cr = CreditRisk(self.url, uuid_t) filename = cr.download_file() df, classes = cr.read_file(filename, uuid_t) X, y = cr.test_train_matrix(df) X_train, X_test, y_train, y_test = cr.feature_select( X, y, self.test_size, False) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) num_round = 5 evallist = [(dtest, 'eval'), (dtrain, 'train')] param = { 'objective': 'binary:logistic', 'silent': 1, 'eval_metric': ['error', 'logloss'] } bst = xgb.train(param, dtrain, num_round, evallist) uuid_t = cr.save_model(bst) y_train_pred = bst.predict(dtrain) y_test_pred = bst.predict(dtest) list_attributes = list(filename) if default_column is not None: default_col_index = list_attributes.index(default_column) c_nparray = filename.as_matrix() df_o = filename score = [] woe_dict = {} list_attributes.pop() for l in list_attributes: woe_dict[l] = woe_calc.woe_single_x_score( c_nparray[:, list_attributes.index(l)], c_nparray[:, default_col_index].astype(bool)) for index, row in df_o.iterrows(): woe_val = 0 # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power. # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50, # while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.). startScore = 600 pdo = 20 factor = pdo / math.log(2) offset = round(startScore - (factor * math.log(50))) for l in list_attributes: cell_value = df_o.at[index, l] col_index = list_attributes.index(l) woe_val = woe_val + woe_dict[l][cell_value] score.append(round(offset + woe_val)) print('Rows:: ', index, 'Score:: ', score[index]) return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred, classes, uuid_t)
def run(self, default_column='default payment next month'): woe_calc = WOE() uuid_t = uuid.uuid4().hex cr = CreditRisk(self.url, uuid_t) filename = cr.download_file() df, classes = cr.read_file(filename, uuid_t) X, y = cr.test_train_matrix(df) X_train, X_test, y_train, y_test = cr.feature_select( X, y, self.test_size, False) rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5) rf.fit(X_train, y_train) uuid_t = cr.save_model(rf) y_train_pred = rf.predict_proba(X_train)[:, 1] y_test_pred = rf.predict_proba(X_test)[:, 1] list_attributes = list(filename) if default_column is not None: default_col_index = list_attributes.index(default_column) c_nparray = filename.as_matrix() df_o = filename score = [] woe_dict = {} list_attributes.pop() for l in list_attributes: woe_dict[l] = woe_calc.woe_single_x_score( c_nparray[:, list_attributes.index(l)], c_nparray[:, default_col_index].astype(bool)) for index, row in df_o.iterrows(): woe_val = 0 # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power. # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50, # while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.). startScore = 600 pdo = 20 factor = pdo / math.log(2) offset = round(startScore - (factor * math.log(50))) for l in list_attributes: cell_value = df_o.at[index, l] col_index = list_attributes.index(l) woe_val = woe_val + \ woe_dict[l][cell_value] * math.log(2) / (1 - math.log(2)) score.append(round(offset + woe_val)) print('Rows:: ', index, 'Score:: ', score[index]) return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred, classes, uuid_t)
def run(self, default_column='default payment next month'): woe_calc = WOE() uuid_t = uuid.uuid4().hex cr = CreditRisk(self.url, uuid_t) df_o = cr.download_file() df, classes = cr.read_file(df_o, uuid_t) X, y = cr.test_train_matrix(df) X_train, X_test, y_train, y_test = cr.feature_select( X, y, self.test_size, False) regr = linear_model.LogisticRegression() regr.fit(X_train, y_train) uuid_t = cr.save_model(regr) y_train_pred = regr.predict_proba(X_train)[:, 1] y_test_pred = regr.predict_proba(X_test)[:, 1] list_attributes = list(df_o) # if default_column is not None: default_col_index = list_attributes.index(default_column) c_nparray = df_o.as_matrix() score = [] woe_dict = {} list_attributes.pop() for l in list_attributes: woe_dict[l] = woe_calc.woe_single_x_score( c_nparray[:, list_attributes.index(l)], c_nparray[:, default_col_index].astype(bool)) for index, row in df_o.iterrows(): woe_val = 0 # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power. # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50, # while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.). startScore = 600 pdo = 20 factor = pdo / math.log(2) offset = round(startScore - (factor * math.log(50))) for l in list_attributes: cell_value = df_o.at[index, l] col_index = list_attributes.index(l) woe_val = woe_val + ( woe_dict[l][cell_value] * regr.coef_[0][col_index]) + ( regr.intercept_[0] / len(list_attributes)) score.append(round(offset + woe_val)) return cr.scores(score, y_train, y_train_pred, y_test, y_test_pred, classes, uuid_t)
def run(self, type=None, columnName=None,default_column=None): cr = CreditRisk(self.url, "") df_train = cr.download_file() #df_train = pd.read_excel(filename, header=1) list_attributes = list(df_train) if default_column is not None: default_col_index = list_attributes.index(default_column) if type is None: cjson_list = [] njson_list = [] woe_calc = WOE() for l in list_attributes: x_vals = [] y_vals = [] x_hist = [] y_hist = [] if l in self.category_list: df_train[l] = df_train[l].astype('category') d = df_train.groupby([l], as_index=False).size() d_frame = d.to_frame() c_nparray = df_train.as_matrix() for index, row in d_frame.iterrows(): x_vals.append(index) y_vals.append(row[0]) if len(x_vals) > 200: sample_list = sorted(random.sample( range(0, len(x_vals)), 200)) else: sample_list = range(0, len(x_vals)) categorical_json = {'feature': l, 'x_vals': [str(x_vals[i]) for i in sample_list], 'y_vals': [str(y_vals[i]) for i in sample_list], 'woe': woe_calc.woe_single_x(c_nparray[:, list_attributes.index(l)], c_nparray[:, default_col_index].astype(bool)) } cjson_list.append(categorical_json) elif l in self.numeric_list: df_train[l] = pd.to_numeric(df_train[l], errors='coerce') h = np.histogram(df_train[l], bins='auto', normed = True, density=False) for x in h[1].tolist(): x_hist.append(format(float(x), '.2f')) for y in h[0].tolist(): y_hist.append(format(float(y), '.0f')) min_val = str(df_train[l].dropna().min()) max_val = str(df_train[l].dropna().max()) mean_val = str(df_train[l].dropna().mean()) median_val = str(df_train[l].dropna().median()) mode_val = str(df_train[l].dropna().mode()[0]) tot_null = str(df_train[l].isnull().sum()) numerical_json = {'feature': l, 'x_hist': x_hist, 'y_hist': y_hist, 'min_val': min_val, 'max_val': max_val, 'mean_val': mean_val, 'median_val': median_val, 'mode_val': mode_val, 'tot_null': tot_null } njson_list.append(numerical_json) json_final = {'categorical': cjson_list, 'numerical': njson_list} return json.dumps(json_final) else: l=columnName if type == 0: x_vals = [] y_vals = [] woe_calc = WOE() df_train[l] = df_train[l].astype('category') d = df_train.groupby([l], as_index=False).size() d_frame = d.to_frame() c_nparray = df_train.as_matrix() for index, row in d_frame.iterrows(): x_vals.append(index) y_vals.append(row[0]) if len(x_vals) > 200: sample_list = sorted(random.sample( range(0, len(x_vals)), 200)) else: sample_list = range(0, len(x_vals)) return json.dumps({'feature': l, 'x_vals': [str(x_vals[i]) for i in sample_list], 'y_vals': [str(y_vals[i]) for i in sample_list], 'woe': woe_calc.woe_single_x(c_nparray[:, list_attributes.index(l)], c_nparray[:, 24].astype(bool)) }) else: x_hist = [] y_hist = [] # df_train[l] = pd.to_numeric(df_train[l], errors='coerce') # h = np.histogram(df_train[l], bins='auto', normed = True, density=False) # for x in h[1].tolist(): # x_hist.append(format(float(x), '.2f')) # for y in h[0].tolist(): # y_hist.append(format(float(y), '.0f')) min_val = str(df_train[l].dropna().min()) max_val = str(df_train[l].dropna().max()) mean_val = str(df_train[l].dropna().mean()) median_val = str(df_train[l].dropna().median()) mode_val = str(df_train[l].dropna().mode()[0]) tot_null = str(df_train[l].isnull().sum()) return json.dumps({'feature': l, 'x_hist':df_train[l].values.tolist(), 'y_hist': y_hist, 'min_val': min_val, 'max_val': max_val, 'mean_val': mean_val, 'median_val': median_val, 'mode_val': mode_val, 'tot_null': tot_null })
def run(self, col_name, feature_type=-1, default_col="DEFAULT PAYMENT NEXT MONTH"): filename = self.url.rsplit('/', 1)[-1] uuid = os.path.splitext(os.path.basename(filename))[0] #print("UV.url:", self.url) #print("UV.uuid:", uuid) CR = CreditRisk(self.url, uuid) Processing = FeatureProcessing() cd = os.path.dirname(os.path.abspath(__file__)) filename = cd + "/user_aie_datasets/" + uuid + "-aie" filepath = Path(filename) if filepath.is_file(): df_user = pd.read_csv(filename, encoding='utf-8') else: df_user = CR.download_file() filename = cd + "/user_datasets/" + uuid cd = os.path.dirname(os.path.abspath(__file__)) df_user.to_csv(filename, encoding='utf-8', index=False) df_user = Processing.sort_df_by_feature_names(df_user) df, classes = CR.write_file(df_user, default_col, uuid) #df_binned = Processing.bin_numerical_features(df) # uncomment to get df where numerical features are binned df.columns = df.columns.str.upper() features = list(df.columns.values) #print(features) num_features = len(features) - 1 if default_col is not None: default_col_index = features.index(default_col) if feature_type == -1: features_dict = Processing.categorize_features_numerical(df) feature_type = features_dict.get(col_name, 0) #print(feature_type) if feature_type == 0: x_vals = [] y_vals = [] woe = WOE() df[col_name] = df[col_name].astype('category') df_grouped = df.groupby([col_name], as_index=False).size() df_ = df_grouped.to_frame() category_nparray = df.as_matrix() for idx, row in df_.iterrows(): x_vals.append(idx) y_vals.append(row[0]) if len(x_vals) > 200: sample_list = sorted( random.sample(range(0, len(x_vals)), 200)) else: sample_list = range(0, len(x_vals)) return json.dumps({ 'feature': col_name, 'x_vals': [str(x_vals[i]) for i in sample_list], 'y_vals': [str(y_vals[i]) for i in sample_list], #'woe': woe.woe_single_x(category_nparray[:, features.index(col_name)], # category_nparray[:, num_features].astype(bool)) 'woe': woe.woe_single_x_score( category_nparray[:, features.index(col_name)], category_nparray[:, features.index(default_col)].astype( bool)), 'feature_type': feature_type }) elif feature_type == 1: woe = WOE() category_nparray = df.as_matrix() x_hist = [] y_hist = [] min_val = str(df[col_name].dropna().min()) max_val = str(df[col_name].dropna().max()) mean_val = str(df[col_name].dropna().mean()) median_val = str(df[col_name].dropna().median()) mode_val = str(df[col_name].dropna().mode()[0]) tot_null = str(df[col_name].isnull().sum()) std_val = str(df[col_name].dropna().std()) var_val = str(df[col_name].dropna().var()) high_corrs = str(Processing.get_corr_coeffs(df, col_name)) count_outliers = str(Processing.get_outliers_count(df, col_name)) percentage_missing = str( Processing.get_missing_percent(df, col_name)) #_, iv = woe.woe_single_x(category_nparray[:, features.index(col_name)], category_nparray[:, num_features].astype(bool)) _, iv = woe.woe_single_continuous_feature( df[[col_name, default_col]], col_name, default_col) #_, iv = woe.woe_single_x(category_nparray[:, features.index(col_name)], category_nparray[:, features.index(default_col)].astype(bool)) #print(col_name, " numerical iv:", iv) return json.dumps({ 'feature': col_name, 'x_hist': df[col_name].values.tolist(), 'y_hist': y_hist, 'min_val': min_val, 'max_val': max_val, 'mean_val': mean_val, 'median_val': median_val, 'mode_val': mode_val, 'tot_null': tot_null, 'std_val': std_val, 'var_val': var_val, 'high_corrs': high_corrs, 'count_outliers': count_outliers, 'percentage_missing': percentage_missing, 'iv': iv, 'feature_type': feature_type })
def run(self, default_column='DEFAULT PAYMENT NEXT MONTH', corr_threshold=1.0): # download file from url filename = self.url.rsplit('/', 1)[-1] uuid = os.path.splitext(os.path.basename(filename))[0] print("RandomForest.self.url:", self.url) CR = CreditRisk(self.url, uuid) Processing = FeatureProcessing() # check if file exists locally, if not download df = CR.read_file() df = Processing.sort_df_by_feature_names(df) df, classes = CR.write_file(df, default_column, uuid) # drop ignored features if self.ignore_features is not None: for feature in self.ignore_features: df = df.drop(feature, axis=1) #df = CR.bin_numerical_features(df) # uncomment if needed # remove highly correlated features features_to_drop = Processing.list_highly_corr_features( df, corr_threshold) #print(features_to_drop) df = Processing.remove_features(df, features_to_drop) # partition features = list(df) #print(features) X, y = CR.test_train_matrix(df, default_column) X_train, X_test, y_train, y_test = CR.split_train_test( X, y, self.test_size, False, len(features)) # create, fit model # TODO need to do grid search to find optimal hyperparams clf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=2) clf.fit(X_train, y_train) print("random forest accuracy: " + str(CR.get_model_accuracy(clf, X_test, y_test))) #print("log loss: ", self.log_loss(clf, X_train, y_train)) # TODO may not need this: get list of most important features '''importance = [] for feature in zip(features, clf.feature_importances_): importance.append(feature) importance = sorted(importance, key=itemgetter(1), reverse=True) # descending order print(importance)''' # save model #uuid_t = uuid.uuid4().hex uuid_t = CR.save_model(clf) # get probability estimates y_train_pred = clf.predict_proba(X_train)[:, 1] y_test_pred = clf.predict_proba(X_test)[:, 1] # get target col index if default_column is not None: default_col_index = features.index(default_column) # uncomment to see kfolds score #kfolds_cv_score = self.kfold_cv(CR, X.as_matrix(), y.as_matrix()) #print(kfolds_cv_score) c_nparray = df.as_matrix() score = [] woe_dict = {} features.pop() # TODO make credit scores from probability of default woe = WOE() for l in features: woe_dict[l] = woe.woe_single_x_score( c_nparray[:, features.index(l)], c_nparray[:, default_col_index].astype(bool)) for index, row in df.iterrows(): woe_val = 0 # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power. # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50, # while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.). startScore = 600 pdo = 20 factor = pdo / math.log(2) offset = round(startScore - (factor * math.log(50))) for l in features: cell_value = df.at[index, l] col_index = features.index(l) woe_val = woe_val + \ woe_dict[l][cell_value] * math.log(2) / (1 - math.log(2)) score.append(round(offset + woe_val)) #print('Rows:: ', index, 'Score:: ', score[index]) return CR.scores(score, y_train, y_train_pred, y_test, y_test_pred, classes, uuid_t)
def fit(self, X, y=None): """ 1. 逐个特征基于熵值离散化成nbin个区间 2. 计算计算特征woe及iv值 3. 保存转换后的离散特征和woe特征 4. 保存特征值的iv及离散区间 :param X: (N,M) :param y: (N,1) :return: """ # dataset = pd.read_table(self.fn_raw_train, sep=',', header=0) # labels = dataset.pop(self.labelname) X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) user_info = X.loc[:, self.sample_columns] logging('user_info:{}'.format(user_info.columns)) dataset = X.drop(user_info, axis=1).fillna(0).astype(np.int32) labels = y logging('Begin to discretize features', dataset.shape) start = time.clock() woe = WOE() woe_arr = [] to_drop = [] for column in dataset.columns: desc = dataset[column].describe(percentiles=[0.98]) minv, maxv = max(-1, desc['min']), round(desc['98%']) features = dataset[column].clip(minv, maxv) n_uniques = features.nunique() if n_uniques < 2 or desc['std'] < 0.05: to_drop.append(column) else: if n_uniques < 1000: feature_values = features.apply(lambda x: int(x)) seg_ents_keys_sorted = self._segment(feature_values, labels) else: # 对数化、离散化: 拉伸低频,压缩高频,系数补偿、饱和特性、平滑低频段的震荡。 feature_values = features.apply(lambda x: int(log(x - minv + 0.1 ** 8, 1.01))) seg_ents_keys_sorted = [ (round(1.01 ** seg_ents_keys_sorted[0]) + minv, round(1.01 ** seg_ents_keys_sorted[1]) + minv) for seg_ents_keys_sorted in self._segment(feature_values, labels)] seg_index = features.apply(categorizing, args=(seg_ents_keys_sorted,)) woe_dict, iv = woe.woe_single_x(seg_index, labels) logging( '{}({}, {}), iv: {}, intervals: {}'.format(column, minv, maxv, round(iv, 4), seg_ents_keys_sorted)) assert len(seg_ents_keys_sorted) == len(woe_dict), '{} ---- {}'.format(seg_ents_keys_sorted, woe_dict) if iv <= 0.02: to_drop.append(column) else: woe_arr.append(woe_dict) self.iv_dict[column], self.woes_dict[column] = iv, woe_dict dataset.loc[:, column], self.intervals_dict[column] = seg_index, seg_ents_keys_sorted if to_drop: dataset.drop(to_drop, axis=1, inplace=True) logging('End to discretize features', dataset.shape) self.selected_columns = dataset.columns # dis特征处理 # temp_dataset = woe.woe_replace(dataset, np.array(woe_arr)) # woe_dataset = pd.DataFrame(X, columns=self.selected_columns) # dis特征存储 dis_dataset = pd.concat([user_info, dataset], axis=1) dis_dataset.insert(dis_dataset.shape[1], labels.name, labels) dis_dataset.to_csv(self.fn_dis_train, index=False) self._create_dis_hql(self.tablename) logging('End to story dis features', dis_dataset.shape) # woe特征处理 temp_dataset = woe.woe_replace(dataset, np.array(woe_arr)) woe_dataset = pd.DataFrame(temp_dataset, columns=self.selected_columns) # woe特征存储 woe_dataset = pd.concat([user_info, woe_dataset], axis=1) woe_dataset.insert(woe_dataset.shape[1], labels.name, labels) woe_dataset.to_csv(self.fn_woe_train, index=False) self._create_woe_hql(self.tablename) logging('End to story woe features', woe_dataset.shape) cPickle.dump(self.woes_dict, open(self.fn_woes_dict, 'wb')) cPickle.dump(self.intervals_dict, open(self.fn_intervals_dict, 'wb')) with open(self.fn_ivs_dict, 'w') as fp: json.dump(self.iv_dict, fp, encoding='utf-8') if self.feature_selection is True: self.dis_rm_columns = pearson_ccs(dis_dataset, self.iv_dict, ratio=self.ratio) dis_rm_dataset = dis_dataset.drop(self.dis_rm_columns, axis=1) # dis_rm_dataset.insert(dis_rm_dataset.shape[1], labels.name, labels) # dis_rm_dataset = pd.concat([user_info, dis_rm_dataset], axis=1) dis_rm_dataset.to_csv(self.fn_dis_rm_train, index=False) logging('dis feature after feature selection', dis_rm_dataset.shape) self.woe_rm_columns = pearson_ccs(woe_dataset, self.iv_dict, self.ratio) woe_rm_dataset = woe_dataset.drop(self.woe_rm_columns, axis=1) # woe_rm_dataset.insert(woe_rm_dataset.shape[1], labels.name, labels) # woe_rm_dataset = pd.concat([user_info, woe_rm_dataset], axis=1) woe_rm_dataset.to_csv(self.fn_woe_rm_train, index=False) logging('woe feature after feature selection', woe_rm_dataset.shape) logging('Discretizing features finished.', dataset.shape, 'Time elapsed: %.2f s' % (time.clock() - start)) return self
def run(self, default_column='DEFAULT PAYMENT NEXT MONTH', corr_threshold=1.0): # download file from url filename = self.url.rsplit('/', 1)[-1] uuid = os.path.splitext(os.path.basename(filename))[0] print("LogisticRegression.self.url:", self.url) CR = CreditRisk(self.url, uuid) Processing = FeatureProcessing() # check if file exists locally, if not download df = CR.read_file() df = Processing.sort_df_by_feature_names(df) df, classes = CR.write_file(df, default_column, uuid) # drop ignored features if self.ignore_features is not None: for feature in self.ignore_features: df = df.drop(feature, axis=1) #df = CR.bin_numerical_features(df) # uncomment if needed # remove highly correlated features features_to_drop = Processing.list_highly_corr_features( df, corr_threshold) #print(features_to_drop) df = Processing.remove_features(df, features_to_drop) # partition features = list(df) #print(features) X, y = CR.test_train_matrix(df, default_column) X_train, X_test, y_train, y_test = CR.split_train_test( X, y, self.test_size, False, len(features)) # create, fit model clf = linear_model.LogisticRegression() clf.fit(X_train, y_train) # save model #uuid_t = uuid.uuid4().hex uuid_t = CR.save_model(clf) # get probability estimates y_train_pred = clf.predict_proba(X_train)[:, 1] y_test_pred = clf.predict_proba(X_test)[:, 1] # get target col index if default_column is not None: default_col_index = features.index(default_column) c_nparray = df.as_matrix() score = [] woe_dict = {} features.pop() woe = WOE() for l in features: woe_dict[l] = woe.woe_single_x_score( c_nparray[:, features.index(l)], c_nparray[:, default_col_index].astype(bool)) for index, row in df.iterrows(): woe_val = 0 # Assume that As the score points are just another way to denote the scorecard, they do not affect its predictive power. # Now, we can assume that 600 score points correspond to odds (bads to goods) of 1:50, # while each additional 20 points double the odds (620 points double the odds to 1:100, 640 score points double to 1:200 etc.). startScore = 600 pdo = 20 factor = pdo / math.log(2) offset = round(startScore - (factor * math.log(50))) for l in features: cell_value = df.at[index, l] col_index = features.index(l) #print(l, ":", col_index) woe_val = woe_val + (woe_dict[l][cell_value] * clf.coef_[0][col_index]) + ( clf.intercept_[0] / len(features)) score.append(round(offset + woe_val)) print("logistic regression accuracy: " + str(CR.get_model_accuracy(clf, X_test, y_test))) return CR.scores(score, y_train, y_train_pred, y_test, y_test_pred, classes, uuid_t)