def filter_linear_regression(dataframe, target, threshold=0.001): """ Filter features with low weight when estimate with linear regression model. Parameters ---------- dataframe : pandas.Dataframe dataframe to process target : string Feature name, as predict result. threshold : float Between 0 and 1. Filter features with weight below the value. Return ------- dataframe after process """ from sklearn.linear_model import LassoCV # from sklearn.linear_model import MultiTaskLassoCV from sklearn import preprocessing categorical_feats = dataframe.select_dtypes('object').columns.tolist() if target in categorical_feats: categorical_feats.remove(target) for col in categorical_feats: lb = preprocessing.LabelEncoder() lb.fit(list(dataframe[col].values.astype('str'))) dataframe[col] = lb.transform(list( dataframe[col].values.astype('str'))) import fast_impute for feature in dataframe.columns: dataframe, acc = fast_impute.impute_mean(dataframe, feature) x_train = dataframe.drop([target], axis=1) y_train = dataframe[target] column_names = x_train.columns.tolist() x_train = preprocessing.scale(x_train) lr = LassoCV(cv=5, alphas=[0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1], n_jobs=-1) lr.fit(x_train, y_train) coef_df = pd.DataFrame() coef_df["feature"] = column_names coef_df["coef"] = lr.coef_.T coef_df["abs_coef"] = np.abs(lr.coef_.T) coef_df.sort_values('abs_coef', inplace=True, ascending=False) less_coef_features = coef_df.loc[coef_df['abs_coef'] < threshold, 'feature'] score = lr.score(x_train, y_train) dataframe.drop(less_coef_features, axis=1, inplace=True) trace('filter_linear_regression') trace('category features') trace(categorical_feats) trace('coefficence list') trace(coef_df) trace('score') trace(score) trace('regularization alpha_') alpha = lr.alpha_ trace(str(alpha)) trace('drop features') trace(less_coef_features) return dataframe
def filter_logistic_regression(dataframe, target, threshold=0.001): """ Filter features with low weight from compution with logistic regression model. Parameters ---------- dataframe : pandas.Dataframe dataframe to process target : string Feature name, as predict result. threshold : float Between 0 and 1. Filter features with weight below the value. Return ------- dataframe after process """ from sklearn.linear_model import LogisticRegressionCV from sklearn import preprocessing categorical_feats = dataframe.select_dtypes('object').columns.tolist() if target in categorical_feats: categorical_feats.remove(target) for col in categorical_feats: lb = preprocessing.LabelEncoder() lb.fit(list(dataframe[col].values.astype('str'))) dataframe[col] = lb.transform(list( dataframe[col].values.astype('str'))) import fast_impute for feature in dataframe.columns: dataframe, acc = fast_impute.impute_mean(dataframe, feature) X = dataframe.drop([target], axis=1) Y = dataframe[target] column_names = X.columns X = preprocessing.scale(X) # lr = LogisticRegression(penalty='l1', max_iter=1000, C=lambd) lr = LogisticRegressionCV(cv=5, penalty='l1',solver='saga',n_jobs=-1, \ max_iter=1000, Cs=[0.03,0.05,0.1,0.3]) lr.fit(X, Y) score = lr.score(X, Y) coef_df = pd.DataFrame() coef_df["feature"] = column_names coef_df["coef"] = lr.coef_.T coef_df["abs_coef"] = np.abs(lr.coef_.T) coef_df.sort_values('abs_coef', inplace=True, ascending=False) less_coef_features = coef_df.loc[coef_df['abs_coef'] < threshold, 'feature'] dataframe.drop(less_coef_features, axis=1, inplace=True) trace('filter_logistic_regression') trace('category features') trace(categorical_feats) trace('coefficence list') trace(coef_df) trace('score') trace(score) trace('regularization C_') c = lr.C_ trace(str(c)) trace('drop features') trace(less_coef_features) return dataframe
def int_module_linear_regression(params, dataframe, target, \ test_dataframe=None, n_folds=5): """ Internal linear regression model with cross validation, supporting both classification prediction and regression prediction. It's aimed for easy usage and reuse for all kinds of situation. Parameters ---------- params : dictionary Parameter set with dictionary format for linear regression model. dataframe : pandas.Dataframe Dataframe to process. target : string Feature name, target identifies some column which is used for prediction analyze. test_dataframe : pandas.Dataframe, optional Dataframe to predict. n_folds : integer, optional Cross validation times when run linear regression model with given dataframe. It's often 5 or 10. Output ------- Score when run linear regression model with given param and dataframe. For regression prediction, score as R2; for binary classification, score as ROC; for multi-classification, score as accuracy. Test result from prediction if specify test_dataframe. """ from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LinearRegression from sklearn.metrics import roc_auc_score from sklearn.preprocessing import MinMaxScaler PREDICT_NAME = 'predict' df = dataframe # df = df.reset_index(drop=True) for f_ in df.columns: df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True) train_df = df.drop([target], axis=1) train_df, _ = one_hot_encoder(train_df, True) train_target = df[target] valid = df[[target]] valid[PREDICT_NAME] = 0 min_max_scaler = MinMaxScaler() train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df), index=train_df.index) predict_classifier_bin, predict_classifier_nominal = _check_classifier( df, target) predict_df = pd.DataFrame({'result': []}) folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001) if predict_classifier_bin == True or predict_classifier_nominal == True: params['solver'] = params['penalty'].get('solver') params['penalty'] = params['penalty']['penalty'] for parameter_name in ['max_iter']: params[parameter_name] = int(params[parameter_name]) lr = None for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_target)): train_x, train_y = train_df.iloc[train_idx], train_target.iloc[ train_idx] valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[ valid_idx] if predict_classifier_bin == True or predict_classifier_nominal == True: lr = LogisticRegression(penalty=params['penalty'],\ solver=params['solver'],\ tol=params['tol'],\ C=params['C'],\ class_weight=params['class_weight'],\ random_state=params['random_state'],\ max_iter=params['max_iter'],\ n_jobs=params['n_jobs']) lr.fit(train_x, train_y) if predict_classifier_bin: predict_result = lr.predict_proba(valid_x)[:, 1] else: predict_result = lr.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) else: lr = LinearRegression(fit_intercept=params['fit_intercept'],\ normalize=params['normalize'],\ n_jobs=params['n_jobs']) lr.fit(train_x, train_y) predict_result = lr.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) debug( '++++++++++++++++++++Linear+++++++++++++++++++++++++++++++++++++++++++' ) predict_df.sort_index(axis=0, inplace=True) valid[PREDICT_NAME] = predict_df['result'] score = 0 if predict_classifier_bin == True: score = roc_auc_score(valid[target], valid[PREDICT_NAME]) elif predict_classifier_nominal: valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME], axis=1) score = np.sum(valid['compare']) / len(valid[target]) else: score = np.square( np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1]) trace('linear regression: ' + target + ', score: ' + str(score)) if test_dataframe is None: return score else: test_prediction = lr.predict(test_dataframe) return test_prediction, score
def int_module_knn(params, dataframe, target, test_dataframe=None, n_folds=5): """ Internal KNN model with cross validation, supporting both classification prediction and regression prediction. It's aimed for easy usage and reuse for all kinds of situation. Parameters ---------- params : dictionary Parameter set with dictionary format for KNN model. dataframe : pandas.Dataframe Dataframe to process. target : string Feature name, target identifies some column which is used for prediction analyze. test_dataframe : pandas.Dataframe, optional Dataframe to predict. n_folds : integer, optional Cross validation times when run KNN model with given dataframe. It's often 5 or 10. Output ------- Score when run KNN model with given param and dataframe. For regression prediction, score as R2; for binary classification, score as ROC; for multi-classification, score as accuracy. Test result from prediction if specify test_dataframe. """ from sklearn.neighbors import KNeighborsRegressor from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import roc_auc_score from sklearn.preprocessing import MinMaxScaler PREDICT_NAME = 'predict' df = dataframe for f_ in df.columns: df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True) # Make sure parameters that need to be integers are integers for parameter_name in ['n_neighbors', 'leaf_size', 'p']: params[parameter_name] = int(params[parameter_name]) train_df = df.drop([target], axis=1) train_df, _ = one_hot_encoder(train_df, True) train2 = train_df.dropna(axis=0) train2 = pd.concat([train2, df[target]], axis=1) df_importance = explore_importance_features(train2, target) feature_importance = df_importance.loc[df_importance['importance'] > 0.001, 'feature'] feature_list = feature_importance.values debug(feature_list) train_df = train_df[feature_list] min_max_scaler = MinMaxScaler() train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df), index=train_df.index) train_target = df[target] valid = df[[target]] valid[PREDICT_NAME] = 0 predict_classifier_bin, predict_classifier_nominal = _check_classifier( df, target) predict_df = pd.DataFrame({'result': []}) folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001) knn = None for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_target)): train_x, train_y = train_df.iloc[train_idx], train_target.iloc[ train_idx] valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[ valid_idx] if predict_classifier_bin == True or predict_classifier_nominal == True: knn = KNeighborsClassifier(n_neighbors=params['n_neighbors'],\ weights=params['weights'],\ algorithm=params['algorithm'],\ leaf_size=params['leaf_size'],\ p=params['p'],\ n_jobs=params['n_jobs']) knn.fit(train_x, train_y) # valid.ix[valid_idx,[PREDICT_NAME]] = knn.predict_proba(valid_x)[:, 1] if predict_classifier_bin: predict_result = knn.predict_proba(valid_x)[:, 1] else: predict_result = knn.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) else: knn = KNeighborsRegressor(n_neighbors=params['n_neighbors'],\ weights=params['weights'],\ algorithm=params['algorithm'],\ leaf_size=params['leaf_size'],\ p=params['p'],\ n_jobs=params['n_jobs']) knn.fit(train_x, train_y) predict_result = knn.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) debug( '++++++++++++++++++++ KNN +++++++++++++++++++++++++++++++++++++++++' ) predict_df.sort_index(axis=0, inplace=True) valid[PREDICT_NAME] = predict_df['result'] score = 0 if predict_classifier_bin == True: score = roc_auc_score(valid[target], valid[PREDICT_NAME]) elif predict_classifier_nominal: valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME], axis=1) score = np.sum(valid['compare']) / len(valid[target]) else: score = np.square( np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1]) trace('knn: ' + target + ', score: ' + str(score)) if test_dataframe is None: return score else: test_prediction = knn.predict(test_dataframe[feature_list]) return test_prediction, score
def int_module_random_forest(params, dataframe, target, test_dataframe=None, \ n_folds=5): """ Internal random forest model with cross validation, supporting both classification prediction and regression prediction. It's aimed for easy usage and reuse for all kinds of situation. Parameters ---------- params : dictionary Parameter set with dictionary format for random forest model. dataframe : pandas.Dataframe Dataframe to process. target : string Feature name, target identifies some column which is used for prediction analyze. test_dataframe : pandas.Dataframe, optional Dataframe to predict. n_folds : integer, optional Cross validation times when run random forest model with given dataframe. It's often 5 or 10. Output ------- Score when run random forest model with given param and dataframe. For regression prediction, score as R2; for binary classification, score as ROC; for multi-classification, score as accuracy. Test result from prediction if specify test_dataframe. """ from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import roc_auc_score PREDICT_NAME = 'predict' df = dataframe for parameter_name in ['n_estimators', 'max_depth', 'min_samples_leaf']: params[parameter_name] = int(params[parameter_name]) for f_ in df.columns: df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True) train_df = df.drop([target], axis=1) train_df, _ = one_hot_encoder(train_df, True) train_target = df[target] valid = df[[target]] valid[PREDICT_NAME] = 0 predict_classifier_bin, predict_classifier_nominal = _check_classifier( df, target) predict_df = pd.DataFrame({'result': []}) folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001) rf = None for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_target)): train_x, train_y = train_df.iloc[train_idx], train_target.iloc[ train_idx] valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[ valid_idx] if predict_classifier_bin == True or predict_classifier_nominal == True: rf = RandomForestClassifier(class_weight=params['class_weight'],\ n_estimators=params['n_estimators'],\ criterion=params['criterion'],\ max_depth=params['max_depth'],\ min_samples_leaf=params['min_samples_leaf'],\ max_features=params['max_features'],\ bootstrap=params['bootstrap'],\ oob_score=params['oob_score'],\ n_jobs=params['n_jobs'],\ random_state=params['random_state']) rf.fit(train_x, train_y) if predict_classifier_bin: predict_result = rf.predict_proba(valid_x)[:, 1] else: predict_result = rf.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) else: rf = RandomForestRegressor( n_estimators=params['n_estimators'],\ criterion=params['criterion'],\ max_depth=params['max_depth'],\ min_samples_leaf=params['min_samples_leaf'],\ max_features=params['max_features'],\ bootstrap=params['bootstrap'],\ oob_score=params['oob_score'],\ n_jobs=params['n_jobs'],\ random_state=params['random_state']) rf.fit(train_x, train_y) predict_result = rf.predict(valid_x) predict_temp_df = pd.DataFrame({'result': predict_result}, index=valid_x.index) predict_df = pd.concat([predict_df, predict_temp_df]) debug( '++++++++++++++++++++random forest+++++++++++++++++++++++++++++++++++++++++++' ) predict_df.sort_index(axis=0, inplace=True) valid[PREDICT_NAME] = predict_df['result'] score = 0 if predict_classifier_bin == True: score = roc_auc_score(valid[target], valid[PREDICT_NAME]) elif predict_classifier_nominal: valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME], axis=1) score = np.sum(valid['compare']) / len(valid[target]) else: score = np.square( np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1]) trace('random forest: ' + target + ', score: ' + str(score)) if test_dataframe is None: return score else: test_prediction = rf.predict(test_dataframe) return test_prediction, score