def filter_linear_regression(dataframe, target, threshold=0.001):
    """
    Filter features with low weight when estimate with linear regression model.
    
    Parameters
    ----------
    dataframe : pandas.Dataframe
        dataframe to process
    target : string        
        Feature name, as predict result.
    threshold : float
        Between 0 and 1. Filter features with weight below the value.
    
    Return
    -------
    dataframe after process
    """
    from sklearn.linear_model import LassoCV
    #    from sklearn.linear_model import MultiTaskLassoCV
    from sklearn import preprocessing

    categorical_feats = dataframe.select_dtypes('object').columns.tolist()
    if target in categorical_feats:
        categorical_feats.remove(target)

    for col in categorical_feats:
        lb = preprocessing.LabelEncoder()
        lb.fit(list(dataframe[col].values.astype('str')))
        dataframe[col] = lb.transform(list(
            dataframe[col].values.astype('str')))

    import fast_impute
    for feature in dataframe.columns:
        dataframe, acc = fast_impute.impute_mean(dataframe, feature)

    x_train = dataframe.drop([target], axis=1)
    y_train = dataframe[target]
    column_names = x_train.columns.tolist()
    x_train = preprocessing.scale(x_train)

    lr = LassoCV(cv=5, alphas=[0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1], n_jobs=-1)
    lr.fit(x_train, y_train)
    coef_df = pd.DataFrame()
    coef_df["feature"] = column_names
    coef_df["coef"] = lr.coef_.T
    coef_df["abs_coef"] = np.abs(lr.coef_.T)
    coef_df.sort_values('abs_coef', inplace=True, ascending=False)
    less_coef_features = coef_df.loc[coef_df['abs_coef'] < threshold,
                                     'feature']
    score = lr.score(x_train, y_train)
    dataframe.drop(less_coef_features, axis=1, inplace=True)
    trace('filter_linear_regression')
    trace('category features')
    trace(categorical_feats)
    trace('coefficence list')
    trace(coef_df)
    trace('score')
    trace(score)
    trace('regularization alpha_')
    alpha = lr.alpha_
    trace(str(alpha))
    trace('drop features')
    trace(less_coef_features)
    return dataframe
def filter_logistic_regression(dataframe, target, threshold=0.001):
    """
    Filter features with low weight from compution with logistic regression model.
    
    Parameters
    ----------
    dataframe : pandas.Dataframe
        dataframe to process
    target : string        
        Feature name, as predict result.
    threshold : float
        Between 0 and 1. Filter features with weight below the value.
    
    Return
    -------
    dataframe after process
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn import preprocessing

    categorical_feats = dataframe.select_dtypes('object').columns.tolist()
    if target in categorical_feats:
        categorical_feats.remove(target)

    for col in categorical_feats:
        lb = preprocessing.LabelEncoder()
        lb.fit(list(dataframe[col].values.astype('str')))
        dataframe[col] = lb.transform(list(
            dataframe[col].values.astype('str')))

    import fast_impute
    for feature in dataframe.columns:
        dataframe, acc = fast_impute.impute_mean(dataframe, feature)

    X = dataframe.drop([target], axis=1)
    Y = dataframe[target]
    column_names = X.columns
    X = preprocessing.scale(X)

    #    lr = LogisticRegression(penalty='l1', max_iter=1000, C=lambd)
    lr = LogisticRegressionCV(cv=5, penalty='l1',solver='saga',n_jobs=-1, \
                              max_iter=1000, Cs=[0.03,0.05,0.1,0.3])
    lr.fit(X, Y)
    score = lr.score(X, Y)
    coef_df = pd.DataFrame()
    coef_df["feature"] = column_names
    coef_df["coef"] = lr.coef_.T
    coef_df["abs_coef"] = np.abs(lr.coef_.T)
    coef_df.sort_values('abs_coef', inplace=True, ascending=False)
    less_coef_features = coef_df.loc[coef_df['abs_coef'] < threshold,
                                     'feature']
    dataframe.drop(less_coef_features, axis=1, inplace=True)
    trace('filter_logistic_regression')
    trace('category features')
    trace(categorical_feats)
    trace('coefficence list')
    trace(coef_df)
    trace('score')
    trace(score)
    trace('regularization C_')
    c = lr.C_
    trace(str(c))
    trace('drop features')
    trace(less_coef_features)
    return dataframe
def int_module_linear_regression(params, dataframe, target, \
                                 test_dataframe=None, n_folds=5):
    """
    Internal linear regression model with cross validation, supporting both classification 
    prediction and regression prediction. It's aimed for easy usage and reuse 
    for all kinds of situation.
    
    Parameters
    ----------
    params : dictionary
        Parameter set with dictionary format for linear regression model.
    dataframe : pandas.Dataframe
        Dataframe to process.
    target : string
        Feature name, target identifies some column which is used for 
        prediction analyze.
    test_dataframe : pandas.Dataframe, optional
        Dataframe to predict.
    n_folds : integer, optional
        Cross validation times when run linear regression model with given dataframe. It's 
        often 5 or 10.
    Output
    -------
    Score when run linear regression model with given param and dataframe. For regression 
    prediction, score as R2; for binary classification, score as ROC; for 
    multi-classification, score as accuracy.
    Test result from prediction if specify test_dataframe.  
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import MinMaxScaler
    PREDICT_NAME = 'predict'

    df = dataframe
    #    df = df.reset_index(drop=True)
    for f_ in df.columns:
        df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True)

    train_df = df.drop([target], axis=1)
    train_df, _ = one_hot_encoder(train_df, True)
    train_target = df[target]
    valid = df[[target]]
    valid[PREDICT_NAME] = 0

    min_max_scaler = MinMaxScaler()
    train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df),
                            index=train_df.index)

    predict_classifier_bin, predict_classifier_nominal = _check_classifier(
        df, target)
    predict_df = pd.DataFrame({'result': []})
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001)

    if predict_classifier_bin == True or predict_classifier_nominal == True:
        params['solver'] = params['penalty'].get('solver')
        params['penalty'] = params['penalty']['penalty']
        for parameter_name in ['max_iter']:
            params[parameter_name] = int(params[parameter_name])

    lr = None
    for n_fold, (train_idx,
                 valid_idx) in enumerate(folds.split(train_df, train_target)):
        train_x, train_y = train_df.iloc[train_idx], train_target.iloc[
            train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[
            valid_idx]

        if predict_classifier_bin == True or predict_classifier_nominal == True:
            lr = LogisticRegression(penalty=params['penalty'],\
                                  solver=params['solver'],\
                                  tol=params['tol'],\
                                  C=params['C'],\
                                  class_weight=params['class_weight'],\
                                  random_state=params['random_state'],\
                                  max_iter=params['max_iter'],\
                                  n_jobs=params['n_jobs'])

            lr.fit(train_x, train_y)
            if predict_classifier_bin:
                predict_result = lr.predict_proba(valid_x)[:, 1]
            else:
                predict_result = lr.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        else:
            lr = LinearRegression(fit_intercept=params['fit_intercept'],\
                                  normalize=params['normalize'],\
                                  n_jobs=params['n_jobs'])
            lr.fit(train_x, train_y)
            predict_result = lr.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        debug(
            '++++++++++++++++++++Linear+++++++++++++++++++++++++++++++++++++++++++'
        )

    predict_df.sort_index(axis=0, inplace=True)
    valid[PREDICT_NAME] = predict_df['result']
    score = 0

    if predict_classifier_bin == True:
        score = roc_auc_score(valid[target], valid[PREDICT_NAME])
    elif predict_classifier_nominal:
        valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME],
                                       axis=1)
        score = np.sum(valid['compare']) / len(valid[target])
    else:
        score = np.square(
            np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1])

    trace('linear regression: ' + target + ', score: ' + str(score))
    if test_dataframe is None:
        return score
    else:
        test_prediction = lr.predict(test_dataframe)
        return test_prediction, score
def int_module_knn(params, dataframe, target, test_dataframe=None, n_folds=5):
    """
    Internal KNN model with cross validation, supporting both classification 
    prediction and regression prediction. It's aimed for easy usage and reuse 
    for all kinds of situation.
    
    Parameters
    ----------
    params : dictionary
        Parameter set with dictionary format for KNN model.
    dataframe : pandas.Dataframe
        Dataframe to process.
    target : string
        Feature name, target identifies some column which is used for 
        prediction analyze.
    test_dataframe : pandas.Dataframe, optional
        Dataframe to predict.
    n_folds : integer, optional
        Cross validation times when run KNN model with given dataframe. It's 
        often 5 or 10.
    Output
    -------
    Score when run KNN model with given param and dataframe. For regression 
    prediction, score as R2; for binary classification, score as ROC; for 
    multi-classification, score as accuracy.
    Test result from prediction if specify test_dataframe.  
    """
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import MinMaxScaler
    PREDICT_NAME = 'predict'

    df = dataframe
    for f_ in df.columns:
        df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True)
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['n_neighbors', 'leaf_size', 'p']:
        params[parameter_name] = int(params[parameter_name])

    train_df = df.drop([target], axis=1)
    train_df, _ = one_hot_encoder(train_df, True)

    train2 = train_df.dropna(axis=0)
    train2 = pd.concat([train2, df[target]], axis=1)

    df_importance = explore_importance_features(train2, target)
    feature_importance = df_importance.loc[df_importance['importance'] > 0.001,
                                           'feature']
    feature_list = feature_importance.values
    debug(feature_list)

    train_df = train_df[feature_list]
    min_max_scaler = MinMaxScaler()
    train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df),
                            index=train_df.index)

    train_target = df[target]
    valid = df[[target]]
    valid[PREDICT_NAME] = 0

    predict_classifier_bin, predict_classifier_nominal = _check_classifier(
        df, target)
    predict_df = pd.DataFrame({'result': []})
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001)

    knn = None
    for n_fold, (train_idx,
                 valid_idx) in enumerate(folds.split(train_df, train_target)):
        train_x, train_y = train_df.iloc[train_idx], train_target.iloc[
            train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[
            valid_idx]

        if predict_classifier_bin == True or predict_classifier_nominal == True:
            knn = KNeighborsClassifier(n_neighbors=params['n_neighbors'],\
                                  weights=params['weights'],\
                                  algorithm=params['algorithm'],\
                                  leaf_size=params['leaf_size'],\
                                  p=params['p'],\
                                  n_jobs=params['n_jobs'])

            knn.fit(train_x, train_y)
            #            valid.ix[valid_idx,[PREDICT_NAME]] = knn.predict_proba(valid_x)[:, 1]
            if predict_classifier_bin:
                predict_result = knn.predict_proba(valid_x)[:, 1]
            else:
                predict_result = knn.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        else:
            knn = KNeighborsRegressor(n_neighbors=params['n_neighbors'],\
                                  weights=params['weights'],\
                                  algorithm=params['algorithm'],\
                                  leaf_size=params['leaf_size'],\
                                  p=params['p'],\
                                  n_jobs=params['n_jobs'])

            knn.fit(train_x, train_y)
            predict_result = knn.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        debug(
            '++++++++++++++++++++ KNN +++++++++++++++++++++++++++++++++++++++++'
        )

    predict_df.sort_index(axis=0, inplace=True)
    valid[PREDICT_NAME] = predict_df['result']
    score = 0

    if predict_classifier_bin == True:
        score = roc_auc_score(valid[target], valid[PREDICT_NAME])
    elif predict_classifier_nominal:
        valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME],
                                       axis=1)
        score = np.sum(valid['compare']) / len(valid[target])
    else:
        score = np.square(
            np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1])

    trace('knn: ' + target + ', score: ' + str(score))
    if test_dataframe is None:
        return score
    else:
        test_prediction = knn.predict(test_dataframe[feature_list])
        return test_prediction, score
def int_module_random_forest(params, dataframe, target, test_dataframe=None, \
                             n_folds=5):
    """
    Internal random forest model with cross validation, supporting both classification 
    prediction and regression prediction. It's aimed for easy usage and reuse 
    for all kinds of situation.
    
    Parameters
    ----------
    params : dictionary
        Parameter set with dictionary format for random forest model.
    dataframe : pandas.Dataframe
        Dataframe to process.
    target : string
        Feature name, target identifies some column which is used for 
        prediction analyze.
    test_dataframe : pandas.Dataframe, optional
        Dataframe to predict.
    n_folds : integer, optional
        Cross validation times when run random forest model with given dataframe. It's 
        often 5 or 10.
    Output
    -------
    Score when run random forest model with given param and dataframe. For regression 
    prediction, score as R2; for binary classification, score as ROC; for 
    multi-classification, score as accuracy.
    Test result from prediction if specify test_dataframe.  
    """

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import roc_auc_score
    PREDICT_NAME = 'predict'

    df = dataframe

    for parameter_name in ['n_estimators', 'max_depth', 'min_samples_leaf']:
        params[parameter_name] = int(params[parameter_name])
    for f_ in df.columns:
        df, _ = fast_impute.impute_mean(df, f_, target=target, intern=True)

    train_df = df.drop([target], axis=1)
    train_df, _ = one_hot_encoder(train_df, True)
    train_target = df[target]
    valid = df[[target]]
    valid[PREDICT_NAME] = 0

    predict_classifier_bin, predict_classifier_nominal = _check_classifier(
        df, target)
    predict_df = pd.DataFrame({'result': []})
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=1001)

    rf = None
    for n_fold, (train_idx,
                 valid_idx) in enumerate(folds.split(train_df, train_target)):
        train_x, train_y = train_df.iloc[train_idx], train_target.iloc[
            train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], train_target.iloc[
            valid_idx]

        if predict_classifier_bin == True or predict_classifier_nominal == True:
            rf = RandomForestClassifier(class_weight=params['class_weight'],\
                                        n_estimators=params['n_estimators'],\
                                        criterion=params['criterion'],\
                                        max_depth=params['max_depth'],\
                                        min_samples_leaf=params['min_samples_leaf'],\
                                        max_features=params['max_features'],\
                                        bootstrap=params['bootstrap'],\
                                        oob_score=params['oob_score'],\
                                        n_jobs=params['n_jobs'],\
                                        random_state=params['random_state'])

            rf.fit(train_x, train_y)
            if predict_classifier_bin:
                predict_result = rf.predict_proba(valid_x)[:, 1]
            else:
                predict_result = rf.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        else:
            rf = RandomForestRegressor(

                                  n_estimators=params['n_estimators'],\
                                  criterion=params['criterion'],\
                                  max_depth=params['max_depth'],\
                                  min_samples_leaf=params['min_samples_leaf'],\
                                  max_features=params['max_features'],\
                                  bootstrap=params['bootstrap'],\
                                  oob_score=params['oob_score'],\
                                  n_jobs=params['n_jobs'],\
                                  random_state=params['random_state'])
            rf.fit(train_x, train_y)
            predict_result = rf.predict(valid_x)
            predict_temp_df = pd.DataFrame({'result': predict_result},
                                           index=valid_x.index)
            predict_df = pd.concat([predict_df, predict_temp_df])

        debug(
            '++++++++++++++++++++random forest+++++++++++++++++++++++++++++++++++++++++++'
        )

    predict_df.sort_index(axis=0, inplace=True)
    valid[PREDICT_NAME] = predict_df['result']
    score = 0
    if predict_classifier_bin == True:
        score = roc_auc_score(valid[target], valid[PREDICT_NAME])
    elif predict_classifier_nominal:
        valid['compare'] = valid.apply(lambda x: x[target] == x[PREDICT_NAME],
                                       axis=1)
        score = np.sum(valid['compare']) / len(valid[target])
    else:
        score = np.square(
            np.corrcoef(valid[target], valid[PREDICT_NAME])[0, 1])

    trace('random forest: ' + target + ', score: ' + str(score))
    if test_dataframe is None:
        return score
    else:
        test_prediction = rf.predict(test_dataframe)
        return test_prediction, score