コード例 #1
0
def get_feat_importance_logreg(mdl_logreg, x, y):
    """
    Calculate feature importance for logistic regression.
    This is similar to random forest.

    Parameters
    ----------
    mdl_logreg : sklearn classifier
        classifier.
    x : pandas data frame
        features.
    y : pandas data frame
        Label.

    Returns
    -------
    feat_importance : TYPE
        DESCRIPTION.

    """
    visualizer = FeatureImportances(mdl_logreg, title='Logistic regression')
    visualizer.fit(x, y)
    visualizer.ax.remove()
    feat_importance = visualizer.feature_importances_[::-1]
    return feat_importance, visualizer
コード例 #2
0
def create_feature_importance_chart(regressor, X_train, y_train):
    """Create feature importance chart.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        regressor (:obj:`regressor`):
            | Fitted sklearn regressor object
        X_train (:obj:`ndarray`):
            | Training data matrix
        y_train (:obj:`ndarray`):
            | The regression target for training

    Returns:
        ``neptune.types.File`` object that you can assign to run's ``base_namespace``.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            rfr = RandomForestRegressor()
            rfr.fit(X_train, y_train)

            run = neptune.init(project='my_workspace/my_project')
            run['visuals/feature_importance'] = npt_utils.create_feature_importance_chart(rfr, X_train, y_train)
    """
    assert is_regressor(regressor), 'regressor should be sklearn regressor.'

    chart = None

    try:
        fig, ax = plt.subplots()
        visualizer = FeatureImportances(regressor, is_fitted=True, ax=ax)
        visualizer.fit(X_train, y_train)
        visualizer.finalize()

        chart = neptune.types.File.as_image(fig)
        plt.close(fig)
    except Exception as e:
        print('Did not log feature importance chart. Error: {}'.format(e))

    return chart
コード例 #3
0
def log_feature_importance_chart(regressor, X_train, y_train, experiment=None):
    """Log feature importance chart.

    Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method.

    Tip:
        Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example.

    Args:
        regressor (:obj:`regressor`):
            | Fitted sklearn regressor object
        X_train (:obj:`ndarray`):
            | Training data matrix
        y_train (:obj:`ndarray`):
            | The regression target for training
        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
            | Neptune ``Experiment`` object to control to which experiment you log the data.
            | If ``None``, log to currently active, and most recent experiment.

    Returns:
        ``None``

    Examples:
        .. code:: python3

            rfr = RandomForestRegressor()
            rfr.fit(X_train, y_train)

            neptune.init('my_workspace/my_project')
            neptune.create_experiment()

            log_feature_importance_chart(rfr, X_train, y_train)
    """
    assert is_regressor(regressor), 'regressor should be sklearn regressor.'
    exp = _validate_experiment(experiment)

    try:
        fig, ax = plt.subplots()
        visualizer = FeatureImportances(regressor, is_fitted=True, ax=ax)
        visualizer.fit(X_train, y_train)
        visualizer.finalize()
        exp.log_image('charts_sklearn', fig, image_name='Feature Importance')
        plt.close(fig)
    except Exception as e:
        print('Did not log feature importance chart. Error: {}'.format(e))
コード例 #4
0
def show_FeatureImportances(
    est: BaseEstimator,
    conf_mat_labels: List,
    X: DataFrame,
    y: Series,
    fig_size: Tuple = (8, 8),
    savefig: Path = Path().cwd() / "reports" / "figures" / "feats_imps.png",
    save_pref: bool = False,
) -> None:
    """Show feature importances"""
    fig, ax = plt.subplots(figsize=fig_size)
    cm = FeatureImportances(est,
                            stack=True,
                            labels=conf_mat_labels,
                            relative=False,
                            ax=ax)
    cm.fit(X, y)
    cm.show()
    if save_pref and not savefig.is_file():
        fig.savefig(savefig, bbox_inches="tight", dpi=300)
コード例 #5
0
    logistic_score = balanced_accuracy_score(y_test, y_pred_Logisticregression.round(), adjusted=False)
    #logistic_score_acc=accuracy_score(y_test, y_pred_Logisticregression)
    scores.append(logistic_score)
    print(logistic_score)
    #print(logistic_score_acc)


    
    
    #from sklearn.metrics import accuracy_score
    #logistic_score = balanced_accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), adjusted=False)
    #logistic_score_acc=accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), normalize=False)
    #print(logistic_score)
    #print(logistic_score_acc)
data_score = pd.DataFrame(columns=['Commodity', 'score'])
data_score['Commodity'] = y_location_trains.columns
data_score['score'] = scores
print(data_score)
data_score.to_csv('/Users/monalisa/Downloads/mmai823-project-master/out/linear_scores.csv')


print(X_train.columns)
# Feature importance
#viz_feat = FeatureImportances(rfc, labels=X_train.columns, relative=False)
from matplotlib import pyplot as plt
%matplotlib inline
viz_features = FeatureImportances(logistic, labels=X_train.columns)
viz_features.fit(X_train, y_train)
viz_features.show()
plt.tight_layout()
コード例 #6
0
)  #Citation - xgboost.readthedocs.io xgboost API reference documentation
selector = RFE(
    estimator, verbose=0, n_features_to_select=40
)  #Citation - scikit-learn.org sklearn API reference documentation

selector.fit(X, y)

#Select the best features
selected = [X.columns[i] for i in selector.get_support(indices=True)
            ]  #Citation - scikit-learn.org sklearn API reference documentation

selected

X = X[selected]

plt.figure(figsize=(11, 9))
ax = plt.gca(
)  #Citation - from matplotlib.org matplotlip API reference documentation

# Title case the feature for better display and create the visualizer
model = RandomForestClassifier(
    n_jobs=-1
)  #Citation - scikit-learn.org sklearn API reference documentation
labels = list(map(lambda s: s.title(), X.columns))
viz = FeatureImportances(
    model, labels=labels,
    relative=True)  #Citation - scikit-yb.org yellowbrick documentation

# Fit and show the feature importances
viz.fit(X, y)
viz.show(ax=ax)
コード例 #7
0
def importances():
    X, y = load_occupancy()
    oz = FeatureImportances(RandomForestClassifier(), ax=newfig())
    oz.fit(X, y)
    savefig(oz, "feature_importances")
コード例 #8
0
    display(Image(data=graph.pipe(format='png')))
    
#task 8: Feature importance and evolution metrics

from yellowbrick.model_selection import FeatureImportances
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")

rf = RandomForestClassifer(bootstrap = 'True', class_weight = None, criterion='gini',
                           max_depth=5, max_feature='auto', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                           oob_score=False, random_state=1, verbose=False,
                           warn_start=False)
viz= FeatureImportances(rf)
viz.fit(X_train,y_train)
viz.show();


dt = DecisionForestClassifer(class_weight = None, criterion='gini',
                           max_depth=3, max_feature='None', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, presort=False,random_state=0,
                           splitters='best')
                           
viz= FeatureImportances(dt)
viz.fit(X_train,y_train)
viz.show();
コード例 #9
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
コード例 #10
0
ファイル: main.py プロジェクト: nixonjin/BigDataTraining
    visualizer = DiscriminationThreshold(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 学习率
    visualizer = LearningCurve(model, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 交叉验证
    visualizer = CVScores(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征重要性
    visualizer = FeatureImportances(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征递归消减
    visualizer = RFECV(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征选择
    visualizer = ValidationCurve(model,
                                 param_name="max_depth",
                                 param_range=np.arange(1, 11),
                                 cv=5,
                                 scoring="f1_weighted")
    visualizer.fit(X_train, y_train)
コード例 #11
0
print(f"Mean Square Error for Random Forest = {round(mse_random_forest, 3)}")
print(
    f"Root Mean Square Error for Random Forest = {round(rmse_random_forest, 3)}"
)
print(
    f"R^2(coefficient of determination) on training set = {round(score_random_forest_train, 3)}"
)
print(
    f"R^2(coefficient of determination) on testing set = {round(score_random_forest_test, 3)}"
)
print("Classification Report")
print(classification_report(y_test, pred_random_forest))
print("Confusion Matrix:")
print(confusion_matrix(y_test, pred_random_forest))
plt.figure(figsize=(16, 10))
viz = FeatureImportances(random_forest)
viz.fit(x_train, y_train)
viz.show()


# Plot learning curve
def plot_learning_curve(estimator,
                        title,
                        x,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=-1,
                        train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure()
コード例 #12
0
#add new columns of all ingreds with freq count greater than 50 to dataset and one hot encode
for ingredient in list(top_ingredients):
    data[ingredient] = data["ingredients"].apply(lambda x: 1
                                                 if ingredient in x else 0)

#Change cuisine type labels to number labels
le = LabelEncoder()
data['cuisine_nums'] = le.fit_transform(data['cuisine'])

# features
X = data.drop(['ingredients', 'id', 'cuisine', 'cuisine_nums'], axis=1)
# target
y = data['cuisine_nums']

##### eliminate lowest importance features ######
viz = FeatureImportances(
    RandomForestClassifier(n_estimators=100, random_state=9, n_jobs=-1))
viz.fit(X, y)
ranked_features = viz.features_
feature_importance_values = viz.feature_importances_
## Number of features to keep from the 978 features (good features)
n = 600
good_features = ranked_features[-n:]
X = X[good_features]

lr = LogisticRegression(solver='liblinear', multi_class='ovr', n_jobs=-1)
lda = LinearDiscriminantAnalysis()
rf = RandomForestClassifier(n_estimators=100, random_state=9, n_jobs=-1)
svc = LinearSVC(random_state=1, C=0.4, penalty="l2", dual=False)
bnb = BernoulliNB()
sgd = SGDClassifier(learning_rate='optimal',
                    random_state=1,
コード例 #13
0
from sklearn.metrics import accuracy_score
logreg_acc_score = accuracy_score(y_test, logreg_pred)
print(logreg_acc_score)

#%%
from sklearn.metrics import f1_score
logreg_f1_score = f1_score(y_test, logreg_pred)
print(logreg_f1_score)

#%%
from yellowbrick.classifier import confusion_matrix
confusion_matrix(logreg, X_train, y_train, X_test, y_test, classes=class_labels)

#%%
from yellowbrick.model_selection import FeatureImportances
viz = FeatureImportances(logreg, labels=X_train.columns, relative=False)
viz.fit(X, y)
viz.show()

#%%
fimp = pd.Series(viz.feature_importances_, index = viz.features_)
fimp.sort_values(ascending=True).head(10).index
#%%
fimp.sort_values().head(10).index

#%%
from sklearn.feature_selection import RFE
rfe = RFE(estimator=logreg, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_
#%%
コード例 #14
0
X_1 = data[features]
y_1 = data.popularity

X_2 = data_without_old[features]
y_2 = data_without_old.popularity

X_3 = data_without_old[features_2]
y_3 = data_without_old.popularity

train_X_1, test_X_1, train_y_1, test_y_1 = train_test_split(X_1, y_1, test_size=0.1, random_state=0)
train_X_2, test_X_2, train_y_2, test_y_2 = train_test_split(X_2, y_2, test_size=0.1, random_state=0)
train_X_3, test_X_3, train_y_3, test_y_3 = train_test_split(X_3, y_3, test_size=0.1, random_state=0)


rfr_model_1 = RandomForestRegressor()
feature_importance_1 = FeatureImportances(rfr_model_1)
rfr_model_1.fit(train_X_1, train_y_1)

rfr_model_2 = RandomForestRegressor()
feature_importance_2 = FeatureImportances(rfr_model_2)
rfr_model_2.fit(train_X_2, train_y_2)

rfr_model_3 = RandomForestRegressor()
feature_importance_3 = FeatureImportances(rfr_model_3)
rfr_model_3.fit(train_X_3, train_y_3)


feature_importance_1.show()
feature_importance_2.show()
feature_importance_3.show()
コード例 #15
0
    loss, acc = model.evaluate(X_val, y_val, verbose=2)
    print("trained model, accuracy: {:5.2f}%".format(100 * acc))
    y_pred = model.predict(X_val, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)
    y_val_bool = np.argmax(y_val, axis=1)
    print(classification_report(y_val_bool, y_pred_bool))

if choice == '4':
    features = len(X_train[0])
    print(features)
    print(strf)
    categories = 3

    model = RandomForestClassifier()
    # Visualizing Feature Importance
    viz = FeatureImportances(model)
    start_time = time.time()
    print("start")
    viz.fit(X_train, y_train)
    print("--- %s seconds ---" % round(time.time() - start_time))
    viz.show()

if choice == '3':
    features = len(X_train[0])
    print(features)
    categories = 3

    model = Sequential()
    model.add(
        Dense(3,
              input_dim=features,