Exemple #1
0
# Split target from features
target = features['sent_to_analysis']
features.drop('sent_to_analysis', axis=1, inplace=True)

# Save a list with feature importance (Univariate Selection)
feature_importance = SelectKBest(k=len(features.columns)).fit(features, target)

feature_importance = pd.concat(
    [pd.DataFrame(features.columns),
     pd.DataFrame(feature_importance.scores_)],
    axis=1)
feature_importance.columns = ['feature', 'score']
feature_importance['score'] = feature_importance[
    'score'].values / feature_importance['score'].sum()
feature_importance.reset_index(inplace=True, drop=True)
joblib.dump(feature_importance, "model_files/feature_importance.pkl")

# train test split and save data
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    test_size=0.3,
                                                    random_state=0)

joblib.dump(pd.DataFrame(X_test, columns=feature_importance['feature']),
            "model_files/X_test.pkl")
y_train.to_pickle("model_files/y_train.pkl")
y_test.to_pickle("model_files/y_test.pkl")

# Imbalanced data treatment using SMOTEENN
smote = SMOTEENN(random_state=0)