def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0
    grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param,
                            scoring=custom_scorer, cv=3, error_score=np.nan)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(ValueError, match=err_msg):
        multi_target_linear.predict_proba(X)
Beispiel #2
0
def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0

    grid_clf = GridSearchCV(sgd_linear_clf,
                            param_grid=param,
                            scoring=custom_scorer,
                            cv=3)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(AttributeError, match=err_msg):
        multi_target_linear.predict_proba(X)
def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, loss="log_loss")
    param = {"loss": ("hinge", "log", "modified_huber")}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0

    grid_clf = GridSearchCV(sgd_linear_clf,
                            param_grid=param,
                            scoring=custom_scorer,
                            cv=3)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "probability estimates are not available for loss='hinge'"
    with pytest.raises(AttributeError, match=err_msg):
        multi_target_linear.predict_proba(X)
Beispiel #4
0
    def test_multi_output_classifier(self):
        X, y = make_multilabel_classification(n_classes=3, random_state=0)
        X = X.astype(numpy.float32)
        clf = MultiOutputClassifier(LogisticRegression()).fit(X, y)
        onx = to_onnx(clf,
                      X[:1],
                      target_opset=TARGET_OPSET,
                      options={'zipmap': False})
        self.assertNotIn("ZipMap", str(onx))

        sess = InferenceSession(onx.SerializeToString())
        res = sess.run(None, {'X': X})
        exp_lab = clf.predict(X)
        exp_prb = clf.predict_proba(X)
        assert_almost_equal(exp_lab, res[0])
        self.assertEqual(len(exp_prb), len(res[1]))
        for e, g in zip(exp_prb, res[1]):
            assert_almost_equal(e, g, decimal=5)

        # check option nocl=True
        onx = to_onnx(clf,
                      X[:1],
                      target_opset=TARGET_OPSET,
                      options={id(clf): {
                                   'nocl': True,
                                   'zipmap': False
                               }})
        self.assertNotIn("ZipMap", str(onx))

        sess = InferenceSession(onx.SerializeToString())
        res = sess.run(None, {'X': X})
        exp_lab = clf.predict(X)
        exp_prb = clf.predict_proba(X)
        assert_almost_equal(exp_lab, res[0])
        self.assertEqual(len(exp_prb), len(res[1]))
        for e, g in zip(exp_prb, res[1]):
            assert_almost_equal(e, g, decimal=5)

        # check option nocl=False
        onx = to_onnx(clf,
                      X[:1],
                      target_opset=TARGET_OPSET,
                      options={id(clf): {
                                   'nocl': False,
                                   'zipmap': False
                               }})
        self.assertNotIn("ZipMap", str(onx))

        sess = InferenceSession(onx.SerializeToString())
        res = sess.run(None, {'X': X})
        exp_lab = clf.predict(X)
        exp_prb = clf.predict_proba(X)
        assert_almost_equal(exp_lab, res[0])
        self.assertEqual(len(exp_prb), len(res[1]))
        for e, g in zip(exp_prb, res[1]):
            assert_almost_equal(e, g, decimal=5)
def test_multiclass_multioutput_estimator_predict_proba():
    seed = 542

    # make test deterministic
    rng = np.random.RandomState(seed)

    # random features
    X = rng.normal(size=(5, 5))

    # random labels
    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes

    Y = np.concatenate([y1, y2], axis=1)

    clf = MultiOutputClassifier(LogisticRegression(
        multi_class='ovr', solver='liblinear', random_state=seed))

    clf.fit(X, Y)

    y_result = clf.predict_proba(X)
    y_actual = [np.array([[0.23481764, 0.76518236],
                          [0.67196072, 0.32803928],
                          [0.54681448, 0.45318552],
                          [0.34883923, 0.65116077],
                          [0.73687069, 0.26312931]]),
                np.array([[0.5171785, 0.23878628, 0.24403522],
                          [0.22141451, 0.64102704, 0.13755846],
                          [0.16751315, 0.18256843, 0.64991843],
                          [0.27357372, 0.55201592, 0.17441036],
                          [0.65745193, 0.26062899, 0.08191907]])]

    for i in range(len(y_actual)):
        assert_almost_equal(y_result[i], y_actual[i])
Beispiel #6
0
def train_and_predict(X, y, train_ratio=0.2, n_trials=10, random_state=None):
    micro, macro, c, std, f1, f1_std = [], [], [], [], [], []
    for i in range(n_trials):
        np.random.seed(random_state)
        X_train, y_train, X_test, y_test = iterative_train_test_split(
            X, y, test_size=1 - train_ratio)
        clf = MultiOutputClassifier(
            LogisticRegressionCV(max_iter=1e4, class_weight='balanced'))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            clf.fit(X_train, y_train.A)
            y_pred = np.array(clf.predict_proba(X_test))[:, :, 1].T
            mi = roc_auc_score(y_test.A, y_pred, average="micro")
            ma = roc_auc_score(y_test.A, y_pred, average="macro")
            y_pred = clf.predict(X_test)
            f = f1_score(y_test.A, y_pred, average="micro")
        std.append(mi)
        f1.append(f)
        f1_std.append(f)
        micro.append(mi)
        macro.append(ma)
        c.append(
            np.mean([estimator.C_.mean() for estimator in clf.estimators_]))
    return np.mean(micro), np.mean(macro), np.mean(c), np.std(std), np.mean(
        f1), np.std(f1_std)
def test_multiclass_multioutput_estimator_predict_proba():
    seed = 542

    # make test deterministic
    rng = np.random.RandomState(seed)

    # random features
    X = rng.normal(size=(5, 5))

    # random labels
    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes

    Y = np.concatenate([y1, y2], axis=1)

    clf = MultiOutputClassifier(LogisticRegression(random_state=seed))

    clf.fit(X, Y)

    y_result = clf.predict_proba(X)
    y_actual = [np.array([[0.23481764, 0.76518236],
                          [0.67196072, 0.32803928],
                          [0.54681448, 0.45318552],
                          [0.34883923, 0.65116077],
                          [0.73687069, 0.26312931]]),
                np.array([[0.5171785, 0.23878628, 0.24403522],
                          [0.22141451, 0.64102704, 0.13755846],
                          [0.16751315, 0.18256843, 0.64991843],
                          [0.27357372, 0.55201592, 0.17441036],
                          [0.65745193, 0.26062899, 0.08191907]])]

    for i in range(len(y_actual)):
        assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert_equal((n_samples, n_classes), class_probabilities.shape)

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
Beispiel #9
0
class Classifier():
    def __init__(self):
        #Multi label classifier
        forest = RandomForestClassifier(n_estimators=100, random_state=1)
        self.clf = MultiOutputClassifier(forest, n_jobs=-1)

    def fit(self, X, y):

        self.clf.fit(X, y)

    def predict(self, X):
        y_pred = np.array(self.clf.predict(X))
        return y_pred

    def predict_proba(self, X):
        """
        Compte the probailities for each label
        Important: this class needs to return an 2D array with 2 columns per label, so 109*2 columns. """
        proba = self.clf.predict_proba(X)
        #Proba is a list of size 109, one for each label, each element is an array of size n_samples * 2,
        #except some times when it is n_sample*1 so a little work is needed to reshape the array
        y_proba = proba[0]
        for x in proba[1:]:
            if x.shape[1] == 2:
                y_proba = np.hstack((y_proba, x))
            else:
                y_proba = np.hstack((y_proba, x, np.zeros_like(x)))

        return y_proba
Beispiel #10
0
class MultilabelClassifier_SVM(Classifier):
    kernel = None
    model = None

    def __init__(self, kernel='linear'):
        self.kernel = kernel

    def train(self, Train_X_Tfidf, Train_Y, Test_X_Tfidf=None, Test_Y=None):
        # Fit the training dataset on the classifier
        self.model = MultiOutputClassifier(
            SVC(C=1.0,
                kernel=self.kernel,
                degree=3,
                gamma='auto',
                probability=True))
        self.model.fit(Train_X_Tfidf, Train_Y)

        return self.model

    def predict(self, df):
        # Raw list of category probabilities (Shape n_categories x n_samples x n_outputs)
        rawList = self.model.predict_proba(df)

        # Convert to NumPy array
        # Extract relevant output
        # Transpose and round probabilities to create an indicator matrix
        predictionMatrix = np.round_(np.array(rawList)[:, :, 1]).T

        # Extract relevant probability output
        # Average confidences across all categories for all samples
        probabilityMatrix = np.amax(np.array(rawList), axis=2)
        confidenceList = np.average(probabilityMatrix.T, axis=1)

        return predictionMatrix, confidenceList
def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert_equal((n_samples, n_classes), class_probabilities.shape)

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
Beispiel #12
0
def train_and_predict(train_data, test_data, train_labels, test_labels):
    clf = MultiOutputClassifier(
        LogisticRegressionCV(max_iter=1e4, class_weight='balanced'))
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        clf.fit(train_data, train_labels.A)
        y_pred = np.array(clf.predict_proba(test_data))[:, :, 1].T
        mi = roc_auc_score(test_labels.A, y_pred, average="micro")
        ma = roc_auc_score(test_labels.A, y_pred, average="macro")
        c = np.mean([estimator.C_.mean() for estimator in clf.estimators_])
    return mi, ma, c
Beispiel #13
0
def test_multi_output_delegate_predict_proba():
    """Check the behavior for the delegation of predict_proba to the underlying
    estimator"""

    # A base estimator with `predict_proba`should expose the method even before fit
    moc = MultiOutputClassifier(LogisticRegression())
    assert hasattr(moc, "predict_proba")
    moc.fit(X, y)
    assert hasattr(moc, "predict_proba")

    # A base estimator without `predict_proba` should raise an AttributeError
    moc = MultiOutputClassifier(LinearSVC())
    assert not hasattr(moc, "predict_proba")
    msg = "'LinearSVC' object has no attribute 'predict_proba'"
    with pytest.raises(AttributeError, match=msg):
        moc.predict_proba(X)
    moc.fit(X, y)
    assert not hasattr(moc, "predict_proba")
    with pytest.raises(AttributeError, match=msg):
        moc.predict_proba(X)
class LinearRegression(Model):
    class SafeOneClassLogisticRegression(SafeOneClassMixin,
                                         LogisticRegression):
        pass

    def fit(self, X, y):
        self.model = MultiOutputClassifier(
            self.SafeOneClassLogisticRegression()).fit(X, y)

    def predict(self, X):
        return self.model.predict_proba(X)
Beispiel #15
0
def main():

    ### read training and testing data
    (Y_data, X_data, tag_list) = read_data(train_path, True)
    (_, X_test, _) = read_data(test_path, False)
    all_corpus = X_data + X_test
    print('Find %d articles.' % (len(all_corpus)))

    ### tokenizer for all data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_corpus)
    save_tokenizer(tokenizer, "bog_tokenizer_pickle")
    word_index = tokenizer.word_index

    ### convert word sequences to index sequence
    print('Convert to index sequences.')
    #    train_sequences = tokenizer.texts_to_sequences(X_data)
    train_sequences = tokenizer.texts_to_matrix(X_data, "count")
    test_sequences = tokenizer.texts_to_matrix(X_test, "count")
    print(test_sequences.shape)

    ###
    train_tag = to_multi_categorical(Y_data, tag_list)

    ### split data into training set and validation set
    (X_train, Y_train), (X_val, Y_val) = split_data(train_sequences, train_tag,
                                                    split_ratio)

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
    print("fitting ...")
    multi_target_forest.fit(X_train, Y_train)

    pred_val = multi_target_forest.predict_proba(X_val)
    print(pred_val.shape)
    pred_val = (pred_val > thresh).astype('int')
    print(pred_val.shape)
    #    print( np_f1score(Y_val, pred_val) )

    Y_pred = multi_target_forest.predict(test_sequences)
    Y_pred_thresh = (Y_pred > thresh).astype('int')

    with open(output_path, 'w') as output:
        print('\"id\",\"tags\"', file=output)
        for index, labels in enumerate(Y_pred_thresh):
            labels = [
                tag_list[i] for i, value in enumerate(labels) if value == 1
            ]
            if len(labels) == 0:
                labels.append(tag_list[np.argmax(Y_pred[index])])
            labels_original = ' '.join(labels)
            print('\"%d\",\"%s\"' % (index, labels_original), file=output)
Beispiel #16
0
class MultilabelClassifier_SVM(Classifier):
    kernel = None       # SVM kernel type
    model = None        # SVC object


    '''
    @brief      Class constructor
    @param      kernel      SVM kernel type
    @return     None
    '''
    def __init__(self, kernel='linear'):
        # Set SVM kernel type
        self.kernel = kernel


    '''
    @brief      Trains the model using given X and Y matrices
    @param      Train_X_Tfidf   Scikit-learn compatible matrix of TF-IDF embeddings for each topic text
    @param      Train_Y         Binary indicator matrix for the Y labels (tags) of the topics
    @return     self.model
    '''
    def train(self, Train_X_Tfidf, Train_Y, Test_X_Tfidf=None, Test_Y=None):
        # Fit the training dataset on the classifier
        self.model = MultiOutputClassifier(SVC(C=1.0, kernel=self.kernel, degree=3, gamma='auto', probability=True))
        self.model.fit(Train_X_Tfidf, Train_Y)
        
        return self.model


    '''
    @brief      Predicts an indicator matrix and confidence level for each topic
    @param      df                  Pandas dataframe of topic text
    @return     predictionMatrix    NumPy indicator matrix for the predicted tags
    @return     confidenceList      NumPy array of prediction confidence scores for each topic
    '''    
    def predict(self, df):
        # Raw list of category probabilities (Shape n_categories x n_samples x n_outputs)
        rawList = self.model.predict_proba(df)
        
        # Convert to NumPy array
        # Extract relevant output
        # Transpose and round probabilities to create an indicator matrix
        predictionMatrix = np.round_(np.array(rawList)[:, :, 1]).T
        
        # Extract relevant probability output
        # Average confidences across all categories for all samples
        probabilityMatrix = np.amax(np.array(rawList), axis=2)
        confidenceList = np.average(probabilityMatrix.T, axis=1)
        
        return predictionMatrix, confidenceList
class MulticlassLearner(AbstractLearner):

    def __init__(self):
        super().__init__()
        clf = LogisticRegression(C=1, solver='lbfgs', multi_class='multinomial')
        # clf = RandomForestClassifier(n_estimators=10)
        # clf = RidgeClassifier(alpha=0.3)
        # clf = MLPClassifier()
        self.clf = MultiOutputClassifier(clf, n_jobs=-1)

    def train(self, store: BasicStore):
        # x = self.data_manager.train
        # y = self.data_manager.train_labels
        self.clf.fit(*store.train_XYs)
        return self.clf

    def predict_proba(self, X):
        return self.clf.predict_proba(X)
Beispiel #18
0
def KNN(X_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(algorithm='auto',
                               metric='minkowski',
                               metric_params=None,
                               n_jobs=-1,
                               n_neighbors=147,
                               p=2,
                               weights='distance')
    print("poopf")
    knn.fit(X_train, y_train)
    classifier = MultiOutputClassifier(knn, n_jobs=-1)
    classifier.fit(X_train, y_train)
    y_predict = (classifier.predict_proba(x_test))
    output = np.zeros((1967, 147))  #2597
    for x in range(1967):
        for y in range(147):
            output[x][y] = y_predict[y][x][1]
    # print(output)
    # np.savetxt("sub.csv", output, delimiter=",")
    print(classifier.score(output, y_test))
Beispiel #19
0
loaddir = "feature_extraction/data/features/"

#read data
df = pd.read_csv(loaddir + 'features.csv', index_col=0)


def create_target(text):
    text = text.strip('][')
    text = text.split(', ')
    return [int(i) for i in text]


df["target"] = df["target"].apply(create_target)

df_train, df_test = train_test_split(df, test_size=0.1, random_state=0)

train_x = df_train.drop(["target"], axis=1)
train_y = df_train["target"]

test_x = df_test.drop(["target"], axis=1)
test_y = df_test["target"]

model = xgb.XGBClassifier()
print(train_y)
print(train_x.values)
clf = MultiOutputClassifier(model).fit(train_x, train_y)

pred = clf.predict_proba(test_x)
print(test_y["target"])
print(pred)
Beispiel #20
0
estimators = MultiOutputClassifier(estimator=XGBClassifier(
    penalty="l2", objective="binary:logistic", random_state=42))

X_train, X_eval, y_train, y_eval = train_test_split(features_df,
                                                    labels_df,
                                                    test_size=0.33,
                                                    shuffle=True,
                                                    stratify=labels_df,
                                                    random_state=RANDOM_SEED)

# Train model
estimators.fit(features_df, labels_df)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = estimators.predict_proba(X_eval)
preds
k = preds[0]
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index=y_eval.index)
print("y_preds.shape:", y_preds.shape)
y_preds.head()


def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
################confusion amtrix for method 2


# In[ ]:


from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
X_train, X_test, y_train, y_test = train_test_split(F,h, test_size=0.30)
multioutput = MultiOutputClassifier(xgb.XGBClassifier(objective='reg:logistic')).fit(X_train, y_train)
y2_true, y2_pred = y_test , multioutput.predict(X_test)
print('Results on the test set:')
print(classification_report(y2_true, y2_pred))
cm = multilabel_confusion_matrix(y_test, y2_pred)
sns.heatmap(cm, center=True)
y1_probas =multioutput.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y1_probas)
plt.show()


# In[ ]:


#############################################################################################


# In[ ]:


##################################Random forest
Beispiel #22
0
                    for i, line in enumerate(f):
                        if i == 0:
                            n2vDim = int(line.split()[1])
                            X = np.zeros((Aexp.shape[0], n2vDim))
                        else:
                            fields = line.split()
                            assert len(fields) == n2vDim + 1
                            X[int(fields[0])] = np.array(
                                fields[1:]).astype(float)
                Xtrain = X[train]
                Xtest = X[test]
                ii_tr = np.where(np.max(np.abs(Xtrain), axis=1) > 0.)[0]
                ii_ts = np.where(np.max(np.abs(Xtest), axis=1) > 0.)[0]
                clf = MultiOutputClassifier(LogisticRegression()).fit(
                    Xtrain[ii_tr], Ytrain[ii_tr])
                y = clf.predict_proba(Xtest[ii_ts])

            elif classifier == 'n2v_knn' and not os.path.exists(
                    experimentPath + 'fold' + str(fold_nr) + '/n2v/0_knn.pkl'):
                with open('../data/' + species + '/networks/tmp0.emb') as f:
                    for i, line in enumerate(f):
                        if i == 0:
                            n2vDim = int(line.split()[1])
                            X = np.zeros((Aexp.shape[0], n2vDim))
                        else:
                            fields = line.split()
                            assert len(fields) == n2vDim + 1
                            X[int(fields[0])] = np.array(
                                fields[1:]).astype(float)
                Xtrain = X[train]
                Xtest = X[test]
Beispiel #23
0
    ]]
    X_test_yn, y_test_yn = merged_test[features], merged_test[[
        'survival_yn', 'amount_yn'
    ]]

    multi_rf_clf = MultiOutputClassifier(
        RandomForestClassifier(n_estimators=100,
                               random_state=0,
                               verbose=3,
                               n_jobs=-1)).fit(X_train_yn, y_train_yn)
    true_label_yn = return_true_label(y_test_yn, merged_test)
    pred_label_yn = return_multi_pred_label(multi_rf_clf, true_label_yn,
                                            X_test_yn)

    sur_pred_res = pd.concat([
        pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[0]),
        pred_label_yn[1][['pred_survival_time', 'survival_time']]
    ], 1)
    sur_pred_res.columns = [
        'survival_yn_prob_0', 'survival_yn_prob_1', 'pred_survival_yn',
        'survival_yn'
    ]
    ams_pred_res = pd.concat([
        pd.DataFrame(multi_rf_clf.predict_proba(X_test_yn)[1]),
        pred_label_yn[1][['pred_amount_spent', 'amount_spent']]
    ], 1)
    ams_pred_res.columns = [
        'amount_yn_prob_0', 'amount_yn_prob_1', 'pred_amount_yn', 'amount_yn'
    ]

    #joblib.dump(multi_rf_clf, 'clf_sur0_ams1.pkl')
Beispiel #24
0
def run_regression(train_embeds, train_labels, test_embeds, test_labels, args):
    start_time = time.time()

    if args.label == 'single':
        log = args.classifier
    elif args.label == 'multi':
        log = MultiOutputClassifier(args.classifier, n_jobs=-1)
        # log = RandomForestClassifier(n_jobs = -1, random_state=seed)
        # log = MLPClassifier(random_state=seed)
    else:
        assert False

    log.fit(train_embeds, train_labels)
    test_pred = log.predict(test_embeds)
    train_pred = log.predict(train_embeds)
    test_score = log.predict_proba(test_embeds)
    train_score = log.predict_proba(train_embeds)

    n2v_scores = {}
    n2v_scores['runtime'] = time.time() - start_time

    if args.label == 'single':
        print("Single-label")

        n2v_scores['test_f1'] = f1_score(test_labels,
                                         test_pred,
                                         average=args.average)
        n2v_scores['test_precision'] = precision_score(test_labels,
                                                       test_pred,
                                                       average=args.average)
        n2v_scores['test_recall'] = recall_score(test_labels,
                                                 test_pred,
                                                 average=args.average)
        n2v_scores['test_accuracy'] = accuracy_score(test_labels, test_pred)

        n2v_scores['train_f1'] = f1_score(train_labels,
                                          train_pred,
                                          average=args.average)
        n2v_scores['train_precision'] = precision_score(train_labels,
                                                        train_pred,
                                                        average=args.average)
        n2v_scores['train_recall'] = recall_score(train_labels,
                                                  train_pred,
                                                  average=args.average)
        n2v_scores['train_accuracy'] = accuracy_score(train_labels, train_pred)

        lb = LabelBinarizer()
        lb.fit(test_labels)
        lb.fit(train_labels)
        n2v_scores['test_auc'] = roc_auc_score(lb.transform(test_labels),
                                               test_score,
                                               average=args.average)
        n2v_scores['test_ap'] = average_precision_score(
            lb.transform(test_labels), test_score, average=args.average)
        n2v_scores['train_auc'] = roc_auc_score(lb.transform(train_labels),
                                                train_score,
                                                average=args.average)
        n2v_scores['train_ap'] = average_precision_score(
            lb.transform(train_labels), train_score, average=args.average)
    elif args.label == 'multi':
        print("Multi-label", test_labels.shape[1])
        assert test_labels.shape[1] == train_labels.shape[1]

        n2v_scores['test_f1'] = []
        n2v_scores['train_f1'] = []
        n2v_scores['test_precision'] = []
        n2v_scores['train_precision'] = []
        n2v_scores['test_recall'] = []
        n2v_scores['train_recall'] = []
        for i in range(test_labels.shape[1]):
            n2v_scores['test_f1'].append(
                f1_score(test_labels[:, i],
                         test_pred[:, i],
                         average=args.average))
            n2v_scores['test_precision'].append(
                precision_score(test_labels[:, i],
                                test_pred[:, i],
                                average=args.average))
            n2v_scores['test_recall'].append(
                recall_score(test_labels[:, i],
                             test_pred[:, i],
                             average=args.average))
            n2v_scores['train_f1'].append(
                f1_score(train_labels[:, i],
                         train_pred[:, i],
                         average=args.average))
            n2v_scores['train_precision'].append(
                precision_score(train_labels[:, i],
                                train_pred[:, i],
                                average=args.average))
            n2v_scores['train_recall'].append(
                recall_score(train_labels[:, i],
                             train_pred[:, i],
                             average=args.average))
        n2v_scores['test_f1'] = np.mean(n2v_scores['test_f1'])
        n2v_scores['test_precision'] = np.mean(n2v_scores['test_precision'])
        n2v_scores['test_recall'] = np.mean(n2v_scores['test_recall'])
        n2v_scores['train_f1'] = np.mean(n2v_scores['train_f1'])
        n2v_scores['train_precision'] = np.mean(n2v_scores['train_precision'])
        n2v_scores['train_recall'] = np.mean(n2v_scores['train_recall'])

        n2v_scores['test_accuracy'] = accuracy_score(test_labels, test_pred)
        n2v_scores['train_accuracy'] = accuracy_score(train_labels, train_pred)

        # https://github.com/scikit-learn/scikit-learn/issues/2451
        # n2v_scores['test_lrap'] = label_ranking_average_precision_score(test_labels, test_score)
        # n2v_scores['train_lrap'] = label_ranking_average_precision_score(train_labels, train_score)
        # n2v_scores['test_auc'] = roc_auc_score(test_labels, test_score, average=args.average)
        # n2v_scores['test_ap'] = average_precision_score(test_labels, test_score, average=args.average)
        # n2v_scores['train_auc'] = roc_auc_score(train_labels, train_score, average=args.average)
        # n2v_scores['train_ap'] = average_precision_score(train_labels, train_score, average=args.average)s
    else:
        assert False

    print(n2v_scores)
    # print("Test F1-score", n2v_scores['test_f1'])
    # print("Train F1-score", n2v_scores['train_f1'])
    # print("Runtime (s)", n2v_scores['runtime'])

    # from sklearn.dummy import DummyClassifier
    # dummy = DummyClassifier()
    # dummy.fit(train_embeds, train_labels)
    # print("Random baseline")
    # print(f1_score(test_labels, dummy.predict(test_embeds), average=average))
    # for i in range(test_labels.shape[1]):
    # print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro")
    return n2v_scores
Beispiel #25
0
    random_state=1,
    return_train_score=True)

fit_model(random_forest_Bayes_optimized_classifier, X_train, y_train, X_test)
print(random_forest_Bayes_optimized_classifier.best_estimator_)

#Show Confusion Matrix
random_forest_optim = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=2000,
                           max_depth=20,
                           min_samples_split=20,
                           min_samples_leaf=4,
                           max_features='auto'))
classifier = random_forest_optim.fit(X_train, y_train)
cm = multilabel_confusion_matrix(y_test, random_forest_optim.predict(X_test))
print(cm)

## Retrain best model on full dataset and fit to test_set_features
random_forest_optim.fit(scaled_training_features, training_set_labels)
preds = random_forest_optim.predict_proba(scaled_test_features)

## Format for submittal on DrivenData
#Code copied from DrivenData to ensure correct format for submittal

# Save predictions to submission data frame
submission_format["h1n1_vaccine"] = preds[0][:, 1]
submission_format["seasonal_vaccine"] = preds[1][:, 1]

print(submission_format.head())
submission_format.to_csv('my_submission.csv', index=True)
X = np.vstack(list(map(lambda x: x['data'], sample_files.values())))
# X = normalize(X)

# Build a label list that corresponds to the feature set
y = []
for value in sample_files.values():
    y += [value['labels']] * len(value['data'])
y = np.array(mlb.transform(y))

# Use a multi-label classifier implementing Multinomial Naive Bayes
clf = MultiOutputClassifier(ExtraTreesClassifier(max_depth=5))
clf.fit(X, y)

print(f'Mean accuracy: {clf.score(X, y)}')

num_folds = 10
cv_score = cross_val_score(clf, X, y, cv=num_folds)
print(f'{num_folds}-fold cross-validation: {cv_score}')

# Perform real-time tests for each input file
for key, value in sample_files.items():
    print("\nPerforming real-time classification of "
          f"{', '.join(value['labels'])}")
    start_time = timeit.default_timer()
    features = Serializer("data/" + key).classify_realtime(
        clf, wait_for_min_counts=False, classification_interval_counts=1000)
    total_time = timeit.default_timer() - start_time
    print(f'Classified in {total_time} seconds')
    print_prediction(clf.predict(features))
    print_proba(clf.predict_proba(features))
Beispiel #27
0
def main():

    # Script argument parsing
    parser = argparse.ArgumentParser(
        description=
        'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values',
        epilog=' coded by: Emanuele Palombo')

    parser.add_argument('dataset_name',
                        metavar='DATASET',
                        type=str,
                        nargs='?',
                        default=__default_ts_name,
                        help='{} (default {}) - dataset name'.format(
                            list(__ts_opts.keys()), __default_ts_name))

    parser.add_argument(
        '--test-size',
        '-t',
        dest='test_size',
        action='store',
        metavar='TEST_SIZE',
        type=float,
        default=__default_test_size,
        help='[0-1] (default {}) - splitting size of TestSet'.format(
            __default_test_size))

    parser.add_argument(
        '--question-marks-ts',
        '-q',
        dest='qm_repeted_ts',
        action='store',
        type=int,
        default=__default_question_mark_count_repeated,
        help=
        '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet'
        .format(__default_question_mark_count_repeated,
                __default_question_mark_count))

    parser.add_argument(
        '--no-split',
        '-s',
        dest='no_split',
        action='store_true',
        default=__default_no_split,
        help='(default {}) - keep whole DataSet for training'.format(
            __default_no_split))

    parser.add_argument('--img-tag',
                        '-i',
                        dest='img_tag',
                        action='store',
                        type=str,
                        default='',
                        help='string - add arbitrary string to saved images')

    parser.add_argument(
        '--verbose',
        '-v',
        dest='verbosity',
        action='count',
        default=__default_training_verbosity,
        help='add more verbosity to output (repeat it to increase)')

    args = parser.parse_args()

    if args.dataset_name not in __ts_opts:
        print('ERROR: Choose correct DataSet!\n')
        parser.print_help()
        exit(1)

    trainingset_selected_name = args.dataset_name
    test_size = args.test_size
    qm_repeted_ts = args.qm_repeted_ts
    dataset_no_split = args.no_split
    training_verbosity = args.verbosity
    img_tag = args.img_tag
    running_id = id_generator()

    ts_selected_opts = __ts_opts[trainingset_selected_name]
    # End script argument parsing

    print('\nDataSet selected: ' + ts_selected_opts['url'])

    # read dataset to pandas dataframe
    dataset = pd.read_csv(ts_selected_opts['url'],
                          names=ts_selected_opts['columns'])

    if training_verbosity >= 1:
        print('\nFirst five rows of DataSet:\n')
        print(dataset.head())
        print('\nDataSet Length: {}'.format(len(dataset)))

    # DataSet Manipulation
    # remove row with question marks (this avoid to have '?' on the output)
    dataset = dataset[~(dataset.astype(str) == '?').any(1)]

    # strip out (remove) the "real output" (y)
    dataset = dataset.iloc[ts_selected_opts['x_slice'][0],
                           ts_selected_opts['x_slice'][1]]

    # Different approach to value conversion
    # convert all column to int (str => int)
    # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1)
    # convert all columns to int
    dataset = dataset.astype(int)

    # dataSet Information
    features_count = len(dataset.columns)
    features_values = ds_features_values(dataset)

    # copy input features to output (columns * 2)
    for column in dataset.columns:
        dataset['y_' + column] = dataset[column]

    # Split DataSet
    training_set, test_set = train_test_split(
        dataset,
        test_size=test_size,
        random_state=__default_train_test_split_random_state)

    # check feature values between TrainingSet and TestSet
    # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size)
    if not check_labels_split(features_count, training_set, test_set):
        exit(1)

    # Concat (add row) TrainingSet and TestSet
    # in this case model could see all sample (included queries without '?')
    if dataset_no_split:
        training_set = pd.concat([training_set, test_set], axis=0)

        print('\nTraining over the whole DataSet')
    else:
        print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'.
              format(test_size))

    # add (append) question mark
    # append qm_count rows, with 1 to qm_count '?'
    qm_count = int(ts_selected_opts['question_mark_count'])
    for i in range(qm_repeted_ts):
        for value_count in range(1, qm_count + 1):
            training_set = ds_mod_with_value(training_set, value_count,
                                             features_count, True)

            if training_verbosity >= 1:
                print(
                    '{} Added {} question mark (?) to TrainingSet for each sample'
                    .format(i, value_count))

    # Shuffle TrainingSet
    training_set = training_set.sample(frac=1)

    if training_verbosity >= 1:
        print('\nManipulated TrainingSet:\n')
        print(training_set.head())
        print('\nTrainingSet Length: {}'.format(len(training_set)))

    # TrainingSet: input X (features) and Output y ("mirrored" features))
    x_train = training_set.iloc[:, 0:features_count]
    y_train = training_set.iloc[:, features_count:]

    # TestSet: input X (features) and Output y ("mirrored" features))
    x_test = test_set.iloc[:, 0:features_count]
    y_test = test_set.iloc[:, features_count:]

    if training_verbosity >= 2:
        print('\nInput train:\n {}'.format(x_train.head()))
        print('\nOutput train:\n {}'.format(y_train.head()))
        print('\nInput test:\n {}'.format(x_test.head()))
        print('\nOutput test:\n {}'.format(y_test.head()))

    x_train = x_train.values
    y_train = y_train.values
    y_test = y_test.values

    # oneHot encoding (characteristic vector)
    # passing features_values without None force OneHotEncoder to transform None to null vector
    one_hot_encoder = OneHotEncoder(categories=features_values,
                                    handle_unknown='ignore')
    one_hot_encoder.fit(x_train)
    x_train_encoded = one_hot_encoder.transform(x_train).toarray()

    if training_verbosity >= 2:
        print('\nOneHotEncoding...\nexample: {} => {}'.format(
            x_train[0], x_train_encoded[0]))

    # store all results/metrics for each model/classifier
    results = {}

    for classifier_name in __deafult_model_classifier:

        filename = 'model_{}_{}.sav'.format(trainingset_selected_name,
                                            classifier_name)

        if os.path.isfile(filename):
            # load module already trained
            multi_output_classifier = joblib.load(filename)

            print(
                '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!'
                .format(classifier_name, filename))
        else:
            n_jobs = None
            model_verbosity = True if training_verbosity >= 3 else False

            if classifier_name == 'MLP':
                classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[
                    'mlp_hidden_layers_sizes'],
                                           max_iter=1000,
                                           verbose=model_verbosity)
            elif classifier_name == 'KNN':
                n_jobs = None
                classifier = KNeighborsClassifier(
                    n_neighbors=ts_selected_opts['knn_k'])
            elif classifier_name == 'SVM':
                classifier = SVC(gamma='scale',
                                 decision_function_shape='ovo',
                                 probability=True,
                                 verbose=model_verbosity)
            elif classifier_name == 'RandomForest':
                classifier = RandomForestClassifier(
                    n_estimators=ts_selected_opts['random_forest_estimator'],
                    verbose=model_verbosity)

            print('\n### Init and training the model: {}'.format(
                classifier_name))

            # init MultiOutput for classifier
            multi_output_classifier = MultiOutputClassifier(classifier,
                                                            n_jobs=n_jobs)
            multi_output_classifier.fit(x_train_encoded, y_train)

            # save the model to disk
            joblib.dump(multi_output_classifier, filename)

        results[classifier_name] = collections.defaultdict(list)
        metris_result = results[classifier_name]

        # create input test (query) with different number of '?'
        for query_count_question_mark in range(
                ts_selected_opts['question_mark_count'] + 1):

            print('\n## Add {} questions mark to input test (query)'.format(
                query_count_question_mark))

            # modify (in place) input test with question marks
            x_test_with_qm = ds_mod_with_value(
                x_test.copy(),
                value_count=query_count_question_mark,
                append=False)

            if training_verbosity >= 2:
                print('\nInput test (query):\n {}'.format(
                    pd.DataFrame(data=x_test_with_qm).head()))

            # encode the input test
            x_test_encoded = one_hot_encoder.transform(
                x_test_with_qm).toarray()

            # compute output prediction and probability
            y_pred = multi_output_classifier.predict(x_test_encoded)
            y_pred_proba = multi_output_classifier.predict_proba(
                x_test_encoded)
            # precision on whole output
            score = multi_output_classifier.score(x_test_encoded, y_test)
            # the Hamming loss corresponds to the Hamming distance between y_test and y_pred
            hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float(
                y_test.size)

            # compute y_test and y_pred how if the out was only the query question marks
            y_test_reduced, y_pred_reduced = reduce_y_to_qm(
                x_test_with_qm, y_test, y_pred)

            # write y_pred_proba to file (csv)
            write_pred_proba(
                y_pred_proba,
                '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path,
                                              trainingset_selected_name,
                                              classifier_name,
                                              query_count_question_mark,
                                              running_id, img_tag))

            print('\nMetrics:')
            print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy',
                                                     'log loss'))
            print('-' * (30 + 10 + 10 + 7))

            log_loss_avg = 0
            # for each output column => compute accuracy and log_loss
            for feature_index in range(y_test.shape[1]):
                y_test_column = y_test[:, feature_index]
                y_pred_column = y_pred[:, feature_index]

                accuracy = accuracy_score(y_test_column, y_pred_column)
                # note: for avoid error here was implemented check_labels_split()
                log_loss_value = log_loss(
                    y_test_column,
                    y_pred_proba[feature_index],
                    labels=features_values[feature_index])

                print(' {:<30} | {:^10.4f} | {:>10.4f}'.format(
                    test_set.columns[feature_index], accuracy, log_loss_value))

                log_loss_avg += log_loss_value

                metris_result['accuracy_' +
                              str(feature_index)].append(accuracy)
                metris_result['log_loss_' +
                              str(feature_index)].append(log_loss_value)

            print('\nVirtual reduced output:')
            # for each output reduced (only question marks) => compute accuracy
            for index in range(query_count_question_mark):
                accuracy = accuracy_score(y_test_reduced[:, index],
                                          y_pred_reduced[:, index])
                print(' accuracy {}:   {:>10.4f}'.format(index, accuracy))

                metris_result['accuracy_reduced_' +
                              str(index)].append(accuracy)

            print('\nAll output:')
            print(' accuracy:     {:>10.4f}'.format(score))
            print(' log_loss avg: {:>10.4f}'.format(log_loss_avg /
                                                    y_test.shape[1]))
            print(' hamming loss: {:>10.4f}'.format(hamming_loss))

            metris_result['accuracy'].append(score)
            metris_result['log_loss_avg'].append(log_loss_avg /
                                                 y_test.shape[1])
            metris_result['hamming_loss'].append(hamming_loss)

        # GRAPH PLOT per model/classifier
        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [
            results[classifier_name]['accuracy'],
            results[classifier_name]['log_loss_avg'],
            results[classifier_name]['hamming_loss']
        ],
                        labels=['accuracy', 'log loss avg', 'hamming loss'],
                        fmt=['bo-', 'ro-', 'yo-'],
                        title=classifier_name,
                        xlabel='Number of Question Marks in the query',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path,
                                                  trainingset_selected_name,
                                                  classifier_name, running_id,
                                                  img_tag),
                        dpi=200)

        # create list of list of accuracy x feature
        accuracy_lst = [
            'accuracy_' + str(index) for index in range(features_count)
        ]
        accuracy_lst = [
            results[classifier_name][accuracy_key]
            for accuracy_key in accuracy_lst
        ]

        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                        [results[classifier_name]['accuracy']] + accuracy_lst,
                        fmt=['bo-'] + ['g.--'] * len(accuracy_lst),
                        title=classifier_name +
                        ': whole accuracy and those by features',
                        xlabel='Number of Question Marks in the query',
                        ylabel='accuracy',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of accuracy_reduced x feature (adding 0 in front when needed)
        accuracy_reduced_lst = [
            'accuracy_reduced_' + str(index)
            for index in range(ts_selected_opts['question_mark_count'])
        ]
        accuracy_reduced_lst = [
            results[classifier_name][accuracy_reduced]
            for accuracy_reduced in accuracy_reduced_lst
        ]
        accuracy_reduced_lst = [[None] *
                                (ts_selected_opts['question_mark_count'] -
                                 len(accuracy_reduced) + 1) + accuracy_reduced
                                for accuracy_reduced in accuracy_reduced_lst]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['accuracy']] + accuracy_reduced_lst,
            fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst),
            title=classifier_name +
            ': whole accuracy and the virtual accuracies by features',
            xlabel='Number of Question Marks in the query',
            ylabel='accuracy',
            ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of log_loss x feature
        log_loss_lst = [
            'log_loss_' + str(index) for index in range(features_count)
        ]
        log_loss_lst = [
            results[classifier_name][log_loss_key]
            for log_loss_key in log_loss_lst
        ]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['log_loss_avg']] + log_loss_lst,
            fmt=['ro-'] + ['c.--'] * len(log_loss_lst),
            title=classifier_name + ': average log loss and those by features',
            xlabel='Number of Question Marks in the query',
            ylabel='log loss')

        if __default_save_img:
            plt.savefig('{}{}-{}-log-loss-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

    metrics_by_classifier = [
        results[classifier][metric]
        for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    label_by_classifier = [
        classifier + ' ' + metric for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    fmt_lst = [
        style.replace('0', character)
        for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x']
        for style in ['b0-', 'r0-', 'y0-']
    ]

    # GRAPH PLOT comparing model/classifier
    plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                    metrics_by_classifier,
                    labels=label_by_classifier,
                    fmt=fmt_lst,
                    title='Compare all model',
                    xlabel='Number of Question Marks in the query',
                    ylabel='',
                    ymax=1)

    if __default_save_img:
        plt.savefig('{}{}-comparing-{}{}.png'.format(
            __default_imgs_path, trainingset_selected_name, running_id,
            img_tag),
                    dpi=200)

    if not __default_save_img:
        plt.show()
Beispiel #28
0
class Team:
    def __init__(self, team_name, play_by_play_df):
        self.team = team_name
        self.team_df = play_by_play_df[play_by_play_df['posteam'] == self.team]
        self._generate_lists()

        self.valid_play_dict = {
            'Pass': 0,
            'Run': 1,
            'Punt': 2,
            'Field Goal': 3
        }
        self.valid_play_inv_dict = {
            0: 'Pass',
            1: 'Run',
            2: 'Punt',
            3: 'Field Goal'
        }

        self.X = []
        self.Y = []

    def train_classifier(self, debug_classifier=False):
        self._organize_training_data()
        self._generate_random_forest(debug_classifier)

    def _generate_random_forest(self, debug_classifier):
        self.forest = RandomForestClassifier(n_estimators=100, random_state=1)
        self.multi_target_forest = MultiOutputClassifier(self.forest,
                                                         n_jobs=-1)
        X_train, X_test, Y_train, Y_test = train_test_split(self.X,
                                                            self.Y,
                                                            test_size=0.1,
                                                            random_state=0)
        self.multi_target_forest.fit(X_train, Y_train)

        forests = self.multi_target_forest.estimators_
        forest0_feat = forests[0].feature_importances_.tolist()
        forest1_feat = forests[1].feature_importances_.tolist()
        forest2_feat = forests[2].feature_importances_.tolist()
        forest3_feat = forests[3].feature_importances_.tolist()

        feature_df = pd.DataFrame(
            data={
                'Features': [x for x in range(5)],
                'Forest0': forest0_feat,
                'Forest1': forest1_feat,
                'Forest2': forest2_feat,
                'Forest3': forest3_feat
            })

        if debug_classifier == True:
            print('Training Score: ',
                  self.multi_target_forest.score(X_train, Y_train))
            print('Test Score: ',
                  self.multi_target_forest.score(X_test, Y_test))

            fig1 = plt.figure()

            ax = fig1.add_subplot(111)

            width = 0.1

            feature_df.Forest0.plot(kind='bar',
                                    color='red',
                                    ax=ax,
                                    width=width,
                                    position=-1)
            feature_df.Forest1.plot(kind='bar',
                                    color='green',
                                    ax=ax,
                                    width=width,
                                    position=0)
            feature_df.Forest2.plot(kind='bar',
                                    color='blue',
                                    ax=ax,
                                    width=width,
                                    position=1)
            feature_df.Forest3.plot(kind='bar',
                                    color='yellow',
                                    ax=ax,
                                    width=width,
                                    position=2)

            ax.set_xticklabels([
                'Yards to First', 'Down', 'Quarter', 'Yardline', 'Score Diff'
            ],
                               rotation=0)
            ax.set_xlabel('Features')
            ax.set_ylabel('Feature Importance')
            ax.set_title('Random Forest - Feature Analysis')

            plt.xlim(-0.5, 4.5)
            plt.legend(['Pass', 'Run', 'Punt', 'Field Goal'])
            plt.show()

    def test_classifier(self, yards_to_go, down, quarter, yard_line,
                        score_diff):

        input_array = np.array(
            [yards_to_go, down, quarter, yard_line, score_diff])
        prediction = self.multi_target_forest.predict_proba(
            input_array).tolist()
        prediction = prediction[0][1]
        return np.argmax(prediction)

    def _generate_lists(self):

        self.play_type = self.team_df['PlayType'].values.tolist()
        self.game_ID = self.team_df['GameID'].values.tolist()
        self.drive = self.team_df['Drive'].values.tolist()
        self.quarter = self.team_df['qtr'].values.tolist()
        self.down = self.team_df['down'].values.tolist()
        self.time = self.team_df['time'].values.tolist()
        self.pos_team = self.team_df['posteam'].values.tolist()
        self.def_team = self.team_df['DefensiveTeam'].values.tolist()
        self.pass_length = self.team_df['PassLength'].values.tolist()
        self.pass_location = self.team_df['PassLocation'].values.tolist()
        self.pass_attempt = self.team_df['PassAttempt'].values.tolist()
        self.air_yards = self.team_df['AirYards'].values.tolist()
        self.rush_attempt = self.team_df['RushAttempt'].values.tolist()
        self.run_location = self.team_df['RunLocation'].values.tolist()
        self.run_gap = self.team_df['RunGap'].values.tolist()
        self.fieldgoal_distance = self.team_df[
            'FieldGoalDistance'].values.tolist()
        self.pos_team_score = self.team_df['PosTeamScore'].values.tolist()
        self.def_team_score = self.team_df['DefTeamScore'].values.tolist()
        self.yrdline100 = self.team_df['yrdline100'].values.tolist()
        self.yrds_to_go = self.team_df['ydstogo'].values.tolist()

    def _organize_training_data(self):

        score_diff_list = np.array(self.pos_team_score) - np.array(
            self.def_team_score)
        zipped_data = zip(self.quarter, self.down, self.yrdline100,
                          self.yrds_to_go, score_diff_list, self.play_type)

        for quarter, down, yrdln, yrds_to_go, score_diff, play_type in zipped_data:

            input_list = [yrds_to_go, down, quarter, yrdln, score_diff]
            if not np.any(np.isnan(
                    input_list)) and play_type in self.valid_play_dict:

                output_list = [0 for _ in range(4)]
                output_list[self.valid_play_dict[play_type]] = 1

                self.X.append(input_list)
                self.Y.append(output_list)

        self.X = np.array(self.X)
        self.Y = np.array(self.Y)

    def generate_success_probabilities(self, opponent, yr, debug_probs=False):
        ##############################
        # Extract Team Specific Data #
        ##############################
        self.opponent = opponent

        valid_dates = [
            str(yr) + '-' + '09',
            str(yr) + '-' + '10',
            str(yr) + '-' + '11',
            str(yr) + '-' + '12',
            str(yr + 1) + '-' + '01'
        ]

        coach_yr_09_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[0])]
        coach_yr_10_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[1])]
        coach_yr_11_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[2])]
        coach_yr_12_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[3])]
        coach_yr_01_df = self.team_df[self.team_df['\ufeffDate'].str.contains(
            valid_dates[4])]

        coach_yr_df = pd.concat([
            coach_yr_09_df, coach_yr_10_df, coach_yr_11_df, coach_yr_12_df,
            coach_yr_01_df
        ])

        team_prob_df = coach_yr_df[coach_yr_df['DefensiveTeam'] ==
                                   self.opponent]

        loc_pass_outcome = team_prob_df['PassOutcome'].values.tolist()
        loc_yrds_gained = team_prob_df['Yards.Gained'].values.tolist()
        loc_play_type = team_prob_df['PlayType'].values.tolist()
        loc_interception = team_prob_df['InterceptionThrown'].values.tolist()

        loc_play_type_fumble = coach_yr_df['PlayType'].values.tolist()
        loc_fumble = coach_yr_df['Fumble'].values.tolist()
        loc_drive = coach_yr_df['Drive'].values.tolist()
        loc_gameID = coach_yr_df['GameID'].values.tolist()

        loc_fg_success = coach_yr_df['FieldGoalResult']
        loc_fg_distance = coach_yr_df['yrdline100']
        loc_fg_play_type = coach_yr_df['PlayType']

        loc_punt_spot = coach_yr_df['yrdline100']
        loc_punt_return = coach_yr_df['Return_spot']

        loc_time_elapsed = coach_yr_df['Elapsed_Play_Time']

        ########################
        # Initialize Variables #
        ########################
        self.elapsed_time = {
            'punt': [],
            'run': [],
            'pass_good': [],
            'pass_nogood': [],
            'fg': []
        }

        self.total_passes = 0
        self.total_completions = 0
        self.pass_list = []
        self.rush_list = []

        self.pass_or_sack = 0
        self.num_sacks = 0
        self.sack_dist = []

        self.total_interceptions = 0

        field_goal_attempts = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0}
        field_goal_successes = {0: 0, 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0}
        self.field_goal_pct = {}

        total_runs = 0
        total_run_fumbles = 0
        total_pass = 0
        total_pass_fumbles = 0

        self.punt_dist = []
        punt_touchback = {
            90: 0,
            80: 0,
            70: 0,
            60: 0,
            50: 0,
            40: 0,
            30: 0,
            20: 0
        }
        punt_kickrange = {
            90: 0,
            80: 0,
            70: 0,
            60: 0,
            50: 0,
            40: 0,
            30: 0,
            20: 0
        }
        punt_total = 0

        #####################
        # Punt Calculations #
        #####################
        for punt_spot, return_spot, time in zip(loc_punt_spot, loc_punt_return,
                                                loc_time_elapsed):
            if np.isnan(punt_spot) == False and np.isnan(return_spot) == False:
                punt_total += 1
                punt_range = np.floor(punt_spot / 10) * 10
                punt_kickrange[punt_range] += 1
                if return_spot == 80:
                    punt_touchback[punt_range] += 1
                else:
                    self.punt_dist.append(return_spot - (100 - punt_spot))
                if np.isnan(time) == False:
                    self.elapsed_time['punt'].append(time)
        self.punt_alpha, self.punt_loc, self.punt_beta = stats.gamma.fit(
            self.punt_dist)
        punt_x = np.arange(-10, 80, 1)
        g3 = gamma.pdf(x=punt_x,
                       a=self.punt_alpha,
                       loc=self.punt_loc,
                       scale=self.punt_beta)

        self.punt_touchback_pct = {}
        for key, value in punt_kickrange.items():
            if value != 0:
                self.punt_touchback_pct[key] = punt_touchback[key] / value

        ###########################
        # Field Goal Calculations #
        ###########################
        for fg_success, fg_distance, fg_play_type, time in zip(
                loc_fg_success, loc_fg_distance, loc_fg_play_type,
                loc_time_elapsed):

            if fg_play_type == 'Field Goal':
                marker = np.floor(fg_distance / 10) * 10
                if marker is not None:
                    if np.isnan(time) == False:
                        self.elapsed_time['fg'].append(time)
                    field_goal_attempts[marker] += 1
                    if fg_success == 'Good':
                        field_goal_successes[marker] += 1

        for key, value in field_goal_attempts.items():
            if value > 0:
                self.field_goal_pct[key] = field_goal_successes[key] / value
            else:
                self.field_goal_pct[key] = 0

        #######################
        # Fumble Calculations #
        #######################
        for i, fumble in enumerate(loc_fumble):
            current_game = loc_gameID[i]
            current_drive = loc_drive[i]
            if loc_play_type_fumble[i] == 'Pass':
                total_pass += 1
                if fumble == 1:
                    if loc_gameID[i + 1] == current_game:
                        if loc_drive[i + 1] == current_drive or loc_drive[
                                i + 1] == current_drive + 1:
                            pass
                        else:
                            total_pass_fumbles += 1
            elif loc_play_type_fumble[i] == 'Run':
                total_runs += 1
                if fumble == 1:
                    if loc_gameID[i + 1] == current_game:
                        if loc_drive[i + 1] == current_drive or loc_drive[
                                i + 1] == current_drive + 1:
                            pass
                        else:
                            total_run_fumbles += 1

        self.pass_fumble_pct = total_pass_fumbles / total_pass
        self.run_fumble_pct = total_run_fumbles / total_runs

        #############################
        # Pass and Run Calculations #
        #############################
        for pass_outcome, yrds_gained, play_type, interception, time in zip(
                loc_pass_outcome, loc_yrds_gained, loc_play_type,
                loc_interception, loc_time_elapsed):

            if play_type == 'Pass' or play_type == 'Sack':
                self.pass_or_sack += 1
                if play_type == 'Sack':
                    self.num_sacks += 1
                    self.sack_dist.append(yrds_gained)

            if play_type == 'Pass':
                self.total_passes += 1
                if pass_outcome == "Complete":
                    self.total_completions += 1
                    self.pass_list.append(yrds_gained)
                    if np.isnan(time) == False:
                        self.elapsed_time['pass_good'].append(time)
                else:
                    if np.isnan(time) == False:
                        self.elapsed_time['pass_nogood'].append(time)
                if interception == 1:
                    self.total_interceptions += 1

            elif play_type == 'Run':
                if np.isnan(time) == False:
                    self.elapsed_time['run'].append(time)
                self.rush_list.append(yrds_gained)

        self.time_kde = {}

        self.time_kde['pass_good'] = stats.gaussian_kde(
            self.elapsed_time['pass_good'], bw_method=.2)
        self.time_kde['pass_nogood'] = stats.gaussian_kde(
            self.elapsed_time['pass_nogood'], bw_method=.2)
        self.time_kde['punt'] = stats.gaussian_kde(self.elapsed_time['punt'],
                                                   bw_method=.2)
        self.time_kde['run'] = stats.gaussian_kde(self.elapsed_time['run'],
                                                  bw_method=.2)
        self.time_kde['fg'] = stats.gaussian_kde(self.elapsed_time['fg'],
                                                 bw_method=.2)

        self.pass_complete_pct = self.total_completions / self.total_passes

        self.pass_alpha, self.pass_loc, self.pass_beta = stats.gamma.fit(
            self.pass_list)
        self.run_alpha, self.run_loc, self.run_beta = stats.gamma.fit(
            self.rush_list)

        self.sack_pct = self.num_sacks / self.pass_or_sack
        self.sack_yrds_mean = np.mean(self.sack_dist)
        self.sack_yrds_std = np.std(self.sack_dist)
        self.interception_pct = self.total_interceptions / self.total_passes

        #############
        # Debugging #
        #############
        if debug_probs == True:
            pass_x = np.arange(0, 40, .1)
            g1 = gamma.pdf(x=pass_x,
                           a=self.pass_alpha,
                           loc=self.pass_loc,
                           scale=self.pass_beta)

            run_x = np.arange(-10, 20, .1)
            g2 = gamma.pdf(x=run_x,
                           a=self.run_alpha,
                           loc=self.run_loc,
                           scale=self.run_beta)

            fig2 = plt.figure()

            ax1 = fig2.add_subplot(2, 1, 1)
            ax1.plot(pass_x, g1)
            ax1.hist(self.pass_list, bins=20, normed=True)
            ax1.set_xlabel('Pass Yards')
            ax1.set_ylabel('Probability')

            ax2 = fig2.add_subplot(2, 1, 2)
            ax2.plot(run_x, g2)
            ax2.hist(self.rush_list, 20, normed=True)
            ax2.set_xlabel('Rush Yards')
            ax2.set_ylabel('Probability')
            fig2.show()

            fig3 = plt.figure()

            ax3 = fig3.add_subplot(1, 1, 1)
            ax3.plot(punt_x, g3)
            ax3.hist(self.punt_dist, bins=20, normed=True)
            fig3.show()

            fig6 = plt.figure()

            ax6 = fig6.add_subplot(1, 1, 1)
            print('TIMES', self.elapsed_time)
            for key, value in self.elapsed_time.items():
                ax6.hist(value, histtype='step', label=key)
            ax6.legend()
            fig6.show()
Beispiel #29
0
class MultiLabeller(semisupervisor.SemiSupervisor):
    """
    A widget for assigning more than one label to each data point.

    This class is designed to label data for (semi-)supervised learning
    algorithms. It allows you to label data. In the future, it will also allow
    you to re-train an algorithm.

    Parameters
    ----------
    connection_string: str
        A SQLAlchemy-compatible database connection string. This is where the
        data for this widget will be stored, and where it will be retrieved
        from for labelling.
    features : list, np.ndarray, pd.Series, pd.DataFrame, optional
        An array or sequence of data in which each element (if 1D) or each row
        (if 2D) represents one data point for which you'd like to generate
        labels.
    labels : list, np.ndarray, pd.Series, pd.DataFrame, optional
        If you already have some labels, but would like to re-label some, then
        you can pass these in as labels.
    options : tuple, list
        The options presented for labelling.
    classifier : sklearn.base.ClassifierMixin, optional
        An object that implements the standard sklearn fit/predict methods. If
        provided, a button for retraining the model is shown, and the model
        performance under k-fold crossvalidation can be read as you go along.
    display_func : callable, optional
        A function that will be used to display the data. This function should
        take in two arguments, first the data to display, and second the number
        of data points to display (set to 1 for this class).
    eval_method : callable, optional
        A function that accepts the classifier, features, and labels as input
        and returns a dictionary of values that contain the key 'test_score'.
        The default is sklearn.model_selection.cross_validate, with cv=3. Use
        functools.partial to create a function with its parameters fixed.
    reorder : str, callable, optional
        One of the reordering algorithms specified in
        :py:mod:`superintendent.prioritisation`. This describes a function that
        receives input in the shape of n_samples, n_labels and calculates the
        priority in terms of information value in labelling a data point.
    shuffle_prop : float
        The proportion of points that are shuffled when the data points are
        re-ordered (see reorder keyword-argument). This controls the
        "exploration vs exploitation" trade-off - the higher, the more you
        explore the feature space randomly, the lower, the more you exploit
        your current weak points.
    keyboard_shortcuts : bool, optional
        If you want to enable ipyevent-mediated keyboard capture to use the
        keyboard rather than the mouse to submit data.

    """

    def __init__(self, *args, **kwargs):
        """
        A class for labelling your data.

        This class is designed to label data for (semi-)supervised learning
        algorithms. It allows you to label data, periodically re-train your
        algorithm and assess its performance, and determine which data points
        to label next based on your model's predictions.

        """
        reorder = kwargs.pop("reorder", None)

        super().__init__(*args, **kwargs)

        if self.event_manager is not None:
            self.event_manager.on_dom_event(
                self.input_widget._on_key_down, remove=True
            )

        if (
            not isinstance(self.classifier, MultiOutputClassifier)
            and self.classifier is not None
        ):
            self.classifier = MultiOutputClassifier(self.classifier, n_jobs=-1)

        if reorder is not None and isinstance(reorder, str):
            if reorder not in prioritisation.functions:
                raise NotImplementedError(
                    "Unknown reordering function '{}'.".format(reorder)
                )
            self.reorder = prioritisation.functions[reorder]
        elif reorder is not None and callable(reorder):
            self.reorder = reorder
        elif reorder is None:
            self.reorder = None
        else:
            raise ValueError(
                "The reorder argument needs to be either a function or the "
                "name of a function listed in superintendent.prioritisation."
            )

        self.input_widget = controls.MulticlassSubmitter(
            hint_function=kwargs.get("hint_function"),
            hints=kwargs.get("hints"),
            options=kwargs.get("options", ()),
            max_buttons=kwargs.get("max_buttons", 12),
        )
        self.input_widget.on_submission(self._apply_annotation)
        if self.event_manager is not None:
            self.event_manager.on_dom_event(self.input_widget._on_key_down)
        self._compose()

    def retrain(self, *args):
        """Retrain the classifier you passed when creating this widget.

        This calls the fit method of your class with the data that you've
        labelled. It will also score the classifier and display the
        performance.
        """
        if self.classifier is None:
            raise ValueError("No classifier to retrain.")

        if len(self.queue.list_labels()) < 1:
            self.model_performance.value = (
                "Score: Not enough labels to retrain."
            )
            return

        _, labelled_X, labelled_y = self.queue.list_completed()

        preprocessor = MultiLabelBinarizer()
        labelled_y = preprocessor.fit_transform(labelled_y)

        self._render_processing(message="Retraining... ")

        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.performance = self.eval_method(
                    self.classifier, labelled_X, labelled_y
                )
                self.model_performance.value = "Score: {:.2f}".format(
                    self.performance["test_score"].mean()
                )

        except ValueError:  # pragma: no cover
            self.performance = "Could not evaluate"
            self.model_performance.value = "Score: {}".format(self.performance)

        self.classifier.fit(labelled_X, labelled_y)

        if self.reorder is not None:
            ids, unlabelled_X = self.queue.list_uncompleted()

            probabilities = self.classifier.predict_proba(unlabelled_X)

            # if len(preprocessor.classes_) > 1:
            #     probabilities = sum(probabilities) / len(probabilities)

            reordering = list(
                self.reorder(probabilities, shuffle_prop=self.shuffle_prop)
            )

            new_order = OrderedDict(
                [(id_, index) for id_, index in zip(ids, list(reordering))]
            )

            self.queue.reorder(new_order)

        self.queue.undo()
        self._annotation_loop.send({"source": "__skip__"})
Beispiel #30
0
class ModelTrainer:
    """
    To train a merchine learning model based on the input yaml config
    """
    RAND_SEED = 42
    input_cmds = ('fit', 'evaluate', 'predict', 'experiment')
    supported_types = ('regression', 'classification', 'clustering')
    results_path = configs.get('results_path')  # path to the results folder
    default_model_path = configs.get(
        'default_model_path')  # path to the pre-fitted model
    description_file = configs.get(
        'description_file')  # path to the description.json file
    evaluation_file = configs.get(
        'evaluation_file')  # path to the evaluation.json file
    prediction_file = configs.get(
        'prediction_file')  # path to the predictions.csv
    default_dataset_props = configs.get(
        'dataset_props'
    )  # dataset props that can be changed from the yaml file
    default_model_props = configs.get(
        'model_props')  # model props that can be changed from the yaml file
    model = None

    def __init__(self, *args, **kwargs) -> None:

        self.data_path: str = kwargs.get('data_path', None)
        self.logfile = kwargs.get('logfile', None)
        self.command = kwargs.get('cmd', None)
        self.results_path = kwargs.get('results_path',
                                       None)  # path to the results folder
        self._x_columns = None
        # results_path as specified input
        if self.results_path == None:
            self.results_path = ModelTrainer.results_path  # path to the results folder
        else:
            self.default_model_path = os.path.join(self.results_path,
                                                   configs.get('model_file'))
            self.description_file = os.path.join(
                self.results_path,
                'description.json')  # path to the description.json file
            self.evaluation_file = os.path.join(
                self.results_path,
                'evaluation.json')  # path to the evaluation.json file
            self.prediction_file = os.path.join(
                self.results_path,
                'prediction.json')  # path to the predictions.csv

        logger.info(f"Entered kwargs: {kwargs}")

        if not self.command or self.command not in self.input_cmds:
            raise Exception(f"You must enter a valid command.\n"
                            f"available commands: {self.input_cmds}")

        if self.command == "fit":
            self.yml_path = kwargs.get('yaml_path', None)
            file_ext = self.yml_path.split('.')[-1]
            logger.info(f"You passed the configurations as a {file_ext} file.")

            self.yaml_configs = read_yaml(
                self.yml_path) if file_ext == 'yaml' else read_json(
                    self.yml_path)
            logger.info(f"your chosen configuration: {self.yaml_configs}")

            # dataset options given by the user
            self.dataset_props: dict = self.yaml_configs.get(
                'dataset', self.default_dataset_props)
            # model options given by the user
            self.model_props: dict = self.yaml_configs.get(
                'model', self.default_model_props)
            # list of target(s) to predict
            self.target: list = self.yaml_configs.get('target', None)
            # list of obs_id(s) to identify observation
            self.observation_id: list = self.yaml_configs.get(
                'observation_id', None)

            self.model_type: str = self.model_props.get('type', None)
            logger.info(f"dataset_props: {self.dataset_props} \n"
                        f"model_props: {self.model_props} \n "
                        f"target: {self.target} \n")

            # handle random numbers generation
            random_num_options = self.dataset_props.get('random_numbers', None)
            if random_num_options:
                generate_reproducible = random_num_options.get(
                    'generate_reproducible', None)
                if generate_reproducible:
                    logger.info(
                        "You provided the generate reproducible results option."
                    )
                    seed = random_num_options.get('seed', self.RAND_SEED)
                    np.random.seed(seed)
                    logger.info(
                        f"Setting a seed = {seed} to generate same random numbers on each experiment.."
                    )

        # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used
        else:
            self.model_path = kwargs.get('model_path', self.default_model_path)
            logger.info(f"path of the pre-fitted model => {self.model_path}")
            # load description file to read stored training parameters
            with open(self.description_file, 'r') as f:
                dic = json.load(f)
                self.target: list = dic.get(
                    "target")  # target to predict as a list
                self.model_type: str = dic.get(
                    "type"
                )  # type of the model -> regression or classification
                self.dataset_props: dict = dic.get(
                    'dataset_props')  # dataset props entered while fitting
        getattr(self, self.command)()

    def _create_model(self, **kwargs):
        """
        fetch a model depending on the provided type and algorithm by the user and return it
        @return: class of the chosen model
        """
        model_type: str = self.model_props.get('type')
        model_algorithm: str = self.model_props.get('algorithm')
        use_cv = self.model_props.get('use_cv_estimator', None)

        model_args = None
        if not model_type or not model_algorithm:
            raise Exception(f"model_type and algorithm cannot be None")
        algorithms: dict = models_dict.get(
            model_type)  # extract all algorithms as a dictionary
        model = algorithms.get(
            model_algorithm)  # extract model class depending on the algorithm
        logger.info(
            f"Solving a {model_type} problem using ===> {model_algorithm}")
        if not model:
            raise Exception("Model not found in the algorithms list")
        else:
            model_props_args = self.model_props.get('arguments', None)
            if model_props_args and type(model_props_args) == dict:
                model_args = model_props_args
            elif not model_props_args or model_props_args.lower() == "default":
                model_args = None

            if use_cv:
                model_class = model.get('cv_class', None)
                if model_class:
                    logger.info(
                        f"cross validation estimator detected. "
                        f"Switch to the CV version of the {model_algorithm} algorithm"
                    )
                else:
                    logger.info(
                        f"No CV class found for the {model_algorithm} algorithm"
                    )
            else:
                model_class = model.get('class')
            logger.info(f"model arguments: \n"
                        f"{self.model_props.get('arguments')}")
            model = model_class(**kwargs) if not model_args else model_class(
                **model_args)
            return model, model_args

    def _save_model(self, model):
        """
        save the model to a binary file
        @param model: model to save
        @return: bool
        """
        try:
            if not os.path.exists(self.results_path):
                logger.info(
                    f"creating model_results folder to save results...\n"
                    f"path of the results folder: {self.results_path}")
                os.mkdir(self.results_path)
            else:
                logger.info(f"Folder {self.results_path} already exists")
                logger.warning(
                    f"data in the {self.results_path} folder will be overridden. If you don't "
                    f"want this, then move the current {self.results_path} to another path"
                )

        except OSError:
            logger.exception(
                f"Creating the directory {self.results_path} failed ")
        else:
            logger.info(
                f"Successfully created the directory in {self.results_path} ")
            pickle.dump(model, open(self.default_model_path, 'wb'))
            return True

    def _load_model(self, f: str = ''):
        """
        load a saved model from file
        @param f: path to model
        @return: loaded model
        """
        try:
            if not f:
                logger.info(f"result path: {self.results_path} ")
                logger.info(f"loading model form {self.default_model_path} ")
                model = pickle.load(open(self.default_model_path, 'rb'))
            else:
                logger.info(f"loading from {f}")
                model = pickle.load(open(f, 'rb'))
            return model
        except FileNotFoundError:
            logger.error(f"File not found in {self.default_model_path} ")

    def _prepare_clustering_data(self):
        """
        preprocess data for the clustering algorithm
        """
        return self._process_data(target='fit_cluster')

    def _prepare_predict_data(self):
        """
        preprocess predict data to get similar data to the one used when training the model
        """
        return self._process_data(target='predict')

    def _prepare_fit_data(self):
        return self._process_data(target='fit')

    def _prepare_eval_data(self):
        return self._process_data(target='evaluate')

    def _process_data(self, target='fit'):
        """
        read and return data as x and y
        @return: list of separate x and y
        """
        assert isinstance(self.target,
                          list), "provide target(s) as a list in the yaml file"
        if self.model_type != "clustering":
            assert len(
                self.target) > 0, "please provide at least a target to predict"

        try:
            read_data_options = self.dataset_props.get('read_data_options',
                                                       None)
            dataset = pd.read_csv(
                self.data_path) if not read_data_options else pd.read_csv(
                    self.data_path, **read_data_options)
            logger.info(f"dataset shape: {dataset.shape}")
            attributes = list(dataset.columns)
            logger.info(f"dataset attributes: {attributes}")

            # handle missing values in the dataset
            preprocess_props = self.dataset_props.get('preprocess', None)
            if preprocess_props:
                # handle encoding
                encoding = preprocess_props.get('encoding')
                if encoding:
                    encoding_type = encoding.get('type', None)
                    column = encoding.get('column', None)
                    if column in attributes:
                        dataset, classes_map = encode(
                            df=dataset,
                            encoding_type=encoding_type.lower(),
                            column=column)
                        if classes_map:
                            self.dataset_props[
                                'label_encoding_classes'] = classes_map
                            logger.info(
                                f"adding classes_map to dataset props: \n{classes_map}"
                            )
                        logger.info(
                            f"shape of the dataset after encoding => {dataset.shape}"
                        )

                # preprocessing strategy: mean, median, mode etc..
                strategy = preprocess_props.get('missing_values')
                if strategy:
                    dataset = handle_missing_values(dataset, strategy=strategy)
                    logger.info(
                        f"shape of the dataset after handling missing values => {dataset.shape}"
                    )

            if target == 'predict' or target == 'fit_cluster':
                x = _reshape(dataset.to_numpy())
                if not preprocess_props:
                    return x
                scaling_props = preprocess_props.get('scale', None)
                if not scaling_props:
                    return x
                else:
                    scaling_method = scaling_props.get('method', None)
                    return normalize(x, method=scaling_method)

            if any(col not in attributes for col in self.target):
                raise Exception(
                    "chosen target(s) to predict must exist in the dataset")

            y = pd.concat(
                [dataset.pop(x) for x in self.target],
                axis=1)  # remove target variable(s) from dataset & concat them
            x = _reshape(dataset.to_numpy())
            y = _reshape(y.to_numpy())
            logger.info(f"y shape: {y.shape} and x shape: {x.shape}")
            self._x_columns = dataset.columns.to_list()
            logger.info(f"X columns: {self._x_columns}")

            # handle data scaling
            if preprocess_props:
                scaling_props = preprocess_props.get('scale', None)
                if scaling_props:
                    scaling_method = scaling_props.get('method', None)
                    scaling_target = scaling_props.get('target', None)
                    if scaling_target == 'all':
                        x = normalize(x, method=scaling_method)
                        y = normalize(y, method=scaling_method)
                    elif scaling_target == 'inputs':
                        x = normalize(x, method=scaling_method)
                    elif scaling_target == 'outputs':
                        y = normalize(y, method=scaling_method)

            if target == 'evaluate':
                return x, y

            split_options = self.dataset_props.get('split', None)
            if not split_options:
                return x, y, None, None
            test_size = split_options.get('test_size')
            shuffle = split_options.get('shuffle')
            stratify = split_options.get('stratify')
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                shuffle=shuffle,
                stratify=None
                if not stratify or stratify.lower() == "default" else stratify)

            return x_train, y_train, x_test, y_test

        except Exception as e:
            logger.exception(
                f"error occured while preparing the data: {e.args}")

    def get_evaluation(self, model, x_test, y_true, y_pred, y_score, **kwargs):
        try:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 y_score=y_score,
                                 get_score_only=False,
                                 **kwargs)
        except Exception as e:
            logger.debug(e)
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 y_score=y_score,
                                 get_score_only=True,
                                 **kwargs)
        return res

    def fit(self, **kwargs):
        """fit a model

        Raises:
            Exception: [description]
        """
        x_train = None
        y_train = None
        x_test = None
        y_test = None

        cv_results = None
        eval_results = None
        cv_params = None
        hp_search_results = {}

        if self.model_type == 'clustering':
            x_train = self._prepare_clustering_data()
        else:
            x_train, y_train, x_test, y_test = self._prepare_fit_data()
        self.model, model_args = self._create_model(**kwargs)
        logger.info(
            f"executing a {self.model.__class__.__name__} algorithm...")

        # convert to multioutput if there is more than one target to predict:
        if self.model_type != 'clustering' and len(self.target) > 1:
            logger.info(
                f"predicting multiple targets detected. Hence, the model will be automatically "
                f"converted to a multioutput model")
            self.model = MultiOutputClassifier(self.model) \
                if self.model_type == 'classification' else MultiOutputRegressor(self.model)

        if self.model_type != 'clustering':
            cv_params = self.model_props.get('cross_validate', None)
            if not cv_params:
                logger.info(f"cross validation is not provided")
            else:
                # perform cross validation
                logger.info("performing cross validation ...")
                cv_results = cross_validate(estimator=self.model,
                                            X=x_train,
                                            y=y_train,
                                            **cv_params)

            hyperparams_props = self.model_props.get('hyperparameter_search',
                                                     None)
            if hyperparams_props:

                # perform hyperparameter search
                method = hyperparams_props.get('method', None)
                grid_params = hyperparams_props.get('parameter_grid', None)
                hp_args = hyperparams_props.get('arguments', None)
                logger.info(
                    f"Performing hyperparameter search using -> {method}")
                logger.info(
                    f"Grid parameters entered by the user: {grid_params}")
                logger.info(f"Additional hyperparameter arguments: {hp_args}")
                best_estimator, best_score, best_params = hyperparameter_search(
                    model=self.model,
                    method=method,
                    params=grid_params,
                    x_train=x_train,
                    y_train=y_train,
                    **hp_args)
                hp_search_results['best_params'] = best_params
                hp_search_results['best_score'] = best_score
                self.model = best_estimator

            self.model.fit(x_train, y_train)

        else:  # if the model type is clustering
            self.model.fit(x_train)

        saved = self._save_model(self.model)
        if saved:
            logger.info(
                f"model saved successfully and can be found in the {self.results_path} folder"
            )

        if self.model_type == 'clustering':
            eval_results = self.model.score(x_train)
        else:
            if x_test is None:
                logger.info(
                    f"no split options was provided. training score will be calculated"
                )
                eval_results = self.model.score(x_train, y_train)

            else:
                logger.info(
                    f"split option detected. The performance will be automatically evaluated "
                    f"using the test data portion")
                y_pred = self.model.predict(x_test)
                y_score = self.model.predict_proba(
                    x_test) if self.model_type == 'classification' else None
                eval_results = self.get_evaluation(model=self.model,
                                                   x_test=x_test,
                                                   y_true=y_test,
                                                   y_pred=y_pred,
                                                   y_score=y_score,
                                                   **kwargs)

        fit_description = {
            "model": self.model.__class__.__name__,
            "arguments": model_args if model_args else "default",
            "type": self.model_props['type'],
            "algorithm": self.model_props['algorithm'],
            "dataset_props": self.dataset_props,
            "model_props": self.model_props,
            "data_path": self.data_path,
            "train_data_shape": x_train.shape,
            "test_data_shape": None if x_test is None else x_test.shape,
            "train_data_size": x_train.shape[0],
            "test_data_size": None if x_test is None else x_test.shape[0],
            "results_path": str(self.results_path),
            "model_path": str(self.default_model_path),
            "target": None if self.model_type == 'clustering' else self.target,
            "results_on_test_data": eval_results,
            "hyperparameter_search_results": hp_search_results
        }
        if self.model_type == 'clustering':
            clustering_res = {
                "cluster_centers": self.model.cluster_centers_,
                "cluster_labels": self.model.labels_
            }
            fit_description['clustering_results'] = clustering_res

        if cv_params:
            cv_res = {
                "fit_time": cv_results['fit_time'].tolist(),
                "score_time": cv_results['score_time'].tolist(),
                "test_score": cv_results['test_score'].tolist()
            }
            fit_description['cross_validation_params'] = cv_params
            fit_description['cross_validation_results'] = cv_res

        try:
            logger.info(f"saving fit description to {self.description_file}")
            with open(self.description_file, 'w', encoding='utf-8') as f:
                json.dump(fit_description, f, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(
                f"Error while storing the fit description file: {e}")

    def evaluate(self, **kwargs):
        """
        evaluate a pre-fitted model and save results to a evaluation.json
        @return: None
        """
        x_val = None
        y_true = None
        eval_results = None

        try:
            model = self._load_model()
            if self.model_type != 'clustering':
                x_val, y_true = self._prepare_eval_data()
                y_pred = model.predict(x_val)
                y_score = model.predict_proba(
                    x_val) if self.model_type == 'classification' else None
                eval_results = self.get_evaluation(model=model,
                                                   x_test=x_val,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   y_score=y_score,
                                                   **kwargs)
            else:
                x_val = self._prepare_clustering_data()
                y_pred = model.predict(x_val)
                eval_results = model.score(x_val, y_pred)

            logger.info(f"saving fit description to {self.evaluation_file}")
            with open(self.evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(eval_results, f, ensure_ascii=False, indent=4)

        except Exception as e:
            logger.exception(f"error occured during evaluation: {e}")

    def predict(self):
        """
        use a pre-fitted model to make predictions and save them as csv
        @return: None
        """
        try:
            model = self._load_model(f=self.model_path)
            x_val = self._prepare_predict_data(
            )  # the same is used for clustering
            y_pred = model.predict(x_val)
            y_pred = _reshape(model.predict_proba(x_val)[:, 1]) if (
                type_of_target(y_pred) == 'binary'
                and self.model_type == 'classification') else _reshape(y_pred)
            logger.info(
                f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}"
            )
            logger.info(f"predict on targets: {self.target}")
            df_pred = pd.DataFrame.from_dict({
                self.target[i]: y_pred[:,
                                       i] if len(y_pred.shape) > 1 else y_pred
                for i in range(len(self.target))
            })

            logger.info(f"saving the predictions to {self.prediction_file}")
            df_pred.to_csv(self.prediction_file)

        except Exception as e:
            logger.exception(f"Error while preparing predictions: {e}")

    @staticmethod
    def create_init_config_file(model_type=None,
                                model_name=None,
                                target=None,
                                *args,
                                **kwargs):
        path = configs.get('init_file_path', None)
        if not path:
            raise Exception("You need to provide a path for the init file")

        dataset_props = ModelTrainer.default_dataset_props
        model_props = ModelTrainer.default_model_props
        if model_type:
            logger.info(f"user selected model type = {model_type}")
            model_props['type'] = model_type
        if model_name:
            logger.info(f"user selected algorithm = {model_name}")
            model_props['algorithm'] = model_name

        logger.info(f"initalizing a default ModelTrainer.yaml in {path}")
        default_data = {
            "dataset":
            dataset_props,
            "model":
            model_props,
            "target": ['provide your target(s) here']
            if not target else [tg for tg in target.split()]
        }
        created = create_yaml(default_data, path)
        if created:
            logger.info(
                f"a default Model.yaml is created for you in {path}. "
                f"you just need to overwrite the values to meet your expectations"
            )
        else:
            logger.warning(
                f"something went wrong while initializing a default file")