Esempio n. 1
0
def align_left(df1, df2, log=False):
    if log: section_timer = Timer(log=f"removing columns of the second dataframe that are not in the first")

    df1, df2 = df1.align(df2, join="inner", axis=1)

    if log: section_timer.end_timer(log=f"done, with final shapes of {df1.shape} and {df2.shape}")
    return df1, df2
Esempio n. 2
0
def parse_CSV_to_df(file_path, lines_cap=None, reduce_size=False, log=False):
    """
    Parses a .csv file into a dataframe.

    :param file_path:
        Path of the file being parsed to a pandas' dataframe
    :param lines_cap:
        Number of lines being parsed
        If None, parses all the lines
    :param log:
        Flag for log on the console

    :return df:
        Returns fetched dataframe
    """
    if log:
        section_timer = Timer(
            log=f"parsing file {file_path} ({'{0:.1f}'.format(os.path.getsize(file_path) * 2 ** (-20))}Mb)")

    df = pd.read_csv(file_path, nrows=lines_cap, index_col=False, encoding="utf_8", header=0)

    if reduce_size:
        df = reduce_dataframe_size(df.infer_objects())

    if log:
        section_timer.end_timer(log=f"parsed a dataframe of {df.shape[0]} rows and {df.shape[1]} features")

    return df
Esempio n. 3
0
def correct_nan_values(df, log=False):
    if log: section_timer = Timer(log=f"searching for anomalies")

    # converts some particular strings into nans
    df = df.replace(to_replace=["XAP", "XNA"], value=np.nan)

    # loop through each column
    for col in df.columns:
        # if it's a numerical column (we operate only on them)
        if df[col].dtype != "object" and "sk_id" not in col.lower().strip():
            unique_values = set(df[col].unique())

            # we don't want to delete columns with over 90% of same values
            #df[col] = __check_unuseful_col(df[col], log=False)

            # we want to delete single values with opposite sign
            #df[col] = __check_single_sign_value(df[col], log=False)

            # if it's a continuous feature
            #if len(unique_values) >= 50:
            #df[col] = __check_frequency_anomaly(df[col], log=False)
            if len(unique_values) == 2:
                df[col] = df[col].replace(unique_values - {0}, 1)

    if log: section_timer.end_timer(log=f"done")

    return df
def random_forest(X_train=None,
                  X_validate=None,
                  y_train=None,
                  y_validate=None,
                  tuning=False,
                  log=False):
    # positive class symbol (usually 1)
    positive_label = list(
        filter(lambda value: "1" in str(value), set(y_train.tolist())))[0]
    # tuning the classifier
    if tuning:
        if log: section_timer = Timer(log=f"tuning Random Forest classifier")
        # testing several parameters
        bestScore, n_estimators_best, max_samples_best, max_features_best = 0, None, None, None
        for n_estimators in [1, 100, 500, 1000]:
            for max_samples in [0.1, 0.25]:
                for max_features in ["sqrt", "log2"]:
                    classifier = RandomForestClassifier(
                        n_estimators=n_estimators,
                        criterion="gini",
                        max_features=max_features,
                        class_weight=None,
                        bootstrap=True,
                        warm_start=True,
                        max_samples=max_samples,
                        n_jobs=4).fit(X_train, y_train)
                    score = roc_auc_score(y_validate,
                                          classifier.predict(X_validate))
                    if (score > bestScore):
                        bestScore, n_estimators_best, max_samples, max_features_best = \
                            score, n_estimators, max_samples, max_features
        # choosing best parameters
        classifier = RandomForestClassifier(n_estimators=n_estimators_best,
                                            criterion="gini",
                                            max_features=max_features_best,
                                            class_weight=None,
                                            bootstrap=True,
                                            warm_start=True,
                                            max_samples=max_samples,
                                            n_jobs=4)
        if log:
            section_timer.end_timer(
                log=f"done with a max score of {bestScore}")
    # default classifier
    else:
        classifier = RandomForestClassifier(n_estimators=300,
                                            criterion="gini",
                                            bootstrap=True,
                                            max_samples=0.2,
                                            n_jobs=4)

    return classifier
def feature_selection(df, y_train, corr_threshold=0.8, log=False):
    if log: section_timer = Timer(log=f"finding the features")

    features_to_keep = int(len(df.columns) * corr_threshold)
    columns = ExtraTreesClassifier(n_estimators=100).fit(
        df.to_numpy(), y_train).feature_importances_
    columns, correlations = pd.Series(columns).sort_values(
        ascending=False).index.tolist()[:features_to_keep], pd.Series(
            columns).sort_values(ascending=False).tolist()[:features_to_keep]
    columns = [df.columns[i] for i in columns]

    if log:
        section_timer.end_timer(
            log=f"selected {len(columns)} (out of {len(df.columns)}) features")

    return columns
Esempio n. 6
0
def remove_useless_columns(df, log=False):
    if log: section_timer = Timer(log=f"searching for totally useless columns")

    original_columns = set(df.columns)
    # removes totally meaningless columns
    for col in df.columns:
        if re.match(
                r"(AMT_REQ_CREDIT_BUREAU_(HOUR|WEEK|DAY|QRT))|(.*_(AVG|MODE))|(WEEKDAY_APPR_PROCESS_START)",
                col) != None:
            df = df.drop(columns=col)

    if log:
        section_timer.end_timer(
            log=
            f"removed {len(original_columns - set(df.columns))} columns for a final shape of {df.shape}"
        )
    return df
Esempio n. 7
0
def undersample(df, complex=False, log=False):
    if log:
        section_timer = Timer(log=f"undersampling")
    if "_merge" in df.columns:
        df = df.drop("_merge", axis=1)

    count = evaluation.count_values(df, "TARGET")
    lessLabel = 0 if count[0] < count[1] else 1
    df_new = pd.concat([
        df[df["TARGET"] == 0].sample(count[lessLabel]),
        df[df["TARGET"] == 1].sample(count[lessLabel])
    ])

    if log:
        section_timer.end_timer(
            log=f"done with a final shape of {df_new.shape}")
    return df_new
Esempio n. 8
0
def write_df_to_file(df, file_path, index=False, reduce_size=True, header=True, log=False):
    """
    :param df:
        Pandas' dataframe being written to a file
    :param file_path:
        Path of the file being creating from a pandas' dataframe
    """
    if log:
        section_timer = Timer(
            log=f"writing file {file_path}")

    if reduce_size:
        df = reduce_dataframe_size(df)

    df.to_csv(path_or_buf=file_path, index=index, header=header, sep=",")

    if log:
        section_timer.end_timer(log=f"written a dataframe of {df.shape[0]} rows and {df.shape[1]} features")
Esempio n. 9
0
def smote(df, log=False):
    if log:
        section_timer = Timer(log=f"oversampling using SMOTE")
    if "_merge" in df.columns:
        df = df.drop("_merge", axis=1)

    target_values = pd.unique(df["TARGET"]).tolist()
    target_values.sort()
    false_number, true_number = target_values
    df = df.replace(to_replace={false_number: 0, true_number: 1})

    df, df["TARGET"] = SMOTE(n_jobs=4).fit_resample(df.drop(columns="TARGET"),
                                                    df["TARGET"])

    if log:
        section_timer.end_timer(log=f"for a total shape of {df.shape}")

    return df
def logistic_regression(X_train=None,
                        X_validate=None,
                        y_train=None,
                        y_validate=None,
                        tuning=False,
                        log=False):
    # positive class symbol (usually 1)
    positive_label = list(
        filter(lambda value: "1" in str(value), set(y_train.tolist())))[0]
    # tuning the classifier
    if tuning:
        if log:
            section_timer = Timer(log=f"tuning Logistic Regression classifier")
        # testing several parameters
        bestScore, solver_best = 0, None
        for solver in ["liblinear", "lbfgs", "newton-cg", "saga"]:
            classifier = LogisticRegression(solver=solver,
                                            dual=False,
                                            warm_start=True,
                                            max_iter=500,
                                            n_jobs=4,
                                            C=1).fit(X_train, y_train)
            score = roc_auc_score(y_validate, classifier.predict(X_validate))
            if (score > bestScore):
                bestScore, solver_best = \
                    score, solver
        # choosing best parameters
        classifier = LogisticRegression(solver=solver_best,
                                        dual=False,
                                        warm_start=True,
                                        max_iter=1000,
                                        n_jobs=4,
                                        C=1)
        if log:
            section_timer.end_timer(
                log=f"done with a max score of {bestScore}")
    # default classifier
    else:
        classifier = LogisticRegression(dual=False,
                                        max_iter=1000,
                                        n_jobs=4,
                                        C=1)

    return classifier
Esempio n. 11
0
def remove_rows(df, threshold, log=False):
    """
    Removes rows with more than a percentage of NaN values

    :param df:
        Input dataframe
    :param threshold:
        Maximum percentage of NaN values for rows
    :param log:
        Flag for log on the console
    :return:
    """
    if log: section_timer = Timer(log=f"removing rows with more than {threshold * 100}% of NaNs")

    non_nan_values = int(df.shape[1] * (1 - threshold))
    df_clean = df.dropna(thresh=non_nan_values, axis=0)

    if log: section_timer.end_timer(log=f"removed {df.shape[0] - df_clean.shape[0]} rows")
    return df_clean
Esempio n. 12
0
def show_unique_values(df, file_path=None, log=False):
    if log:
        section_timer = Timer(log=f"searching for unique values")

    unique_values = {}
    for col in df.columns:
        # if it's a discrete column
        if df[col].dtype == "object":
            print(col)
            unique_values[col] = set(df[col].tolist())
        # pene
    pprint(unique_values)

    if file_path != None:
        with open(file_path, "w") as fp:
            pprint(unique_values, stream=fp)

    if log:
        section_timer.end_timer(log=f"done")
Esempio n. 13
0
def impute_missing_values(df, mode="simple", columns=None, reduce_size=False, log=False):
    if columns == None:
        columns_to_impute = list(df.columns)
    elif columns == []:
        return df
    else:
        columns_to_impute = columns

    if log: section_timer = Timer(log=f"imputing missing values")

    X = df[columns_to_impute].to_numpy()

    if mode.lower().strip() == "simple 0":
        imputer = SimpleImputer(strategy="constant", fill_value=0)

    elif mode.lower().strip() == "simple median":
        imputer = SimpleImputer(strategy="median", copy=False)

    elif mode.lower().strip() == "simple mean":
        imputer = SimpleImputer(strategy="mean", copy=False)

    elif mode.lower().strip() == "simple most common":
        imputer = SimpleImputer(strategy="most_frequent", copy=False)

    elif mode.lower().strip() == "iterative":
        imputer = IterativeImputer(max_iter=3, n_nearest_features=5)

    else:
        raise Exception(f'Unrecognized mode f{mode.strip()}.\nOnly supported modes are "simple 0", "simple mean", "simple median", "simple most common", "iterative"')

    X_pred = imputer.fit_transform(X)

    df[columns_to_impute] = X_pred

    if reduce_size:
        df = parsing.reduce_dataframe_size(df, log=False)

    if log:
        section_timer.end_timer(log=f"done")

    return df
def multilayer_perceptron(X_train=None,
                          X_validate=None,
                          y_train=None,
                          y_validate=None,
                          tuning=False,
                          log=False):
    # positive class symbol (usually 1)
    positive_label = list(
        filter(lambda value: "1" in str(value), set(y_train.tolist())))[0]
    # tuning the classifier
    if tuning:
        if log:
            section_timer = Timer(
                log=f"tuning Multilayer Perceptron classifier")
        # testing several parameters
        bestScore, activation_best, learning_rate_best = 0, None, None
        for activation in ["logistic", "relu"]:
            for learningRate in ["constant", "adaptive"]:
                classifier = MLPClassifier(activation=activation,
                                           learning_rate=learningRate,
                                           solver="adam",
                                           max_iter=200).fit(X_train, y_train)
                score = f1_score(y_validate, classifier.predict(X_validate))
                if (score > bestScore):
                    bestScore, activation_best, learning_rate_best = score, activation, learningRate
        # choosing best parameters
        classifier = MLPClassifier(activation=activation_best,
                                   learning_rate=learning_rate_best,
                                   solver="adam",
                                   max_iter=200)
        if log:
            section_timer.end_timer(
                log=f"done with a max score of {bestScore}")
    # default classifier
    else:
        classifier = MLPClassifier(solver="adam",
                                   activation="relu",
                                   learning_rate="constant",
                                   max_iter=200)

    return classifier
Esempio n. 15
0
def remove_columns(df, threshold, log=False):
    """
    Removes columns with more than a percentage of NaN values

    :param df:
        Input dataframe
    :param threshold:
        Maximum percentage of NaN values for columns
    :param log:
        Flag for log on the console
    :return:
    """
    if log: sectionTimer = Timer(log=f"removing columns with more than {threshold * 100}% of nans")
            
    # removes columns with many nans
    non_nan_values = int(df.shape[0] * (1 - threshold))
    df_clean = df.dropna(thresh=non_nan_values, axis=1)
    dropped_cols = list(set(df.columns) - set(df_clean.columns))

    if log: sectionTimer.end_timer(log=f"removed {len(set(df.columns)) - df_clean.shape[1]} columns")
    return df_clean, dropped_cols
Esempio n. 16
0
def frequency_encoding(df, just_one_hot=False, log=False):
    """
    encodes the dataframe with the frequencies on the categorical features.
    :param df: dataframe to be encoded
    :param log: true if we want to set the timer on
    :return: encoded dataframe
    """
    if log:
        section_timer = Timer(
            log=f"frequency encoding a dataframe with shape {df.shape}")

    # does a one-hot encoding
    old_cols = set(df.columns)
    df = pd.get_dummies(df)

    # eventually do the frequency encoding
    if not just_one_hot:
        for col in list(set(df.columns) - old_cols):
            df[col] = (df[col] * (df[col].sum() / df.shape[0]))

    if log: section_timer.end_timer(log=f"done")
    return df
def knn(X_train=None,
        X_validate=None,
        y_train=None,
        y_validate=None,
        tuning=False,
        log=False):
    # positive class symbol (usually 1)
    positive_label = list(
        filter(lambda value: "1" in str(value), set(y_train.tolist())))[0]
    # tuning the classifier
    if tuning:
        if log: section_timer = Timer(log=f"tuning KNN classifier")
        # testing several parameters
        bestScore, n_neighbors_best = 0, None
        for neighbors in [2, 8]:
            classifier = KNeighborsClassifier(n_neighbors=neighbors,
                                              weights="distance",
                                              p=2,
                                              n_jobs=4).fit(X_train, y_train)
            score = f1_score(y_validate, classifier.predict(X_validate))
            if (score > bestScore):
                bestScore, n_neighbors_best = score, neighbors
        # choosing best parameters
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors_best,
                                          weights="distance",
                                          p=2,
                                          n_jobs=4)
        if log:
            section_timer.end_timer(
                log=f"done with a max score of {bestScore}")
    # default classifier
    else:
        classifier = KNeighborsClassifier(n_neighbors=2,
                                          weights="distance",
                                          p=2,
                                          n_jobs=4)

    return classifier
def lda(X_train=None,
        X_validate=None,
        y_train=None,
        y_validate=None,
        tuning=False,
        log=False):
    # positive class symbol (usually 1)
    positive_label = list(
        filter(lambda value: "1" in str(value), set(y_train.tolist())))[0]
    # tuning the classifier
    if tuning:
        if log:
            section_timer = Timer(
                log=f"tuning Linear Discriminant Analysis classifier")
        # testing several parameters
        bestScore, solver_best, shrinkage_best = 0, "svd", None
        for solver in ["svd", "eigen", "lsqr"]:
            if solver != "svd":
                for shrinkage in [None, "auto"]:
                    classifier = LinearDiscriminantAnalysis(
                        solver=solver,
                        shrinkage=shrinkage).fit(X_train, y_train)
                    score = f1_score(y_validate,
                                     classifier.predict(X_validate))
                    if (score > bestScore):
                        bestScore, solver_best, shrinkage_best = score, solver, shrinkage
        # choosing best parameters
        classifier = LinearDiscriminantAnalysis(solver=solver_best,
                                                shrinkage=shrinkage_best)
        if log:
            section_timer.end_timer(
                log=f"done with a max score of {bestScore}")
    # default classifier
    else:
        classifier = LinearDiscriminantAnalysis()

    return classifier
Esempio n. 19
0
def merge_dfs(dfs, data_path, groupby_mode="mean", just_one_hot=False, do_imputing=False, log=False):  # to join all the other dataframes.
    """
    Merges the dataframe to the original one, to have a dataframe complete with all the information.
    :param original_dataframe:
        dataframe where we add all the others
    :param data_path:
        location where there are all the data
    :param log:
        flag for the time
    :return:
        dataframe completed
    """
    if log: section_timer = Timer(log=f"merging the dataframes")

    bureau, prev_application = __joining_minor_csvs(data_path, just_one_hot=just_one_hot, do_imputing=do_imputing, groupby_mode="mean")
    for i in range(len(dfs)):
        dfs[i] = pd.merge(left=dfs[i], right=bureau,
                          how='left', on="SK_ID_CURR", left_index=True)
        dfs[i] = pd.merge(left=dfs[i], right=prev_application,
                          how='left', on="SK_ID_CURR", left_index=True)

    if log: section_timer.end_timer(log=f"join completed")

    return [reduce_dataframe_size(df) for df in dfs]
def pca_transform(dfs_to_transform,
                  corr_threshold=0.7,
                  PCA_n_components=None,
                  old_cols=None,
                  log=False):
    if log: section_timer = Timer(log=f"computing PCA")

    ys_to_transform = [None, None, None]
    for i, df_to_transform in enumerate(dfs_to_transform):
        if "TARGET" in df_to_transform.columns:
            dfs_to_transform[i], ys_to_transform[i] = df_to_transform.drop(
                columns=["TARGET"
                         ]), df_to_transform.loc[:,
                                                 "TARGET"].to_numpy().tolist()
        else:
            ys_to_transform[i] = None

    concatenated_dfs, indexes = pd.DataFrame(
        columns=dfs_to_transform[1].columns), []
    for i, df in enumerate(dfs_to_transform):
        indexes += [concatenated_dfs.shape[0]]
        concatenated_dfs = pd.concat([concatenated_dfs, df],
                                     sort=False).reset_index(drop=True)

    if log: print(f"\t...scaling dataframes...")

    # scaling
    concatenated_dfs = scale(concatenated_dfs)

    if log: print(f"\t...adding polynomial features...")
    # polynomial features
    cols_to_poly = set(
        feature_selection(dfs_to_transform[0],
                          y_train=ys_to_transform[0],
                          corr_threshold=corr_threshold)) & set(old_cols)
    concatenated_dfs = add_poly_features(concatenated_dfs,
                                         features_list=cols_to_poly)

    if log: print(f"\t...finding principal components...")

    # PCA
    pca = PCA(n_components=PCA_n_components, whiten=True, svd_solver="auto")
    concatenated_dfs = pd.DataFrame(data=pca.fit_transform(concatenated_dfs))
    selected_components = list(concatenated_dfs.columns)

    # reproducing dfs
    dfs_to_transform = []
    for i, index in enumerate(indexes):
        if i < len(indexes) - 1:
            dfs_to_transform.append(
                concatenated_dfs.iloc[indexes[i]:indexes[i + 1], :])
        else:
            dfs_to_transform.append(concatenated_dfs.iloc[indexes[i]:, :])

    selected_components = list(
        feature_selection(dfs_to_transform[0],
                          ys_to_transform[0],
                          corr_threshold=0.7,
                          log=False))
    dfs_to_transform = [df[selected_components] for df in dfs_to_transform]

    for i, df_to_transform in enumerate(dfs_to_transform):
        if ys_to_transform[i] != None:
            df_to_transform = dfs_to_transform[i]
            df_to_transform.insert(1, "TARGET", ys_to_transform[i])
            dfs_to_transform[i] = df_to_transform

    if log:
        section_timer.end_timer(
            log=f"found {len(selected_components)} components")
    return dfs_to_transform
def predict(X_train,
            X_test,
            y_train,
            X_validate=None,
            y_validate=None,
            mode="ensemble",
            tuning=False,
            probabilities=True,
            k_fold_splits=3,
            log=False):
    # ensemble
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
    if mode.lower().strip() in ["ensemble", "voting"]:
        classifier_name = "Ensemble"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifiers = [
            ("Random forest",
             random_forest(X_train=X_train,
                           X_validate=X_validate,
                           y_train=y_train,
                           y_validate=y_validate,
                           tuning=tuning,
                           log=log)),
            #("Naive Bayes", naive_bayes(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)),
            ("Logistic Regression",
             logistic_regression(X_train=X_train,
                                 X_validate=X_validate,
                                 y_train=y_train,
                                 y_validate=y_validate,
                                 tuning=tuning,
                                 log=log)),
            ("MLP",
             multilayer_perceptron(X_train=X_train,
                                   X_validate=X_validate,
                                   y_train=y_train,
                                   y_validate=y_validate,
                                   tuning=tuning,
                                   log=log)),
            ("KNN",
             knn(X_train=X_train,
                 X_validate=X_validate,
                 y_train=y_train,
                 y_validate=y_validate,
                 tuning=tuning,
                 log=log)),
            #("SVM", svm(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)),
            ("AdaBoost",
             adaboost(X_train=X_train,
                      X_validate=X_validate,
                      y_train=y_train,
                      y_validate=y_validate,
                      tuning=tuning,
                      log=log))
        ]

        classifier = VotingClassifier(estimators=classifiers, voting='soft')

    # random forest classifier
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    elif mode.lower().strip() in ["random forest", "rf", "forest"]:
        classifier_name = "Random Forest"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = random_forest(X_train=X_train,
                                   X_validate=X_validate,
                                   y_train=y_train,
                                   y_validate=y_validate,
                                   tuning=tuning,
                                   log=log)

    # naive bayes
    # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
    elif mode.lower().strip() in ["bayes", "naive bayes", "nb"]:
        classifier_name = "Naive Bayes"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = naive_bayes(X_train=X_train,
                                 X_validate=X_validate,
                                 y_train=y_train,
                                 y_validate=y_validate,
                                 tuning=tuning,
                                 log=log)

    # logistic regression
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    elif mode.lower().strip() in [
            "logistic", "logistic regression", "regression"
    ]:
        classifier_name = "Logistic Regression"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = logistic_regression(X_train=X_train,
                                         X_validate=X_validate,
                                         y_train=y_train,
                                         y_validate=y_validate,
                                         tuning=tuning,
                                         log=log)

    # multilayer perceptron
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    elif mode.lower().strip() in [
            "mlp", "multilayer perceptron", "perceptron"
    ]:
        classifier_name = "Multilayer Perceptron"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = multilayer_perceptron(X_train=X_train,
                                           X_validate=X_validate,
                                           y_train=y_train,
                                           y_validate=y_validate,
                                           tuning=tuning,
                                           log=log)

    # K nearest neighbors classifier
    # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    elif mode.lower().strip() in ["knn", "nearest neighbors"]:
        classifier_name = "K-Nearest Neighbors"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = knn(X_train=X_train,
                         X_validate=X_validate,
                         y_train=y_train,
                         y_validate=y_validate,
                         tuning=tuning,
                         log=log)

    # Support Vector Machine
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
    elif mode.lower().strip() in ["svm"]:
        classifier_name = "SVM"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = svm(X_train=X_train,
                         X_validate=X_validate,
                         y_train=y_train,
                         y_validate=y_validate,
                         tuning=tuning,
                         log=log)

    # AdaBoost
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
    elif mode.lower().strip() in ["adaboost", "ada boost", "ada"]:
        classifier_name = "AdaBoost"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = adaboost(X_train=X_train,
                              X_validate=X_validate,
                              y_train=y_train,
                              y_validate=y_validate,
                              tuning=tuning,
                              log=log)

    elif mode.lower().strip() in ["lgb", "lgbt"]:
        classifier_name = "lgb"

        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        features = X_train
        test_features = X_test
        best_preds, best_score = np.zeros(X_test.shape[0]), 0
        if tuning:
            # parameters are:
            # learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2
            parameters_combinations = list(
                itertools.product([0.05, 0.1], [100, 250, 500], [8, 512, 2048],
                                  [100, 500, 1000, 2500], [-1, 5, 10],
                                  [0, 0.25, 0.5], [0, 0.25, 0.5]))
            best_combination = parameters_combinations[0]

            for i, combination in enumerate(parameters_combinations):
                if log:
                    print(
                        f"\n\t...trying combination {i + 1} of {len(parameters_combinations)}, with a current best score of {best_score} and combination {best_combination}...\n"
                    )
                try:
                    learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2 = combination
                    model = lgb.LGBMClassifier(
                        n_estimators=500,
                        objective='binary',
                        n_jobs=-1,
                        verbose=-1,
                        class_weight='balanced',
                        device="cpu",
                        learning_rate=learning_rate,
                        reg_alpha=0.1,
                        reg_lambda=0.1,
                        min_data_in_leaf=min_data_in_leaf,
                        bagging_fraction=0.25,
                        bagging_freq=5,
                        max_bin=max_bin,
                        num_leaves=num_leaves,
                        max_depth=max_depth,
                        lambdal1=lambdal1,
                        lambdal2=lambdal2)

                    test_predictions = np.zeros(X_test.shape[0])

                    for train_indices, valid_indices in KFold(
                            n_splits=k_fold_splits, shuffle=True,
                            random_state=42).split(features):
                        train_features, train_labels = features[
                            train_indices], y_train[train_indices]
                        valid_features, valid_labels = features[
                            valid_indices], y_train[valid_indices]

                        # training
                        model = model.fit(train_features,
                                          train_labels,
                                          eval_metric='auc',
                                          eval_set=[
                                              (valid_features, valid_labels),
                                              (train_features, train_labels)
                                          ],
                                          eval_names=['test', 'train'],
                                          categorical_feature='auto',
                                          early_stopping_rounds=500,
                                          verbose=-1)
                        best_iteration = model.best_iteration_
                        print(model.best_score_)
                        train_score, test_score = model.best_score_["train"][
                            "auc"] - model.best_score_["train"][
                                "binary_logloss"], model.best_score_["test"][
                                    "auc"] - -model.best_score_["test"][
                                        "binary_logloss"]

                        # if we are over/underfitting, current parameters are bad
                        if train_score - test_score > 0.05 or train_score - test_score < -0.05:
                            break

                        # prediction
                        test_predictions += model.predict_proba(
                            test_features,
                            num_iteration=best_iteration)[:, 1] / k_fold_splits

                        # updates parameters
                        if test_score > best_score:
                            best_combination, best_preds, best_score = combination, test_predictions, test_score
                except:
                    continue
            if log:
                section_timer.end_timer(
                    log=
                    f"found best combination {best_combination} and best score {best_score}"
                )
        else:
            learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2 = (
                0.05, 100, 8, 100, -1, 0, 0)
            '''
            model = lgb.LGBMClassifier(n_estimators=500, objective='binary', n_jobs=-1, verbose=-1,
                                            class_weight='balanced', device="cpu",
                                            learning_rate=learning_rate,
                                            reg_alpha=0.1, reg_lambda=0.1, min_data_in_leaf=min_data_in_leaf,
                                            bagging_fraction=0.25, bagging_freq=5, 
                                            max_bin=max_bin, num_leaves=num_leaves, max_depth=max_depth,
                                            lambdal1=lambdal1, lambdal2=lambdal2)
            '''
            model = lgb.LGBMClassifier(n_estimators=1000,
                                       objective='binary',
                                       n_jobs=-1,
                                       class_weight='balanced',
                                       learning_rate=0.05,
                                       reg_alpha=0.3,
                                       reg_lambda=0.2,
                                       max_bin=50)

            test_predictions = np.zeros(X_test.shape[0])

            i = 0

            for train_indices, valid_indices in KFold(
                    n_splits=k_fold_splits, shuffle=True).split(features):
                i += 1
                print("\n----------------> ", i)

                train_features, train_labels = features[
                    train_indices], y_train[train_indices]
                valid_features, valid_labels = features[
                    valid_indices], y_train[valid_indices]

                # training
                model = model.fit(train_features,
                                  train_labels,
                                  eval_metric='auc',
                                  eval_set=[(valid_features, valid_labels),
                                            (train_features, train_labels)],
                                  eval_names=['test', 'train'],
                                  categorical_feature='auto',
                                  early_stopping_rounds=500,
                                  verbose=-1)
                best_iteration = model.best_iteration_
                best_score = max(best_score, model.best_score_["test"]["auc"])

                # prediction
                test_predictions += model.predict_proba(
                    test_features, num_iteration=best_iteration)[:, 1]

            if log: section_timer.end_timer(log=f"best score: {best_score}")

        return test_predictions / k_fold_splits, None

    elif mode.lower().strip() in ["lda", "linear discriminant"]:
        classifier_name = "Linear Discriminant Analysis"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = lda(X_train=X_train,
                         X_validate=X_validate,
                         y_train=y_train,
                         y_validate=y_validate,
                         tuning=tuning,
                         log=log)

    elif mode.lower().strip() in ["gb", "gradient boosting"]:
        classifier_name = "Gradient Boosting"
        if log:
            section_timer = Timer(
                log=f"predicting using {classifier_name} classifier")
        classifier = GradientBoostingClassifier()

    else:
        raise Exception(
            f'Unrecognized mode f{mode.strip()}.\nOnly supported modes are "ensemble", "bayes", "logistic", "rf", "mlp", "knn", "lda"'
        )

    # prediction
    if probabilities and classifier_name != "SVM":
        y_pred = classifier.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    else:
        y_pred = classifier.fit(X_train, y_train).predict(X_test)
    if classifier_name not in ["SVM"]:
        proba = classifier.predict_proba(X_test)
    else:
        proba = None

    if log: section_timer.end_timer(log=f"done")
    return y_pred, proba
Esempio n. 22
0
    test_ids = parsing.parse_CSV_to_df(file_path=file_path_test,
                                       log=False)["SK_ID_CURR"]
    X_train, y_train = df_train.drop(
        columns=["TARGET"]).to_numpy(), df_train["TARGET"]
    X_test = df_test.to_numpy()
    features = list(df_test.columns)

    y_test_pred, proba = classification.predict(
        X_train=X_train,
        X_test=X_test,
        X_validate=X_train,
        y_train=y_train,
        y_validate=y_train,
        mode=classifier,
        tuning=hyperparameters_tuning,
        probabilities=predict_probabilities,
        k_fold_splits=k_fold_splits,
        log=log)

    df_submission = pd.DataFrame(columns=["SK_ID_CURR", "TARGET"])
    df_submission["SK_ID_CURR"], df_submission[
        "TARGET"] = test_ids, y_test_pred
    parsing.write_df_to_file(df=df_submission,
                             file_path=submission_path,
                             log=log)
    parsing.write_df_to_file(df=df_submission,
                             file_path="../submission.csv",
                             log=log)

    if log: total_timer.end_timer(log=f"everything done")