def align_left(df1, df2, log=False): if log: section_timer = Timer(log=f"removing columns of the second dataframe that are not in the first") df1, df2 = df1.align(df2, join="inner", axis=1) if log: section_timer.end_timer(log=f"done, with final shapes of {df1.shape} and {df2.shape}") return df1, df2
def parse_CSV_to_df(file_path, lines_cap=None, reduce_size=False, log=False): """ Parses a .csv file into a dataframe. :param file_path: Path of the file being parsed to a pandas' dataframe :param lines_cap: Number of lines being parsed If None, parses all the lines :param log: Flag for log on the console :return df: Returns fetched dataframe """ if log: section_timer = Timer( log=f"parsing file {file_path} ({'{0:.1f}'.format(os.path.getsize(file_path) * 2 ** (-20))}Mb)") df = pd.read_csv(file_path, nrows=lines_cap, index_col=False, encoding="utf_8", header=0) if reduce_size: df = reduce_dataframe_size(df.infer_objects()) if log: section_timer.end_timer(log=f"parsed a dataframe of {df.shape[0]} rows and {df.shape[1]} features") return df
def correct_nan_values(df, log=False): if log: section_timer = Timer(log=f"searching for anomalies") # converts some particular strings into nans df = df.replace(to_replace=["XAP", "XNA"], value=np.nan) # loop through each column for col in df.columns: # if it's a numerical column (we operate only on them) if df[col].dtype != "object" and "sk_id" not in col.lower().strip(): unique_values = set(df[col].unique()) # we don't want to delete columns with over 90% of same values #df[col] = __check_unuseful_col(df[col], log=False) # we want to delete single values with opposite sign #df[col] = __check_single_sign_value(df[col], log=False) # if it's a continuous feature #if len(unique_values) >= 50: #df[col] = __check_frequency_anomaly(df[col], log=False) if len(unique_values) == 2: df[col] = df[col].replace(unique_values - {0}, 1) if log: section_timer.end_timer(log=f"done") return df
def random_forest(X_train=None, X_validate=None, y_train=None, y_validate=None, tuning=False, log=False): # positive class symbol (usually 1) positive_label = list( filter(lambda value: "1" in str(value), set(y_train.tolist())))[0] # tuning the classifier if tuning: if log: section_timer = Timer(log=f"tuning Random Forest classifier") # testing several parameters bestScore, n_estimators_best, max_samples_best, max_features_best = 0, None, None, None for n_estimators in [1, 100, 500, 1000]: for max_samples in [0.1, 0.25]: for max_features in ["sqrt", "log2"]: classifier = RandomForestClassifier( n_estimators=n_estimators, criterion="gini", max_features=max_features, class_weight=None, bootstrap=True, warm_start=True, max_samples=max_samples, n_jobs=4).fit(X_train, y_train) score = roc_auc_score(y_validate, classifier.predict(X_validate)) if (score > bestScore): bestScore, n_estimators_best, max_samples, max_features_best = \ score, n_estimators, max_samples, max_features # choosing best parameters classifier = RandomForestClassifier(n_estimators=n_estimators_best, criterion="gini", max_features=max_features_best, class_weight=None, bootstrap=True, warm_start=True, max_samples=max_samples, n_jobs=4) if log: section_timer.end_timer( log=f"done with a max score of {bestScore}") # default classifier else: classifier = RandomForestClassifier(n_estimators=300, criterion="gini", bootstrap=True, max_samples=0.2, n_jobs=4) return classifier
def feature_selection(df, y_train, corr_threshold=0.8, log=False): if log: section_timer = Timer(log=f"finding the features") features_to_keep = int(len(df.columns) * corr_threshold) columns = ExtraTreesClassifier(n_estimators=100).fit( df.to_numpy(), y_train).feature_importances_ columns, correlations = pd.Series(columns).sort_values( ascending=False).index.tolist()[:features_to_keep], pd.Series( columns).sort_values(ascending=False).tolist()[:features_to_keep] columns = [df.columns[i] for i in columns] if log: section_timer.end_timer( log=f"selected {len(columns)} (out of {len(df.columns)}) features") return columns
def remove_useless_columns(df, log=False): if log: section_timer = Timer(log=f"searching for totally useless columns") original_columns = set(df.columns) # removes totally meaningless columns for col in df.columns: if re.match( r"(AMT_REQ_CREDIT_BUREAU_(HOUR|WEEK|DAY|QRT))|(.*_(AVG|MODE))|(WEEKDAY_APPR_PROCESS_START)", col) != None: df = df.drop(columns=col) if log: section_timer.end_timer( log= f"removed {len(original_columns - set(df.columns))} columns for a final shape of {df.shape}" ) return df
def undersample(df, complex=False, log=False): if log: section_timer = Timer(log=f"undersampling") if "_merge" in df.columns: df = df.drop("_merge", axis=1) count = evaluation.count_values(df, "TARGET") lessLabel = 0 if count[0] < count[1] else 1 df_new = pd.concat([ df[df["TARGET"] == 0].sample(count[lessLabel]), df[df["TARGET"] == 1].sample(count[lessLabel]) ]) if log: section_timer.end_timer( log=f"done with a final shape of {df_new.shape}") return df_new
def write_df_to_file(df, file_path, index=False, reduce_size=True, header=True, log=False): """ :param df: Pandas' dataframe being written to a file :param file_path: Path of the file being creating from a pandas' dataframe """ if log: section_timer = Timer( log=f"writing file {file_path}") if reduce_size: df = reduce_dataframe_size(df) df.to_csv(path_or_buf=file_path, index=index, header=header, sep=",") if log: section_timer.end_timer(log=f"written a dataframe of {df.shape[0]} rows and {df.shape[1]} features")
def smote(df, log=False): if log: section_timer = Timer(log=f"oversampling using SMOTE") if "_merge" in df.columns: df = df.drop("_merge", axis=1) target_values = pd.unique(df["TARGET"]).tolist() target_values.sort() false_number, true_number = target_values df = df.replace(to_replace={false_number: 0, true_number: 1}) df, df["TARGET"] = SMOTE(n_jobs=4).fit_resample(df.drop(columns="TARGET"), df["TARGET"]) if log: section_timer.end_timer(log=f"for a total shape of {df.shape}") return df
def logistic_regression(X_train=None, X_validate=None, y_train=None, y_validate=None, tuning=False, log=False): # positive class symbol (usually 1) positive_label = list( filter(lambda value: "1" in str(value), set(y_train.tolist())))[0] # tuning the classifier if tuning: if log: section_timer = Timer(log=f"tuning Logistic Regression classifier") # testing several parameters bestScore, solver_best = 0, None for solver in ["liblinear", "lbfgs", "newton-cg", "saga"]: classifier = LogisticRegression(solver=solver, dual=False, warm_start=True, max_iter=500, n_jobs=4, C=1).fit(X_train, y_train) score = roc_auc_score(y_validate, classifier.predict(X_validate)) if (score > bestScore): bestScore, solver_best = \ score, solver # choosing best parameters classifier = LogisticRegression(solver=solver_best, dual=False, warm_start=True, max_iter=1000, n_jobs=4, C=1) if log: section_timer.end_timer( log=f"done with a max score of {bestScore}") # default classifier else: classifier = LogisticRegression(dual=False, max_iter=1000, n_jobs=4, C=1) return classifier
def remove_rows(df, threshold, log=False): """ Removes rows with more than a percentage of NaN values :param df: Input dataframe :param threshold: Maximum percentage of NaN values for rows :param log: Flag for log on the console :return: """ if log: section_timer = Timer(log=f"removing rows with more than {threshold * 100}% of NaNs") non_nan_values = int(df.shape[1] * (1 - threshold)) df_clean = df.dropna(thresh=non_nan_values, axis=0) if log: section_timer.end_timer(log=f"removed {df.shape[0] - df_clean.shape[0]} rows") return df_clean
def show_unique_values(df, file_path=None, log=False): if log: section_timer = Timer(log=f"searching for unique values") unique_values = {} for col in df.columns: # if it's a discrete column if df[col].dtype == "object": print(col) unique_values[col] = set(df[col].tolist()) # pene pprint(unique_values) if file_path != None: with open(file_path, "w") as fp: pprint(unique_values, stream=fp) if log: section_timer.end_timer(log=f"done")
def impute_missing_values(df, mode="simple", columns=None, reduce_size=False, log=False): if columns == None: columns_to_impute = list(df.columns) elif columns == []: return df else: columns_to_impute = columns if log: section_timer = Timer(log=f"imputing missing values") X = df[columns_to_impute].to_numpy() if mode.lower().strip() == "simple 0": imputer = SimpleImputer(strategy="constant", fill_value=0) elif mode.lower().strip() == "simple median": imputer = SimpleImputer(strategy="median", copy=False) elif mode.lower().strip() == "simple mean": imputer = SimpleImputer(strategy="mean", copy=False) elif mode.lower().strip() == "simple most common": imputer = SimpleImputer(strategy="most_frequent", copy=False) elif mode.lower().strip() == "iterative": imputer = IterativeImputer(max_iter=3, n_nearest_features=5) else: raise Exception(f'Unrecognized mode f{mode.strip()}.\nOnly supported modes are "simple 0", "simple mean", "simple median", "simple most common", "iterative"') X_pred = imputer.fit_transform(X) df[columns_to_impute] = X_pred if reduce_size: df = parsing.reduce_dataframe_size(df, log=False) if log: section_timer.end_timer(log=f"done") return df
def multilayer_perceptron(X_train=None, X_validate=None, y_train=None, y_validate=None, tuning=False, log=False): # positive class symbol (usually 1) positive_label = list( filter(lambda value: "1" in str(value), set(y_train.tolist())))[0] # tuning the classifier if tuning: if log: section_timer = Timer( log=f"tuning Multilayer Perceptron classifier") # testing several parameters bestScore, activation_best, learning_rate_best = 0, None, None for activation in ["logistic", "relu"]: for learningRate in ["constant", "adaptive"]: classifier = MLPClassifier(activation=activation, learning_rate=learningRate, solver="adam", max_iter=200).fit(X_train, y_train) score = f1_score(y_validate, classifier.predict(X_validate)) if (score > bestScore): bestScore, activation_best, learning_rate_best = score, activation, learningRate # choosing best parameters classifier = MLPClassifier(activation=activation_best, learning_rate=learning_rate_best, solver="adam", max_iter=200) if log: section_timer.end_timer( log=f"done with a max score of {bestScore}") # default classifier else: classifier = MLPClassifier(solver="adam", activation="relu", learning_rate="constant", max_iter=200) return classifier
def remove_columns(df, threshold, log=False): """ Removes columns with more than a percentage of NaN values :param df: Input dataframe :param threshold: Maximum percentage of NaN values for columns :param log: Flag for log on the console :return: """ if log: sectionTimer = Timer(log=f"removing columns with more than {threshold * 100}% of nans") # removes columns with many nans non_nan_values = int(df.shape[0] * (1 - threshold)) df_clean = df.dropna(thresh=non_nan_values, axis=1) dropped_cols = list(set(df.columns) - set(df_clean.columns)) if log: sectionTimer.end_timer(log=f"removed {len(set(df.columns)) - df_clean.shape[1]} columns") return df_clean, dropped_cols
def frequency_encoding(df, just_one_hot=False, log=False): """ encodes the dataframe with the frequencies on the categorical features. :param df: dataframe to be encoded :param log: true if we want to set the timer on :return: encoded dataframe """ if log: section_timer = Timer( log=f"frequency encoding a dataframe with shape {df.shape}") # does a one-hot encoding old_cols = set(df.columns) df = pd.get_dummies(df) # eventually do the frequency encoding if not just_one_hot: for col in list(set(df.columns) - old_cols): df[col] = (df[col] * (df[col].sum() / df.shape[0])) if log: section_timer.end_timer(log=f"done") return df
def knn(X_train=None, X_validate=None, y_train=None, y_validate=None, tuning=False, log=False): # positive class symbol (usually 1) positive_label = list( filter(lambda value: "1" in str(value), set(y_train.tolist())))[0] # tuning the classifier if tuning: if log: section_timer = Timer(log=f"tuning KNN classifier") # testing several parameters bestScore, n_neighbors_best = 0, None for neighbors in [2, 8]: classifier = KNeighborsClassifier(n_neighbors=neighbors, weights="distance", p=2, n_jobs=4).fit(X_train, y_train) score = f1_score(y_validate, classifier.predict(X_validate)) if (score > bestScore): bestScore, n_neighbors_best = score, neighbors # choosing best parameters classifier = KNeighborsClassifier(n_neighbors=n_neighbors_best, weights="distance", p=2, n_jobs=4) if log: section_timer.end_timer( log=f"done with a max score of {bestScore}") # default classifier else: classifier = KNeighborsClassifier(n_neighbors=2, weights="distance", p=2, n_jobs=4) return classifier
def lda(X_train=None, X_validate=None, y_train=None, y_validate=None, tuning=False, log=False): # positive class symbol (usually 1) positive_label = list( filter(lambda value: "1" in str(value), set(y_train.tolist())))[0] # tuning the classifier if tuning: if log: section_timer = Timer( log=f"tuning Linear Discriminant Analysis classifier") # testing several parameters bestScore, solver_best, shrinkage_best = 0, "svd", None for solver in ["svd", "eigen", "lsqr"]: if solver != "svd": for shrinkage in [None, "auto"]: classifier = LinearDiscriminantAnalysis( solver=solver, shrinkage=shrinkage).fit(X_train, y_train) score = f1_score(y_validate, classifier.predict(X_validate)) if (score > bestScore): bestScore, solver_best, shrinkage_best = score, solver, shrinkage # choosing best parameters classifier = LinearDiscriminantAnalysis(solver=solver_best, shrinkage=shrinkage_best) if log: section_timer.end_timer( log=f"done with a max score of {bestScore}") # default classifier else: classifier = LinearDiscriminantAnalysis() return classifier
def merge_dfs(dfs, data_path, groupby_mode="mean", just_one_hot=False, do_imputing=False, log=False): # to join all the other dataframes. """ Merges the dataframe to the original one, to have a dataframe complete with all the information. :param original_dataframe: dataframe where we add all the others :param data_path: location where there are all the data :param log: flag for the time :return: dataframe completed """ if log: section_timer = Timer(log=f"merging the dataframes") bureau, prev_application = __joining_minor_csvs(data_path, just_one_hot=just_one_hot, do_imputing=do_imputing, groupby_mode="mean") for i in range(len(dfs)): dfs[i] = pd.merge(left=dfs[i], right=bureau, how='left', on="SK_ID_CURR", left_index=True) dfs[i] = pd.merge(left=dfs[i], right=prev_application, how='left', on="SK_ID_CURR", left_index=True) if log: section_timer.end_timer(log=f"join completed") return [reduce_dataframe_size(df) for df in dfs]
def pca_transform(dfs_to_transform, corr_threshold=0.7, PCA_n_components=None, old_cols=None, log=False): if log: section_timer = Timer(log=f"computing PCA") ys_to_transform = [None, None, None] for i, df_to_transform in enumerate(dfs_to_transform): if "TARGET" in df_to_transform.columns: dfs_to_transform[i], ys_to_transform[i] = df_to_transform.drop( columns=["TARGET" ]), df_to_transform.loc[:, "TARGET"].to_numpy().tolist() else: ys_to_transform[i] = None concatenated_dfs, indexes = pd.DataFrame( columns=dfs_to_transform[1].columns), [] for i, df in enumerate(dfs_to_transform): indexes += [concatenated_dfs.shape[0]] concatenated_dfs = pd.concat([concatenated_dfs, df], sort=False).reset_index(drop=True) if log: print(f"\t...scaling dataframes...") # scaling concatenated_dfs = scale(concatenated_dfs) if log: print(f"\t...adding polynomial features...") # polynomial features cols_to_poly = set( feature_selection(dfs_to_transform[0], y_train=ys_to_transform[0], corr_threshold=corr_threshold)) & set(old_cols) concatenated_dfs = add_poly_features(concatenated_dfs, features_list=cols_to_poly) if log: print(f"\t...finding principal components...") # PCA pca = PCA(n_components=PCA_n_components, whiten=True, svd_solver="auto") concatenated_dfs = pd.DataFrame(data=pca.fit_transform(concatenated_dfs)) selected_components = list(concatenated_dfs.columns) # reproducing dfs dfs_to_transform = [] for i, index in enumerate(indexes): if i < len(indexes) - 1: dfs_to_transform.append( concatenated_dfs.iloc[indexes[i]:indexes[i + 1], :]) else: dfs_to_transform.append(concatenated_dfs.iloc[indexes[i]:, :]) selected_components = list( feature_selection(dfs_to_transform[0], ys_to_transform[0], corr_threshold=0.7, log=False)) dfs_to_transform = [df[selected_components] for df in dfs_to_transform] for i, df_to_transform in enumerate(dfs_to_transform): if ys_to_transform[i] != None: df_to_transform = dfs_to_transform[i] df_to_transform.insert(1, "TARGET", ys_to_transform[i]) dfs_to_transform[i] = df_to_transform if log: section_timer.end_timer( log=f"found {len(selected_components)} components") return dfs_to_transform
def predict(X_train, X_test, y_train, X_validate=None, y_validate=None, mode="ensemble", tuning=False, probabilities=True, k_fold_splits=3, log=False): # ensemble # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html if mode.lower().strip() in ["ensemble", "voting"]: classifier_name = "Ensemble" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifiers = [ ("Random forest", random_forest(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), #("Naive Bayes", naive_bayes(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), ("Logistic Regression", logistic_regression(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), ("MLP", multilayer_perceptron(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), ("KNN", knn(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), #("SVM", svm(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)), ("AdaBoost", adaboost(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log)) ] classifier = VotingClassifier(estimators=classifiers, voting='soft') # random forest classifier # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html elif mode.lower().strip() in ["random forest", "rf", "forest"]: classifier_name = "Random Forest" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = random_forest(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # naive bayes # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html elif mode.lower().strip() in ["bayes", "naive bayes", "nb"]: classifier_name = "Naive Bayes" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = naive_bayes(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # logistic regression # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html elif mode.lower().strip() in [ "logistic", "logistic regression", "regression" ]: classifier_name = "Logistic Regression" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = logistic_regression(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # multilayer perceptron # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html elif mode.lower().strip() in [ "mlp", "multilayer perceptron", "perceptron" ]: classifier_name = "Multilayer Perceptron" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = multilayer_perceptron(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # K nearest neighbors classifier # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html elif mode.lower().strip() in ["knn", "nearest neighbors"]: classifier_name = "K-Nearest Neighbors" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = knn(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # Support Vector Machine # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC elif mode.lower().strip() in ["svm"]: classifier_name = "SVM" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = svm(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) # AdaBoost # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html elif mode.lower().strip() in ["adaboost", "ada boost", "ada"]: classifier_name = "AdaBoost" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = adaboost(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) elif mode.lower().strip() in ["lgb", "lgbt"]: classifier_name = "lgb" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") features = X_train test_features = X_test best_preds, best_score = np.zeros(X_test.shape[0]), 0 if tuning: # parameters are: # learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2 parameters_combinations = list( itertools.product([0.05, 0.1], [100, 250, 500], [8, 512, 2048], [100, 500, 1000, 2500], [-1, 5, 10], [0, 0.25, 0.5], [0, 0.25, 0.5])) best_combination = parameters_combinations[0] for i, combination in enumerate(parameters_combinations): if log: print( f"\n\t...trying combination {i + 1} of {len(parameters_combinations)}, with a current best score of {best_score} and combination {best_combination}...\n" ) try: learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2 = combination model = lgb.LGBMClassifier( n_estimators=500, objective='binary', n_jobs=-1, verbose=-1, class_weight='balanced', device="cpu", learning_rate=learning_rate, reg_alpha=0.1, reg_lambda=0.1, min_data_in_leaf=min_data_in_leaf, bagging_fraction=0.25, bagging_freq=5, max_bin=max_bin, num_leaves=num_leaves, max_depth=max_depth, lambdal1=lambdal1, lambdal2=lambdal2) test_predictions = np.zeros(X_test.shape[0]) for train_indices, valid_indices in KFold( n_splits=k_fold_splits, shuffle=True, random_state=42).split(features): train_features, train_labels = features[ train_indices], y_train[train_indices] valid_features, valid_labels = features[ valid_indices], y_train[valid_indices] # training model = model.fit(train_features, train_labels, eval_metric='auc', eval_set=[ (valid_features, valid_labels), (train_features, train_labels) ], eval_names=['test', 'train'], categorical_feature='auto', early_stopping_rounds=500, verbose=-1) best_iteration = model.best_iteration_ print(model.best_score_) train_score, test_score = model.best_score_["train"][ "auc"] - model.best_score_["train"][ "binary_logloss"], model.best_score_["test"][ "auc"] - -model.best_score_["test"][ "binary_logloss"] # if we are over/underfitting, current parameters are bad if train_score - test_score > 0.05 or train_score - test_score < -0.05: break # prediction test_predictions += model.predict_proba( test_features, num_iteration=best_iteration)[:, 1] / k_fold_splits # updates parameters if test_score > best_score: best_combination, best_preds, best_score = combination, test_predictions, test_score except: continue if log: section_timer.end_timer( log= f"found best combination {best_combination} and best score {best_score}" ) else: learning_rate, max_bin, num_leaves, min_data_in_leaf, max_depth, lambdal1, lambdal2 = ( 0.05, 100, 8, 100, -1, 0, 0) ''' model = lgb.LGBMClassifier(n_estimators=500, objective='binary', n_jobs=-1, verbose=-1, class_weight='balanced', device="cpu", learning_rate=learning_rate, reg_alpha=0.1, reg_lambda=0.1, min_data_in_leaf=min_data_in_leaf, bagging_fraction=0.25, bagging_freq=5, max_bin=max_bin, num_leaves=num_leaves, max_depth=max_depth, lambdal1=lambdal1, lambdal2=lambdal2) ''' model = lgb.LGBMClassifier(n_estimators=1000, objective='binary', n_jobs=-1, class_weight='balanced', learning_rate=0.05, reg_alpha=0.3, reg_lambda=0.2, max_bin=50) test_predictions = np.zeros(X_test.shape[0]) i = 0 for train_indices, valid_indices in KFold( n_splits=k_fold_splits, shuffle=True).split(features): i += 1 print("\n----------------> ", i) train_features, train_labels = features[ train_indices], y_train[train_indices] valid_features, valid_labels = features[ valid_indices], y_train[valid_indices] # training model = model.fit(train_features, train_labels, eval_metric='auc', eval_set=[(valid_features, valid_labels), (train_features, train_labels)], eval_names=['test', 'train'], categorical_feature='auto', early_stopping_rounds=500, verbose=-1) best_iteration = model.best_iteration_ best_score = max(best_score, model.best_score_["test"]["auc"]) # prediction test_predictions += model.predict_proba( test_features, num_iteration=best_iteration)[:, 1] if log: section_timer.end_timer(log=f"best score: {best_score}") return test_predictions / k_fold_splits, None elif mode.lower().strip() in ["lda", "linear discriminant"]: classifier_name = "Linear Discriminant Analysis" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = lda(X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate, tuning=tuning, log=log) elif mode.lower().strip() in ["gb", "gradient boosting"]: classifier_name = "Gradient Boosting" if log: section_timer = Timer( log=f"predicting using {classifier_name} classifier") classifier = GradientBoostingClassifier() else: raise Exception( f'Unrecognized mode f{mode.strip()}.\nOnly supported modes are "ensemble", "bayes", "logistic", "rf", "mlp", "knn", "lda"' ) # prediction if probabilities and classifier_name != "SVM": y_pred = classifier.fit(X_train, y_train).predict_proba(X_test)[:, 1] else: y_pred = classifier.fit(X_train, y_train).predict(X_test) if classifier_name not in ["SVM"]: proba = classifier.predict_proba(X_test) else: proba = None if log: section_timer.end_timer(log=f"done") return y_pred, proba
test_ids = parsing.parse_CSV_to_df(file_path=file_path_test, log=False)["SK_ID_CURR"] X_train, y_train = df_train.drop( columns=["TARGET"]).to_numpy(), df_train["TARGET"] X_test = df_test.to_numpy() features = list(df_test.columns) y_test_pred, proba = classification.predict( X_train=X_train, X_test=X_test, X_validate=X_train, y_train=y_train, y_validate=y_train, mode=classifier, tuning=hyperparameters_tuning, probabilities=predict_probabilities, k_fold_splits=k_fold_splits, log=log) df_submission = pd.DataFrame(columns=["SK_ID_CURR", "TARGET"]) df_submission["SK_ID_CURR"], df_submission[ "TARGET"] = test_ids, y_test_pred parsing.write_df_to_file(df=df_submission, file_path=submission_path, log=log) parsing.write_df_to_file(df=df_submission, file_path="../submission.csv", log=log) if log: total_timer.end_timer(log=f"everything done")