) LENGTH_ENCODE = True if LENGTH_ENCODE: len_encode = ["URL"] for col in len_encode: X[f"{col}_len"] = X[col].apply(len) X = X.drop(col, axis=1) CATEGORIZE = False if CATEGORIZE: X[obj_cols] = X[obj_cols].astype("category") DATE_ENCODE = False if DATE_ENCODE: X = encode_dates(X, "date") sns.displot(y) plt.title("Distribution") plt.show() SEED = 0 SAMPLE_SIZE = 10000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED) # split into train and validation set dt = lgb.Dataset(Xt, yt, free_raw_data=False) np.random.seed(SEED) sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE) Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx] ds = lgb.Dataset(Xs, ys)
X = df.drop( [ "Termd", "EmploymentStatus", "DateofTermination", "LastPerformanceReview_Date", "EmpStatusID", "TermReason", ], axis=1, ) X.info() date_cols = X.select_dtypes("datetime") for col in date_cols: X = encode_dates(X, col) encode_columns = ["Employee_Name", "Position", "ManagerName"] enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4) for col in encode_columns: transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1)) transformed_values = pd.DataFrame(transformed_values, index=X.index) transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values] X = pd.concat([X, transformed_values], axis=1) X = X.drop(col, axis=1) obj_cols = X.select_dtypes("object").columns X[obj_cols] = X[obj_cols].astype("category") SEED = 0
count += 1 return count def get_female_count(x): count = 0 for gender in x.split(", "): if gender == "female": count += 1 return count X["male_count"] = X["borrower_genders"].apply(get_male_count) X["female_count"] = X["borrower_genders"].apply(get_female_count) X = encode_dates(X, "posted_time") X = encode_dates(X, "disbursed_time") X = encode_dates(X, "funded_time") X = encode_dates(X, "date") SEED = 0 SAMPLE_SIZE = 5000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED ) # split into train and validation set dt = lgb.Dataset(Xt, yt, free_raw_data=False) np.random.seed(SEED) sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE) Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx] ds = lgb.Dataset(Xs, ys)
LENGTH_ENCODE = False if LENGTH_ENCODE: len_encode = ["URL"] for col in len_encode: X[f"{col}_len"] = X[col].apply(len) X = X.drop(col, axis=1) CATEGORIZE = True if CATEGORIZE: X[obj_cols] = X[obj_cols].astype("category") enc = OrdinalEncoder() X = enc.fit_transform(X) DATE_ENCODE = False if DATE_ENCODE: X = encode_dates(X, "date") sns.displot(y) plt.title("Distribution") plt.show() SEED = 0 SAMPLE_SIZE = 10000 Xt, Xv, yt, yv = train_test_split( X, y, random_state=SEED) # split into train and validation set dt = xgb.DMatrix(Xt, yt) np.random.seed(SEED) sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE) Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx] ds = xgb.DMatrix(Xs, ys)
for weightclass in unique_plus_classes: try: weightclass = float(weightclass[:-1]) except ValueError: weightclass = 0 float_unique_plus_classes.append(weightclass) X["WeightClassKg"] = ( X["WeightClassKg"] .replace( unique_plus_classes, float_unique_plus_classes, ) .astype(float) ) X = encode_dates(X, 'Date') obj_cols = X.select_dtypes("object").columns nunique = X[obj_cols].nunique() prop_unique = (X[obj_cols].nunique() / len(df)).sort_values( ascending=False ) # in order of most unique to least unique = pd.concat([prop_unique, nunique], axis=1) unique.columns = [ "proportion", "nunique", ] unique X = similarity_encode( X,