Esempio n. 1
0
    )

LENGTH_ENCODE = True
if LENGTH_ENCODE:
    len_encode = ["URL"]
    for col in len_encode:
        X[f"{col}_len"] = X[col].apply(len)
        X = X.drop(col, axis=1)

CATEGORIZE = False
if CATEGORIZE:
    X[obj_cols] = X[obj_cols].astype("category")

DATE_ENCODE = False
if DATE_ENCODE:
    X = encode_dates(X, "date")

sns.displot(y)
plt.title("Distribution")
plt.show()

SEED = 0
SAMPLE_SIZE = 10000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED)  # split into train and validation set
dt = lgb.Dataset(Xt, yt, free_raw_data=False)
np.random.seed(SEED)
sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE)
Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx]
ds = lgb.Dataset(Xs, ys)
Esempio n. 2
0
X = df.drop(
    [
        "Termd",
        "EmploymentStatus",
        "DateofTermination",
        "LastPerformanceReview_Date",
        "EmpStatusID",
        "TermReason",
    ],
    axis=1,
)

X.info()
date_cols = X.select_dtypes("datetime")
for col in date_cols:
    X = encode_dates(X, col)

encode_columns = ["Employee_Name", "Position", "ManagerName"]
enc = SimilarityEncoder(similarity="ngram", categories="k-means", n_prototypes=4)
for col in encode_columns:
    transformed_values = enc.fit_transform(X[col].values.reshape(-1, 1))
    transformed_values = pd.DataFrame(transformed_values, index=X.index)
    transformed_values.columns = [f"{col}_" + str(num) for num in transformed_values]
    X = pd.concat([X, transformed_values], axis=1)
    X = X.drop(col, axis=1)

obj_cols = X.select_dtypes("object").columns
X[obj_cols] = X[obj_cols].astype("category")


SEED = 0
Esempio n. 3
0
            count += 1
    return count


def get_female_count(x):
    count = 0
    for gender in x.split(", "):
        if gender == "female":
            count += 1
    return count


X["male_count"] = X["borrower_genders"].apply(get_male_count)
X["female_count"] = X["borrower_genders"].apply(get_female_count)

X = encode_dates(X, "posted_time")
X = encode_dates(X, "disbursed_time")
X = encode_dates(X, "funded_time")
X = encode_dates(X, "date")

SEED = 0
SAMPLE_SIZE = 5000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED
)  # split into train and validation set
dt = lgb.Dataset(Xt, yt, free_raw_data=False)
np.random.seed(SEED)
sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE)
Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx]
ds = lgb.Dataset(Xs, ys)
Esempio n. 4
0
LENGTH_ENCODE = False
if LENGTH_ENCODE:
    len_encode = ["URL"]
    for col in len_encode:
        X[f"{col}_len"] = X[col].apply(len)
        X = X.drop(col, axis=1)

CATEGORIZE = True
if CATEGORIZE:
    X[obj_cols] = X[obj_cols].astype("category")
    enc = OrdinalEncoder()
    X = enc.fit_transform(X)

DATE_ENCODE = False
if DATE_ENCODE:
    X = encode_dates(X, "date")

sns.displot(y)
plt.title("Distribution")
plt.show()

SEED = 0
SAMPLE_SIZE = 10000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED)  # split into train and validation set
dt = xgb.DMatrix(Xt, yt)
np.random.seed(SEED)
sample_idx = np.random.choice(Xt.index, size=SAMPLE_SIZE)
Xs, ys = Xt.loc[sample_idx], yt.loc[sample_idx]
ds = xgb.DMatrix(Xs, ys)
Esempio n. 5
0
for weightclass in unique_plus_classes:
    try:
        weightclass = float(weightclass[:-1])
    except ValueError:
        weightclass = 0
    float_unique_plus_classes.append(weightclass)

X["WeightClassKg"] = (
    X["WeightClassKg"]
    .replace(
        unique_plus_classes,
        float_unique_plus_classes,
    )
    .astype(float)
)
X = encode_dates(X, 'Date')

obj_cols = X.select_dtypes("object").columns
nunique = X[obj_cols].nunique()
prop_unique = (X[obj_cols].nunique() / len(df)).sort_values(
    ascending=False
)  # in order of most unique to least
unique = pd.concat([prop_unique, nunique], axis=1)
unique.columns = [
    "proportion",
    "nunique",
]
unique

X = similarity_encode(
    X,