Example #1
0
prop_unique = (X[obj_cols].nunique() / len(df)).sort_values(
    ascending=False)  # in order of most unique to least
unique = pd.concat([prop_unique, nunique], axis=1)
unique.columns = [
    "proportion",
    "nunique",
]
print(unique)

ENCODE = True
if ENCODE:
    X = similarity_encode(
        X,
        encode_columns=[
            "Subtitle",
        ],
        n_prototypes=4,
        preran=False,
        drop_original=True,
    )

LENGTH_ENCODE = True
if LENGTH_ENCODE:
    len_encode = ["URL"]
    for col in len_encode:
        X[f"{col}_len"] = X[col].apply(len)
        X = X.drop(col, axis=1)

CATEGORIZE = False
if CATEGORIZE:
    X[obj_cols] = X[obj_cols].astype("category")
Example #2
0
import lightgbm as lgb
from helpers import similarity_encode
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

df = pd.read_csv(
    r"data\forestfires.csv",
    parse_dates=[],
    index_col=[],
)
X, y = similarity_encode(df, encode=False, categorize=True, preran=False)
X = X.drop("rain", axis=1)
d = lgb.Dataset(X, y, silent=True)

# rmse: 98.18188205858038
NUM_BOOST_ROUND = 455
params = {
    "objective": "rmse",
    "metric": "rmse",
    "verbose": -1,
    "n_jobs": 6,
    "learning_rate": 0.004090619790710353,
    "feature_pre_filter": False,
    "lambda_l1": 6.99239231800302e-08,
    "lambda_l2": 9.330959145992983,
    "num_leaves": 9,
    "feature_fraction": 0.8999999999999999,
    "bagging_fraction": 1.0,
    "bagging_freq": 0,
    "min_child_samples": 20,
Example #3
0
obj_cols = X.select_dtypes("object").columns
nunique = X[obj_cols].nunique()
prop_unique = (X[obj_cols].nunique() / len(df)).sort_values(
    ascending=False)  # in order of most unique to least
unique = pd.concat([prop_unique, nunique], axis=1)
unique.columns = [
    "proportion",
    "nunique",
]
unique

if ENCODE:
    X = similarity_encode(
        X,
        encode_columns=[],
        n_prototypes=5,
        train=True,
        drop_original=False,
    )

X[obj_cols] = X[obj_cols].astype("category")

sns.kdeplot(y)
plt.title("KDE distribution")
plt.show()

SEED = 0
SAMPLE_SIZE = 10000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED)  # split into train and validation set
Example #4
0
obj_cols = X.select_dtypes("object").columns
nunique = X[obj_cols].nunique()
prop_unique = (X[obj_cols].nunique() / len(df)).sort_values(
    ascending=False
)  # in order of most unique to least
unique = pd.concat([prop_unique, nunique], axis=1)
unique.columns = [
    "proportion",
    "nunique",
]
unique

X = similarity_encode(
    X,
    encode_columns=["Name", "MeetName", "Division", "Federation"],
    n_prototypes=5,
    train=True,
    drop_original=False,
)

X[obj_cols] = X[obj_cols].astype('category')

sns.kdeplot(y)
plt.title("KDE distribution")
plt.show()

SEED = 0
SAMPLE_SIZE = 10000

Xt, Xv, yt, yv = train_test_split(
    X, y, random_state=SEED