Esempio n. 1
0
        'workex': 'category',
        'specialisation': 'category',
        'status': 'category'
    }).drop(columns=['sl_no']).dropna().assign(outlier=lambda x: (np.abs(
        zscore(x.select_dtypes('number'))) < 3).all(axis=1)).dropna().drop(
            columns=['status', 'outlier']))

y = df.pop('salary')

# df.head()
# df.shape

#%% Preprocessor functions
ohe = ce.OneHotEncoder(
    drop_invariant=True,
    return_df=True,
    use_cat_names=True,
    handle_missing='return_nan')  # Remember replace(np.nan, 0)

tge = ce.TargetEncoder(
    drop_invariant=True,
    return_df=True,
    handle_missing='value',
    # min_samples_leaf=3,
    # smoothing=0.4,
)

num_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
cat_cols = [
    'gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation'
]
Esempio n. 2
0
    'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff',
    'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff',
    'vote.arff', 'vowel.arff'
]

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [  #category_encoders.BackwardDifferenceEncoder(),
    category_encoders.BaseNEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.HashingEncoder(),
    # category_encoders.HelmertEncoder(),
    category_encoders.JamesSteinEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.MEstimateEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.OrdinalEncoder(),
    # category_encoders.PolynomialEncoder(),
    # category_encoders.SumEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
df['Desc'] = df['Desc'].apply(remove_symbol)
print(df.head(10))

train, test = train_test_split(df, test_size=0.3, random_state=42)

# Acquisizione delle stop word
file_stopw = open("stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(df['Desc'])
vocab_size = len(t.word_index) + 1

#prepare class encoder
le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
print(labels)

print(le.category_mapping)
print(len(le.category_mapping))


# integer encode the documents
encoded_train = t.texts_to_sequences(train['Desc'])

max_length = 256
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
print(padded_train)

train_labels = train['Code']
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam, SGD
from sklearn.utils import shuffle
from keras import regularizers

import category_encoders as ce
oneHotEncoder = ce.OneHotEncoder(cols=[0, 1, 2])

np.random.seed(133)

ENCODERS_FIT = False

AGE_LIMIT = 1991


def evaluate_predictions(y_pred, y_real, buffer):

    correct_count = 0
    n = len(y_pred)

    for i in range(n):
Esempio n. 5
0
#Remove duplicates to get unique events/sensors
unique_events = list(dict.fromkeys(all_events))
unique_sensors = list(dict.fromkeys(all_sensors))

#Use humansorting on events/sensors
unique_events.sort(key=natural_keys)
unique_sensors.sort(key=natural_keys)
all_sensors.sort(key=natural_keys)
all_events.sort(key=natural_keys)

unique_sensors_dataframe = pd.DataFrame(data=unique_sensors, columns=['sensor'])
all_sensors_dataframe = pd.DataFrame(data=all_sensors, columns=['sensor'])
unique_events_dataframe = pd.DataFrame(data=unique_events, columns=['event'])
all_events_dataframe = pd.DataFrame(data=all_events, columns=['event'])

sensor_encoder = category_encoders.OneHotEncoder(cols=['sensor'])
event_encoder = category_encoders.OneHotEncoder(cols=['event'])
sensor_encoder.fit(unique_sensors_dataframe)
event_encoder.fit(unique_events_dataframe)

sensors_classes = sensor_encoder.transform(unique_sensors_dataframe)
encoded_sensors = sensor_encoder.transform(all_sensors_dataframe)

event_classes = event_encoder.transform(unique_events_dataframe)
encoded_events = event_encoder.transform(all_events_dataframe)

sensor_results = combine_df_columns(encoded_sensors)
event_results = combine_df_columns(encoded_events)
event_results2 = dict()

Esempio n. 6
0
    return X


train_race, val_race = train_test_split(trainval, random_state=42)
train = wrangle(train_race)
val = wrangle(val_race)
X_train = train.drop(columns=target)
X_val = val.drop(columns=target)
y_train = train[target]
y_val = val[target]

numeric_features = train_features.select_dtypes(
    include='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
categorical_features = cardinality[cardinality <= 2000].index.tolist()

features = numeric_features + categorical_features

# model
model = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True, handle_unknown='ignore'),
    SimpleImputer(strategy='median'),
    RandomForestClassifier(random_state=0, n_jobs=-1))

# Fit on train, score on val
model.fit(X_train, y_train)
model.predict(X_val)

print(model)
dump(model, 'model.pkl')
for i in X['Per Share Net profit before tax']:
    if i < 0.17037:
        x.append('low')
    elif i >= 0.170370 and i < 0.179709:
        x.append('low-medium')
    elif i >= 0.179709 and i < 0.193493:
        x.append('high-medium')
    else:
        x.append('high')
X['Per Share Net profit before tax bin']=pd.Series(x)

pip install category_encoders

#one hot encode
import category_encoders as ce
encoder=ce.OneHotEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],handle_unknown='return_nan',return_df=True,use_cat_names=True)
X = encoder.fit_transform(X)
X.head()


#hash encode
#Create object for hash encoder
encoder=ce.HashingEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],n_components=4)
#Fit and Transform Data
X=encoder.fit_transform(X)
X.head()


#Create object for binary encoding
encoder= ce.BinaryEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],return_df=True)
X=encoder.fit_transform(X) 
Esempio n. 8
0
import category_encoders as ce
from sklearn import ensemble
import xgboost as xgb
NOM_ENCODER = {
    'OneHotEncoder' : ce.OneHotEncoder(cols=['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'])
}

MODELS = {
'RandomForest': ensemble.RandomForestClassifier(n_estimators=200, n_jobs=1, verbose= 2),
'xgBoost': xgb.XGBClassifier(max_depth=15, learning_rate = 0.03, n_estimators=400, verbosity=1, objective='binary:logistic')
}
Esempio n. 9
0
"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder
from entity_embedding import EntityEmbeddingEncoder
from cesamo import CESAMOEncoder

Encoders = {
    'Ordinal':
    ce.OrdinalEncoder(),
    'Polynomial':
    ce.PolynomialEncoder(),
    'OneHot':
    ce.OneHotEncoder(),
    'BackwardDifference':
    ce.BackwardDifferenceEncoder(),
    'Helmert':
    ce.HelmertEncoder(),
    'EntityEmbedding':
    EntityEmbeddingEncoder(),
    'TargetEnc':
    ce.TargetEncoder(),
    'WOE':
    ce.WOEEncoder(),
    'CENG':
    CENGEncoder(verbose=0),
    'GeneticPP':
    GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'AgingPP':
Esempio n. 10
0
def norm_data(X_train,
              X_test,
              y_train,
              y_test,
              real=None,
              categ=None,
              all=True):
    '''Preprocessing features'''
    #  -------------   Split data on real and categ   -----------------
    X_train_categ = np.hstack((X_train[:, :2], X_train[:, 81:82]))
    X_test_categ = np.hstack((X_test[:, :2], X_test[:, 81:82]))

    X_train_real = np.hstack((X_train[:, 2:81], X_train[:, 82:]))
    X_test_real = np.hstack((X_test[:, 2:81], X_test[:, 82:]))

    #  -------  Check flag that we want to use all data for encoding --------
    if all == True:
        X_all_categ = np.append(X_train_categ, X_test_categ, axis=0)
        #print (X.shape, X_train_categ.shape, X_test_categ.shape)
        y_all = np.append(y_train, y_test, axis=0)
        #print (y_all.shape, y_train.shape, y_test.shape)
    else:
        X_all_categ = X_train_categ
        y_all = y_train

    #  -------  Norm of real data on mean and deviation --------
    if real == 'standart':
        ss = StandardScaler()
        X_train_real_res = ss.fit_transform(X_train_real)
        X_test_real_res = ss.transform(X_test_real)
    elif real == 'normal':
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train_real_res = min_max_scaler.fit_transform(X_train_real)
        X_test_real_res = min_max_scaler.transform(X_test_real)
    else:
        X_train_real_res = X_train_real
        X_test_real_res = X_test_real

    #  -------  Encoding of categorical features  -----------
    if categ == 'target':
        encoder = ce.TargetEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'onehot':
        encoder = ce.OneHotEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'helmert':
        encoder = ce.HelmertEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'hash':
        encoder = ce.HashingEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    else:
        X_train_categ_res = X_train_categ
        X_test_categ_res = X_test_categ

    #  ------------     Joy data together  ---------------
    X_train_ready = np.hstack((X_train_categ_res, X_train_real_res))
    X_test_ready = np.hstack((X_test_categ_res, X_test_real_res))

    return X_train_ready, X_test_ready
Esempio n. 11
0
train_processed.reset_index(drop=True, inplace=True)
target = train_processed["rent"]
target_log = np.log1p(target)
train_processed.drop(["id", "rent"], axis=1, inplace=True)
test_processed.drop("id", axis=1, inplace=True)

####################
## get feature
####################
# モデル学習用データフレーム(category encoderの都合で分ける)
train_use = pd.DataFrame()
test_use = pd.DataFrame()

### location ###
ce_ordinal = ce.OneHotEncoder(cols=["district"], handle_missing="value")
train_use["district"] = train_processed["district"]
test_use["district"] = test_processed["district"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

### access ###
train_use["min_to_nearest_sta"] = train_processed["access_min"].apply(
    lambda x: min(x) if x else np.nan)
test_use["min_to_nearest_sta"] = test_processed["access_min"].apply(
    lambda x: min(x) if x else np.nan)

train_use["num_sta"] = train_processed["access_sta"].apply(lambda x: len(x))
test_use["num_sta"] = test_processed["access_sta"].apply(lambda x: len(x))

# 路線
Esempio n. 12
0
 def one_hot(data, column_names):
     #Encoding the data, encoding the string values into numerical values, using binary method.
     encoder = ce.OneHotEncoder(column_names)
     data_transformed = encoder.fit_transform(data)
     return (data_transformed)
Esempio n. 13
0
# A - Label encoding
z = pd.DataFrame()
z['state'] = df['State']
le = LabelEncoder()
z['le_state'] = le.fit_transform(np.ravel(z))

print("label encode ~ state")
print('')

# data quality issue
print('Max categorical value for state is %s.\n' % z['le_state'].max())

# B - Hot One encoding
y = pd.DataFrame()
y['state'] = df['State']
oh = ce.OneHotEncoder(cols=['state'])
x = oh.fit_transform(y)
w = pd.concat([y, x], axis=1, ignore_index=False)

print("hot one encode ~ state")
print('')

# C - Binary encoding
v = pd.DataFrame()
v['state'] = df['State']
be = ce.BinaryEncoder(cols=['state'])
u = be.fit_transform(v)
t = pd.concat([v, u], axis=1, ignore_index=False)

print("binary encode ~ state")
print('')
#dirty columns include: kids, courses
none_i = re.compile(r'none', flags=re.IGNORECASE)
# df.kids = none_i.sub(r'none\i', df.kids)
df['kids'].replace(none_i, 0, inplace=True)
df.kids = df['kids'].str.extract(r'^(\d+)', expand=False)

print(f'kids are {df.kids.unique()}')
print(f'gender are {df.gender.unique()}')

print(f'industry are {df.industry.unique()}')
print(f'military are {df.Military.unique()}')
# print(f'courses are {df.NumCourses.unique()}')

onehotecoder = ce.OneHotEncoder(cols=[
    "gender", "InUS", "ethnicity", "Usstate", "marrital", "employment",
    "industry"
],
                                handle_unknown='impute')
df = onehotecoder.fit_transform(df)

col_non_num = [c for c in df.columns if df[c].dtype == 'object']

print('no error2')

df.drop(columns=col_non_num, inplace=True)
print(df.shape)
print(df.dtypes)
print(df.head(10))
# fill with mode, mean, or median
df_mode, df_mean, df_median = df.mode().iloc[0], df.mean(), df.median()
Esempio n. 15
0
def predict(user_data):
    m_path = Path(__file__).parent
    path = m_path.joinpath('dataset/clean_data.csv')
    df = pd.read_csv(str(path))
    
    df = df.loc[df['Current contraceptive method'] != 'Not using']
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Calendar or rhythm method/Periodic abstinence', 'Periodic abstinence', regex=True)
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Implants/Norplant', 'Implants', regex=True)
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Mucus/Billing/Ovulation', 'Ovulation', regex=True)

    columns = ["Respondent's current age",
                'Age of respondent at 1st birth',
                'Age at first menstrual period',
                'Recent sexual activity',
                'Region',
                'Type of place of residence',
                'Current marital status',
                'Births in last five years',
                'Births in last three years',
                'Births in past year',
                'Currently pregnant',
                'Total number all pregnacies',
                'Decision maker for using contraception',
                'Decision maker for not using contraception',
                'Preferred future method',
                'Smokes cigarettes',
                'Smokes pipe full of tobacco',
                'Chews tobacco',
                'Snuffs by nose',
                'Smokes kreteks',
                'Smokes cigars, cheroots or cigarillos',
                'Smokes water pipe',
                'Snuff by mouth',
                'Chews betel quid with tobacco',
                "Husband's desire for children",
                'Exposure',
                'Unmet need',
                'Unmet need (definition 2)',
                'Unmet need for contraception (definition 3)'
                ]
    X = df[columns]
    y = df['Current contraceptive method']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    X_encoder = ce.OneHotEncoder(cols=[
        'Recent sexual activity',
        'Region',
        'Type of place of residence',
        'Current marital status',
        'Currently pregnant',
        'Decision maker for using contraception',
        'Decision maker for not using contraception',
        'Preferred future method',
        'Smokes cigarettes',
        'Smokes pipe full of tobacco',
        'Chews tobacco',
        'Snuffs by nose',
        'Smokes kreteks',
        'Smokes cigars, cheroots or cigarillos',
        'Smokes water pipe',
        'Snuff by mouth',
        'Chews betel quid with tobacco',
        "Husband's desire for children",
        'Exposure',
        'Unmet need',
        'Unmet need (definition 2)',
        'Unmet need for contraception (definition 3)'
    ])

    # X_train = X_encoder.fit_transform(X_train)
    # X_test = X_encoder.transform(X_test)
    rf_classifier = RandomForestClassifier(n_estimators=100)
    # rf_classifier.fit(X_train, y_train)

    # Preprocess, Use Model, and Train
    model = Pipeline([("preprocessing",X_encoder),("model",rf_classifier)]).fit(X_train, y_train)
    user_encode = model.predict(user_data)

    # Retrieve and return text
    result_text = user_encode[0]
    return result_text
# train test split the data (80/20)
train, val = train_test_split(df,
                              train_size=0.80,
                              test_size=.20,
                              stratify=df[target],
                              random_state=42)

#%%
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

#%%
# fit a pipeline (Decision Tree)
pipelineTree = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                             SimpleImputer(strategy='mean'), StandardScaler(),
                             DecisionTreeClassifier(max_depth=3))

pipelineTree.fit(X_train, y_train)

#%%
# validation accuracy (Decision Tree)
y_pred_tree = pipelineTree.predict(X_val)
print('Validation Accuracy', accuracy_score(y_val, y_pred_tree))

#%%
y_pred_tree

#%%
# fit a pipeline (Random Forest)
Esempio n. 17
0
def main(dataSetName, X, y):

    scores = []
    raw_scores_ds = {}

    # Loading logistic regression classifier
    clf = linear_model.LogisticRegression()

    # try every encoding method available
    #encoders = ce.__all__
    encoders = [
        "BackwardDifferenceEncoder", "BinaryEncoder", "HashingEncoder",
        "HelmertEncoder", "OneHotEncoder", "OrdinalEncoder", "SumEncoder",
        "PolynomialEncoder", "BaseNEncoder", "LeaveOneOutEncoder"
    ]
    print(encoders)

    for encoder_name in encoders:
        print(encoder_name)
        if (encoder_name == "BackwardDifferenceEncoder"):
            encoder = ce.BackwardDifferenceEncoder(cols=columnsToEncode)
        if (encoder_name == "BinaryEncoder"):
            encoder = ce.BinaryEncoder(cols=columnsToEncode)
        if (encoder_name == "HashingEncoder"):
            encoder = ce.HashingEncoder(cols=columnsToEncode)
        if (encoder_name == "HelmertEncoder"):
            encoder = ce.HelmertEncoder(cols=columnsToEncode)
        if (encoder_name == "OneHotEncoder"):
            encoder = ce.OneHotEncoder(cols=columnsToEncode)
        if (encoder_name == "OrdinalEncoder"):
            encoder = ce.OrdinalEncoder(cols=columnsToEncode)
        if (encoder_name == "SumEncoder"):
            encoder = ce.SumEncoder(cols=columnsToEncode)
        if (encoder_name == "PolynomialEncoder"):
            encoder = ce.PolynomialEncoder(cols=columnsToEncode)
        if (encoder_name == "BaseNEncoder"):
            encoder = ce.BaseNEncoder(cols=columnsToEncode)
        if (encoder_name == "LeaveOneOutEncoder"):
            encoder = ce.LeaveOneOutEncoder(cols=columnsToEncode)
        #encoder = getattr(category_encoders, encoder_name)
        print(encoder)
        start_time = time.time()
        score, stds, raw_scores, dim = score_models(clf, X, y, encoder,
                                                    encoder_name, dataSetName)
        scores.append([
            encoder_name, dataSetName[0], dim, score, stds,
            time.time() - start_time
        ])
        raw_scores_ds[encoder_name] = raw_scores
        gc.collect()

    results = pd.DataFrame(scores,
                           columns=[
                               'Encoding', 'Dataset', 'Dimensionality',
                               'Avg. Score', 'Score StDev', 'Elapsed Time'
                           ])

    #print(raw_scores_ds)
    #raw = pd.DataFrame.from_dict(raw_scores_ds)
    #print(raw)
    #ax = raw.plot(kind='box', return_type='axes')
    #plt.title('Scores for Encodings on %s Dataset' % (name, ))
    #plt.ylabel('Score (higher better)')
    #for tick in ax.get_xticklabels():
    #tick.set_rotation(90)
    #plt.grid()
    #plt.tight_layout()
    #plt.show()

    #return results, raw
    return results
Esempio n. 18
0
 def __init__(self, **params):
     super().__init__(**params)
     self.transformer = ce.OneHotEncoder(**self.transformer_params)
Esempio n. 19
0
#data pre-processing
df_bank, cat_cols_bank = bank_data_prep(bank_data)
#df_adult, cat_cols_adult=adult_data_prep(adult_data)

#%%calculate the memory usage of the prepared data frame
BYTES_TO_MB = 0.000001

print(round(df_bank.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#round(df_adult.memory_usage(deep=True).sum()* BYTES_TO_MB, 3)

#adult_data.info(memory_usage='deep')
#%% different embedding
# one-hot encoding
start_time = time.time()
one_hot_encoder = ce.OneHotEncoder(cols=cat_cols_bank)
one_hot_transformed = one_hot_encoder.fit_transform(df_bank)
print('computation time of one-hot :', time.time() - start_time)
print(
    'Memory usage after encoding: ',
    round(one_hot_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

# label encode
start_time = time.time()
label_encoder = ce.OrdinalEncoder(cols=cat_cols_bank)
label_transformed = label_encoder.fit_transform(df_bank)
print('computation time of label:', time.time() - start_time)
print('Memory usage after encoding: ',
      round(label_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#hash encoding  with md5 hash function
Esempio n. 20
0
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(
    housing_category_encoded.reshape(-1, 1))
housing_cat_1hot

# In[31]:

encoder_test = OneHotEncoder()
housing_test_cat_1hot = encoder_test.fit_transform(
    housing_test_category_encoded.reshape(-1, 1))
housing_test_cat_1hot

# In[32]:

encoder = ce.OneHotEncoder()
housing_cat_reshaped = housing_category.values.reshape(-1, 1)
encoder.fit(housing_cat_reshaped)
X_cleaned = encoder.transform(housing_cat_reshaped)
cat_data = X_cleaned.as_matrix()
print(X_cleaned[0:5])
print(type(cat_data))
print(cat_data.shape)

# In[33]:

encoder_test = ce.OneHotEncoder()
housing_test_cat_reshaped = housing_test_category.values.reshape(-1, 1)
encoder_test.fit(housing_test_cat_reshaped)
X_test_cleaned = encoder_test.transform(housing_test_cat_reshaped)
cat_test_data = X_test_cleaned.as_matrix()
Esempio n. 21
0
def main():
    train_df = pd.read_csv(TRAIN_DATA_PATH)
    test_df = pd.read_csv(TEST_DATA_PATH)

    train_df["usage"] = "train"
    test_df["usage"] = "test"
    test_df["left"] = 100

    df = pd.concat([train_df, test_df], axis=0)
    usage = df.loc[:, "usage"]
    label = df.loc[:, "left"]
    df = df.drop(["usage", "left"], axis=1)

    categorical_columns = [c for c in df.columns if df[c].dtype == 'object']
    ce_ohe = ce.OneHotEncoder(cols=categorical_columns,
                              handle_unknown='impute')
    encorded_df = ce_ohe.fit_transform(df)
    encorded_df = pd.concat([encorded_df, usage, label], axis=1)

    train = encorded_df[encorded_df["usage"] == "train"].drop(
        "usage", axis=1).reset_index(drop=True)
    test = encorded_df[encorded_df["usage"] == "test"].drop(
        "usage", axis=1).reset_index(drop=True)

    train_x = train.drop(["left", "index"], axis=1)
    train_y = train.loc[:, "left"]
    index = test.loc[:, "index"]
    test_x = test.drop(["left", "index"], axis=1)

    f = partial(objective, train_x, train_y)  # 目的関数に引数を固定しておく
    study = optuna.create_study(
        direction='maximize')  # Optuna で取り出す特徴量の数を最適化する

    study.optimize(f, n_trials=10)  # 試行回数を決定する
    print('params:', study.best_params)  # 発見したパラメータを出力する
    best_feature_count = study.best_params['n_features_to_select']
    train_x, train_y = get_important_features(train_x, train_y,
                                              best_feature_count)

    n_splits = 10
    best_params = get_best_params(train_x, train_y)

    submission = np.zeros((len(test_x), 1))
    acc_scores = {}

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    for i, (tr_idx, val_idx) in enumerate(skf.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        tr_dataset = lgb.Dataset(tr_x, tr_y)
        val_dataset = lgb.Dataset(val_x, val_y, reference=tr_dataset)
        model = get_model(tr_dataset, val_dataset, best_params)

        y_pred = model.predict(test_x)
        preds = pd.DataFrame(y_pred)
        submission += preds

    submission_df = pd.DataFrame(submission / n_splits)

    submission_df = pd.concat([index, submission_df], axis=1)
    print("#################################")
    print(submission_df)
    print("#################################")

    submission_df.to_csv(SAVE_DATA_PATH, header=False, index=False)
    df = df.drop(columns=i)
df = df.fillna(df.mode().iloc[0])
print(df.describe())
print(df.dtypes)
print(df.head())

# Split dataset into training and testing sets
X = df.drop(columns='Result')
Y = df['Result'].copy()
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=20)

# Convert categorical data to numerical
ohe = ce.OneHotEncoder(use_cat_names=True)
X_train_ohe = ohe.fit_transform(X_train)
X_test_ohe = ohe.transform(X_test)

# Training MLP
clf = MLPClassifier(solver='lbfgs',
                    activation='logistic',
                    learning_rate='constant',
                    max_iter=600,
                    hidden_layer_sizes=(10, 10))
clf.fit(X_train_ohe, Y_train)
prediction = clf.predict(X_test_ohe)

# Evaluate MLP
print(clf.get_params())
print('The accuracy on test set is: ', clf.score(X_test_ohe, Y_test))
Esempio n. 23
0
# import category encoders

# # import category_encoders as ce
# # encode remaining variables with one-hot encoding

# # encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
# #                                  'race', 'sex', 'native_country'])

# # X_train = encoder.fit_transform(X_train)

# # X_test = encoder.transform(X_test)
# X_train.head()

import category_encoders as ce
#encode remaining variables with one hot encoding
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital-status', 'occupation', 'relationship',
                                 'race', 'gender', 'native-country'])
print(encoder)
x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)
print(x_train.shape)
print(x_test.shape)

# 11. Feature Scaling
    # Table of Contents
cols = x_train.columns
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train = pd.DataFrame(x_train, columns=[cols])
x_test = pd.DataFrame(x_test, columns=[cols])
Esempio n. 24
0
# rimozione SOLO dei simboli (nessuno stemming e nessuna rimozione delle stopword)
# df1['Desc'] = df1['Desc'].apply(remove_symbol)
# test = df1

# Acquisizione delle stop word
file_stopw = open("support/stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(df['Desc'])

vocab_size = len(t.word_index) + 1

# prepare class encoder
le = ce.OneHotEncoder(return_df=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
print(labels)

# integer encode the documents
encoded_train = t.texts_to_sequences(train['Desc'])

max_length = 64
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
print(padded_train)

# test_ids = df1['id']
test_ids = test['Code']

train_labels = train['Code']
# print(train_labels)
Esempio n. 25
0
import pyreadr
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce

# feature eng
df = pd.read_csv(
    'Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'
)
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)
column_names = df.columns
cat_cols = ['zip_code', 'channel']
ce_one_hot = ce.OneHotEncoder(cols=cat_cols, use_cat_names=True)
data_ohe = ce_one_hot.fit_transform(df)
data_ohe.segment = data_ohe.segment.map({
    'Womens E-Mail': 1,
    'Mens E-Mail': 1,
    'No E-Mail': 0
})
data = data_ohe.copy()
train = data_ohe.drop('visit', axis=1)
column_names = list(train.columns)
train_np = train.to_numpy().astype(float)
treatment_col = column_names.index('segment')
y = data_ohe.visit.to_numpy().astype(float)

X_train, X_valid, Y_train, Y_valid = train_test_split(train_np,
                                                      y,
                                                      test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)
Esempio n. 26
0
def get_feature_encoders(data_df, features, categorical_columns):
    encoder = ce.OneHotEncoder(use_cat_names=True)
    encoded_feature_names = encoder.fit_transform(data_df).columns.tolist()
    return encoder, encoded_feature_names
    '''encoders = dict()
Esempio n. 27
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  7 23:17:31 2018

@author: tgadfort
"""

#conda install -c conda-forge category_encoders
#https://github.com/scikit-learn-contrib/categorical-encoding

import category_encoders as ce

encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
            if(row[j] == tmp[i]):

                flag = True
        if(flag == False):
            tmp.append(row[j])
    row[j] = tmp.index(row[j])
    print(row[j])

x,t = [],[]

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

list_cols = ['week','soldout','name','remarks','event','payday','weather']

ce_ohe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute')
df_train_ce_onehot = ce_ohe.fit_transform(df_train)
df_test_ce_onehot = ce_ohe.fit_transform(df_test)

train_len = len(df_train_ce_onehot)

df_train_ce_onehot['precipitation'] = df_train_ce_onehot['precipitation'].str.replace('--','0')
df_test_ce_onehot['precipitation'] = df_test_ce_onehot['precipitation'].str.replace('--','0')
del df_train_ce_onehot['datetime']
del df_test_ce_onehot['datetime']

df_train_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer')
df_test_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer')

train_t = df_train_ce_onehot.loc[:,['y']]
train_t = train_t.drop(range(train_len,len(train_t)))
Esempio n. 29
0
import warnings
warnings.filterwarnings("ignore")

"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from cesamo import CESAMOEncoder
from entity_embedding import EntityEmbeddingEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder

Encoders = {'Ordinal': ce.OrdinalEncoder(),
            'Polynomial': ce.PolynomialEncoder(),
            'OneHot': ce.OneHotEncoder(),
            'BackwardDifference': ce.BackwardDifferenceEncoder(),
            'Helmert': ce.HelmertEncoder(),
            'EntityEmbedding': EntityEmbeddingEncoder(),
            'TargetEnc': ce.TargetEncoder(),
            'WOE': ce.WOEEncoder(),
            'CENG': CENGEncoder(verbose = 0),
            'GeneticPP': GeneticPPEncoder(),
            'AgingPP': AgingPPEncoder(),
            'SimplePP': SimplePPEncoder(),
            'CESAMOEncoder': CESAMOEncoder()}
"""END: Import encoders"""


"""START: Import models"""
try: 
    axis=1,
    inplace=False)

stringFeatures = worldcupAllFeatures[[
    'Team1', 'Team2', 'Team1_Continent', 'Team2_Continent', 'Phase'
]].copy()

numericFeaturePipeline = Pipeline([
    ('selector', DataFrameSelector(list(numericFeatures))),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

stringFeaturePipeline = Pipeline([
    ('selector', DataFrameSelector(list(stringFeatures))),
    ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", numericFeaturePipeline),
    ("cat_pipeline", stringFeaturePipeline),
])

preprocessedFeature = pd.DataFrame(
    data=full_pipeline.fit_transform(worldcupAllFeatures),
    index=np.arange(1, 65))

# Split the data into training/testing sets
worldcupFeatureTrainingData, testData, worldcupTargetTrainingData, testTarget = \
    train_test_split(preprocessedFeature, scoreAsTarget, test_size=0.2, random_state=1)