コード例 #1
0
        'workex': 'category',
        'specialisation': 'category',
        'status': 'category'
    }).drop(columns=['sl_no']).dropna().assign(outlier=lambda x: (np.abs(
        zscore(x.select_dtypes('number'))) < 3).all(axis=1)).dropna().drop(
            columns=['status', 'outlier']))

y = df.pop('salary')

# df.head()
# df.shape

#%% Preprocessor functions
ohe = ce.OneHotEncoder(
    drop_invariant=True,
    return_df=True,
    use_cat_names=True,
    handle_missing='return_nan')  # Remember replace(np.nan, 0)

tge = ce.TargetEncoder(
    drop_invariant=True,
    return_df=True,
    handle_missing='value',
    # min_samples_leaf=3,
    # smoothing=0.4,
)

num_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
cat_cols = [
    'gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation'
]
コード例 #2
0
    'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff',
    'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff',
    'vote.arff', 'vowel.arff'
]

# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [  #category_encoders.BackwardDifferenceEncoder(),
    category_encoders.BaseNEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.HashingEncoder(),
    # category_encoders.HelmertEncoder(),
    category_encoders.JamesSteinEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.MEstimateEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.OrdinalEncoder(),
    # category_encoders.PolynomialEncoder(),
    # category_encoders.SumEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
df['Desc'] = df['Desc'].apply(remove_symbol)
print(df.head(10))

train, test = train_test_split(df, test_size=0.3, random_state=42)

# Acquisizione delle stop word
file_stopw = open("stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(df['Desc'])
vocab_size = len(t.word_index) + 1

#prepare class encoder
le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
print(labels)

print(le.category_mapping)
print(len(le.category_mapping))


# integer encode the documents
encoded_train = t.texts_to_sequences(train['Desc'])

max_length = 256
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
print(padded_train)

train_labels = train['Code']
コード例 #4
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam, SGD
from sklearn.utils import shuffle
from keras import regularizers

import category_encoders as ce
oneHotEncoder = ce.OneHotEncoder(cols=[0, 1, 2])

np.random.seed(133)

ENCODERS_FIT = False

AGE_LIMIT = 1991


def evaluate_predictions(y_pred, y_real, buffer):

    correct_count = 0
    n = len(y_pred)

    for i in range(n):
コード例 #5
0
#Remove duplicates to get unique events/sensors
unique_events = list(dict.fromkeys(all_events))
unique_sensors = list(dict.fromkeys(all_sensors))

#Use humansorting on events/sensors
unique_events.sort(key=natural_keys)
unique_sensors.sort(key=natural_keys)
all_sensors.sort(key=natural_keys)
all_events.sort(key=natural_keys)

unique_sensors_dataframe = pd.DataFrame(data=unique_sensors, columns=['sensor'])
all_sensors_dataframe = pd.DataFrame(data=all_sensors, columns=['sensor'])
unique_events_dataframe = pd.DataFrame(data=unique_events, columns=['event'])
all_events_dataframe = pd.DataFrame(data=all_events, columns=['event'])

sensor_encoder = category_encoders.OneHotEncoder(cols=['sensor'])
event_encoder = category_encoders.OneHotEncoder(cols=['event'])
sensor_encoder.fit(unique_sensors_dataframe)
event_encoder.fit(unique_events_dataframe)

sensors_classes = sensor_encoder.transform(unique_sensors_dataframe)
encoded_sensors = sensor_encoder.transform(all_sensors_dataframe)

event_classes = event_encoder.transform(unique_events_dataframe)
encoded_events = event_encoder.transform(all_events_dataframe)

sensor_results = combine_df_columns(encoded_sensors)
event_results = combine_df_columns(encoded_events)
event_results2 = dict()

コード例 #6
0
ファイル: model.py プロジェクト: AnikaZN/LotR
    return X


train_race, val_race = train_test_split(trainval, random_state=42)
train = wrangle(train_race)
val = wrangle(val_race)
X_train = train.drop(columns=target)
X_val = val.drop(columns=target)
y_train = train[target]
y_val = val[target]

numeric_features = train_features.select_dtypes(
    include='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
categorical_features = cardinality[cardinality <= 2000].index.tolist()

features = numeric_features + categorical_features

# model
model = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True, handle_unknown='ignore'),
    SimpleImputer(strategy='median'),
    RandomForestClassifier(random_state=0, n_jobs=-1))

# Fit on train, score on val
model.fit(X_train, y_train)
model.predict(X_val)

print(model)
dump(model, 'model.pkl')
コード例 #7
0
for i in X['Per Share Net profit before tax']:
    if i < 0.17037:
        x.append('low')
    elif i >= 0.170370 and i < 0.179709:
        x.append('low-medium')
    elif i >= 0.179709 and i < 0.193493:
        x.append('high-medium')
    else:
        x.append('high')
X['Per Share Net profit before tax bin']=pd.Series(x)

pip install category_encoders

#one hot encode
import category_encoders as ce
encoder=ce.OneHotEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],handle_unknown='return_nan',return_df=True,use_cat_names=True)
X = encoder.fit_transform(X)
X.head()


#hash encode
#Create object for hash encoder
encoder=ce.HashingEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],n_components=4)
#Fit and Transform Data
X=encoder.fit_transform(X)
X.head()


#Create object for binary encoding
encoder= ce.BinaryEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],return_df=True)
X=encoder.fit_transform(X) 
コード例 #8
0
import category_encoders as ce
from sklearn import ensemble
import xgboost as xgb
NOM_ENCODER = {
    'OneHotEncoder' : ce.OneHotEncoder(cols=['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'])
}

MODELS = {
'RandomForest': ensemble.RandomForestClassifier(n_estimators=200, n_jobs=1, verbose= 2),
'xgBoost': xgb.XGBClassifier(max_depth=15, learning_rate = 0.03, n_estimators=400, verbosity=1, objective='binary:logistic')
}
コード例 #9
0
"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder
from entity_embedding import EntityEmbeddingEncoder
from cesamo import CESAMOEncoder

Encoders = {
    'Ordinal':
    ce.OrdinalEncoder(),
    'Polynomial':
    ce.PolynomialEncoder(),
    'OneHot':
    ce.OneHotEncoder(),
    'BackwardDifference':
    ce.BackwardDifferenceEncoder(),
    'Helmert':
    ce.HelmertEncoder(),
    'EntityEmbedding':
    EntityEmbeddingEncoder(),
    'TargetEnc':
    ce.TargetEncoder(),
    'WOE':
    ce.WOEEncoder(),
    'CENG':
    CENGEncoder(verbose=0),
    'GeneticPP':
    GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2),
    'AgingPP':
コード例 #10
0
def norm_data(X_train,
              X_test,
              y_train,
              y_test,
              real=None,
              categ=None,
              all=True):
    '''Preprocessing features'''
    #  -------------   Split data on real and categ   -----------------
    X_train_categ = np.hstack((X_train[:, :2], X_train[:, 81:82]))
    X_test_categ = np.hstack((X_test[:, :2], X_test[:, 81:82]))

    X_train_real = np.hstack((X_train[:, 2:81], X_train[:, 82:]))
    X_test_real = np.hstack((X_test[:, 2:81], X_test[:, 82:]))

    #  -------  Check flag that we want to use all data for encoding --------
    if all == True:
        X_all_categ = np.append(X_train_categ, X_test_categ, axis=0)
        #print (X.shape, X_train_categ.shape, X_test_categ.shape)
        y_all = np.append(y_train, y_test, axis=0)
        #print (y_all.shape, y_train.shape, y_test.shape)
    else:
        X_all_categ = X_train_categ
        y_all = y_train

    #  -------  Norm of real data on mean and deviation --------
    if real == 'standart':
        ss = StandardScaler()
        X_train_real_res = ss.fit_transform(X_train_real)
        X_test_real_res = ss.transform(X_test_real)
    elif real == 'normal':
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train_real_res = min_max_scaler.fit_transform(X_train_real)
        X_test_real_res = min_max_scaler.transform(X_test_real)
    else:
        X_train_real_res = X_train_real
        X_test_real_res = X_test_real

    #  -------  Encoding of categorical features  -----------
    if categ == 'target':
        encoder = ce.TargetEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'onehot':
        encoder = ce.OneHotEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'helmert':
        encoder = ce.HelmertEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    elif categ == 'hash':
        encoder = ce.HashingEncoder(cols=[0, 1, 2], return_df=False)
        encoder.fit(X_all_categ, y_all)

        X_train_categ_res = encoder.transform(X_train_categ)
        X_test_categ_res = encoder.transform(X_test_categ)
    else:
        X_train_categ_res = X_train_categ
        X_test_categ_res = X_test_categ

    #  ------------     Joy data together  ---------------
    X_train_ready = np.hstack((X_train_categ_res, X_train_real_res))
    X_test_ready = np.hstack((X_test_categ_res, X_test_real_res))

    return X_train_ready, X_test_ready
コード例 #11
0
train_processed.reset_index(drop=True, inplace=True)
target = train_processed["rent"]
target_log = np.log1p(target)
train_processed.drop(["id", "rent"], axis=1, inplace=True)
test_processed.drop("id", axis=1, inplace=True)

####################
## get feature
####################
# モデル学習用データフレーム(category encoderの都合で分ける)
train_use = pd.DataFrame()
test_use = pd.DataFrame()

### location ###
ce_ordinal = ce.OneHotEncoder(cols=["district"], handle_missing="value")
train_use["district"] = train_processed["district"]
test_use["district"] = test_processed["district"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

### access ###
train_use["min_to_nearest_sta"] = train_processed["access_min"].apply(
    lambda x: min(x) if x else np.nan)
test_use["min_to_nearest_sta"] = test_processed["access_min"].apply(
    lambda x: min(x) if x else np.nan)

train_use["num_sta"] = train_processed["access_sta"].apply(lambda x: len(x))
test_use["num_sta"] = test_processed["access_sta"].apply(lambda x: len(x))

# 路線
コード例 #12
0
ファイル: item_based.py プロジェクト: egilabert/marketplace
 def one_hot(data, column_names):
     #Encoding the data, encoding the string values into numerical values, using binary method.
     encoder = ce.OneHotEncoder(column_names)
     data_transformed = encoder.fit_transform(data)
     return (data_transformed)
コード例 #13
0
# A - Label encoding
z = pd.DataFrame()
z['state'] = df['State']
le = LabelEncoder()
z['le_state'] = le.fit_transform(np.ravel(z))

print("label encode ~ state")
print('')

# data quality issue
print('Max categorical value for state is %s.\n' % z['le_state'].max())

# B - Hot One encoding
y = pd.DataFrame()
y['state'] = df['State']
oh = ce.OneHotEncoder(cols=['state'])
x = oh.fit_transform(y)
w = pd.concat([y, x], axis=1, ignore_index=False)

print("hot one encode ~ state")
print('')

# C - Binary encoding
v = pd.DataFrame()
v['state'] = df['State']
be = ce.BinaryEncoder(cols=['state'])
u = be.fit_transform(v)
t = pd.concat([v, u], axis=1, ignore_index=False)

print("binary encode ~ state")
print('')
コード例 #14
0
#dirty columns include: kids, courses
none_i = re.compile(r'none', flags=re.IGNORECASE)
# df.kids = none_i.sub(r'none\i', df.kids)
df['kids'].replace(none_i, 0, inplace=True)
df.kids = df['kids'].str.extract(r'^(\d+)', expand=False)

print(f'kids are {df.kids.unique()}')
print(f'gender are {df.gender.unique()}')

print(f'industry are {df.industry.unique()}')
print(f'military are {df.Military.unique()}')
# print(f'courses are {df.NumCourses.unique()}')

onehotecoder = ce.OneHotEncoder(cols=[
    "gender", "InUS", "ethnicity", "Usstate", "marrital", "employment",
    "industry"
],
                                handle_unknown='impute')
df = onehotecoder.fit_transform(df)

col_non_num = [c for c in df.columns if df[c].dtype == 'object']

print('no error2')

df.drop(columns=col_non_num, inplace=True)
print(df.shape)
print(df.dtypes)
print(df.head(10))
# fill with mode, mean, or median
df_mode, df_mean, df_median = df.mode().iloc[0], df.mean(), df.median()
コード例 #15
0
def predict(user_data):
    m_path = Path(__file__).parent
    path = m_path.joinpath('dataset/clean_data.csv')
    df = pd.read_csv(str(path))
    
    df = df.loc[df['Current contraceptive method'] != 'Not using']
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Calendar or rhythm method/Periodic abstinence', 'Periodic abstinence', regex=True)
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Implants/Norplant', 'Implants', regex=True)
    df['Current contraceptive method'] = df['Current contraceptive method'].replace('Mucus/Billing/Ovulation', 'Ovulation', regex=True)

    columns = ["Respondent's current age",
                'Age of respondent at 1st birth',
                'Age at first menstrual period',
                'Recent sexual activity',
                'Region',
                'Type of place of residence',
                'Current marital status',
                'Births in last five years',
                'Births in last three years',
                'Births in past year',
                'Currently pregnant',
                'Total number all pregnacies',
                'Decision maker for using contraception',
                'Decision maker for not using contraception',
                'Preferred future method',
                'Smokes cigarettes',
                'Smokes pipe full of tobacco',
                'Chews tobacco',
                'Snuffs by nose',
                'Smokes kreteks',
                'Smokes cigars, cheroots or cigarillos',
                'Smokes water pipe',
                'Snuff by mouth',
                'Chews betel quid with tobacco',
                "Husband's desire for children",
                'Exposure',
                'Unmet need',
                'Unmet need (definition 2)',
                'Unmet need for contraception (definition 3)'
                ]
    X = df[columns]
    y = df['Current contraceptive method']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    X_encoder = ce.OneHotEncoder(cols=[
        'Recent sexual activity',
        'Region',
        'Type of place of residence',
        'Current marital status',
        'Currently pregnant',
        'Decision maker for using contraception',
        'Decision maker for not using contraception',
        'Preferred future method',
        'Smokes cigarettes',
        'Smokes pipe full of tobacco',
        'Chews tobacco',
        'Snuffs by nose',
        'Smokes kreteks',
        'Smokes cigars, cheroots or cigarillos',
        'Smokes water pipe',
        'Snuff by mouth',
        'Chews betel quid with tobacco',
        "Husband's desire for children",
        'Exposure',
        'Unmet need',
        'Unmet need (definition 2)',
        'Unmet need for contraception (definition 3)'
    ])

    # X_train = X_encoder.fit_transform(X_train)
    # X_test = X_encoder.transform(X_test)
    rf_classifier = RandomForestClassifier(n_estimators=100)
    # rf_classifier.fit(X_train, y_train)

    # Preprocess, Use Model, and Train
    model = Pipeline([("preprocessing",X_encoder),("model",rf_classifier)]).fit(X_train, y_train)
    user_encode = model.predict(user_data)

    # Retrieve and return text
    result_text = user_encode[0]
    return result_text
コード例 #16
0
# train test split the data (80/20)
train, val = train_test_split(df,
                              train_size=0.80,
                              test_size=.20,
                              stratify=df[target],
                              random_state=42)

#%%
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]

#%%
# fit a pipeline (Decision Tree)
pipelineTree = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                             SimpleImputer(strategy='mean'), StandardScaler(),
                             DecisionTreeClassifier(max_depth=3))

pipelineTree.fit(X_train, y_train)

#%%
# validation accuracy (Decision Tree)
y_pred_tree = pipelineTree.predict(X_val)
print('Validation Accuracy', accuracy_score(y_val, y_pred_tree))

#%%
y_pred_tree

#%%
# fit a pipeline (Random Forest)
コード例 #17
0
def main(dataSetName, X, y):

    scores = []
    raw_scores_ds = {}

    # Loading logistic regression classifier
    clf = linear_model.LogisticRegression()

    # try every encoding method available
    #encoders = ce.__all__
    encoders = [
        "BackwardDifferenceEncoder", "BinaryEncoder", "HashingEncoder",
        "HelmertEncoder", "OneHotEncoder", "OrdinalEncoder", "SumEncoder",
        "PolynomialEncoder", "BaseNEncoder", "LeaveOneOutEncoder"
    ]
    print(encoders)

    for encoder_name in encoders:
        print(encoder_name)
        if (encoder_name == "BackwardDifferenceEncoder"):
            encoder = ce.BackwardDifferenceEncoder(cols=columnsToEncode)
        if (encoder_name == "BinaryEncoder"):
            encoder = ce.BinaryEncoder(cols=columnsToEncode)
        if (encoder_name == "HashingEncoder"):
            encoder = ce.HashingEncoder(cols=columnsToEncode)
        if (encoder_name == "HelmertEncoder"):
            encoder = ce.HelmertEncoder(cols=columnsToEncode)
        if (encoder_name == "OneHotEncoder"):
            encoder = ce.OneHotEncoder(cols=columnsToEncode)
        if (encoder_name == "OrdinalEncoder"):
            encoder = ce.OrdinalEncoder(cols=columnsToEncode)
        if (encoder_name == "SumEncoder"):
            encoder = ce.SumEncoder(cols=columnsToEncode)
        if (encoder_name == "PolynomialEncoder"):
            encoder = ce.PolynomialEncoder(cols=columnsToEncode)
        if (encoder_name == "BaseNEncoder"):
            encoder = ce.BaseNEncoder(cols=columnsToEncode)
        if (encoder_name == "LeaveOneOutEncoder"):
            encoder = ce.LeaveOneOutEncoder(cols=columnsToEncode)
        #encoder = getattr(category_encoders, encoder_name)
        print(encoder)
        start_time = time.time()
        score, stds, raw_scores, dim = score_models(clf, X, y, encoder,
                                                    encoder_name, dataSetName)
        scores.append([
            encoder_name, dataSetName[0], dim, score, stds,
            time.time() - start_time
        ])
        raw_scores_ds[encoder_name] = raw_scores
        gc.collect()

    results = pd.DataFrame(scores,
                           columns=[
                               'Encoding', 'Dataset', 'Dimensionality',
                               'Avg. Score', 'Score StDev', 'Elapsed Time'
                           ])

    #print(raw_scores_ds)
    #raw = pd.DataFrame.from_dict(raw_scores_ds)
    #print(raw)
    #ax = raw.plot(kind='box', return_type='axes')
    #plt.title('Scores for Encodings on %s Dataset' % (name, ))
    #plt.ylabel('Score (higher better)')
    #for tick in ax.get_xticklabels():
    #tick.set_rotation(90)
    #plt.grid()
    #plt.tight_layout()
    #plt.show()

    #return results, raw
    return results
コード例 #18
0
ファイル: preprocessing.py プロジェクト: syaffa/automlk
 def __init__(self, **params):
     super().__init__(**params)
     self.transformer = ce.OneHotEncoder(**self.transformer_params)
コード例 #19
0
#data pre-processing
df_bank, cat_cols_bank = bank_data_prep(bank_data)
#df_adult, cat_cols_adult=adult_data_prep(adult_data)

#%%calculate the memory usage of the prepared data frame
BYTES_TO_MB = 0.000001

print(round(df_bank.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#round(df_adult.memory_usage(deep=True).sum()* BYTES_TO_MB, 3)

#adult_data.info(memory_usage='deep')
#%% different embedding
# one-hot encoding
start_time = time.time()
one_hot_encoder = ce.OneHotEncoder(cols=cat_cols_bank)
one_hot_transformed = one_hot_encoder.fit_transform(df_bank)
print('computation time of one-hot :', time.time() - start_time)
print(
    'Memory usage after encoding: ',
    round(one_hot_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

# label encode
start_time = time.time()
label_encoder = ce.OrdinalEncoder(cols=cat_cols_bank)
label_transformed = label_encoder.fit_transform(df_bank)
print('computation time of label:', time.time() - start_time)
print('Memory usage after encoding: ',
      round(label_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3))

#hash encoding  with md5 hash function
コード例 #20
0
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(
    housing_category_encoded.reshape(-1, 1))
housing_cat_1hot

# In[31]:

encoder_test = OneHotEncoder()
housing_test_cat_1hot = encoder_test.fit_transform(
    housing_test_category_encoded.reshape(-1, 1))
housing_test_cat_1hot

# In[32]:

encoder = ce.OneHotEncoder()
housing_cat_reshaped = housing_category.values.reshape(-1, 1)
encoder.fit(housing_cat_reshaped)
X_cleaned = encoder.transform(housing_cat_reshaped)
cat_data = X_cleaned.as_matrix()
print(X_cleaned[0:5])
print(type(cat_data))
print(cat_data.shape)

# In[33]:

encoder_test = ce.OneHotEncoder()
housing_test_cat_reshaped = housing_test_category.values.reshape(-1, 1)
encoder_test.fit(housing_test_cat_reshaped)
X_test_cleaned = encoder_test.transform(housing_test_cat_reshaped)
cat_test_data = X_test_cleaned.as_matrix()
コード例 #21
0
def main():
    train_df = pd.read_csv(TRAIN_DATA_PATH)
    test_df = pd.read_csv(TEST_DATA_PATH)

    train_df["usage"] = "train"
    test_df["usage"] = "test"
    test_df["left"] = 100

    df = pd.concat([train_df, test_df], axis=0)
    usage = df.loc[:, "usage"]
    label = df.loc[:, "left"]
    df = df.drop(["usage", "left"], axis=1)

    categorical_columns = [c for c in df.columns if df[c].dtype == 'object']
    ce_ohe = ce.OneHotEncoder(cols=categorical_columns,
                              handle_unknown='impute')
    encorded_df = ce_ohe.fit_transform(df)
    encorded_df = pd.concat([encorded_df, usage, label], axis=1)

    train = encorded_df[encorded_df["usage"] == "train"].drop(
        "usage", axis=1).reset_index(drop=True)
    test = encorded_df[encorded_df["usage"] == "test"].drop(
        "usage", axis=1).reset_index(drop=True)

    train_x = train.drop(["left", "index"], axis=1)
    train_y = train.loc[:, "left"]
    index = test.loc[:, "index"]
    test_x = test.drop(["left", "index"], axis=1)

    f = partial(objective, train_x, train_y)  # 目的関数に引数を固定しておく
    study = optuna.create_study(
        direction='maximize')  # Optuna で取り出す特徴量の数を最適化する

    study.optimize(f, n_trials=10)  # 試行回数を決定する
    print('params:', study.best_params)  # 発見したパラメータを出力する
    best_feature_count = study.best_params['n_features_to_select']
    train_x, train_y = get_important_features(train_x, train_y,
                                              best_feature_count)

    n_splits = 10
    best_params = get_best_params(train_x, train_y)

    submission = np.zeros((len(test_x), 1))
    acc_scores = {}

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    for i, (tr_idx, val_idx) in enumerate(skf.split(train_x, train_y)):
        tr_x = train_x.iloc[tr_idx].reset_index(drop=True)
        tr_y = train_y.iloc[tr_idx].reset_index(drop=True)
        val_x = train_x.iloc[val_idx].reset_index(drop=True)
        val_y = train_y.iloc[val_idx].reset_index(drop=True)

        tr_dataset = lgb.Dataset(tr_x, tr_y)
        val_dataset = lgb.Dataset(val_x, val_y, reference=tr_dataset)
        model = get_model(tr_dataset, val_dataset, best_params)

        y_pred = model.predict(test_x)
        preds = pd.DataFrame(y_pred)
        submission += preds

    submission_df = pd.DataFrame(submission / n_splits)

    submission_df = pd.concat([index, submission_df], axis=1)
    print("#################################")
    print(submission_df)
    print("#################################")

    submission_df.to_csv(SAVE_DATA_PATH, header=False, index=False)
コード例 #22
0
    df = df.drop(columns=i)
df = df.fillna(df.mode().iloc[0])
print(df.describe())
print(df.dtypes)
print(df.head())

# Split dataset into training and testing sets
X = df.drop(columns='Result')
Y = df['Result'].copy()
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=20)

# Convert categorical data to numerical
ohe = ce.OneHotEncoder(use_cat_names=True)
X_train_ohe = ohe.fit_transform(X_train)
X_test_ohe = ohe.transform(X_test)

# Training MLP
clf = MLPClassifier(solver='lbfgs',
                    activation='logistic',
                    learning_rate='constant',
                    max_iter=600,
                    hidden_layer_sizes=(10, 10))
clf.fit(X_train_ohe, Y_train)
prediction = clf.predict(X_test_ohe)

# Evaluate MLP
print(clf.get_params())
print('The accuracy on test set is: ', clf.score(X_test_ohe, Y_test))
コード例 #23
0
# import category encoders

# # import category_encoders as ce
# # encode remaining variables with one-hot encoding

# # encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
# #                                  'race', 'sex', 'native_country'])

# # X_train = encoder.fit_transform(X_train)

# # X_test = encoder.transform(X_test)
# X_train.head()

import category_encoders as ce
#encode remaining variables with one hot encoding
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital-status', 'occupation', 'relationship',
                                 'race', 'gender', 'native-country'])
print(encoder)
x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)
print(x_train.shape)
print(x_test.shape)

# 11. Feature Scaling
    # Table of Contents
cols = x_train.columns
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train = pd.DataFrame(x_train, columns=[cols])
x_test = pd.DataFrame(x_test, columns=[cols])
コード例 #24
0
ファイル: LSTM.py プロジェクト: turmovasiring/CODIESP-10
# rimozione SOLO dei simboli (nessuno stemming e nessuna rimozione delle stopword)
# df1['Desc'] = df1['Desc'].apply(remove_symbol)
# test = df1

# Acquisizione delle stop word
file_stopw = open("support/stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(df['Desc'])

vocab_size = len(t.word_index) + 1

# prepare class encoder
le = ce.OneHotEncoder(return_df=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
print(labels)

# integer encode the documents
encoded_train = t.texts_to_sequences(train['Desc'])

max_length = 64
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
print(padded_train)

# test_ids = df1['id']
test_ids = test['Code']

train_labels = train['Code']
# print(train_labels)
コード例 #25
0
import pyreadr
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce

# feature eng
df = pd.read_csv(
    'Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv'
)
df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True)
column_names = df.columns
cat_cols = ['zip_code', 'channel']
ce_one_hot = ce.OneHotEncoder(cols=cat_cols, use_cat_names=True)
data_ohe = ce_one_hot.fit_transform(df)
data_ohe.segment = data_ohe.segment.map({
    'Womens E-Mail': 1,
    'Mens E-Mail': 1,
    'No E-Mail': 0
})
data = data_ohe.copy()
train = data_ohe.drop('visit', axis=1)
column_names = list(train.columns)
train_np = train.to_numpy().astype(float)
treatment_col = column_names.index('segment')
y = data_ohe.visit.to_numpy().astype(float)

X_train, X_valid, Y_train, Y_valid = train_test_split(train_np,
                                                      y,
                                                      test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)
コード例 #26
0
def get_feature_encoders(data_df, features, categorical_columns):
    encoder = ce.OneHotEncoder(use_cat_names=True)
    encoded_feature_names = encoder.fit_transform(data_df).columns.tolist()
    return encoder, encoded_feature_names
    '''encoders = dict()
コード例 #27
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  7 23:17:31 2018

@author: tgadfort
"""

#conda install -c conda-forge category_encoders
#https://github.com/scikit-learn-contrib/categorical-encoding

import category_encoders as ce

encoder = ce.BackwardDifferenceEncoder(cols=[...])
encoder = ce.BinaryEncoder(cols=[...])
encoder = ce.HashingEncoder(cols=[...])
encoder = ce.HelmertEncoder(cols=[...])
encoder = ce.OneHotEncoder(cols=[...])
encoder = ce.OrdinalEncoder(cols=[...])
encoder = ce.SumEncoder(cols=[...])
encoder = ce.PolynomialEncoder(cols=[...])
encoder = ce.BaseNEncoder(cols=[...])
encoder = ce.LeaveOneOutEncoder(cols=[...])
コード例 #28
0
            if(row[j] == tmp[i]):

                flag = True
        if(flag == False):
            tmp.append(row[j])
    row[j] = tmp.index(row[j])
    print(row[j])

x,t = [],[]

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

list_cols = ['week','soldout','name','remarks','event','payday','weather']

ce_ohe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute')
df_train_ce_onehot = ce_ohe.fit_transform(df_train)
df_test_ce_onehot = ce_ohe.fit_transform(df_test)

train_len = len(df_train_ce_onehot)

df_train_ce_onehot['precipitation'] = df_train_ce_onehot['precipitation'].str.replace('--','0')
df_test_ce_onehot['precipitation'] = df_test_ce_onehot['precipitation'].str.replace('--','0')
del df_train_ce_onehot['datetime']
del df_test_ce_onehot['datetime']

df_train_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer')
df_test_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer')

train_t = df_train_ce_onehot.loc[:,['y']]
train_t = train_t.drop(range(train_len,len(train_t)))
コード例 #29
0
import warnings
warnings.filterwarnings("ignore")

"""START: Import encoders"""
import category_encoders as ce
import sys
sys.path.append('../encoders/')
from ceng import CENGEncoder
from cesamo import CESAMOEncoder
from entity_embedding import EntityEmbeddingEncoder
from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder

Encoders = {'Ordinal': ce.OrdinalEncoder(),
            'Polynomial': ce.PolynomialEncoder(),
            'OneHot': ce.OneHotEncoder(),
            'BackwardDifference': ce.BackwardDifferenceEncoder(),
            'Helmert': ce.HelmertEncoder(),
            'EntityEmbedding': EntityEmbeddingEncoder(),
            'TargetEnc': ce.TargetEncoder(),
            'WOE': ce.WOEEncoder(),
            'CENG': CENGEncoder(verbose = 0),
            'GeneticPP': GeneticPPEncoder(),
            'AgingPP': AgingPPEncoder(),
            'SimplePP': SimplePPEncoder(),
            'CESAMOEncoder': CESAMOEncoder()}
"""END: Import encoders"""


"""START: Import models"""
try: 
コード例 #30
0
    axis=1,
    inplace=False)

stringFeatures = worldcupAllFeatures[[
    'Team1', 'Team2', 'Team1_Continent', 'Team2_Continent', 'Phase'
]].copy()

numericFeaturePipeline = Pipeline([
    ('selector', DataFrameSelector(list(numericFeatures))),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

stringFeaturePipeline = Pipeline([
    ('selector', DataFrameSelector(list(stringFeatures))),
    ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", numericFeaturePipeline),
    ("cat_pipeline", stringFeaturePipeline),
])

preprocessedFeature = pd.DataFrame(
    data=full_pipeline.fit_transform(worldcupAllFeatures),
    index=np.arange(1, 65))

# Split the data into training/testing sets
worldcupFeatureTrainingData, testData, worldcupTargetTrainingData, testTarget = \
    train_test_split(preprocessedFeature, scoreAsTarget, test_size=0.2, random_state=1)