Ejemplo n.º 1
0
    def test_update_unique_vals(self):
        one_hot_encoder = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])
        one_hot_encoder._update_unique_vals(self.data)

        self.assertEqual(one_hot_encoder.unique_vals["Embarked"],
                         set(['Q', np.nan, 'S', 'C']))
        self.assertEqual(one_hot_encoder.unique_vals["Sex"],
                         set(['male', 'female']))
        self.assertEqual(one_hot_encoder.unique_vals["Pclass"], set([1, 2, 3]))
Ejemplo n.º 2
0
    def test_transform(self):
        one_hot_encoder = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])
        one_hot_encoder.fit(self.data)

        transformed_data = np.array(
            [[0.0, 0.0, 1.0, 0.0, 1.0, 22.0, 7.25, 0.0, 0.0, 0.0, 1.0],
             [1.0, 0.0, 0.0, 1.0, 0.0, 38.0, 71.2833, 0.0, 1.0, 0.0, 0.0],
             [0.0, 0.0, 1.0, 1.0, 0.0, 26.0, 7.925, 0.0, 0.0, 0.0, 1.0],
             [1.0, 0.0, 0.0, 1.0, 0.0, 35.0, 53.1, 0.0, 0.0, 0.0, 1.0],
             [0.0, 0.0, 1.0, 0.0, 1.0, 35.0, 8.05, 0.0, 0.0, 0.0, 1.0]])

        np_test.assert_array_equal(one_hot_encoder.transform(self.data.head()),
                                   transformed_data)
Ejemplo n.º 3
0
    def test_fit(self):
        one_hot_encoder1 = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])
        one_hot_encoder2 = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])

        one_hot_encoder1.fit(self.data)
        one_hot_encoder2._update_unique_vals(self.data)
        one_hot_encoder2._fit_encoders()

        self.assertEqual(one_hot_encoder1.categorical_columns,
                         one_hot_encoder2.categorical_columns)
        self.assertEqual(one_hot_encoder1.unique_vals,
                         one_hot_encoder2.unique_vals)
        self.assertEqual(one_hot_encoder1.encoders, one_hot_encoder2.encoders)
Ejemplo n.º 4
0
    def test_fit_transform(self):
        one_hot_encoder1 = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])
        one_hot_encoder2 = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])

        one_hot_encoder2.fit(self.data.head())

        np_test.assert_array_equal(
            one_hot_encoder1.fit_transform(self.data.head()),
            one_hot_encoder2.transform(self.data.head()))
Ejemplo n.º 5
0
    def test_class_init(self):
        one_hot_encoder = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])

        self.assertEqual(one_hot_encoder.categorical_columns,
                         ["Pclass", "Sex", "Embarked"])
        self.assertEqual(one_hot_encoder.unique_vals, defaultdict(set))
        self.assertEqual(one_hot_encoder.encoders, {
            "Pclass": Encoder(),
            "Sex": Encoder(),
            "Embarked": Encoder()
        })
Ejemplo n.º 6
0
    def data_enc(self):
        fp_lb_enc = self.datapath + 'lb_enc'  # label encoding
        fp_oh_enc = self.datapath + "oh_enc"  # one-hot encoding
        df_train_f = pd.read_csv(self.fp_train_f,
                                 index_col=None,
                                 chunksize=500000,
                                 iterator=True)
        df_test_f = pd.read_csv(self.fp_test_f,
                                index_col=None,
                                chunksize=500000,
                                iterator=True)

        lb_enc = {}
        # for col in self.cols:
        #     self.cols_index[col] = np.append(self.cols_index[col], 'other')  # 添加新 other
        # for col in self.cols:
        #     lb_enc[col] = LabelEncoder()
        # %% one-hot and label encode
        print('starting one-hot and label encoding...')
        oh_enc = OneHotEncoder(self.cols)
        for chunk in df_train_f:
            oh_enc.fit(chunk)  # dummy的one-hot不会
            # for col in self.cols:
            #     lb_enc[col].fit(chunk[col])  # fit会重置
        for chunk in df_test_f:
            oh_enc.fit(chunk)
            # for col in self.cols:
            #     lb_enc[col].fit(chunk[col])
        # pickle.dump(lb_enc, open(fp_lb_enc, 'wb'))
        pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))
Ejemplo n.º 7
0
 def test_transform_coo(self):
     one_hot_encoder = OneHotEncoder(
         categorical_columns=["Pclass", "Sex", "Embarked"])
     one_hot_encoder.fit(self.data)
     coo_matrix_1 = one_hot_encoder.transform(self.data.head(), dtype="coo")
     coo_matrix_2 = coo_matrix(
         one_hot_encoder.transform(self.data.head(), dtype="np"))
     np_test.assert_array_equal(coo_matrix_1.toarray(),
                                coo_matrix_2.toarray())
Ejemplo n.º 8
0
    def test_fit_encoders(self):
        one_hot_encoder = OneHotEncoder(
            categorical_columns=["Pclass", "Sex", "Embarked"])
        one_hot_encoder._update_unique_vals(self.data)
        one_hot_encoder._fit_encoders()

        embarked_encoder = Encoder()
        embarked_encoder.fit(set(['Q', np.nan, 'S', 'C']))
        self.assertEqual(one_hot_encoder.encoders["Embarked"],
                         embarked_encoder)

        sex_encoder = Encoder()
        sex_encoder.fit(set(['male', 'female']))
        self.assertEqual(one_hot_encoder.encoders["Sex"], sex_encoder)

        pclass_encoder = Encoder()
        pclass_encoder.fit(set([1, 2, 3]))
        self.assertEqual(one_hot_encoder.encoders["Pclass"], pclass_encoder)
       as we do it later in the iteration of model training on chunks
'''
## 1.label encoding
lb_enc = {}
for col in cols:
    col_index[col] = np.append(col_index[col], 'other')

for col in cols:
    lb_enc[col] = LabelEncoder()
    lb_enc[col].fit(col_index[col])
    
## store the label encoder
pickle.dump(lb_enc, open(fp_lb_enc, 'wb'))

## 2.one-hot encoding
oh_enc = OneHotEncoder(cols)

df_train_f = pd.read_csv(fp_train_f, index_col=None, chunksize=500000, iterator=True)
df_test_f  = pd.read_csv(fp_test_f, index_col=None, chunksize=500000, iterator=True)

for chunk in df_train_f:
    oh_enc.fit(chunk)
for chunk in df_test_f:
    oh_enc.fit(chunk)
    
## store the one-hot encoder
pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))

#----- construct of original train set (sub-sampling randomly) -----#
n = sum(1 for line in open(fp_train_f)) - 1  # total size of train data (about 46M)
s = 2000000 # desired train set size (2M)
Ejemplo n.º 10
0
k = 100
col_index = {}
for col in cols:
    col_index[col] = cols_counts[col][0:k - 1].index
    #print(col, col_index[col])

## 对分类变量进行标签编码
lb_enc = {}
for col in cols:
    # 超过前100个value,设置为other
    col_index[col] = np.append(col_index[col], 'other')

for col in cols:
    lb_enc[col] = LabelEncoder()
    lb_enc[col].fit(col_index[col])

## 存储标签编码
pickle.dump(lb_enc, open(label_encoder_file, 'wb'))
print(label_encoder_file + ' saved')
## one-hot编码
oh_enc = OneHotEncoder(cols)

for chunk in df_train_org:
    oh_enc.fit(chunk)
for chunk in df_test_org:
    oh_enc.fit(chunk)

## 存储one-hot编码
pickle.dump(oh_enc, open(onehot_encoder_file, 'wb'))
print(onehot_encoder_file + ' saved')
Ejemplo n.º 11
0
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt)
print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt)

## store the pre-trained gbdt_model
pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))

del X_train_gbdt
del y_train_gbdt
gc.collect()

gbdt_model = pickle.load(open(fp_gbdt_model, 'rb'))
#----- data for LR (one-hot encoding with GDBT output) -----#
id_cols = []
for i in range(1, gbdt_model.get_params()['n_estimators'] + 1):
    id_cols.append('tree' + str(i))
oh_enc = OneHotEncoder(id_cols)


def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


## oh_enc fit the train_set
df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0],
                           columns=id_cols,
                           dtype=np.int8)

for chunk in chunker(df_train_id, 50000):
    oh_enc.fit(chunk)

del df_train_id
Ejemplo n.º 12
0
def train_model(white_list, isSortedTopKfeatures=False):
    '''
    Train the ds_model
    '''

    # In[4]:

    # Load Train and Test CSV

    headerNames = [
        "id", "name", "first", "last", "compas_screening_date", "sex", "dob",
        "age", "age_cat", "race", "juv_fel_count", "decile_score",
        "juv_misd_count", "juv_other_count", "priors_count",
        "days_b_screening_arrest", "c_jail_in", "c_jail_out", "c_case_number",
        "c_offense_date", "c_arrest_date", "c_days_from_compas",
        "c_charge_degree", "c_charge_desc", "is_recid", "num_r_cases",
        "r_case_number", "r_charge_degree", "r_days_from_arrest",
        "r_offense_date", "r_charge_desc", "r_jail_in", "r_jail_out",
        "is_violent_recid", "num_vr_cases", "vr_case_number",
        "vr_charge_degree", "vr_offense_date", "vr_charge_desc",
        "v_type_of_assessment", "v_decile_score", "v_score_text",
        "v_screening_date", "type_of_assessment", "decile_score", "score_text",
        "screening_date"
    ]
    prefix = "./data/"

    # ID cannot be used for prediction
    # hence setting index_col = 0 takes care of removing ID field from dataset in both train and test dataframes.
    datadf = pd.read_csv(prefix + "compas-scores.csv",
                         header=None,
                         delim_whitespace=False,
                         names=headerNames,
                         index_col=0,
                         skiprows=1)

    # In[5]:

    #['a','b']
    ## Drop columns not useful at all

    if 'id' in datadf:
        datadf = datadf.drop('id', axis=1)

    if 'name' in datadf:
        datadf = datadf.drop('name', axis=1)

    if 'first' in datadf:
        datadf = datadf.drop('first', axis=1)

    if 'last' in datadf:
        datadf = datadf.drop('last', axis=1)

    if 'c_case_number' in datadf:
        datadf = datadf.drop('c_case_number', axis=1)

    if 'r_case_number' in datadf:
        datadf = datadf.drop('r_case_number', axis=1)

    if 'vr_case_number' in datadf:
        datadf = datadf.drop('vr_case_number', axis=1)

    if 'decile_score.1' in datadf:
        datadf = datadf.drop('decile_score.1', axis=1)

    if 'c_charge_desc' in datadf:
        datadf = datadf.drop('c_charge_desc', axis=1)

    if 'r_charge_desc' in datadf:
        datadf = datadf.drop('r_charge_desc', axis=1)

    if 'vr_charge_desc' in datadf:
        datadf = datadf.drop('vr_charge_desc', axis=1)

    if 'num_r_cases' in datadf:
        datadf = datadf.drop('num_r_cases', axis=1)

    if 'num_vr_cases' in datadf:
        datadf = datadf.drop('num_vr_cases', axis=1)

    if 'v_score_text' in datadf:
        datadf = datadf.drop('v_score_text', axis=1)

    if 'dob' in datadf:
        datadf = datadf.drop('dob', axis=1)

    if 'vr_charge_degree' in datadf:
        datadf = datadf.drop('vr_charge_degree', axis=1)

    if 'c_charge_degree' in datadf:
        datadf = datadf.drop('c_charge_degree', axis=1)

    if 'r_charge_degree' in datadf:
        datadf = datadf.drop('r_charge_degree', axis=1)

    if 'c_charge_degree' in datadf:
        datadf = datadf.drop('c_charge_degree', axis=1)

    ## check
    if 'v_decile_score' in datadf:
        datadf = datadf.drop('v_decile_score', axis=1)

    if 'c_jail_out' in datadf:
        datadf = datadf.drop('c_jail_out', axis=1)

    if 'r_jail_out' in datadf:
        datadf = datadf.drop('r_jail_out', axis=1)

    ## days_b_screening_arrest, c_days_from_compas ,r_days_from_arrest
    if 'days_b_screening_arrest' in datadf:
        datadf = datadf.drop('days_b_screening_arrest', axis=1)

    if 'c_days_from_compas' in datadf:
        datadf = datadf.drop('c_days_from_compas', axis=1)

    if 'r_days_from_arrest' in datadf:
        datadf = datadf.drop('r_days_from_arrest', axis=1)

    # In[6]:

    print(datadf.shape)

    # In[7]:
    '''
    sns.heatmap(datadf.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) #data.corr()-->correlation matrix
    fig=plt.gcf()
    fig.set_size_inches(20,16)
    #plt.show()
    fig.savefig('Correlation_before.png')
    

    # In[8]:


    fig=plt.gcf()
    datadf.hist(figsize=(18, 16), alpha=0.5, bins=50)
    plt.show()
    fig.savefig('histograms1.png')
    '''

    # In[9]:

    datadf.head(10)

    # In[10]:

    ## fill NaN for categorical

    #datadf['v_score_text'].fillna(datadf['v_score_text'].value_counts().index[0], inplace=True)
    #datadf['vr_charge_degree'].fillna(datadf['vr_charge_degree'].value_counts().index[0], inplace=True)
    #datadf['c_charge_desc'].fillna(datadf['c_charge_desc'].value_counts().index[0], inplace=True)
    #datadf['r_charge_desc'].fillna(datadf['r_charge_desc'].value_counts().index[0], inplace=True)
    #datadf['vr_charge_desc'].fillna(datadf['vr_charge_desc'].value_counts().index[0], inplace=True)

    # In[11]:
    '''
    datadf['vr_charge_degree'] = datadf['vr_charge_degree'].str.replace('[^a-zA-Z]',' ')
    datadf['vr_charge_degree'] = datadf['vr_charge_degree'].str.replace('[^a-zA-Z]',' ')

    datadf['v_score_text'] = datadf['v_score_text'].str.replace('[^a-zA-Z]',' ')
    datadf['v_score_text'] = datadf['v_score_text'].str.replace('[^a-zA-Z]',' ')
    '''

    # In[12]:

    if 'age' in datadf:
        datadf = datadf.drop('age', axis=1)

    # In[13]:

    encoder = OneHotEncoder([
        "sex", "race", "v_type_of_assessment", "age_cat", "type_of_assessment"
    ])  # ,"v_score_text","c_charge_desc","r_charge_desc", "vr_charge_desc",
    encoder.fit(datadf)
    encoder.transform(datadf).shape
    encoder.transform(datadf).head(10)

    # In[14]:

    datadf = encoder.transform(datadf)

    print("DF shape >>>>>>>>>>>>>>>> ", datadf.shape)
    print("DF columsn >>>>>>>>>>>>>>>> ", datadf.columns)

    # In[16]:
    '''# Set of Unique Values 
    print(traindf['sex'].unique())
    print(traindf['age_cat'].unique())
    print(traindf['race'].unique())
    print(traindf['score_text'].unique())
    print(traindf['r_charge_desc'].unique())
    print(traindf['c_charge_desc'].unique())
    print(traindf['c_charge_degree'].unique())
    print(traindf['r_charge_degree'].unique())
    print(traindf['r_charge_desc'].unique())
    print(traindf['vr_charge_desc'].unique())
    print(traindf['v_type_of_assessment'].unique())
    print(traindf['v_score_text'].unique())
    print(traindf['score_text'].unique())



    traindf.columns
    '''

    # In[17]:

    # Test Data stats
    datadf.describe()

    # In[18]:

    # age for different date fields
    #datadf['dob'] = pd.to_datetime(datadf['dob'], dayfirst=True)
    datadf['compas_screening_date'] = pd.to_datetime(
        datadf['compas_screening_date'], dayfirst=True)

    datadf['c_offense_date'] = pd.to_datetime(datadf['c_offense_date'],
                                              dayfirst=True)
    datadf['c_arrest_date'] = pd.to_datetime(datadf['c_arrest_date'],
                                             dayfirst=True)
    datadf['r_offense_date'] = pd.to_datetime(datadf['r_offense_date'],
                                              dayfirst=True)

    datadf['vr_offense_date'] = pd.to_datetime(datadf['vr_offense_date'],
                                               dayfirst=True)
    datadf['v_screening_date'] = pd.to_datetime(datadf['v_screening_date'],
                                                dayfirst=True)
    datadf['screening_date'] = pd.to_datetime(datadf['screening_date'],
                                              dayfirst=True)

    datadf['c_jail_in'] = pd.to_datetime(datadf['c_jail_in'], dayfirst=True)
    #datadf['c_jail_out'] = pd.to_datetime(datadf['c_jail_out'], dayfirst=True)

    datadf['r_jail_in'] = pd.to_datetime(datadf['r_jail_in'], dayfirst=True)
    #datadf['r_jail_out'] = pd.to_datetime(datadf['r_jail_out'], dayfirst=True)

    ## ages
    #datadf['Age_in_days'] = (datadf['compas_screening_date']-datadf['dob'])/timedelta(days=1)
    datadf['c_offense_age_in_days'] = (datadf['compas_screening_date'] -
                                       datadf['c_offense_date']) / timedelta(
                                           days=1)
    datadf['c_arrest_age_in_days'] = (datadf['compas_screening_date'] -
                                      datadf['c_arrest_date']) / timedelta(
                                          days=1)

    datadf['r_offense_age_in_days'] = (datadf['compas_screening_date'] -
                                       datadf['r_offense_date']) / timedelta(
                                           days=1)
    datadf['vr_offense_age_in_days'] = (datadf['compas_screening_date'] -
                                        datadf['vr_offense_date']) / timedelta(
                                            days=1)
    datadf['v_screening_age_in_days'] = (
        datadf['compas_screening_date'] -
        datadf['v_screening_date']) / timedelta(days=1)
    datadf['screening_age_in_days'] = (datadf['compas_screening_date'] -
                                       datadf['screening_date']) / timedelta(
                                           days=1)

    datadf['c_jail_in_age_in_days'] = (datadf['compas_screening_date'] -
                                       datadf['c_jail_in']) / timedelta(days=1)
    #datadf['c_jail_out_age_in_days'] = (datadf['compas_screening_date']-datadf['c_jail_out'])/timedelta(days=1)

    datadf['r_jail_in_age_in_days'] = (datadf['compas_screening_date'] -
                                       datadf['r_jail_in']) / timedelta(days=1)
    #datadf['r_jail_out_age_in_days'] = (datadf['compas_screening_date']-datadf['r_jail_out'])/timedelta(days=1)

    print("white_list ", white_list)
    if len(white_list) > 0:
        white_list.append('decile_score')
        datadf.drop(datadf.columns.difference(white_list), 1, inplace=True)

    print("datadf ", datadf.columns)
    ## drop all date cols
    if 'dob' in datadf:
        datadf = datadf.drop('dob', axis=1)

    if 'compas_screening_date' in datadf:
        datadf = datadf.drop('compas_screening_date', axis=1)

    if 'c_offense_date' in datadf:
        datadf = datadf.drop('c_offense_date', axis=1)

    if 'c_arrest_date' in datadf:
        datadf = datadf.drop('c_arrest_date', axis=1)

    if 'r_offense_date' in datadf:
        datadf = datadf.drop('r_offense_date', axis=1)

    if 'vr_offense_date' in datadf:
        datadf = datadf.drop('vr_offense_date', axis=1)

    if 'screening_date' in datadf:
        datadf = datadf.drop('screening_date', axis=1)

    if 'v_screening_date' in datadf:
        datadf = datadf.drop('v_screening_date', axis=1)

    if 'c_jail_in' in datadf:
        datadf = datadf.drop('c_jail_in', axis=1)

    if 'c_jail_out' in datadf:
        datadf = datadf.drop('c_jail_out', axis=1)

    if 'r_jail_in' in datadf:
        datadf = datadf.drop('r_jail_in', axis=1)

    if 'r_jail_out' in datadf:
        datadf = datadf.drop('r_jail_out', axis=1)

    #prediction column - textual (decile_score is the numeric equivalent)
    if 'score_text' in datadf:
        datadf = datadf.drop('score_text', axis=1)

    # In[19]:

    # stats of categorical features
    datadf.describe(include=['O'])

    # In[20]:

    print(datadf.shape)
    datadf.head(10)

    # In[21]:

    # for starters, fill every nan value with mean column values across the dataset.

    #fill NaN values with mean
    #datadf['r_jail_in_age_in_days'].fillna(datadf['r_jail_in_age_in_days'].dropna().mean(), inplace=True)
    datadf[:] = datadf[:].fillna(0)
    '''
    datadf['r_jail_in_age_in_days'].fillna(0) 
    datadf['r_jail_out_age_in_days'].fillna(0) 
    datadf['c_jail_in_age_in_days'].fillna(0) 
    datadf['c_jail_out_age_in_days'].fillna(0) 

    datadf['vr_offense_age_in_days'].fillna(0) 
    datadf['r_offense_age_in_days'].fillna(0) 
    datadf['c_arrest_age_in_days'].fillna(0) 
    datadf['c_offense_age_in_days'].fillna(0) 
    '''

    # In[22]:

    datadf.to_csv('datadf_dt.csv', index=False)

    # In[23]:

    # check if any null values are still present
    print(datadf.columns[datadf.isnull().any()].tolist())

    # In[24]:

    #sample data for a quick run ## TODO removenext line
    traindf, testdf = train_test_split(datadf, random_state=42, test_size=0.3)

    print(traindf.shape)
    print(testdf.shape)

    # In[25]:

    from sklearn import preprocessing
    print(traindf.columns)
    print(traindf.columns[traindf.isnull().any()].tolist())

    # In[26]:

    ## Prediction ds_model  -TRAIN DF
    print(traindf.columns)
    train_features = traindf.loc[:, traindf.columns != 'decile_score']

    print(train_features.columns.values)

    # extract label from training set - Approved
    train_label = traindf.loc[:, traindf.columns == 'decile_score']
    print(train_label.columns)

    ## Prediction ds_model - TEST DF
    print(traindf.columns)
    test_features = testdf.loc[:, testdf.columns != 'decile_score']

    print(test_features.columns)
    print(test_features.head(10))

    # extract label from training set - Approved
    test_label = testdf.loc[:, testdf.columns == 'decile_score']
    print(test_label.columns)

    # In[27]:

    #get_ipython().run_cell_magic('time', '', "#Train the ds_model with best parameters of RF\n# best params for RF using randomizedCV\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\n\n\nds_model = make_pipeline(StandardScaler(with_std=True), \n                         OneVsRestClassifier(\n                             DecisionTreeClassifier(random_state=42,min_samples_split=9)\n                                                )) # n_es = 200, lr 0.001\n\n'''\nds_model = make_pipeline(StandardScaler(with_std=True, with_mean=True), \n       MLPClassifier(activation='relu', alpha=10.0, batch_size='auto', beta_1=0.9,\n       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n       hidden_layer_sizes=(7, 7), learning_rate='adaptive',\n       learning_rate_init=0.001, max_iter=500, momentum=0.9,\n       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,\n       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,\n       warm_start=False))\n\nds_model = make_pipeline(StandardScaler(with_std=True), \n                         OneVsRestClassifier(\n                             ExtraTreesClassifier(n_estimators=98,min_samples_split=10\n                             ,max_leaf_nodes=8,max_features='log2',max_depth=3,criterion='entropy')\n                                                ))\n'''\n\nds_model.fit(train_features, train_label)\ntrain_pred = ds_model.predict(train_features)\n\nprint(metrics.accuracy_score(train_label, train_pred)) # Training Accuracy Score\nprint (np.sqrt(mean_squared_error(train_label, train_pred))) # Training RMSE\n#print(roc_auc_score(train_label, train_pred)) # AUC-ROC values")

    #Train the model with best parameters of RF
    # best params for RF using randomizedCV
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA

    ds_model = make_pipeline(
        StandardScaler(with_std=True),
        OneVsRestClassifier(
            DecisionTreeClassifier(
                random_state=42, min_samples_split=9)))  # n_es = 200, lr 0.001
    '''
    ds_model = make_pipeline(StandardScaler(with_std=True, with_mean=True), 
           MLPClassifier(activation='relu', alpha=10.0, batch_size='auto', beta_1=0.9,
           beta_2=0.999, early_stopping=False, epsilon=1e-08,
           hidden_layer_sizes=(7, 7), learning_rate='adaptive',
           learning_rate_init=0.001, max_iter=500, momentum=0.9,
           nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
           solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
           warm_start=False))

    ds_model = make_pipeline(StandardScaler(with_std=True), 
                             OneVsRestClassifier(
                                 ExtraTreesClassifier(n_estimators=98,min_samples_split=10
                                 ,max_leaf_nodes=8,max_features='log2',max_depth=3,criterion='entropy')
                                                    ))
    '''

    ds_model.fit(train_features, train_label)
    train_pred = ds_model.predict(train_features)

    train_acc = metrics.accuracy_score(train_label, train_pred)
    print(train_acc)  # Training Accuracy Score
    print(np.sqrt(mean_squared_error(train_label,
                                     train_pred)))  # Training RMSE
    #print(roc_auc_score(train_label, train_pred)) # AUC-ROC values

    # In[29]:

    # In[30]:

    #test_pred = ds_model.predict_proba(testdf) #test features are all in testdata
    test_pred = ds_model.predict(
        test_features)  #test features are all in testdata

    print(metrics.accuracy_score(train_label,
                                 train_pred))  # Training Accuracy Score
    print(np.sqrt(mean_squared_error(train_label,
                                     train_pred)))  # Training RMSE

    test_pred_prob = ds_model.predict_proba(
        test_features)  #test features are all in testdata
    print("ds_model.classes_ :: ", ds_model.classes_)
    print(
        "****************************************************************************************"
    )
    print("Predicted Output  >>>>>>>>> ", test_pred_prob)  # Predicted Values
    print(
        "****************************************************************************************"
    )
    print("test_pred[:,1] >> ", test_pred_prob[:, 1][0])

    print(metrics.accuracy_score(test_label,
                                 test_pred))  # Testing Accuracy Score
    print(np.sqrt(mean_squared_error(test_label, test_pred)))  # Testing RMSE

    top_k_features = getSortedTopKfeatures(train_features, train_label)
    #top_k_features.append('decile_score')
    #return {"pred_accu" : train_acc}
    all_feat = train_features.columns.tolist()
    all_feat = list(set(all_feat) - set(top_k_features))
    ans = {
        "pred_accu": train_acc,
        "topk": top_k_features,
        "all_feat": all_feat
    }
    return ans
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt)
print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt)

## store the pre-trained gbdt_model
pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))

del X_train_gbdt
del y_train_gbdt
gc.collect()

gbdt_model = pickle.load(open(fp_gbdt_model, 'rb'))
#----- data for LR (one-hot encoding with GDBT output) -----#
id_cols = []
for i in range(1, gbdt_model.get_params()['n_estimators']+1):
    id_cols.append('tree'+str(i))
oh_enc = OneHotEncoder(id_cols)

def chunker(seq, size):
    return (seq[pos: pos + size] for pos in range(0, len(seq), size))

## oh_enc fit the train_set
df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8)

for chunk in chunker(df_train_id, 50000):
    oh_enc.fit(chunk)
    
del df_train_id

del X_train_org
del y_train_org
gc.collect()
Ejemplo n.º 14
0
#przed wykonaniem kodu upewnij się, że masz około 20 GB miejsca na dysku
#zajmie to prawdopodobnie około 1,5h
#importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from dummyPy import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

#creating classes
labelencoder_c1 = LabelEncoder()
labelencoder_c2 = LabelEncoder()
labelencoder_c3 = LabelEncoder()
ohe = OneHotEncoder(['category_1', 'category_2', 'category_3'])
sc = StandardScaler()

#fitting the label encoder
d1 = pd.read_csv('train2.tsv', sep = '\t')
d2 = pd.read_csv('test_stg2.tsv', sep = '\t')
category_list = pd.DataFrame(pd.concat([d1['category_name'], d2['category_name']]))
category_list['category_name'] = category_list['category_name'].astype('str')
category_list['category_name'] = category_list['category_name'].replace('nan', 'no_category1/no_category2/no_category3')
categories = category_list['category_name'].str.split('/', 3, expand = True)
category_list['category_1'] = categories[0]
category_list['category_2'] = categories[1]
category_list['category_3'] = categories[2]
labelencoder_c1.fit(category_list['category_1'])
labelencoder_c2.fit(category_list['category_2'])
labelencoder_c3.fit(category_list['category_3'])