Example #1
0
target = 'crash'
# Drop data with missing values for target (price)
drops = []
for i in range(df1.shape[0]):
    if pd.isnull(df1['crash'][i]):
        drops.append(i)
df1 = df1.drop(drops)

# In[23]:

encoding = 'one-hot'
scale = None  # Interval scaling:  Use 'std', 'robust' or None
# drop=False - do not drop last category - used for Decision Trees
rie = ReplaceImputeEncode(data_map=attribute_map,
                          nominal_encoding=encoding,
                          interval_scale=scale,
                          drop=False,
                          display=True)

# In[24]:

df1.drop('crash', axis=1, inplace=True)

# In[25]:

encoded_df = rie.fit_transform(df1)

# In[26]:

#varlist = [target, 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','points']
X = encoded_df.drop(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7'], axis=1)
Example #2
0
    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    #     'purpose':[1,('0','1','2','3','4','5','6','7','8','9','X'),[0,0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map,
                          drop=False,
                          nominal_encoding='one-hot',
                          display=True,
                          interval_scale='std')
encoded_df = rie.fit_transform(df)

# In[4]:

X = encoded_df.drop('good_bad', axis=1)
Y = encoded_df['good_bad']
np_y = np.ravel(Y)

features = X.columns
classes = ['Good', 'bad']

# In[9]:
    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'existcr': [2, (1, 2, 3, 4), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [2, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rie.fit_transform(df)

# In[5]:

from collections import Counter
Counter(encoded_df['employed0'])
len(encoded_df.columns)  # 46 columns

Counter(encoded_df['good_bad'])

# In[6]:

X = encoded_df.drop('good_bad', axis=1)
y = encoded_df['good_bad']
Example #4
0
    'MonthlyCharges': [0, (18.25, 188.75), [0, 0]],
    'TotalCharges': [0, (0, 8700), [0, 0]],  #int as str hides max value!
    'Churn': [1, ('Yes', 'No'), [0, 0]],
}

#Define the target
target = ['Churn']

#Logistics Regression
max_f1 = 0
score_list = ['accuracy', 'recall', 'precision', 'f1']

#Encode for logistic regressions
rie_l = ReplaceImputeEncode(data_map=attribute_map,
                            nominal_encoding='one-hot',
                            interval_scale='std',
                            drop=True,
                            display=True)
encoded_df_l = rie_l.fit_transform(df)
X_l = encoded_df_l.drop(target, axis=1)
y_l = encoded_df_l[target]
np_y_l = np.ravel(y_l)  #convert dataframe column to flat array

#Do feature selection using random forest classifiers to determine which
#predictors to include in the logistic regression
features = ExtraTreesClassifier(n_estimators=500)
features.fit(X_l, np_y_l)
print(features.feature_importances_)
#Only the interval predictors are important
#Try two logistic models: one with all predictors, one with only the top 3
#predictors
Example #5
0
        sentiment_score[i] = sentiment_score[i] / n_sw
df_senscore = pd.DataFrame(sentiment_score, columns=['sentiment score'])
df = df.join(df_senscore)

# classify topic based on the probability

df['topic'] = 0
for ix, row in df.iterrows():
    mx = row[['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7']].max()
    b = (row == mx).idxmax(axis=1)
    df.loc[ix, 'topic'] = b
# save the data output of NLP
df.to_csv('after_NLP_data.csv', index=False)

# scale data
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', \
                           interval_scale=None, drop=False, display=True)

df_tree = rie.fit_transform(df)
y = df_tree['crash']
X = df_tree.drop('crash', axis=1)

# find the best tree depth
depth_list = [3, 5, 6, 7, 8, 10, 12, 15, 20, 25]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for d in depth_list:
    print("\nMaximum Tree Depth: ", d)
    dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \
                                 min_samples_split=5,random_state=12345)
    dtc = dtc.fit(X, y)
    scores = cross_validate(dtc, X, y, scoring=score_list, \
                            return_train_score=False, cv=10)
Example #6
0
        'Feb_PayPercent':[0,(0, 1),[0,0]],
        'Jan_PayPercent':[0,(0, 1),[0,0]]
    }
    
    
    # In[14]:df.drop(['Customer'], axis=1)
    
    # In[6]:
    
    
np.sum(df['Marital_Status']==0)
df.dtypes
    
    
    # In[15]:
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', display=True)
encoded_df = rie.fit_transform(df)
    
    
varlist = df['Default']
X = encoded_df.drop('Default', axis=1)
y = encoded_df['Default']

lgr = LogisticRegression()

#Selecting the best attributes using RFE - 25 attributes chosen   
rfe = RFE(lgr,25)
rfe = rfe.fit(X,y)


print(rfe.support_)
    'foreign':[1,(1,2),[0,0]],
    'good_bad':[1,('bad', 'good'),[0,0]],
    'history':[2,(0,1,2,3,4),[0,0]],
    'installp':[2,(1,2,3,4),[0,0]],
    'job':[2,(1,2,3,4),[0,0]],
    'marital':[2,(1,2,3,4),[0,0]],
    'other':[2,(1,2,3),[0,0]],
    'property':[2,(1,2,3,4),[0,0]],
 #   'purpose':[1,(0,1,2,3,4,5,6,7,8,9,'X'),[0,0]],
    'resident':[2,(1,2,3,4),[0,0]],
    'savings':[2,(1,2,3,4,5),[0,0]],
    'telephon':[1,(1,2),[0,0]] }
Step 3: Replace-Impute-Encode
Next, use the class ReplaceImputeEncode() to replace outliers with missing values, impute missing values and then scale interval data and encode categorial data.

The ReplaceImputeEncode() class allows you to specify None for scaling and/or encoding. It also lets you select 'one-hot' or 'SAS' encoding for categorical variables. In most other software this is automatic, but for Python we need to setup our own scaling and encoding.

The complete API for this class is described in the class. First you instantiate the class then you use fit_transform() to actually process your dataframe.

In [3]:
encoding = 'SAS' # Categorical encoding:  Use 'SAS', 'one-hot' or None
scale    = None  # Interval scaling:  Use 'std', 'robust' or None
scaling  = 'No'  # Text description for interval scaling

rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding=encoding, \
                          interval_scale = scale, display=True)

#features_map = rie.draft_features_map(df)
encoded_df = rie.fit_transform(df)
********** Data Preprocessing ***********
Features Dictionary Contains:
attribute_map_clus = {
        'Score'   :[0,(80,100),[0,0]],
        'Year'    :[0,(1985,2016),[0,0]],
        'Region'  :[2,('California Other', 'Central Coast','Central Valley', \
                     'Clear Lake','High Valley', 'Lake County',\
                     'Mendocino County','Mendocino Ridge',\
                     'Mendocino/Lake Counties', 'Napa','Napa-Sonoma',\
                     'North Coast','Red Hills Lake County','Redwood Valley',\
                     'Sierra Foothills','Sonoma','South Coast'),[0,0]],
        'Cluster' :[2,(0,1,2,3,4,5,6,7,8),[0,0]],
        'Price'   :[0,(0,625),[0,0]]
}
varlist = ['Price']

rie_clus = ReplaceImputeEncode(data_map=attribute_map_clus, \
                               nominal_encoding='one-hot', 
                          interval_scale = None, drop=True, display=False)
encoded_df_clus = rie_clus.fit_transform(clus)

X_clus = encoded_df_clus.drop(varlist, axis=1)
y_clus = encoded_df_clus[varlist]
X_train, X_valid, y_train, y_valid= \
train_test_split(X_clus,y_clus,test_size = 0.3, random_state=7)

np_y_train = np.ravel(y_train)
np_y_valid = np.ravel(y_valid)


reg = LinearRegression()
reg.fit(X_train,np_y_train)
Example #9
0
    'model': [2, ('COBALT', 'G5', 'HHR', 'ION', 'SKY', 'SOLSTICE'), [0, 0]],
    'crashed': [1, ('N', 'Y'), [0, 0]],
    'abs': [1, ('N', 'Y'), [0, 0]],
    'mileage': [0, (0, 200000), [0, 0]],
    '0': [0, (0, 1), [0, 0]],
    '1': [0, (0, 1), [0, 0]],
    '2': [0, (0, 1), [0, 0]],
    '3': [0, (0, 1), [0, 0]],
    '4': [0, (0, 1), [0, 0]],
    '5': [0, (0, 1), [0, 0]],
    '6': [0, (0, 1), [0, 0]],
    '7': [0, (0, 1), [0, 0]],
}
varlist = ['crashed']
rie = ReplaceImputeEncode(data_map=attribute_map, \
                               nominal_encoding='one-hot',
                          interval_scale = None, drop=True, display=False)
encoded_df = rie.fit_transform(reg_df)
X = encoded_df.drop(varlist, axis=1)
y = encoded_df[varlist]
np_y = np.ravel(y)

#10 fold-cross validation to find optimum regularization value
max_f1 = 0
C_list = [.1, 1, 10, 100]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for c in C_list:
    print("\nRegularization Parameter: ", c)
    lgr = LogisticRegression(C=c, tol=1e-8, max_iter=1000)
    lgr.fit(X, np_y)
    scores = cross_validate(lgr, X, np_y,\
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

sas_map = {
    'duration': [0, (0, 100), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'coapp': [2, (1, 2, 3), [0, 0]],
    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'good_bad': [1, ('good', 'bad'), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]]
}
#Replace, impute, and encode using SAS encoding
rep_imp_enc = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rep_imp_enc.fit_transform(df)

# Regression requires numpy arrays containing all numeric values
y = np.asarray(encoded_df['good_bad'])
# Drop the target, 'object'.  Axis=1 indicates the drop is for a column.
X = np.asarray(encoded_df.drop('good_bad', axis=1))

#Fit a logistic regression model, use k=4 fold cross validation
X_train, X_validate, y_train, y_validate = \
            train_test_split(X,y,test_size = 0.3, random_state=7)
logistic = LogisticRegression()
logistic.fit(X, y)

log_tts = LogisticRegression()
log_tts.fit(X_train, y_train)