Python ReplaceImputeEncode Examples, Class_replace_impute_encode.ReplaceImputeEncode Python Examples

Example #1

0

Show file

target = 'crash'
# Drop data with missing values for target (price)
drops = []
for i in range(df1.shape[0]):
    if pd.isnull(df1['crash'][i]):
        drops.append(i)
df1 = df1.drop(drops)

# In[23]:

encoding = 'one-hot'
scale = None  # Interval scaling:  Use 'std', 'robust' or None
# drop=False - do not drop last category - used for Decision Trees
rie = ReplaceImputeEncode(data_map=attribute_map,
                          nominal_encoding=encoding,
                          interval_scale=scale,
                          drop=False,
                          display=True)

# In[24]:

df1.drop('crash', axis=1, inplace=True)

# In[25]:

encoded_df = rie.fit_transform(df1)

# In[26]:

#varlist = [target, 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','points']
X = encoded_df.drop(['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7'], axis=1)

Example #2

0

Show file

    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    #     'purpose':[1,('0','1','2','3','4','5','6','7','8','9','X'),[0,0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map,
                          drop=False,
                          nominal_encoding='one-hot',
                          display=True,
                          interval_scale='std')
encoded_df = rie.fit_transform(df)

# In[4]:

X = encoded_df.drop('good_bad', axis=1)
Y = encoded_df['good_bad']
np_y = np.ravel(Y)

features = X.columns
classes = ['Good', 'bad']

# In[9]:

Example #3

0

Show file

File: DecisionTreeClassifier_Creditdata.py Project: himgupta08/Stat-656-projects-applied-analytics-sas-eminer-python

    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'existcr': [2, (1, 2, 3, 4), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'job': [2, (1, 2, 3, 4), [0, 0]],
    'housing': [2, (1, 2, 3), [0, 0]],
    'foreign': [1, (1, 2), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]],
    'resident': [2, (1, 2, 3, 4), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'other': [2, (1, 2, 3), [0, 0]],
    'property': [2, (1, 2, 3, 4), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'telephon': [2, (1, 2), [0, 0]]
}

rie = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rie.fit_transform(df)

# In[5]:

from collections import Counter
Counter(encoded_df['employed0'])
len(encoded_df.columns)  # 46 columns

Counter(encoded_df['good_bad'])

# In[6]:

X = encoded_df.drop('good_bad', axis=1)
y = encoded_df['good_bad']

Example #4

0

Show file

File: churn.py Project: ChrisBerardi/Customer-Churn

    'MonthlyCharges': [0, (18.25, 188.75), [0, 0]],
    'TotalCharges': [0, (0, 8700), [0, 0]],  #int as str hides max value!
    'Churn': [1, ('Yes', 'No'), [0, 0]],
}

#Define the target
target = ['Churn']

#Logistics Regression
max_f1 = 0
score_list = ['accuracy', 'recall', 'precision', 'f1']

#Encode for logistic regressions
rie_l = ReplaceImputeEncode(data_map=attribute_map,
                            nominal_encoding='one-hot',
                            interval_scale='std',
                            drop=True,
                            display=True)
encoded_df_l = rie_l.fit_transform(df)
X_l = encoded_df_l.drop(target, axis=1)
y_l = encoded_df_l[target]
np_y_l = np.ravel(y_l)  #convert dataframe column to flat array

#Do feature selection using random forest classifiers to determine which
#predictors to include in the logistic regression
features = ExtraTreesClassifier(n_estimators=500)
features.fit(X_l, np_y_l)
print(features.feature_importances_)
#Only the interval predictors are important
#Try two logistic models: one with all predictors, one with only the top 3
#predictors

Example #5

0

Show file

File: sample_project.py Project: shanebabe/Python_2020

        sentiment_score[i] = sentiment_score[i] / n_sw
df_senscore = pd.DataFrame(sentiment_score, columns=['sentiment score'])
df = df.join(df_senscore)

# classify topic based on the probability

df['topic'] = 0
for ix, row in df.iterrows():
    mx = row[['T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7']].max()
    b = (row == mx).idxmax(axis=1)
    df.loc[ix, 'topic'] = b
# save the data output of NLP
df.to_csv('after_NLP_data.csv', index=False)

# scale data
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', \
                           interval_scale=None, drop=False, display=True)

df_tree = rie.fit_transform(df)
y = df_tree['crash']
X = df_tree.drop('crash', axis=1)

# find the best tree depth
depth_list = [3, 5, 6, 7, 8, 10, 12, 15, 20, 25]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for d in depth_list:
    print("\nMaximum Tree Depth: ", d)
    dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \
                                 min_samples_split=5,random_state=12345)
    dtc = dtc.fit(X, y)
    scores = cross_validate(dtc, X, y, scoring=score_list, \
                            return_train_score=False, cv=10)

Example #6

0

Show file

        'Feb_PayPercent':[0,(0, 1),[0,0]],
        'Jan_PayPercent':[0,(0, 1),[0,0]]
    }
    
    
    # In[14]:df.drop(['Customer'], axis=1)
    
    # In[6]:
    
    
np.sum(df['Marital_Status']==0)
df.dtypes
    
    
    # In[15]:
rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding='one-hot', display=True)
encoded_df = rie.fit_transform(df)
    
    
varlist = df['Default']
X = encoded_df.drop('Default', axis=1)
y = encoded_df['Default']

lgr = LogisticRegression()

#Selecting the best attributes using RFE - 25 attributes chosen   
rfe = RFE(lgr,25)
rfe = rfe.fit(X,y)


print(rfe.support_)

Example #7

0

Show file

File: Logistic_Regression_and_Cross_Validation.py Project: krit-gpt/Applied_Analytics

    'foreign':[1,(1,2),[0,0]],
    'good_bad':[1,('bad', 'good'),[0,0]],
    'history':[2,(0,1,2,3,4),[0,0]],
    'installp':[2,(1,2,3,4),[0,0]],
    'job':[2,(1,2,3,4),[0,0]],
    'marital':[2,(1,2,3,4),[0,0]],
    'other':[2,(1,2,3),[0,0]],
    'property':[2,(1,2,3,4),[0,0]],
 #   'purpose':[1,(0,1,2,3,4,5,6,7,8,9,'X'),[0,0]],
    'resident':[2,(1,2,3,4),[0,0]],
    'savings':[2,(1,2,3,4,5),[0,0]],
    'telephon':[1,(1,2),[0,0]] }
Step 3: Replace-Impute-Encode
Next, use the class ReplaceImputeEncode() to replace outliers with missing values, impute missing values and then scale interval data and encode categorial data.

The ReplaceImputeEncode() class allows you to specify None for scaling and/or encoding. It also lets you select 'one-hot' or 'SAS' encoding for categorical variables. In most other software this is automatic, but for Python we need to setup our own scaling and encoding.

The complete API for this class is described in the class. First you instantiate the class then you use fit_transform() to actually process your dataframe.

In [3]:
encoding = 'SAS' # Categorical encoding:  Use 'SAS', 'one-hot' or None
scale    = None  # Interval scaling:  Use 'std', 'robust' or None
scaling  = 'No'  # Text description for interval scaling

rie = ReplaceImputeEncode(data_map=attribute_map, nominal_encoding=encoding, \
                          interval_scale = scale, display=True)

#features_map = rie.draft_features_map(df)
encoded_df = rie.fit_transform(df)
********** Data Preprocessing ***********
Features Dictionary Contains:

Example #8

0

Show file

File: week 10.py Project: ChrisBerardi/STAT656-Applied-Analytics

attribute_map_clus = {
        'Score'   :[0,(80,100),[0,0]],
        'Year'    :[0,(1985,2016),[0,0]],
        'Region'  :[2,('California Other', 'Central Coast','Central Valley', \
                     'Clear Lake','High Valley', 'Lake County',\
                     'Mendocino County','Mendocino Ridge',\
                     'Mendocino/Lake Counties', 'Napa','Napa-Sonoma',\
                     'North Coast','Red Hills Lake County','Redwood Valley',\
                     'Sierra Foothills','Sonoma','South Coast'),[0,0]],
        'Cluster' :[2,(0,1,2,3,4,5,6,7,8),[0,0]],
        'Price'   :[0,(0,625),[0,0]]
}
varlist = ['Price']

rie_clus = ReplaceImputeEncode(data_map=attribute_map_clus, \
                               nominal_encoding='one-hot', 
                          interval_scale = None, drop=True, display=False)
encoded_df_clus = rie_clus.fit_transform(clus)

X_clus = encoded_df_clus.drop(varlist, axis=1)
y_clus = encoded_df_clus[varlist]
X_train, X_valid, y_train, y_valid= \
train_test_split(X_clus,y_clus,test_size = 0.3, random_state=7)

np_y_train = np.ravel(y_train)
np_y_valid = np.ravel(y_valid)


reg = LinearRegression()
reg.fit(X_train,np_y_train)

Example #9

0

Show file

    'model': [2, ('COBALT', 'G5', 'HHR', 'ION', 'SKY', 'SOLSTICE'), [0, 0]],
    'crashed': [1, ('N', 'Y'), [0, 0]],
    'abs': [1, ('N', 'Y'), [0, 0]],
    'mileage': [0, (0, 200000), [0, 0]],
    '0': [0, (0, 1), [0, 0]],
    '1': [0, (0, 1), [0, 0]],
    '2': [0, (0, 1), [0, 0]],
    '3': [0, (0, 1), [0, 0]],
    '4': [0, (0, 1), [0, 0]],
    '5': [0, (0, 1), [0, 0]],
    '6': [0, (0, 1), [0, 0]],
    '7': [0, (0, 1), [0, 0]],
}
varlist = ['crashed']
rie = ReplaceImputeEncode(data_map=attribute_map, \
                               nominal_encoding='one-hot',
                          interval_scale = None, drop=True, display=False)
encoded_df = rie.fit_transform(reg_df)
X = encoded_df.drop(varlist, axis=1)
y = encoded_df[varlist]
np_y = np.ravel(y)

#10 fold-cross validation to find optimum regularization value
max_f1 = 0
C_list = [.1, 1, 10, 100]
score_list = ['accuracy', 'recall', 'precision', 'f1']
for c in C_list:
    print("\nRegularization Parameter: ", c)
    lgr = LogisticRegression(C=c, tol=1e-8, max_iter=1000)
    lgr.fit(X, np_y)
    scores = cross_validate(lgr, X, np_y,\

Example #10

0

Show file

File: homework4.py Project: ChrisBerardi/STAT656-Applied-Analytics

    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'telephon': [1, (1, 2), [0, 0]]
}

sas_map = {
    'duration': [0, (0, 100), [0, 0]],
    'checking': [2, (1, 2, 3, 4), [0, 0]],
    'coapp': [2, (1, 2, 3), [0, 0]],
    'history': [2, (0, 1, 2, 3, 4), [0, 0]],
    'good_bad': [1, ('good', 'bad'), [0, 0]],
    'savings': [2, (1, 2, 3, 4, 5), [0, 0]],
    'installp': [2, (1, 2, 3, 4), [0, 0]],
    'marital': [2, (1, 2, 3, 4), [0, 0]]
}
#Replace, impute, and encode using SAS encoding
rep_imp_enc = ReplaceImputeEncode(data_map=attribute_map, display=True)
encoded_df = rep_imp_enc.fit_transform(df)

# Regression requires numpy arrays containing all numeric values
y = np.asarray(encoded_df['good_bad'])
# Drop the target, 'object'.  Axis=1 indicates the drop is for a column.
X = np.asarray(encoded_df.drop('good_bad', axis=1))

#Fit a logistic regression model, use k=4 fold cross validation
X_train, X_validate, y_train, y_validate = \
            train_test_split(X,y,test_size = 0.3, random_state=7)
logistic = LogisticRegression()
logistic.fit(X, y)

log_tts = LogisticRegression()
log_tts.fit(X_train, y_train)