Esempio n. 1
0
                                                          index)]

### now, let's seperate the numeric explanatory data from the string data
string_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes ==
                                            'object']
numeric_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.
                                             dtypes != 'object']

# that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation)
string_featuresfore = string_featuresfore.fillna('Nothing')
# cleaning up string features
string_featuresfore = cleanup_data(string_featuresfore)
# binarizing string features
encoded_datafore = get_binary_values(string_featuresfore)
## imputing features
imputer_object = imp(missing_values='NaN', strategy='median', axis=0)
imputer_object.fit(numeric_featuresfore)
numeric_featuresfore = pandas.DataFrame(
    imputer_object.transform(numeric_featuresfore),
    columns=numeric_featuresfore.columns)

## pulling together numeric and encoded data.
explanatory_dffore = pandas.concat([numeric_featuresfore, encoded_datafore],
                                   axis=1)
explanatory_dffore.head()

#now, let's find features with no variance
no_variationfore = find_zero_var(explanatory_dffore)
explanatory_dffore.drop(no_variationfore['toDelete'], inplace=True)

# deleting perfect correlation
## seeing which explanatory feature rows got removed.  Looks like none.
response_seriesfore.index[~response_seriesfore.index.isin(explanatory_dffore.index)]

### now, let's seperate the numeric explanatory data from the string data
string_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes == 'object']
numeric_featuresfore = explanatory_dffore.ix[:, explanatory_dffore.dtypes != 'object']

# that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation)
string_featuresfore = string_featuresfore.fillna('Nothing')
# cleaning up string features
string_featuresfore = cleanup_data(string_featuresfore)
# binarizing string features 
encoded_datafore = get_binary_values(string_featuresfore)
## imputing features
imputer_object = imp(missing_values='NaN', strategy='median', axis=0)
imputer_object.fit(numeric_featuresfore)
numeric_featuresfore = pandas.DataFrame(imputer_object.transform(numeric_featuresfore), columns = numeric_featuresfore.columns)

## pulling together numeric and encoded data.
explanatory_dffore = pandas.concat([numeric_featuresfore, encoded_datafore],axis = 1)
explanatory_dffore.head()


#now, let's find features with no variance 
no_variationfore = find_zero_var(explanatory_dffore)
explanatory_dffore.drop(no_variationfore['toDelete'], inplace = True)

# deleting perfect correlation
no_correlationfore = find_perfect_corr(explanatory_dffore)
explanatory_dffore.drop(no_correlationfore['toRemove'], 1, inplace = True)
Esempio n. 3
0
 def mval(self):
     imp1 = imp(missing_values="NaN", strategy="mean", axis=0)
     imp1 = imp1.fit(self.x[:, 1:3])
     return imp1.transform(self.x[:, 1:3])
data.Open.fillna(method='ffill', inplace=True)
data.High.fillna(method='ffill', inplace=True)
data.Low.fillna(method='ffill', inplace=True)
data.Close.fillna(method='ffill', inplace=True)

train = data.iloc[:, 1:7]
test = data.iloc[:, 7:8]

print(train.head())
print(test.head())

from sklearn.preprocessing import Imputer as imp

# In[12]:

imp = imp(missing_values='NaN', strategy='median', axis=0)

# In[13]:

train.iloc[:, 1:6] = imp.fit_transform(train.iloc[:, 1:6])

# In[14]:

train.isnull().sum()

# In[15]:

test.fillna(pd.DataFrame.mean(test), inplace=True)

# In[16]:
#%%


'''Loading dataset'''
import numpy as np
import pandas as pd
dataset = pd.read_csv("cattdat.csv")
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values
print(X)
#%%


'''Dealing with missing data'''
from sklearn.preprocessing import Imputer as imp
imputer = imp(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
#%%


'''Encoding Y'''
from sklearn.preprocessing import LabelEncoder as le
labelencoder_x = le()
X[:,3]=labelencoder_x.fit_transform(X[:,3])
#%%


'''Dummy variable'''
from sklearn.preprocessing import OneHotEncoder as hot
ohe = hot(categorical_features=[3])