Ejemplo n.º 1
0
def transform(categorical_columns, numerical_columns, data):
    cat = ('categorical', ohe(), categorical_columns)
    num = ('numeric', ss(), numerical_columns)
    col_trans = ColumnTransformer([cat, num])
    df_trans_scaled = col_trans.fit_transform(data)
    col_names = get_column_names_from_ColumnTransformer(col_trans)
    for vals in numerical_columns:
        col_names.append(vals)
    df_trans_scaled = pd.DataFrame({
        col_names[0]: df_trans_scaled[:, 0],
        col_names[1]: df_trans_scaled[:, 1],
        col_names[2]: df_trans_scaled[:, 2],
        col_names[3]: df_trans_scaled[:, 3],
        col_names[4]: df_trans_scaled[:, 4],
        col_names[5]: df_trans_scaled[:, 5],
        col_names[6]: df_trans_scaled[:, 6],
        col_names[7]: df_trans_scaled[:, 7],
        col_names[8]: df_trans_scaled[:, 8],
        col_names[9]: df_trans_scaled[:, 9],
        col_names[10]: df_trans_scaled[:, 10],
        col_names[11]: df_trans_scaled[:, 11],
        col_names[12]: df_trans_scaled[:, 12],
        col_names[13]: df_trans_scaled[:, 13],
        col_names[14]: df_trans_scaled[:, 14],
        col_names[15]: df_trans_scaled[:, 15],
        col_names[16]: df_trans_scaled[:, 16],
        col_names[17]: df_trans_scaled[:, 17]
    })
    return df_trans_scaled, col_names
Ejemplo n.º 2
0
 def fit(self, data):
     if not isinstance(data, pd.DataFrame):  # Needs to be dataframe
         data = pd.DataFrame(data)
     self.p = data.shape[1]
     self.cidx = np.where(data.dtypes == 'object')[0]
     self.nidx = np.where(~(data.dtypes == 'object'))[0]
     self.cenc = ohe(sparse=False,
                     dtype=int,
                     handle_unknown='ignore',
                     drop=self.drop)
     self.cenc.categories_ = [
         list(data.iloc[:, x].value_counts().index) for x in self.cidx
     ]
     self.cenc.drop_idx_ = np.repeat(0, len(self.cenc.categories_))
     # Total feature size: categories + num
     self.p2 = sum([len(x) - 1
                    for x in self.cenc.categories_]) + len(self.nidx)
     self.nenc = ss()
     self.nenc.mean_ = data.iloc[:, self.nidx].mean().values
     self.nenc.scale_ = data.iloc[:, self.nidx].std().values
     self.nenc.n_features_in_ = self.nidx.shape[0]
     self.cn = list(self.cenc.get_feature_names(data.columns[self.cidx].astype(str))) + \
               data.columns[self.nidx].to_list()
     self.lst_enc = [self.cenc, self.nenc]
     self.lst_cidx = [self.cidx, self.nidx]
     self.lst_iter = [len(z) > 0 for z in self.lst_cidx]
Ejemplo n.º 3
0
def createOneHotEncoding(y_labels):
        ''' Output of an image will be a number between 0-35.
        Why should convert into OneHot Encoding format ?
        '''
        from sklearn.preprocessing import OneHotEncoder as ohe
        enc = ohe(46)
        enc.fit(y_labels.reshape(y_labels.shape[0],1))

        return enc
Ejemplo n.º 4
0
 def __init__(self,
              url,
              names,
              label_tag,
              drop_tags=None,
              encode_tags=None,
              normalizer=Normalizer(),
              normal_tags=None,
              test_size=0.2):
     self.url = url
     self.names = names
     self.drop_tags = drop_tags
     self.encode_tags = encode_tags
     self.data = None
     self.label_tag = label_tag
     self.test_size = test_size
     self.enc = ohe(categories='auto')
     self.normal_tags = normal_tags
     self.normalizer = normalizer
Ejemplo n.º 5
0
def pivot_data(df, cols_and_vals):
	'''
	df - a pandas dataframe
	cols_and_vals - a list (str, int) of column names and number of values to pivot
	returns pivoted array and dict {column_name:pivoted_values}
	'''
	
	encoded_cols = []
	pivoted_vals = {}
	
	for p in cols_and_vals:
		col, n_vals = p[0], p[1]
		encoded_col, new_vals = encode(df[col], n_vals)
		encoded_cols.append(encoded_col)
		pivoted_vals[col] = new_vals
	
	encoded_array = make_array(encoded_cols)
	
	oneHot = ohe(categorical_features = 'all', n_values = 'auto')
	pivoted_array = oneHot.fit_transform(encoded_array)
	
	return pivoted_array, pivoted_vals
Ejemplo n.º 6
0
# import the dataset
dataset = pd.read_csv('data\Data.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# replace missing data in X using mean of the whole column
imputer = im(missing_values='NaN', strategy='mean',
                            axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# encode categorical data
labelencode_X = le()
X[:, 0] = labelencode_X.fit_transform(X[:, 0])
# dummy encoding the data
ohotencode = ohe(categorical_features=[0])
X = ohotencode.fit_transform(X).toarray()

labelencode_Y = le()
y = labelencode_Y.fit_transform(y)

# splitting the data into train and test set
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2,
                                       random_state=0)

# feature scaling
standardscale_X = ss()
X_train = standardscale_X.fit_transform(X_train)
X_test = standardscale_X.transform(X_test)
Ejemplo n.º 7
0
from sklearn.compose import ColumnTransformer as ct
from sklearn.model_selection import train_test_split as tts

#read file and split to dependant and independant
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, 3].values

#fill nan values by mean
x[:, 1:] = sip(missing_values=np.nan, strategy='mean').fit_transform(x[:, 1:])

#check dataset
print("dataset:\n", dataset)

#encode x to zeroes and ones
x = ct([('Country', ohe(), [0])], remainder='passthrough').fit_transform(x)

#encode y to zeroes and ones
y = le().fit_transform(y)

#count nan
"""total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)*100
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print("total :\n",missing_data)
"""

#take some values as training and predict output of some test cases
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)
Ejemplo n.º 8
0
    def fit(self, x):  # Fit the encoder/scaler
        self.n = x.shape[0]
        self.p = x.shape[1]
        dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)])
        dt2 = x.dtypes.astype(str).reset_index(drop=True)
        self.dt = pd.Series(
            np.where(
                dt1.isin(['int64', 'float64'])
                & dt2.isin(['int64', 'float64']), 'float', 'str'))
        if not all(self.dt.values == 'float'):
            self.dt[~(self.dt.values == 'float')] = \
                np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()),
                 'lst',self.dt[~(self.dt.values == 'float')])
        self.cn = np.array(x.columns)
        stopifnot(all(self.dt.isin(['float', 'lst', 'str'])))
        self.cidx = np.where(self.dt == 'str')[0]
        self.nidx = np.where(self.dt == 'float')[0]
        self.tidx = np.where(self.dt == 'lst')[0]
        stopifnot(
            all(
                np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx]))
                == np.arange(self.p)))
        self.iter = {'cenc': True, 'nenc': True, 'tenc': True}
        self.all_enc = {}

        #############################################################
        # --- Encoder (i): Categorical/ordinal integer features --- #

        if len(self.cidx) > 0:
            self.cenc = ohe(sparse=self.sparse,
                            dtype=self.dtype,
                            handle_unknown='ignore',
                            drop=None)
            self.cenc.categories_ = [
                np.unique(x.iloc[:, kk]) for kk in self.cidx
            ]
            self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx]
            cmode_idx = np.array([
                np.where(vec == mm)[0][0]
                for vec, mm in zip(self.cenc.categories_, self.cmode)
            ])
            cum_idx = np.append([0],
                                np.cumsum(
                                    [len(z) for z in self.cenc.categories_]))
            self.cenc.drop_idx = []
            self.cenc.drop_idx_ = None
            self.cenc.p = cum_idx.max() - len(
                self.cenc.drop_idx
            )  # How many features after dropping most common
            self.cenc.cn = list(
                np.delete(self.cenc.get_feature_names(self.cn[self.cidx]),
                          self.cenc.drop_idx))
            self.all_enc['cenc'] = self.cenc
        else:
            self.iter['cenc'] = False

        ###############################################
        # --- Encoder (ii): Continuous numerical ---- #

        if len(self.nidx) > 0:
            if self.quantize:
                u_nidx = np.array(
                    [len(x.iloc[:, kk].unique()) for kk in self.nidx])
                self.nidx1 = self.nidx[u_nidx > 31]  # quantize
                self.nidx2 = self.nidx[u_nidx <= 31]  # one-hot-encode
                self.nenc = {'enc': {}, 'cn': {}}
                if len(self.nidx1) > 0:
                    self.nenc1 = KD(n_bins=self.nbins, strategy='quantile')
                    if not self.sparse:
                        self.nenc1.encode = 'onehot-dense'
                    self.nenc1.fit(x.iloc[:, self.nidx1])
                    self.nenc1.cn = ljoin([
                        cn + '_q' + pd.Series(qq).astype(str)
                        for cn, qq in zip(self.cn[self.nidx1], [
                            np.arange(len(z) - 1) + 1
                            for z in self.nenc1.bin_edges_
                        ])
                    ])
                    self.nenc['enc']['nenc1'] = self.nenc1
                    self.nenc['cn']['nenc1'] = self.nenc1.cn
                if len(self.nidx2) > 0:
                    self.nenc2 = ohe(sparse=self.sparse,
                                     handle_unknown='ignore',
                                     drop=None)
                    self.nenc2.fit(x.iloc[:, self.nidx2])
                    self.nenc2.cn = self.nenc2.get_feature_names(
                        self.cn[self.nidx2])
                    self.nenc['enc']['nenc2'] = self.nenc2
                    self.nenc['cn']['nenc2'] = self.nenc2.cn
                self.nenc['cn'] = ljoin(list(self.nenc['cn'].values()))
                self.all_enc['nenc'] = self.nenc
            else:
                self.nenc = ss(copy=False)
                self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values
                self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values
                self.nenc.n_features_in_ = self.nidx.shape[0]
                self.nenc.p = self.nidx.shape[0]
                self.nenc.cn = list(self.cn[self.nidx])
                self.all_enc['nenc'] = self.nenc
        else:
            self.iter['nenc'] = False

        ################################################
        # --- Encoder (iii): Tokenize text blocks ---- #

        if len(self.tidx) > 0:
            self.tenc = dict(
                zip(self.cn[self.tidx], [
                    cv(tokenizer=lambda x: tok_fun(x),
                       lowercase=False,
                       token_pattern=None,
                       binary=True) for z in range(self.tidx.shape[0])
                ]))
            self.tenc = {'cv': self.tenc}
            for kk, jj in enumerate(self.cn[self.tidx]):
                self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U'))
            self.tenc['p'] = sum(
                [len(z.vocabulary_) for z in self.tenc['cv'].values()])
            self.tenc['cn'] = ljoin([
                l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in
                zip(self.tenc['cv'].values(), self.tenc['cv'].keys())
            ])
            self.all_enc['tenc'] = self.tenc
        else:
            self.iter['tenc'] = False

        # Store all in dictionary to iteration over self.iter
        self.enc_transform = {
            'cenc': self.cenc_transform,
            'nenc': self.nenc_transform,
            'tenc': self.tenc_transform
        }
        # Get the valid categories
        self.tt = np.array(list(self.iter.keys()))[np.where(
            list(self.iter.values()))[0]]
        # Get full feature names
        cn = []
        for ee in self.tt:
            if hasattr(self.all_enc[ee], 'cn'):
                cn.append(self.all_enc[ee].cn)
            else:
                cn.append(self.all_enc[ee]['cn'])
        cn = ljoin(cn)
        self.cn_transform = cn
Ejemplo n.º 9
0
@author: Gus Yudha
"""
""" Import Dataset nasabah bank """
import pandas as pd
dataset = pd.read_csv('bank_customers.csv')
X = dataset.iloc[:, 3:
                 13].values  # Pilah Fitur yang penting (Dari CreditScore - EstimatedSalary)
y = dataset.iloc[:, 13].values  # Pilah Jawaban (Exited)
""" Data preprocessing """
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.compose import ColumnTransformer as ct
le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])  # Ubah nama negara menjadi numerik
X[:, 2] = le.fit_transform(X[:, 2])  # Ubah gender menjadi numerik
Setarakan = ct([('Pilah Jadi 3', ohe(), [1])], remainder="passthrough")
X = Setarakan.fit_transform(X[:, 0:])  # Setarakan kategori negara
X = X[:, 1:]  # hilangkan 1 fitur variabel sampah
""" Pilah Data latihan dengan Data Ujian """
from sklearn.model_selection import train_test_split as tts
Soal_latihan, Soal_ujian, Jawaban_latihan, Jawaban_ujian = tts(X,
                                                               y,
                                                               test_size=0.2,
                                                               random_state=0)
""" Standarisasi Soal latihan dan Soal Ujian """
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
Soal_latihan = ss.fit_transform(Soal_latihan)  # Standarisasi Soal Latihan
Soal_ujian = ss.transform(Soal_ujian)  # Standarisasi Soal Ujian
""" Inisialisasi Arsitektur ANN (11-6-6-1) """
from keras.models import Sequential
    k = targ * (1 - y)
    grad = -np.ravel(np.dot(k.T, X).T)
    #grad =  -np.sum(np.sum(targ*(1-y),axis=1).reshape([n_obs,1])*X,axis=0)
    return grad


def class_efficiency(t_act, t_pred):
    cols = ['t_act', 't_pred']
    df = pd.DataFrame(np.concatenate(
        [training[1], pred_cat.reshape([n_obs, 1])], axis=1),
                      columns=cols)
    ct = pd.crosstab(df.t_act, df.t_pred)
    return ct


ohe1 = ohe(handle_unknown='ignore')
ohe1 = ohe1.fit(training[1])
targ = ohe1.transform(training[1]).toarray()

X = scores_trunc.copy()
dim = len(X.T)
mms1 = mms()
X = mms1.fit_transform(X)
train = np.concatenate([X, targ], axis=1)

n_cats = len(targ.T)
n_obs = len(train)
#dim = n_obs-n_cats

df_train = pd.DataFrame(train)
old_col = np.arange(dim, n_obs).tolist()
Ejemplo n.º 11
0
 def catf(self, t):
     ohe1 = ohe(categorical_features=[t])
     return ohe1.fit_transform(self.x).toarray()
Ejemplo n.º 12
0
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.preprocessing import MultiLabelBinarizer as mlb
"""
OneHotEncoder(n_values=’auto’, 
categorical_features=’all’,  dtype=<class ‘numpy.float64’>,
sparse=True,  handle_unknown=’error’)
"""
# lianjia_df = pd.DataFrame({'Elevator':[1,2],'Renovation':[4,5]},dtype=np.float32,copy=True)
# print(lianjia_df.values)
# print(lianjia_df['Elevator'])
# l = pd.get_dummies(lianjia_df['Elevator'])#独热编码方法
# print(l)

x = np.random.uniform(1, 10, [3, 5]).astype(np.int32)
y = np.arange(1, 10, 0.5)

# print(x)
# # print(y)

# encoder = ohe(sparse=False)#指定结果是否稀疏
# encoder.fit(x)
# print(encoder.active_features_)
# print(encoder.feature_indices_)
# print(encoder.n_values_)
# print(encoder.transform([[1,2,3,4,5]]))

encoder = ohe(sparse=False)  #指定结果是否稀疏后者transform .toarray()
encoder.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
arra = encoder.transform([[0, 1, 3]])
print(arra)
X = ed.iloc[:,1:]
X.shape              # 19100  X 12


# 9.1 Which columns are numerical and which categorical?
num_columns = X.select_dtypes(include = ['float64','int64']).columns
num_columns

cat_columns = X.select_dtypes(include = ['object']).columns
cat_columns



# 10. Start creating transformation objects
# 10.1 Tuple for categorical columns
cat = ("cattrans", ohe(), cat_columns)
# 10.2 tuple for numeric columns
num = ("numtrans", ss() , num_columns)
# 10.3 Instantiate column transformer object
colTrans = ct([num,cat])

# 10.4 Fit and transform
X_trans = colTrans.fit_transform(X)
X_trans.shape              # 19100 X 19


## 11.0 Label encoding
#  11.1  Map labels to 1 and 0
y = y.map({"continue" : 1, "drop" : 0})
y.head()
Ejemplo n.º 14
0
data = pd.read_csv("Churn_Modelling.csv")
x = data.iloc[:,
              3:13].values  #all the columns except the last one is considered
y = data.iloc[:, 13].values

#label encoding
from sklearn.preprocessing import LabelEncoder as le
from sklearn.preprocessing import OneHotEncoder as ohe
le_x_1 = le()  #label encoder object created for country
x[:, 1] = le_x_1.fit_transform(
    x[:,
      1])  #label encoder object linked with the 2nd column of the data table
le_x_2 = le()  #label encoder object created for gender
x[:, 2] = le_x_2.fit_transform(x[:, 2])
ohec = ohe(categorical_features=[
    1
])  #index of the column is to be specified for the onehot encoding
x = ohec.fit_transform(x).toarray()
#now we have to fit the ohec object into
x = x[:,
      1:]  #to eliminate the dummy variable trap(like for three classes a dummy variable set of 2 is fine(third is automatically set))

#data splitting
from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler as sc
sc_x = sc()
x_train = sc_x.fit_transform(x_train)  #standardization scaling we are doing
x_test = sc_x.transform(x_test)
Ejemplo n.º 15
0
get_ipython().run_cell_magic(
    u'html', u'',
    u"<div class='tableauPlaceholder' id='viz1535718122614' style='position: relative'><noscript><a href='#'><img alt='Story 2 ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ti&#47;Titanic2_32&#47;Story2&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Titanic2_32&#47;Story2' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ti&#47;Titanic2_32&#47;Story2&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1535718122614');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='1016px';vizElement.style.height='991px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>"
)

# **Converting categorical data to numeric form**

# In[ ]:

from sklearn.preprocessing import LabelEncoder as le
from sklearn.preprocessing import OneHotEncoder as ohe
for c in train.columns:
    if train[c].dtype == 'object':
        z1 = le().fit_transform(train[c].astype(str))
        train[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))
        z1 = le().fit_transform(test[c].astype(str))
        test[c] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))
z1 = le().fit_transform(train['Age'].astype(str))
train['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))
z1 = le().fit_transform(test['Age'].astype(str))
test['Age'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))
z1 = le().fit_transform(train['Fare'].astype(str))
train['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))
z1 = le().fit_transform(test['Fare'].astype(str))
test['Fare'] = ohe(sparse=False).fit_transform(z1.reshape(len(z1), 1))

# **Dropping Some unnecessary Features**
# cabin has more than 70% of the data missing

# In[ ]:
Ejemplo n.º 16
0
    maximum = np.max(x,axis=0)
    rang = maximum-minimum
    z = (x-minimum)/rang
    return z


raw_data = open("trilogyData.csv")
data = np.loadtxt(raw_data,delimiter=",",skiprows=1, dtype=np.str)

x0 = np.ones((len(data),1))
x = data [:,1:72]
y = data [:,72]
y = y.astype(float)


ohe = ohe(categories = 'auto')
state = ohe.fit_transform(data[:,1].reshape((len(data),1))).toarray().astype(np.float)
grade = ohe.fit_transform(data[:,2].reshape((len(data),1))).toarray().astype(np.float)

cols = data[:, [2,3,4,5]]
norm = normalizeData(cols.astype(int))

x = np.delete(x, [0,1,2,3,4,5], axis=1)
arr = np.concatenate((state,grade,norm,x),axis=1)
arr = arr.astype(float)
arr = np.concatenate((x0,arr), axis=1)


x_train, x_test, y_train, y_test = model_selection.train_test_split(arr,
                                                                    y,train_size=0.7,
                                                                    test_size=0.3, 
Ejemplo n.º 17
0
from sklearn.linear_model import LinearRegression
import statsmodels.regression.linear_model as lm

#read file and split to dependant and independant
dataset = pd.read_csv('50_Startups.csv')
x = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, 4].values
"""
#fill nan values by mean
x[: , 1: ]= sip(missing_values=np.nan,strategy='mean').fit_transform(x[: , 1: ])
"""
#check dataset
print("dataset:\n", dataset)

#encode x to zeroes and ones
x = ct([('Country', ohe(), [3])],
       remainder='passthrough').fit_transform(x).astype('int')
"""
#encode y to zeroes and ones
y=le().fit_transform(y)
"""

#remove variable trap
x = x[:, 1:]

#take some values as training and predict output of some test cases
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)

print("x:\n", x)

print("x_train before scaling:\n", x_train)