Example #1
0
    def preparation_data(self, df):
        # Standarisasi Kolom Numerik
        kolom_numerik = ['Umur', 'NilaiBelanjaSetahun']

        # Statistik sebelum Standardisasi
        print('Statistik Sebelum Standardisasi\n')
        print(df[kolom_numerik].describe().round(1))

        # Standardisasi
        df_std = StandardScaler().fit_transform(df[kolom_numerik])

        # Membuat DataFrame
        df_std = pd.DataFrame(data=df_std,
                              index=df.index,
                              columns=df[kolom_numerik].columns)

        # Menampilkan contoh isi data dan summary statistic
        print('Contoh hasil standardisasi\n')
        print(df_std.head())

        print('Statistik hasil standardisasi\n')
        print(df_std.describe().round(0))

        # Konversi Kategorikal Data
        # Inisiasi nama kolom kategorikal
        kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen']

        # Membuat salinan data frame
        df_encode = df[kolom_kategorikal].copy()

        # Melakukan labelEncoder untuk semua kolom kategorikal
        for col in kolom_kategorikal:
            df_encode[col] = LabelEncoder().fit_transform(df_encode[col])

        # Menampilkan data
        print(df_encode.head())

        # Menggabungkan data frame
        df_model = df_encode.merge(df_std,
                                   left_index=True,
                                   right_index=True,
                                   how='left')
        print(df_model.head())
Example #2
0
# Statistik sebelum Standardisasi
print('Statistik Sebelum Standardisasi\n')
print(df[kolom_numerik].describe().round(1))

# Standardisasi
df_std = StandardScaler().fit_transform(df[kolom_numerik])

# Membuat DataFrame
df_std = pd.DataFrame(data=df_std,
                      index=df.index,
                      columns=df[kolom_numerik].columns)

# Menampilkan contoh isi data dan summary statistic
print('Contoh hasil standardisasi\n')
print(df_std.head())

print('Statistik hasil standardisasi\n')
print(df_std.describe().round(0))

# # [Konversi Kategorikal Data dengan Label Encoder](https://academy.dqlab.id/main/livecode/293/562/2809)

# In[10]:

from sklearn.preprocessing import LabelEncoder

# Inisiasi nama kolom kategorikal
kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen']

# Membuat salinan data frame
df_encode = df[kolom_kategorikal].copy()
            element = float(element)
        except ValueError:
            #print("error",e," happens!")
            element = 0
        rows.append(element)
    df_price[nf] = rows
        
df_price = df_price.loc[:, day_features].replace(np.nan,0).values
# Standardizing the features
df_price = StandardScaler().fit_transform(df_price)
# add columns' name 
df_price = pd.DataFrame(df_price, columns = day_features)

dataset = df_price.values
print("price feature sample: ")
print(df_price.head())


# PAA transformation
# PAA transform (and inverse transform) of the data
n_paa_segments = 3
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_list = []
for item in df_price.values:
    item = item.reshape((1,5,1))
    paa_price_inv = paa.inverse_transform(paa.fit_transform(item))
    paa_list.append(paa_price_inv)
paa_array = np.array(paa_list)

paa_data = paa_array.reshape(1904, 5)
paa_df = pd.DataFrame(paa_data, columns = day_features)
Example #4
0
df_ss = StandardScaler().fit_transform(df)

# In[15]:

#Create a dataframe
df_ss = pd.DataFrame(df_ss)
df_ss.columns = [
    'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail',
    'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG',
    'Weight', 'Wheelbase', 'Length', 'Width'
]

# In[16]:

#Check first few rows
df_ss.head()

# In[17]:

df_ss.describe()

# In[18]:

#Reduce feature down to 3
comp = 3
pca, X_pca = do_pca(comp, df_ss)

# In[19]:

X_pca
Example #5
0
            labels[Pn] = C
        elif labels[Pn] == 0:
            labels[Pn] = C
            PnNeighborPts = regionQuery(D, Pn, eps)
            if len(PnNeighborPts) >= MinPts:
                NeighborPts = NeighborPts + PnNeighborPts
        i += 1


def regionQuery(D, P, eps):

    neighbors = []
    for Pn in range(0, len(D)):
        if numpy.linalg.norm(D[P] - D[Pn]) < eps:
            neighbors.append(Pn)

    return neighbors


import pandas as pd
X = pd.read_csv("clustering.csv")
X = StandardScaler().fit_transform(X)

MyDBSCAN(X, 0.3, 5)

headers = ["x", "y"]
X = pd.read_csv("clustering.csv", names=headers)
X.head()

plt.scatter(X.x, X.y)
plt.show()
Example #6
0
- PCA is a dimensionality reduction technique that reduces less-informative 'noise' features. 
- But PCA is sensitive to variance and different scales, so standardizing will help PCA perform better.
- However, since we found that the correlation between different features in the training dataset is not that significant, so using PCA might not be meaningful.
"""

from sklearn.preprocessing import StandardScaler

#scaling training data between -1 to +1
standardized_train = StandardScaler().fit_transform(train.set_index(['ID_code','target']))

standardized_train

standardized_train = pd.DataFrame(standardized_train, columns=train.set_index(['ID_code','target']).columns)
standardized_train = train[['ID_code','target']].join(standardized_train)

standardized_train.head(10)

from sklearn.decomposition import PCA
k=80
pca = PCA(n_components=k, random_state=42, whiten=True)
pca.fit(standardized_train.set_index(['ID_code','target']))

sum(pca.explained_variance_ratio_)

plt.figure(figsize=(26,9))
plt.plot(pca.explained_variance_ratio_)
plt.xticks(range(k))
plt.xlabel("Number of Features")
plt.ylabel("Proportion of variance explained by additional feature")

"""Normally, if there is a elbow looking point in the graph above, the x value(number of features) of that point is usually the ideal number of components for PCA.
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(x_test)
#Performance Evaluation
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

###### Predicting Customer Lifetime Value using Regression
df2 = pd.get_dummies(df)
df2.head()
x = df2.drop(columns='customer lifetime value')
x.head()
y = df2['customer lifetime value']
y.head()

# OLS Regression
x_1 = sm.add_constant(x)
model = sm.OLS(y, x_1).fit()
model.pvalues
model.summary()  # R^2 is 0.167

# use backward elimination
cols = list(x.columns)
pmax = 1
while (len(cols) > 0):
    p = []
    x_1 = x[cols]
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv('KNN_Project.csv')
df = pd.DataFrame(df)
print(df.head())
sb.pairplot(df, hue='TARGET CLASS')
plt.show()
scalar = StandardScaler()
scalar.fit(df.drop('TARGET CLASS', axis=1))
scalar = scalar.transform(df.drop('TARGET CLASS', axis=1))
scalar = pd.DataFrame(scalar, columns=df.columns[:-1])
print(scalar.head())
X = scalar
y = df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=101)
arr = []
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    arr.append(np.mean(pred != y_test))
plt.figure(figsize=(12, 5))
plt.plot(range(1, 40),
         arr,
Example #9
0
# <h4>Feature scaling</h4>
# <p>As we will go from basic to advance model we will first use logistic regression and we know logistic regression
# is distance based method we will use scale our feature </p>

# In[24]:

##Scaling features as we will use distance base model also like Logistic regression
df_train_custom = df_train.drop(["target", "ID_code"], axis=1)
df_train_custom = StandardScaler().fit_transform(df_train_custom)
df_train_custom = pd.DataFrame(df_train_custom)
df_train_custom = df_train[['ID_code', 'target']].join(df_train_custom)

# In[25]:

df_train_custom.head()

# <h4>Feature Importance</h4>

# In[26]:

##Getting feature importance to get insights of some features
##Running random forest with gridseacrhcv for hyperparameter tunning
parameters = {'min_samples_leaf': [20, 25]}
forest = RandomForestClassifier(max_depth=15, n_estimators=15)
grid = GridSearchCV(forest,
                    parameters,
                    cv=3,
                    n_jobs=-1,
                    verbose=2,
                    scoring=make_scorer(roc_auc_score))
# In[177]:


fraud_data.TransactionAmt.max()


# In[183]:


#Standardization-/-Normalization
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data = scaled_features)
scaled_features.columns = X.columns
scaled_features.head()


# In[187]:


#Splitting the data 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42)

# X_train: independent feature data for training the model
# Y_train: dependent feature data for training the model
# X_test: independent feature data for testing the model; will be used to predict the target values
# Y_test: original target values of X_test; We will compare this values with our predicted values.
 
# test_size = 0.3: 30% of the data will go for test set and 70% of the data will go for train set