import sys
sys.path.insert(0, '/home/freddie/git/DataScience/dataPreProcess/')
from data_process import DataPreparation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import statsmodels.formula.api as sm
import numpy
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
import matplotlib.pyplot as plot
from matplotlib.colors import ListedColormap

data_file="""/home/freddie/git/DataScience/data/Machine Learning A-Z \
Template Folder/Part 3 - Classification/Section 14 -\
 Logistic Regression/Social_Network_Ads.csv"""
 
prep = DataPreparation(data_file)
indep_vars, dep_vars, data_frame = prep.prepare_data_frame([2,4], [4,5])
indep_vars = prep.feature_scaling(indep_vars, None)
sc_indep = StandardScaler()
indep_train, indep_test, dep_train, dep_test = prep.partition_training_test()

scale_X_train = sc_indep.fit_transform(indep_train)
scale_X_test = sc_indep.transform(indep_test)

classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(scale_X_train, dep_train[:,-1])

y_pred = classifier.predict(indep_test)
cm = confusion_matrix(dep_test, y_pred)

#plot
Beispiel #2
0
"""
Created on Wed Jan  3 20:02:07 2018

@author: freddie
"""

import sys
sys.path.insert(0, '/home/freddie/git/DataScience/dataPreProcess/')
from data_process import DataPreparation
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import statsmodels.formula.api as sm
import numpy
import matplotlib.pyplot as plot

data_file = """/home/freddie/git/DataScience/data/\
Machine Learning A-Z Template Folder/Part 2 -\
 Regression/Section 8 - Decision Tree Regression/Position_Salaries.csv"""

prep = DataPreparation(data_file)
indep_vars, dep_vars, data_frame = prep.prepare_data_frame([1, 2], [2, 3])
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(indep_vars, dep_vars)

y_pred = regressor.predict(6.5)

indep_grid = numpy.arange(min(indep_vars), max(indep_vars), 0.1)
indep_grid = indep_grid.reshape((len(indep_grid), 1))

plot.plot(indep_grid, regressor.predict(indep_grid), color='blue')
plot.scatter(indep_vars, dep_vars, color='red')
import sys
sys.path.insert(0, '/home/freddie/git/DataScience/dataPreProcess/')
from data_process import DataPreparation
import scipy.cluster.hierarchy as hierarchy
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plot
import pandas
from apyori import apriori

data_file = """/home/freddie/git/DataScience/data/Machine Learning A-Z Template Folder/Part 5 - Association Rule Learning/Section 28 - Apriori/Market_Basket_Optimisation.csv"""
prep = DataPreparation(data_file)
data_frame = pandas.read_csv(data_file, header=None)
transactions = []

for i in range(0, 7501):
    transactions.append([str(data_frame.values[i, j]) for j in range(0, 20)])

rules = apriori(transactions,
                min_support=0.003,
                min_confidence=0.2,
                min_lift=3,
                min_length=2)

results = list(rules)
# -*- coding: utf-8 -*-

import sys
sys.path.insert(0, '/home/freddie/git/DataScience/dataPreProcess/')
from data_process import DataPreparation
import scipy.cluster.hierarchy as hierarchy
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plot

data_file = """/home/freddie/git/DataScience/data/Machine Learning A-Z Template Folder/Part 4 - Clustering/Section 24 - K-Means Clustering/Mall_Customers.csv"""

prep = DataPreparation(data_file)
indep_vars, dep_vars, data_frame = prep.prepare_data_frame([2, 4], [4, 5])
X = data_frame.iloc[:, [3, 4]].values

dendrogram = hierarchy.dendrogram(hierarchy.linkage(X, method='ward'))
hc = AgglomerativeClustering(n_clusters=5,
                             affinity='euclidean',
                             linkage='ward')
y_hc = hc.fit_predict(X)

#visualise
plot.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s=100, c='red', label='Schnoep')
plot.scatter(X[y_hc == 1, 0],
             X[y_hc == 1, 1],
             s=100,
             c='blue',
             label='moderate')
plot.scatter(X[y_hc == 2, 0],
             X[y_hc == 2, 1],
             s=100,
Beispiel #5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 27 11:39:32 2017

@author: freddie
"""
import sys
sys.path.insert(0, '/home/freddie/git/DataScience/dataPreProcess/')
from data_process import DataPreparation
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm
import numpy

data_file = """/home/freddie/git/DataScience/data/Machine Learning A-Z Template Folder/Part 2 - Regression/Section 5 - Multiple Linear Regression/50_Startups.csv"""
prep = DataPreparation(data_file)
indep_vars, dep_vars, data_frame = prep.prepare_data_frame([0,4], [4,5])

#here one of the dummy variables need to be removed to avoid the Dummy variable trap.
#This is automatically done with the current libs but we will remove a dummy variable anyway.
indep_vars, dep_vars = prep.add_dummy_variables(encode_dep=False, columns=[3])
indep_vars = indep_vars[:,1:]

indep_train, indep_test, dep_train, dep_test = prep.partition_training_test()
#no need to scale feature. lr lib does that or us.

regressor = LinearRegression()
regressor.fit(indep_train, dep_train)
dep_pred = regressor.predict(indep_test)

error = dep_pred-dep_test