return accuracy_score(y, y_pred) def predict(self, X): _X = self.pca_model.transform(X) return self.base_clf.predict(_X) def predict_proba(self, X): _X = self.pca_model.transform(X) return self.base_clf.predict_proba(_X) if __name__ == '__main__': from sklearn import svm import pprint import random copper.project.path = '../../../data-mining/data-science-london/' train = copper.load('train') test = copper.load('test') clf = svm.SVC(kernel='rbf', gamma=0.02, C=10, probability=True) pca_clf = PCA_wrapper(clf, n_components=13) ml = copper.MachineLearning() ml.train = train ml.add_clf(clf, 'svm') ml.add_clf(pca_clf, 'pca') ml.fit() bag = MaxProbaBag() bag.add_clf(ml.clfs) # print(ml.predict_proba(test).head(3)) print(bag.predict_proba(test))
import copper # copper.r.install_packages() copper.project.path = '..' ds = copper.load('cleaned') print ds copper.r.impute(ds)
import copper import numpy as np import matplotlib.pyplot as plt import statsmodels.formula.api as sm copper.project.path = '..' loans = copper.load('loans') loans.fix_names() loans.fillna(method='mean') # print(loans.frame) # print (loans.metadata) # loans.role['InterestRate'] = loans.TARGET # print (loans.frame.skew()) # print (loans.corr()) # loans.histogram('Employment.Length') # plt.draw() # plt.figure() # loans.histogram('MonthlyIncome') # plt.show() # mod = sm.ols(formula='InterestRate ~ FICORange + LoanLength', data=loans.frame) # mod = sm.ols(formula='InterestRate ~ FICORange + LoanLength + C(LoanPurpose)', data=loans.frame) # mod = sm.ols(formula='InterestRate ~ C(LoanPurpose)', data=loans.frame) # res = mod.fit() # print (res.summary()) # print (res.pvalues)
import copper import numpy as np import pandas as pd import matplotlib.pyplot as plt copper.project.path = '../' train = copper.load('train.dataset') # train = copper.load('train_mean.dataset') # train = copper.load('train_mean_log.dataset') # train = copper.load('train_imp.dataset') test = copper.load('test.dataset') # print len(test), len(train) # print train.corr('depend') # train.histogram('depend') train.scatter('x39', 'x40', s=100, alpha=0.2) # train.frame[train.frame.columns[36:40]].boxplot() plt.show() # from pandas.tools.plotting import scatter_matrix # plot1 = scatter_matrix(train.frame[train.frame.columns[-5:]], alpha=0.2, figsize=(8, 8)) # plt.show() # plt.savefig('fig6.pdf') # from pandas.tools.plotting import radviz # radviz(train.frame[['depend', 'x1', 'x2', 'x3', 'x4']], 'depend') # plt.show()
import copper import numpy as np import pandas as pd import matplotlib.pyplot as plt copper.project.path = '../' # train = copper.load('train.dataset') # train = copper.load('train_mean.dataset') # train = copper.load('train_mean_log.dataset') train = copper.load('train_imp.dataset') # test = copper.load('test.dataset') # test = copper.load('test_mean.dataset') test = copper.load('test_imp.dataset') # print test.inputs ml = copper.MachineLearning() ml.set_train(train) ml.set_test(test) ml.costs = [[0,1],[5,0]] from PyWiseRF import WiseRF rf = WiseRF(n_estimators=50, n_jobs=2) ml.add_clf(rf, 'RF') from sklearn import tree tree_clf = tree.DecisionTreeClassifier(max_depth=5) ml.add_clf(tree_clf, 'Tree') ml.fit() print ml.accuracy()