models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=45))) models.append( ('ExtraTreesClassifier', ExtraTreesClassifier(random_state=seed))) models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=seed))) rows = len(feature_list) * len(models) result = pd.DataFrame( columns=['model', 'features', 'train_score', 'test_score']) # result = pd.DataFrame( # np.zeros((rows, 4)), # columns=['model', 'features', 'train_score', 'test_score']) y = dataset.load('target', 'train') clf = VotingClassifier(models) for num_feature, cat_feature in feature_list: print('{} + {}:'.format(num_feature, cat_feature), end='') num_var = dataset.load(num_feature, 'train') cat_var = dataset.load(cat_feature, 'train') X = pd.concat([num_var, cat_var], axis=1) cv_result = cross_validate(clf, X, y, cv=kfold) train_score = cv_result['train_score'].mean() test_score = cv_result['test_score'].mean() print('train score:{:.4f}, test score:{:.4f}'.format(
''' 连续变量进行对数转换 ''' from util import dataset from sklearn.preprocessing import MinMaxScaler print('Loading data......') train = dataset.load('numeric', 'train').astype(float) test = dataset.load('numeric', 'test').astype(float) num_col = dataset.load('numeric', 'feature') scaler = MinMaxScaler() for col in num_col: scaler.fit(train[col].values.reshape(-1, 1)) train[col] = scaler.transform(train[col].values.reshape(-1, 1)) test[col] = scaler.transform(test[col].values.reshape(-1, 1)) print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_maxmin=train).save('train') dataset(numeric_maxmin=test).save('test') print('Done!')
''' 分类变量创建虚拟变量 ''' from util import dataset import pandas as pd print('Loading data......') train = dataset.load('categorical', 'train') test = dataset.load('categorical', 'test') cat_col = dataset.load('categorical', 'feature') for col in cat_col: dummies = pd.get_dummies(train[col], prefix=col) train = pd.concat([train, dummies], axis=1) train.drop([col], axis=1, inplace=True) dummies = pd.get_dummies(test[col], prefix=col) test = pd.concat([test, dummies], axis=1) test.drop([col], axis=1, inplace=True) print('Saving data......') dataset(categorical_dummy=train).save('train') dataset(categorical_dummy=test).save('test') print('Done!')
from sklearn.ensemble import GradientBoostingClassifier from sklearn.feature_selection import RFE from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve from matplotlib import pyplot as plt print('Loading Data...') y = dataset.load('target', 'train') X = dataset.load('new_feature', 'train') # X.drop(['source'], axis=1, inplace=True) # num_var = dataset.load('numeric_maxmin', 'train') # cat_var = dataset.load('categorical_dummy', 'train') # X = pd.concat([num_var, cat_var], axis=1) # X_train, X_test, y_train, y_test = train_test_split( # X, y, test_size=0.3, random_state=45) lr = LogisticRegression(random_state=45) kfold = StratifiedKFold(n_splits=10, random_state=45) roc_base = 0.5 base_n = 1
''' 分类变量创建虚拟变量 ''' from util import dataset import pandas as pd from numericprocess import NumProcess print('Loading data......') cat_var = dataset.load('categorical', 'feature') num_var = dataset.load('numeric', 'feature') train = pd.concat( [dataset.load('numeric', 'train'), dataset.load('categorical', 'train')], axis=1) test = pd.concat( [dataset.load('numeric', 'test'), dataset.load('categorical', 'test')], axis=1) train['source'] = 'train' test['source'] = 'test' df = pd.concat([train, test], axis=0) for x in range(len(cat_var)): for y in range(len(cat_var)): if x < y: name = '{}&{}'.format(cat_var[x], cat_var[y]) df[name] = df[cat_var[x]].astype(str) + '&' + df[
import util.cluster as cluster import matplotlib.pyplot as plt from index.external import * import csv import sys import numpy as np from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering import util.dataset as dataset def argmax(scores): return max(enumerate(scores), key=lambda x: x[1])[0] + 1 X, classes = dataset.load(sys.argv[1]) print 'Samples count: ', len(X), ', class count', len(set(classes)) max_cluster_count = int(sys.argv[2]) clusterizer_type = sys.argv[3] rand_scores = [] jaccard_scores = [] fm_scores = [] f1_scores = [] file = open('res.txt', 'w') for i in range(1, max_cluster_count + 1): #print '----------', i, '----------' clusterizer = None if clusterizer_type == 'kmeans': clusterizer = KMeans(n_clusters=i).fit(np.array(X)) elif clusterizer_type == 'ward':
import numpy as np from util import dataset data = dataset.load('custom_label', 'train') print(data)
''' 删除变量 ''' from util import dataset import pandas as pd import numpy as np # 分类变量 cat_col = dataset.load('categorical', 'feature') num_col = dataset.load('numeric', 'feature') ord_col = dataset.load('order', 'feature') # 加载数据 print('LOADING......') train = dataset.load('train', 'all') test = dataset.load('test', 'all') y = dataset.load('target', 'train') train.drop([ 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender', 'DistanceFromHome', 'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'TrainingTimesLastYear' ], axis=1, inplace=True) test.drop([ 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender', 'DistanceFromHome', 'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'TrainingTimesLastYear' ], axis=1, inplace=True)
''' from util import dataset import pandas as pd import numpy as np import chimerge def meger(x, inver_list): for i in range(len(inver_list)): if x <= inver_list[i]: return i + 1 return i + 2 print('Loading data......') train = dataset.load('numeric', 'train') test = dataset.load('numeric', 'test') num_col = dataset.load('numeric', 'feature') target = dataset.load('target', 'train') df = pd.concat([train, target], axis=1) for col in num_col: _, interval_list = chimerge.ChiMerge(df, col, 'Attrition') train[col] = train[col].map(lambda x: meger(x, interval_list)) test[col] = test[col].map(lambda x: meger(x, interval_list)) print(train.head()) print('=' * 20) print(test.head())
ExtraTreesClassifier(random_state=seed, n_estimators=20))) models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=seed, n_estimators=100, max_depth=3))) rows = len(feature_list) * len(models) result = pd.DataFrame( columns=['model', 'features', 'train_score', 'test_score']) # result = pd.DataFrame( # np.zeros((rows, 4)), # columns=['model', 'features', 'train_score', 'test_score']) y = dataset.load('target', 'train') for num_feature, cat_feature in feature_list: print('{} + {}:'.format(num_feature, cat_feature)) num_var = dataset.load(num_feature, 'train') cat_var = dataset.load(cat_feature, 'train') X = pd.concat([num_var, cat_var], axis=1) test_num = dataset.load(num_feature, 'test') test_cat = dataset.load(cat_feature, 'test') X_test = pd.concat([test_num, test_cat], axis=1) for name, clf in models: print(name + ':', end='') cv_result = cross_validate(estimator=clf, X=X, y=y, cv=kfold)