Ejemplo n.º 1
0
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=45)))
models.append(
    ('ExtraTreesClassifier', ExtraTreesClassifier(random_state=seed)))
models.append(('GradientBoostingClassifier',
               GradientBoostingClassifier(random_state=seed)))

rows = len(feature_list) * len(models)

result = pd.DataFrame(
    columns=['model', 'features', 'train_score', 'test_score'])

# result = pd.DataFrame(
#     np.zeros((rows, 4)),
#     columns=['model', 'features', 'train_score', 'test_score'])

y = dataset.load('target', 'train')

clf = VotingClassifier(models)

for num_feature, cat_feature in feature_list:
    print('{} + {}:'.format(num_feature, cat_feature), end='')
    num_var = dataset.load(num_feature, 'train')
    cat_var = dataset.load(cat_feature, 'train')
    X = pd.concat([num_var, cat_var], axis=1)

    cv_result = cross_validate(clf, X, y, cv=kfold)

    train_score = cv_result['train_score'].mean()
    test_score = cv_result['test_score'].mean()

    print('train score:{:.4f}, test score:{:.4f}'.format(
Ejemplo n.º 2
0
'''
连续变量进行对数转换
'''
from util import dataset
from sklearn.preprocessing import MinMaxScaler

print('Loading data......')
train = dataset.load('numeric', 'train').astype(float)
test = dataset.load('numeric', 'test').astype(float)
num_col = dataset.load('numeric', 'feature')

scaler = MinMaxScaler()
for col in num_col:
    scaler.fit(train[col].values.reshape(-1, 1))
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_maxmin=train).save('train')
dataset(numeric_maxmin=test).save('test')

print('Done!')
Ejemplo n.º 3
0
'''
分类变量创建虚拟变量
'''
from util import dataset
import pandas as pd

print('Loading data......')
train = dataset.load('categorical', 'train')
test = dataset.load('categorical', 'test')
cat_col = dataset.load('categorical', 'feature')

for col in cat_col:
    dummies = pd.get_dummies(train[col], prefix=col)
    train = pd.concat([train, dummies], axis=1)
    train.drop([col], axis=1, inplace=True)

    dummies = pd.get_dummies(test[col], prefix=col)
    test = pd.concat([test, dummies], axis=1)
    test.drop([col], axis=1, inplace=True)

print('Saving data......')
dataset(categorical_dummy=train).save('train')
dataset(categorical_dummy=test).save('test')

print('Done!')
Ejemplo n.º 4
0
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from matplotlib import pyplot as plt

print('Loading Data...')

y = dataset.load('target', 'train')

X = dataset.load('new_feature', 'train')
# X.drop(['source'], axis=1, inplace=True)
# num_var = dataset.load('numeric_maxmin', 'train')
# cat_var = dataset.load('categorical_dummy', 'train')

# X = pd.concat([num_var, cat_var], axis=1)

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.3, random_state=45)

lr = LogisticRegression(random_state=45)
kfold = StratifiedKFold(n_splits=10, random_state=45)
roc_base = 0.5
base_n = 1
Ejemplo n.º 5
0
'''
分类变量创建虚拟变量
'''
from util import dataset
import pandas as pd
from numericprocess import NumProcess

print('Loading data......')

cat_var = dataset.load('categorical', 'feature')
num_var = dataset.load('numeric', 'feature')

train = pd.concat(
    [dataset.load('numeric', 'train'),
     dataset.load('categorical', 'train')],
    axis=1)
test = pd.concat(
    [dataset.load('numeric', 'test'),
     dataset.load('categorical', 'test')],
    axis=1)

train['source'] = 'train'
test['source'] = 'test'

df = pd.concat([train, test], axis=0)

for x in range(len(cat_var)):
    for y in range(len(cat_var)):
        if x < y:
            name = '{}&{}'.format(cat_var[x], cat_var[y])
            df[name] = df[cat_var[x]].astype(str) + '&' + df[
import util.cluster as cluster
import matplotlib.pyplot as plt
from index.external import *
import csv
import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import util.dataset as dataset


def argmax(scores):
    return max(enumerate(scores), key=lambda x: x[1])[0] + 1


X, classes = dataset.load(sys.argv[1])
print 'Samples count: ', len(X), ', class count', len(set(classes))

max_cluster_count = int(sys.argv[2])
clusterizer_type = sys.argv[3]
rand_scores = []
jaccard_scores = []
fm_scores = []
f1_scores = []
file = open('res.txt', 'w')
for i in range(1, max_cluster_count + 1):
    #print '----------', i, '----------'
    clusterizer = None
    if clusterizer_type == 'kmeans':
        clusterizer = KMeans(n_clusters=i).fit(np.array(X))
    elif clusterizer_type == 'ward':
Ejemplo n.º 7
0
import numpy as np
from util import dataset

data = dataset.load('custom_label', 'train')
print(data)
Ejemplo n.º 8
0
'''
删除变量
'''
from util import dataset
import pandas as pd
import numpy as np

# 分类变量
cat_col = dataset.load('categorical', 'feature')
num_col = dataset.load('numeric', 'feature')
ord_col = dataset.load('order', 'feature')

# 加载数据
print('LOADING......')
train = dataset.load('train', 'all')
test = dataset.load('test', 'all')
y = dataset.load('target', 'train')

train.drop([
    'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
    'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender',
    'DistanceFromHome', 'Education', 'PerformanceRating',
    'RelationshipSatisfaction', 'TrainingTimesLastYear'
], axis=1, inplace=True)
test.drop([
    'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
    'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender',
    'DistanceFromHome', 'Education', 'PerformanceRating',
    'RelationshipSatisfaction', 'TrainingTimesLastYear'
], axis=1, inplace=True)
Ejemplo n.º 9
0
'''
from util import dataset
import pandas as pd
import numpy as np
import chimerge


def meger(x, inver_list):
    for i in range(len(inver_list)):
        if x <= inver_list[i]:
            return i + 1
    return i + 2


print('Loading data......')
train = dataset.load('numeric', 'train')
test = dataset.load('numeric', 'test')
num_col = dataset.load('numeric', 'feature')

target = dataset.load('target', 'train')

df = pd.concat([train, target], axis=1)

for col in num_col:
    _, interval_list = chimerge.ChiMerge(df, col, 'Attrition')
    train[col] = train[col].map(lambda x: meger(x, interval_list))
    test[col] = test[col].map(lambda x: meger(x, interval_list))

print(train.head())
print('=' * 20)
print(test.head())
Ejemplo n.º 10
0
               ExtraTreesClassifier(random_state=seed, n_estimators=20)))
models.append(('GradientBoostingClassifier',
               GradientBoostingClassifier(random_state=seed,
                                          n_estimators=100,
                                          max_depth=3)))

rows = len(feature_list) * len(models)

result = pd.DataFrame(
    columns=['model', 'features', 'train_score', 'test_score'])

# result = pd.DataFrame(
#     np.zeros((rows, 4)),
#     columns=['model', 'features', 'train_score', 'test_score'])

y = dataset.load('target', 'train')

for num_feature, cat_feature in feature_list:
    print('{} + {}:'.format(num_feature, cat_feature))
    num_var = dataset.load(num_feature, 'train')
    cat_var = dataset.load(cat_feature, 'train')
    X = pd.concat([num_var, cat_var], axis=1)

    test_num = dataset.load(num_feature, 'test')
    test_cat = dataset.load(cat_feature, 'test')

    X_test = pd.concat([test_num, test_cat], axis=1)

    for name, clf in models:
        print(name + ':', end='')
        cv_result = cross_validate(estimator=clf, X=X, y=y, cv=kfold)