コード例 #1
0
def set_missing_ages(p_df):
    age_df = p_df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    # 归一化
    scaler = preprocessing.StandardScaler()
    age_df['Fare_scaled'] = scaler.fit_transform(age_df['Fare'].values.reshape(
        -1, 1))
    del age_df['Fare']
    # 分割已经数据和待预测数据集
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    y_inner = known_age[:, 0]
    x_inner = known_age[:, 1:]
    rfr_inner = AbuML(x_inner, y_inner, age_df.Age.notnull())
    rfr_inner.estimator.polynomial_regression(degree=1)
    reg_inner = rfr_inner.fit()
    predicted_ages = reg_inner.predict(unknown_age[:, 1::])
    p_df.loc[(p_df.Age.isnull()), 'Age'] = predicted_ages
    return p_df
コード例 #2
0
ファイル: c10.py プロジェクト: 3774257/abu
def sample_105_0():
    """
    10.5 AbuML
    :return:
    """
    global g_with_date_week_noise
    g_with_date_week_noise = True
    train_x, train_y_regress, train_y_classification, pig_three_feature, \
    test_x, test_y_regress, test_y_classification, kl_another_word_feature_test = sample_1031_1()

    from abupy import AbuML
    # 通过x, y矩阵和特征的DataFrame对象组成AbuML
    ml = AbuML(train_x, train_y_classification, pig_three_feature)
    # 使用随机森林作为分类器
    _ = ml.estimator.random_forest_classifier()

    # 交织验证结果的正确率
    print('ml.cross_val_accuracy_score():\n', ml.cross_val_accuracy_score())
    # 特征的选择
    print('ml.feature_selection():\n', ml.feature_selection())
コード例 #3
0
def sample_105_0():
    """
    10.5 AbuML
    :return:
    """
    global g_with_date_week_noise
    g_with_date_week_noise = True
    train_x, train_y_regress, train_y_classification, pig_three_feature, \
    test_x, test_y_regress, test_y_classification, kl_another_word_feature_test = sample_1031_1()

    from abupy import AbuML
    # 通过x, y矩阵和特征的DataFrame对象组成AbuML
    ml = AbuML(train_x, train_y_classification, pig_three_feature)
    # 使用随机森林作为分类器
    _ = ml.estimator.random_forest_classifier()

    # 交织验证结果的正确率
    print('ml.cross_val_accuracy_score():\n', ml.cross_val_accuracy_score())
    # 特征的选择
    print('ml.feature_selection():\n', ml.feature_selection())
コード例 #4
0
def train_val(data):
    """封装所有处理训练步骤"""
    # 处理离散特征
    dummies__cabin = pd.get_dummies(data['Cabin'], prefix='Cabin')
    dummies__embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
    dummies__sex = pd.get_dummies(data['Sex'], prefix='Sex')
    dummies__pclass = pd.get_dummies(data['Pclass'], prefix='Pclass')
    df = pd.concat([data, dummies__cabin, dummies__embarked, dummies__sex, dummies__pclass], axis=1)
    df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
    # 归一化数据
    scaler = preprocessing.StandardScaler()
    df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1))
    df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1))
    df['SibSp_scaled'] = scaler.fit_transform(df['SibSp'].astype(float).values.reshape(-1, 1))
    df['Parch_scaled'] = scaler.fit_transform(df['Parch'].astype(float).values.reshape(-1, 1))
    # 选择特征
    train_df = df.filter(regex='Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    train_np = train_df.as_matrix()
    y = train_np[:, 0]
    x = train_np[:, 1:]
    titanic = AbuML(x, y, train_df)
    titanic.estimator.logistic_classifier()
    titanic.cross_val_accuracy_score()
コード例 #5
0
def new(df):
    # 选择哪些特征作为训练特征
    train_df = df.filter(
        regex=
        'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
    )
    train_df.head(1)

    # 用新加入模型训练
    train_np = train_df.as_matrix()
    y = train_np[:, 0]
    x = train_np[:, 1:]
    new_titanic = AbuML(x, y, train_df)

    return new_titanic
コード例 #6
0
 def __init__(self):
     self.titanic = AbuML.create_test_more_fiter()
     self.data_train = pd.read_csv("./data/titanic/train.csv")
     self.df = None
コード例 #7
0
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
        axis=1,
        inplace=True)

# 选择哪些特征作为训练特征
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
train_df.head(1)

from abupy import AbuML
train_np = train_df.as_matrix()
y = train_np[:, 0]
x = train_np[:, 1:]
titanic = AbuML(x, y, train_df)

titanic.estimator.logistic_classifier()
titanic.cross_val_accuracy_score()

#构造非线性特征
df['Child'] = (data_train['Age'] <= 10).astype(int)
df['Age*Age'] = data_train['Age'] * data_train['Age']
df['Age*Age_scaled'] = scaler.fit_transform(df['Age*Age'])

df['Age*Class'] = data_train['Age'] * data_train['Pclass']
df['Age*Class_scaled'] = scaler.fit_transform(df['Age*Class'].values.reshape(
    -1, 1))

# filter加入新增的特征
train_df = df.filter(
コード例 #8
0

x = np.array([1, 2, 3, 4, 5])
assert np.mean(x) == np.sum(x) / 5

assert np.std(x) == np.sqrt(np.mean((x - np.mean(x))**2))

f1 = np.array([0.2, 0.5, 1.1]).reshape(-1, 1)
f2 = np.array([-100.0, 56.0, -77.0]).reshape(-1, 1)

f1_scaled = (f1 - np.mean(f1)) / np.std(f1)
f2_scaled = (f2 - np.mean(f2)) / np.std(f2)

import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()
f1_sk_scaled = scaler.fit_transform(f1)
f2_sk_scaled = scaler.fit_transform(f2)

assert np.allclose(f1_sk_scaled, f1_scaled) and np.allclose(
    f2_sk_scaled, f2_scaled)

from abupy import AbuML
import sklearn.preprocessing as preprocessing

iris = AbuML.create_test_fiter()

iris.estimator.logistic_classifier(multi_class='multinomial', solver='lbfgs')

iris.cross_val_accuracy_score()
コード例 #9
0
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1))
df['SibSp_scaled'] = scaler.fit_transform(
    df['SibSp'].astype(float).values.reshape(-1, 1))
df['Parch_scaled'] = scaler.fit_transform(
    df['Parch'].astype(float).values.reshape(-1, 1))

# 选择特征
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
train_np = train_df.as_matrix()
y = train_np[:, 0]
x = train_np[:, 1:]

titanic = AbuML(x, y, train_df)

titanic.estimator.logistic_classifier()
titanic.cross_val_accuracy_score()

from abupy import ABuMLGrid
# 切换决策树
titanic.estimator.decision_tree_classifier(criterion='entropy')
# grid seach寻找最优的决策树层数
best_score_, best_params_ = ABuMLGrid.grid_search_init_kwargs(
    titanic.estimator.clf,
    titanic.x,
    titanic.y,
    param_name='max_depth',
    param_range=range(3, 10),
    show=True)
コード例 #10
0
df.head(1)

#检查缺失
df.info()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#归一化数据
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

#模型训练
df = pd.DataFrame(data=np.c_[x_train, y_train])
columns = np.append(scikit_boston.feature_names, ['MEDV'])
boston = AbuML(x_train, y_train, df)
boston.estimator.polynomial_regression(degree=1)
reg = boston.fit()

#测试集上预测
y_pred = reg.predict(x_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

#平方展开
boston.estimator.polynomial_regression(degree=2)

reg = boston.fit()

y_pred = reg.predict(x_test)
コード例 #11
0
# -*- coding: utf-8 -*
from abupy import AbuML

# 泰坦尼克号生存预测
titanic = AbuML.create_test_more_fiter()

titanic.plot_confusion_matrices()

from abupy import ABuMLExecute
from sklearn import metrics

titanic_y_pred = ABuMLExecute.run_cv_estimator(titanic.get_fiter(),
                                               titanic.x,
                                               titanic.y,
                                               n_folds=10)
confusion_matrix = metrics.confusion_matrix(titanic.y, titanic_y_pred)
TP = confusion_matrix[1, 1]
TN = confusion_matrix[0, 0]
FP = confusion_matrix[0, 1]
FN = confusion_matrix[1, 0]
print TP, TN, FP, FN

assert metrics.accuracy_score(
    titanic.y, titanic_y_pred) == (TP + TN) / float(TP + TN + FP + FN)

# “生存”类别的精确率
tit_precision = TP / float(TP + FP)
# “生存”类别的召回率
tit_recall = TP / float(TP + FN)

assert metrics.precision_score(titanic.y, titanic_y_pred) == tit_precision
コード例 #12
0
ph3 = dummies_pclass.head(3)
print('ph3=', ph3)

dummies_embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
dummies_sex = pd.get_dummies(data_train['Sex'], prefix='Sex')

df = pd.concat([df, dummies_embarked, dummies_sex, dummies_pclass], axis=1)

# noinspection PyUnresolvedReferences
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
        axis=1,
        inplace=True)

# 选择哪些特征作为训练特征
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
print(train_df.head(1))

train_np = train_df.as_matrix()
y = train_np[:, 0]
x = train_np[:, 1:]

from abupy import AbuML
titanic = AbuML(x, y, train_df)

titanic.estimator.logistic_classifier()
s = titanic.cross_val_accuracy_score()
print(s)
コード例 #13
0
# coding: utf-8

import pandas as pd # pandas是python的数据格式处理类库

from abupy import AbuML

# 泰坦尼克号生存预测
titanic = AbuML.create_test_more_fiter()
AbuML().estimator.polynomial_regression()
コード例 #14
0
scaler = preprocessing.StandardScaler()
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1))
df['SibSp_scaled'] = scaler.fit_transform(
    df['SibSp'].astype(float).values.reshape(-1, 1))
df['Parch_scaled'] = scaler.fit_transform(
    df['Parch'].astype(float).values.reshape(-1, 1))
# 选择特征
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
train_np = train_df.as_matrix()
y = train_np[:, 0]
x = train_np[:, 1:]
titanic = AbuML(x, y, train_df)

from abupy import ABuMLGrid

# # 决策树
# titanic.estimator.decision_tree_classifier()
# # grid seach寻找最优的决策树层数
# best_score_, best_params_ = ABuMLGrid.grid_search_init_kwargs(titanic.estimator.clf, titanic.x, titanic.y,
#                                                         param_name='max_depth',param_range=range(3, 10), show=True)
#
# titanic.estimator.decision_tree_classifier(**best_params_)
# titanic.cross_val_accuracy_score()

#随机森林
titanic.estimator.random_forest_classifier()
コード例 #15
0
    print('RFE selection')
    print(
        pd.DataFrame(
            {
                'support': selector.support_,
                'ranking': selector.ranking_
            },
            index=fairy_tale_feature.columns[1:]))


feature_selection(estimator, train_x, train_y_classification)

# 3.3
from abupy import AbuML
# 通过X ,Y矩阵和特征的DataFrame对象醉成AbuML
ml = AbuML(train_x, train_y_classification, fairy_tale_feature)
# 使用随机森林作为分类器
_ = ml.estimator.random_forest_classifier()

# 交织验证结果的正确率
ml.cross_val_accuracy_score()
# 特征的选择
ml.feature_selection()

abupy.env.g_enable_ml_feature = True
abupy.env.g_enable_train_test_split = True

# 初始化资金200万元
read_cash = 2000000
# 每笔交易的买入基数资金设置为万分之15
abupy.beta.atr.g_atr_pos_base = 0.0015
コード例 #16
0
#选择那些特征作为训练特征
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*\|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
train_df.head(1)

print(train_df.head(1))

#输入模型查看成绩
from abupy import AbuML

train_np = train_df.as_matrix()
y = train_np[:, 0]
x = train_np[:, 1:]
titanic = AbuML(x, y, train_df)

titanic.estimator.logistic_classifier()
titanic.cross_val_accuracy_score()

#逻辑分类是一个线性模型,线性模型就是把特征对应的分类结果的作用相加起来
#特征的非线性的表达式可以分为两类:
#(1)用于表达数值特征本身的非线性因素
#(2)用于表达特征与特征之间存在的非线性关联,并且这种关联关系对分类结果有帮助

#第一种仅适用于数值特征,对应的构造特征的方式有很多种:多项式化和离散化。多项式构造是指将原有数值的高次方作为特征,数据离散化是指将连续的数值划分为一个个区间
#将数值是否在区间内作为特征。高次方让数值内在的表达变得复杂,可描述能力增强,而离散则是让模型来拟合逼近真实的关系描述。

#划分区间
# 划分区间
df['Child'] = (data_train['Age'] <= 10).astype(int)