Exemple #1
0
def kmeans_roughly_average_categories_part1(
        data, num_keep, columns, original_data,
        init_n_clusters):  # data为只做因子分析的特征列数据
    if type(columns) != dict:
        raise Exception('columns Type is Error, must dict')

    # 一、因子分析:
    fa_score = factor_analysis(data, num_keep)  # 因子得分(DataFrame)
    ft.df_change_colname(fa_score, columns)

    # 1、初始的聚类: (查看 因子得分 的聚类情况,反应了 数据的真实 分布情况)
    kmeans = KMeans(n_clusters=init_n_clusters)  # MiniBatchKMeans()分批处理
    # kmeans = cluster.KMeans(n_clusters=3, init='random', n_init=1)
    result = kmeans.fit(fa_score)  # result 为 因子分析结果 → k-Means的结果
    original_data = original_data.join(pd.DataFrame(result.labels_))
    original_data = original_data.rename(columns={0: "clustor"})
    # 画饼图
    original_data.clustor.value_counts().plot(kind='pie')
    plt.show()

    # 2、检测 因子得分 偏度
    skew, kurt, var_x_ln = ft.skew_distribution_test(fa_score)
    if len(var_x_ln) > 0:
        from sklearn import preprocessing

        # Tukey正态分布打分(聚类模型)(幂变换:隐射到 正太分布)。 而 PowerTransformer 中的  Yeo-Johnson 和 Box-Cox(要求正数)都不行
        quantile_transformer = preprocessing.QuantileTransformer(
            output_distribution='normal', random_state=0)
        fa_score = quantile_transformer.fit_transform(fa_score)
        fa_score = pd.DataFrame(fa_score)
        fa_score = fa_score.rename(columns=columns)

        skew, kurt, var_x_ln = ft.skew_distribution_test(fa_score)

    return fa_score
 def __init__(self, df_path, target_tag, pipe_x, hidden_dim, epochs, lr,
              *args, **kwargs):
     self.all_sequences, groups = get_sequences(df_path)
     train, test = GroupKFold(n_splits=5).split(X=self.all_sequences,
                                                groups=groups).__next__()
     self.train = train.astype(dtype=np.int32)
     self.test = test.astype(dtype=np.int32)
     ft.TagAdder.target_tag = target_tag
     super().__init__(
         module=LSTM_SA,
         module__hidden_dim=hidden_dim,
         module__transform_pipe=pipe_x,
         max_epochs=epochs,
         batch_size=-1,
         lr=lr,
         optimizer=torch.optim.SGD,
         criterion=torch.nn.CrossEntropyLoss,
         train_split=lambda *_:
         ((self.train, np.zeros(len(self.train), dtype=np.int32)),
          (self.test, np.zeros(len(self.test), dtype=np.int32))),
         *args,
         **kwargs)
     self.df_path = df_path
     self.pipe_y = Pipeline(
         steps=[('tag', ft.TagSelector(
             target_tag=target_tag)), ('ir', ft.IndexRemover())])
Exemple #3
0
def hierarchical_clustering(data, num_keep, columns,
                            labels):  # data为只做因子分析的特征列数据
    if type(columns) != dict:
        raise Exception('columns Type is Error, must dict')
    if type(labels) != list and type(labels) != np.ndarray:
        raise Exception('labels Type is Error, must list or np.ndarray')

    # 一、因子分析:
    fa_score = factor_analysis(data, num_keep)  # 因子得分(DataFrame)
    ft.df_change_colname(fa_score, columns)

    # 二、层次聚类:AGNES
    import scipy.cluster.hierarchy as sch
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
    import matplotlib.pyplot as plt

    # 1.2、直接传入 坐标点数据(矩阵)
    Z = sch.linkage(fa_score, metric='euclidean', method='ward')

    # 将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png
    P = sch.dendrogram(Z, labels=labels)  # labels 可以接受 list 或 numpy.ndarray
    plt.show()

    plt.savefig('plot_dendrogram1.png')
    cluster = sch.fcluster(Z, t=1)
Exemple #4
0
def tag_scorer(X, df, pipe, params, cv, target_tags, data_prefix, pipe_prefix,
               params_prefix, adders, outdir):
    prefix = data_prefix + '_' + pipe_prefix + '_' + params_prefix
    ft.load_adders(data_prefix, adders)
    row_count = df.shape[0]
    priors = {
        tag: np.count_nonzero(df[tag].to_numpy()) / row_count
        for tag in target_tags
    }
    print('Running %s GridSearch for tags: ' % prefix, target_tags)
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=cv)
    for tag in target_tags:
        ft.TagAdder.target_tag = tag
        search = GridSearchCV(pipe,
                              params,
                              cv=skf,
                              scoring=['f1', 'recall', 'precision'],
                              refit='f1')
        search.fit(X, df[tag])
        print("Tag: %s, Prior: %0.3f, Best parameter (CV score=%0.3f):" %
              (tag, priors[tag], search.best_score_))
        print('Tag: ', tag, search.best_params_)
        results = deepcopy(search.cv_results_)

        results['prior'] = [priors[tag]] * len(results['params'])
        results['improvement_oldbad'] = [
            results['mean_test_f1'][i] / results['prior'][i]
            for i in range(len(results['params']))
        ]
        results['f1_naive'] = [
            results['prior'][i] * 2.0 / (1.0 + results['prior'][i])
            for i in range(len(results['params']))
        ]
        results['f1_imprv'] = [
            results['mean_test_f1'][i] / results['f1_naive'][i]
            for i in range(len(results['params']))
        ]
        results['precision*recall'] = [
            results['mean_test_precision'][i] * results['mean_test_recall'][i]
            for i in range(len(results['params']))
        ]
        results['pr_imprv'] = [
            results['precision*recall'][i] / results['prior'][i]
            for i in range(len(results['params']))
        ]

        pd.DataFrame(results).to_csv(outdir + prefix + '_' + tag + '.csv')
        print('DONE: %s ' % (outdir + prefix + '_' + tag + '.csv'))
Exemple #5
0
def iv_visualization(bins_of_col):
    ivlist = []  # 各变量IV
    index = []  # x轴的标签
    collist = []
    for i, col in enumerate(bins_of_col):
        print("x" + str(i + 1), col, bins_of_col[col][1])
        ivlist.append(bins_of_col[col][1])
        index.append("x" + str(i + 1))
        collist.append(col)

    fig1 = plt.figure(1, figsize=(8, 5))
    ax1 = fig1.add_subplot(1, 1, 1)
    x = np.arange(len(index)) + 1
    ax1.bar(
        x, ivlist, width=.4
    )  # ax1.bar(range(len(index)),ivlist, width=0.4)#生成柱状图  #ax1.bar(x,ivlist,width=.04)
    ax1.set_xticks(x)
    ax1.set_xticklabels(index, rotation=0, fontsize=15)
    ax1.set_ylabel('IV', fontsize=16)  # IV(Information Value),
    # 在柱状图上添加数字标签
    for a, b in zip(x, ivlist):
        plt.text(a,
                 b + 0.01,
                 '%.4f' % b,
                 ha='center',
                 va='bottom',
                 fontsize=12)
    plt.show()

    a = np.array(index)
    b = np.array(collist)
    c = np.array(ivlist)
    d = np.vstack([a, b, c])
    df_ = pd.DataFrame(d).T
    ft.df_change_colname(df_, {0: "x_axis", 1: "feature", 2: "iv"})
    df_ = ft.data_sort(df_, ["iv"], [False])
    return df_
Exemple #6
0
def lstm_sa_run(data_prefix, target_tag, pipe, hidden_dim):
    ft.load_adders(data_prefix)
    df_path = ft.exp_path + 'data/' + data_prefix + '.csv'
    pipes_dir = ft.exp_path + 'pipes/'
    results_dir = ft.exp_path + 'results/'
    results_file = results_dir + '_'.join(
        ['lstmsa', data_prefix, pipe,
         str(hidden_dim), target_tag]) + '.csv'
    with open(pipes_dir + pipe + '_pipe.txt', 'r') as pipe_file:
        pipe_x = eval(pipe_file.read())
    reporter = st.LSTM_SA_REPORTER(results_path=results_file)
    stopper = skorch.callbacks.EarlyStopping(monitor='v.f1',
                                             patience=50,
                                             threshold=0.0001,
                                             threshold_mode='rel',
                                             lower_is_better=False)
    trainer = st.LSTM_SA_TRAINER(df_path=df_path,
                                 target_tag=target_tag,
                                 pipe_x=pipe_x,
                                 lr=0.01,
                                 hidden_dim=hidden_dim,
                                 epochs=1000,
                                 callbacks=[reporter, stopper])
    trainer.fit()
Exemple #7
0
def woe_mapping(data,
                y_name,
                bins_of_col,
                woeall,
                is_validation=True,
                save_path=None,
                encoding="UTF-8"):
    model_woe = pd.DataFrame(index=data.index)

    # 将每个原始特征数据 按bins_of_col存储的每个特征的分箱区间 分箱后, 再按分箱的结果把WOE结构用map函数映射到数据中
    for col in bins_of_col:
        # 注意: 使用pd.cut函数之前,确保分箱区间bins的首尾分别为: -np.inf, np.inf
        model_woe[col] = pd.cut(data[col],
                                bins_of_col[col][0]).map(woeall[col])

    # 将标签补充到数据中(只有训练集、验证集/测试集 有标签Y, 真实提交测试集是没有标签Y的)
    if is_validation:
        model_woe[y_name] = data[y_name]  # 这就是建模数据了

    # 保存 建模数据
    if save_path is not None:
        ft.writeFile_outData(model_woe, save_path, encoding)

    return model_woe
Exemple #8
0
os.chdir(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\101_Sklearn\7_SVM"
)

# In[]:
weather = pd.read_csv(r"weatherAUS5000.csv", index_col=0)  # 第一列变为index

X = weather.iloc[:, :-1]
Y = weather.iloc[:, -1]

# In[]:
import FeatureTools as ft

print(X.isnull().mean())  # 缺失值所占总值的比例 isnull().sum(全部的True)/X.shape[0]
print(ft.missing_values_table(X))

print(set(Y), Y.isnull().sum())

# In[]:
# Ori_Xtrain, Ori_Xtest, Ori_Ytrain, Ori_Ytest = train_test_split(X,Y,test_size=0.3,random_state=420) #随机抽样
Ori_Xtrain, Ori_Xtest, Ori_Ytrain, Ori_Ytest = ft.data_segmentation_skf(
    X, Y, test_size=0.3)

Xtrain = Ori_Xtrain.copy()
Xtest = Ori_Xtest.copy()
Ytrain = Ori_Ytrain.copy()
Ytest = Ori_Ytest.copy()

# 恢复索引
for i in [Xtrain, Xtest, Ytrain, Ytest]:
Exemple #9
0
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
# import xgboost as xgb
import re
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns
#import pandas_profiling
color = sns.color_palette()
sns.set_style('darkgrid')

from math import isnan
import FeatureTools as ft
ft.set_file_path(r"D:\视频教程\8、项目\项目列表\比赛\和鲸\携程酒店浏览客户流失概率预测")
import Tools_customize as tc
import Binning_tools as bt

# In[]:
train_data = ft.readFile_inputData('userlostprob.txt', parse_dates = ['d','arrival'], sep='\t')
# In[]:
train_data.describe()
# In[]:
# 缺失值概览:
mis_val_table_ren_columns = ft.missing_values_table(train_data)
# In[]:
# 删除缺失值比列88%的列historyvisit_7ordernum
_, train_data = ft.missing_values_table(train_data, percent=88)
# In[]:
# 提前訂酒店天数,似乎日常上越晚訂越不會流失
Exemple #10
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  5 09:48:31 2020

@author: dell
"""

import numpy as np
import pandas as pd
import os 
import operator
import timeit
import matplotlib.pyplot as plt

import FeatureTools as ft
ft.set_file_path(r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\103_Recommend\BAT\data\Adult")
import Tools_customize as tc
import Data_Samping as ds

# In[]:
names = ["age", "workclass", "fnlwgt", "education_level", "education-num", "marital-status",
         "occupation", "relationship", "race", "sex", "capital-gain", 
         "capital-loss", "hours-per-week", "native-country", "income"]
dtype = {
            "age":np.int32,
            "education-num":np.int32,
            "capital-gain":np.int32,
            "capital-loss":np.int32,
            "hours-per-week":np.int32
         }
from sklearn.metrics import mean_squared_error as MSE, r2_score
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# import pandas_profiling
color = sns.color_palette()
sns.set_style('darkgrid')

from math import isnan
import FeatureTools as ft

ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data"
)
import Tools_customize as tc
import Binning_tools as bt
import RocLib as rlb

# In[]:
# 生成一些简单的样本数据,test_prei 代表第i个模型的预测值
test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6]

Exemple #12
0
exp.plot('Pred', 'resid',
         kind='scatter')  # Pred = β*Income,随着预测值的增大,残差resid呈现 喇叭口形状
ana1.summary()
# In[]:
Xtrain = exp[['Age', 'Income', 'dist_home_val']]
Ytrain = exp[['avg_exp']]

reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtrain)  # 预测我们的yhat
print(reg.score(Xtrain, Ytrain))

predict = pd.DataFrame(yhat, columns=['Pred'])
print(Ytrain.dtypes, predict.dtypes)

y = Ytrain.copy()
ft.recovery_index([y])
# resid = pd.DataFrame((y['avg_exp'] - predict["Pred"]), columns=['resid'])
resid = pd.DataFrame(y['avg_exp'].sub(predict["Pred"]), columns=['resid'])

resid_1 = pd.concat([predict, resid], axis=1)
resid_1.plot('Pred', 'resid', kind='scatter')

print(ft.r2_score_customize(Ytrain, yhat, 1))
print(ft.r2_score_customize(Ytrain, yhat, 2))
print(ft.adj_r2_customize(Ytrain, yhat, Xtrain.shape[1], 2))

# In[15]:
# ols类计算 线性回归模型 并得到 预测值 和 残差
# 遇到异方差情况,教科书上会介绍使用加权最小二乘法,但是实际上最常用的是对 被解释变量y 取对数
# R-squared为0.454
ana1 = ols('avg_exp ~ Income', exp).fit()
Exemple #13
0
warnings.filterwarnings('ignore')
import itertools
import matplotlib.gridspec as gridspec

from math import isnan
import FeatureTools as ft
# ft.set_file_path(r"E:\code\python_workSpace\idea_space\meituan_hotel\analysis")
import Tools_customize as tc
import Binning_tools as bt
from sqlalchemy import create_engine

# In[]:
# i.meituan.com
# 一、表读取
sql = "select * from bar_view"
bar_data = ft.read_from_mysql(sql, "root", "123456", "127.0.0.1", "3306",
                              "meituan_bar")

# In[]:
mis_val_table_ren_columns = ft.missing_values_table(bar_data)

# In[]:
bar_data['bar_phone'] = bar_data['bar_phone'].map(lambda x: np.nan
                                                  if not x else x)
bar_data['bar_address'] = bar_data['bar_address'].map(lambda x: np.nan
                                                      if not x else x)
bar_data['tuan'] = bar_data['tuan'].map(lambda x: np.nan if not x else x)
bar_data['juan'] = bar_data['juan'].map(lambda x: np.nan if not x else x)
bar_data['wai'] = bar_data['wai'].map(lambda x: np.nan if not x else x)

bar_data['package_id'] = bar_data['package_id'].map(lambda x: np.nan
                                                    if not x else x)
Exemple #14
0
def train_test_phase(train_data, test_data, tags, pipe_prefix, adders,
                     tag_params, tag_classifiers, filename):
    pipes_dir = ft.exp_path + 'pipes/'
    train_df = pd.read_csv(ft.exp_path + 'data/' + train_data + '.csv',
                           encoding="utf-8",
                           keep_default_na=False)
    test_df = pd.read_csv(ft.exp_path + 'data/' + test_data + '.csv',
                          encoding="utf-8",
                          keep_default_na=False)
    train_tags = sorted(
        set(train_df.columns.values.tolist()) - set([
            'Unnamed: 0', 'node_id', 'tree_id', 'timestamp', 'author', 'text',
            'parent'
        ]))
    test_tags = sorted(
        set(test_df.columns.values.tolist()) - set([
            'Unnamed: 0', 'node_id', 'tree_id', 'timestamp', 'author', 'text',
            'parent'
        ]))
    if train_tags != test_tags:
        print(
            'Error! Train & Test tags are not identical. Imcompatible DataSets.'
        )
        return
    if tags is None:
        tags = test_tags
    with open(pipes_dir + pipe_prefix + '.txt', 'r') as pipe_file:
        transform_pipe = re.sub(r",?\s*\('cls', ft.ClsAdder\(\)\)", '',
                                pipe_file.read())
    # TRAIN PHASE #
    X_trn = np.arange(len(train_df.index)).reshape((len(train_df.index), 1))
    train_pipe = eval(transform_pipe)
    ft.load_adders(train_data, adders)
    trained_classifiers = {}
    for tag, params in tag_params.items():
        print('Running TRAIN PHASE for: %s' % tag)
        ft.TagAdder.target_tag = tag
        train_pipe.set_params(**params)
        cls = ft.ClsAdder(tag_classifiers[tag])
        transformed_X = train_pipe.fit_transform(X_trn)
        cls.fit(transformed_X, train_df[tag])
        trained_classifiers[tag] = cls

    # TEST PHASE #
    X_tst = np.arange(len(test_df.index)).reshape((len(test_df.index), 1))
    test_pipe = eval(transform_pipe)
    ft.load_adders(test_data, adders)
    scores = defaultdict(dict)
    for tag, cls in trained_classifiers.items():
        print('Running TEST PHASE for: %s' % tag)
        ft.TagAdder.target_tag = tag
        test_pipe.set_params(**tag_params[tag])
        transformed_X = test_pipe.fit_transform(X_tst)
        y_predicted = cls.predict(transformed_X)
        y_true = test_df[tag]
        scores['prior'][tag] = np.mean(y_true)
        scores['f1_naive'][tag] = metrics.f1_score(y_true, [1] * len(y_true))
        f1 = metrics.f1_score(y_true, y_predicted)
        scores['f1_imprv'][tag] = f1 / scores['f1_naive'][tag]
        scores['f1'][tag] = f1
        scores['precision'][tag] = metrics.precision_score(y_true, y_predicted)
        scores['recall'][tag] = metrics.recall_score(y_true, y_predicted)
        scores['params'][tag] = str(tag_params[tag])
        scores['cls'][tag] = tag_classifiers[tag]
        scores['pipe'][tag] = transform_pipe

    priors = [scores['prior'][tag] for tag in tags]
    avg_keys = ['prior', 'f1_naive', 'f1_imprv', 'f1', 'precision', 'recall']
    weights_variations = {
        '0': np.ones(len(priors)),
        '1': np.array(priors),
        '2': np.sqrt(priors),
        '3': np.cbrt(priors)
    }
    for key in avg_keys:
        for w_type, weights in weights_variations.items():
            vals = [scores[key][tag] for tag in tags]
            scores[key]['average' + w_type] = np.average(vals, weights=weights)
    pd.DataFrame(scores).to_csv(filename)
    print('DONE:', filename)
# -*- coding: utf-8 -*-
"""
Created on Thu Jan  2 13:46:07 2020

@author: dell
"""

import numpy as np
import pandas as pd
import os
import operator
import timeit

import FeatureTools as ft
ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\103_Recommend\BAT\data"
)
import Tools_customize as tc
import Data_Samping as ds

# In[]:
#graph_data = ft.get_graph_from_data("ml-1m/ratings.txt")
graph_data = ft.get_graph_from_data("log.txt", ",")


# In[]:
def personal_rank(graph, root, alpha, iter_num, recom_num=10):
    '''
    Args:
        graph: user item graph
        root: the fixed user for which to recom
Exemple #16
0
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error

from math import isnan
import FeatureTools as ft

ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data")
import Tools_customize as tc
import Binning_tools as bt

# In[]:
train_data_6 = ft.readFile_inputData('train_data_6.csv', index_col=0)
test_data_6 = ft.readFile_inputData('test_data_6.csv', index_col=0)

# In[]:
feature_names = train_data_6.columns.tolist()
feature_names.remove('price')
X_data = train_data_6[feature_names]
Y_data = train_data_6['price']
X_test = test_data_6[feature_names]

Exemple #17
0
class_2 = 50  #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]]  #设定两个类别的中心
clusters_std = [1.5, 0.5]  #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0,
                  shuffle=False)

# In[]:
X = pd.DataFrame(X, columns=["X1", "X2"])
y = pd.DataFrame(y, columns=["y"])
data = pd.concat([X, y], axis=1)

# In[]:
ft.Sample_imbalance(data, "y")

# In[]:
Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)
# In[]:
ft.sample_category(Ytest, Ytrain)

# In[]:
# 在sklearn下建模#
clf = XGBC().fit(Xtrain, Ytrain)
ypred = clf.predict(Xtest)
ypred_proba = clf.predict_proba(Xtest)
# In[]:
print(clf.score(Xtest, Ytest))  # 默认模型评估指标 - 准确率
print(cm(Ytest, ypred, labels=[1, 0]))  # 少数类写在前面
print(recall(Ytest, ypred))
Exemple #18
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jan  3 21:45:06 2020

@author: dell
"""

import numpy as np
import pandas as pd
import os 
import operator
import timeit
from collections import OrderedDict

import FeatureTools as ft
ft.set_file_path(r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\103_Recommend\BAT\data\ml-1m")
import Tools_customize as tc
import Data_Samping as ds

# In[]:
def get_ave_score(input_file, split_char="::", title_num=None, encoding="UTF-8"):
    if not os.path.exists(input_file):
        return {}
    record_dict = {}
    score_dict = {}
    fp = open(input_file, encoding=encoding)
    line_num = 0
    for line in fp:
        if (title_num is not None) and (line_num <= title_num):
            line_num += 1
            continue
Exemple #19
0
warnings.filterwarnings('ignore')
import itertools
import matplotlib.gridspec as gridspec

from math import isnan
import FeatureTools as ft
# ft.set_file_path(r"E:\code\python_workSpace\idea_space\meituan_hotel\analysis")
import Tools_customize as tc
import Binning_tools as bt
from sqlalchemy import create_engine

# In[]:
# i.meituan.com
# 一、表读取
sql = "select * from hotel_orders_view"
hotel_orders_data = ft.read_from_mysql(sql, "wx_fyinn", "CBSFWPFkzwtL8382",
                                       "47.108.49.9", "3306", "wx_fyinn")
# In[]:
hotel_orders_data2 = hotel_orders_data.copy()

# In[]:
hotel_orders_data2['o_rt_hotel_room_size'] = hotel_orders_data2[
    'o_rt_hotel_room_size'].map(lambda x: np.float16(x[0:x.find('平方米')])
                                if '平方米' in x else np.float16(x))

# In[]:
print(hotel_orders_data2.iloc[0]['o_pay_time'],
      type(hotel_orders_data2.iloc[0]['o_pay_time']))
print(hotel_orders_data2.iloc[3]['o_pay_time'],
      type(hotel_orders_data2.iloc[3]['o_pay_time']),
      hotel_orders_data2.iloc[3]['o_pay_time'] is pd.NaT)
print(hotel_orders_data2['o_pay_time'].dtypes)
Exemple #20
0
fig, ax = plt.subplots(1, figsize=(15, 8))
ax.plot(range(1, 201), cvresult1.iloc[:, 2], c="red", label="train,original")
ax.plot(range(1, 201), cvresult1.iloc[:, 0], c="orange", label="test,original")
ax.set_ylim(top=5)  # 截取Y轴最大值 进行显示
ax.grid()
ax.legend(fontsize="xx-large")
plt.show()
'''
从曲线上可以看出,模型现在处于过拟合的状态。我们决定要进行剪枝。我们的目标是:训练集和测试集的结果尽量
接近,如果测试集上的结果不能上升,那训练集上的结果降下来也是不错的选择(让模型不那么具体到训练数据,增加泛化能力)。
'''
# In[]:
ft.learning_curve_xgboost(X,
                          y,
                          param1,
                          num_round=num_round,
                          metric="rmse",
                          n_fold=5,
                          set_ylim_top=5)  # 默认rmse

# In[]:
# 二、学习曲线调参: (重点:调参方式)
'''
一、调参要求:
1、测试集上的模型指标(MSE) 较默认超参数模型 要降低(最少持平);
2、允许训练集上的模型指标(MSE) 较默认超参数模型 升高;
3、多个模型在都满足1、2条件的情况下,选择 训练集与测试集 MSE距离近的模型(泛化误差小)
'''
# 默认超参数:
param1 = {
    'silent': True  # 默认False: 打印
Exemple #21
0
def main(argv):
    if len(argv) == 0:
        # print 'Usage: my_program command --option <argument>'
        print(__doc__)
    else:
        args = docopt(__doc__, argv=argv)
        if args['aggregate_labels']:
            with tempfile.TemporaryDirectory() as tmpdir:
                scrapping.scrap(args['<bundles_dir>'], tmpdir)
                notcut_trees = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_notcut_trees.txt'
                notcut_priors = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_notcut_priors.csv'
                cut_trees = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_cut_trees.txt'
                split_trees = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_split_trees.txt'
                withdummy_trees = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_withdummy_trees.txt'
                cut_priors = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_cut_priors.csv'
                split_priors = args['<out_dir>'] + '/' + args[
                    '<prefix>'] + '_split_priors.csv'
                out_df = args['<out_dir>'] + '/' + args['<prefix>'] + '_df.csv'
                lt.aggregate_labels_from_bundles_to_trees(
                    args['<all_trees.txt>'], tmpdir, notcut_trees)
                tt.remove_duplicate_nodes(notcut_trees, notcut_trees)
                print('Duplicate nodes were removed.')
                tt.translate_list_of_trees(notcut_trees, notcut_trees)
                print('Trees were translated.')
                lt.remove_2nd_tags(notcut_trees)
                print('2nd tags were removed.')
                lt.print_label_priors(notcut_trees, notcut_priors)
                lt.cut_non_labeled_branches(notcut_trees, cut_trees)
                print('Non-labeled branches were cut.')
                lt.apply_split_labels_to_trees(cut_trees, split_trees)
                lt.print_label_priors(split_trees, split_priors)
                print('Split labels were applied.')
                lt.apply_start_split_end_labels_to_trees(
                    cut_trees, withdummy_trees)
                print('Start-Split-End labels were applied.')
                lt.print_label_priors(cut_trees, cut_priors)
                lt.create_data_csv(split_trees, out_df)
                print('Labels aggregated successfully.')
        if args['rename_labels']:
            lt.rename_labels(args['<in_file>'], args['<out_file>'])
        if args['rename_tags_in_trees']:
            lt.rename_tags_in_trees(args['<trees.txt>'],
                                    args['<out_trees.txt>'])
        if args['rename_tags_in_df']:
            lt.rename_tags_in_df(args['<df.csv>'], args['<out_df.csv>'])
        if args['create_bundles']:
            tt.create_bundles(args['<all_trees.txt>'], args['<tree_ids>'],
                              args['<out_dir>'])
        if args['rework_labels']:
            lt.rework_labels(args['<labeled_trees.txt>'],
                             args['<rework_settings.txt>'],
                             args['<out_trees.txt>'])
        if args['create_df']:
            lt.create_data_csv(args['<labeled_trees.txt>'],
                               args['<out_df.csv>'], args['--rework_settings'],
                               args['--ignore_deleted'])
        if args['print']:
            if args['dis_branch']:
                dst.print_branch(trees_path=args['<trees.txt>'],
                                 probas_path=args['<probas.dispr>'],
                                 branch_atlas_id=args['<branch_atlas_id>'],
                                 out_file=args['<out_file.csv>'])

            if args['dis_tags_npmi']:
                dst.print_dis_tags_npmi(trees_path=args['<trees.txt>'],
                                        probas_path=args['<probas.dispr>'],
                                        out_file=args['<out.csv>'],
                                        just_count=args['--just_count'],
                                        just_pmi=args['--just_pmi'])
            if args['tags_npmi']:
                lt.print_tags_npmi_table(args['<labeled_trees.txt>'],
                                         args['<out.csv>'],
                                         args['--just_count'], args['--log'])
            if args['label_priors']:
                lt.print_label_priors(args['<labeled_trees.txt>'],
                                      args['<out.csv>'], args['--per_tree'])

            if args['label_details']:
                lt.print_label_details(args['<labeled_trees.txt>'],
                                       args['<out.csv>'])

            if args['label_cooc_lists']:
                lt.print_label_cooc_lists(args['<labeled_trees.txt>'],
                                          args['<out.csv>'])

            if args['label_passes']:
                lt.print_label_passes(args['<labeled_trees.txt>'],
                                      args['<out_dir>'])

            if args['label_ngrams']:
                lt.print_label_ngrams(
                    args['<labeled_trees.txt>'], args['<out_dir>'],
                    [int(n) for n in args['<n1,n2,n3>'].split(',')])

            if args['forward_backward_transitions']:
                lt.print_forward_backward_transitions(
                    args['<labeled_trees.txt>'], args['<out_dir>'],
                    [int(n) for n in args['<n1,n2,n3>'].split(',')])

            if args['label_ngram_lists']:
                lt.print_label_ngram_lists(
                    args['<labeled_trees.txt>'], args['<out_file>'],
                    [int(n) for n in args['<n1,n2,n3>'].split(',')],
                    args['<min_count>'])

            if args['trees_statistics']:
                tt.create_list_of_trees_statistics(args['<trees.txt>'],
                                                   args['<stats.csv>'])

            if args['label_stats']:
                trees_path = args['<trees.txt>']
                out_dir = args['<stats_dir>']
                priors_csv = out_dir + '/priors.csv'
                npmi = out_dir + '/npmi.csv'
                correlation_log = out_dir + '/corr_log.txt'
                pmi = out_dir + '/pmi.csv'
                matthews = out_dir + '/matthews_correlation.csv'
                together_counts = out_dir + '/together_counts.csv'
                general_stats = out_dir + '/general_stats.csv'
                lt.print_label_ngrams(trees_path, out_dir, [2, 3, 4, 5, 6, 7])
                lt.print_label_priors(trees_path, priors_csv)
                lt.print_tags_npmi_table(trees_path,
                                         npmi,
                                         log_file=correlation_log)
                lt.print_tags_npmi_table(trees_path, pmi, just_pmi=True)
                lt.print_tags_npmi_table(trees_path,
                                         together_counts,
                                         just_count=True)
                tt.create_list_of_trees_statistics(trees_path, general_stats)
                lt.print_tags_matthews(trees_path, matthews)

        if args['grid_search']:
            cv = None
            if args['--cv']:
                cv = int(args['--cv'])
            target_tags = None
            if args['--target_tags']:
                target_tags = args['--target_tags'].split(',')
            split_tags = None
            if args['--split_tags']:
                split_tags = [int(i) for i in args['--split_tags'].split(',')]
            adders = args['--adders']
            ct.grid_search(args['<data_prefix>'],
                           args['<pipe_prefix>'],
                           args['<params_prefix>'],
                           adders=adders,
                           cv=cv,
                           split_tags=split_tags,
                           target_tags=target_tags)

        if args['train_test']:
            target_tags = None
            if args['--target_tags']:
                target_tags = args['--target_tags'].split(',')
            adders = args['--adders']
            cls = args['--cls']
            ct.train_test(args['<train>'],
                          args['<test>'],
                          args['<pipe>'],
                          args['<params>'],
                          args['<mode>'],
                          classifier=cls,
                          target_tags=target_tags,
                          adders=adders)

        if args['aggregate_train_test']:
            target_tags = None
            if args['--target_tags']:
                target_tags = args['--target_tags'].split(',')
            adders = args['--adders']
            cls = args['--cls']
            ct.aggregate_traintest(args['<train>'],
                                   args['<test>'],
                                   args['<pipe>'],
                                   args['<params>'],
                                   args['<mode>'],
                                   classifier=cls,
                                   target_tags=target_tags,
                                   adders=adders)

        if args['lstmsa']:
            ct.lstm_sa(data_prefix=args['<data_prefix>'],
                       target_tags=args['--target_tags'],
                       pipes=args['<pipes>'],
                       hidden_dims=args['<hidden_dims>'])

        if args['aggregate_scores']:
            ct.aggregate_scores(args['<data_prefix>'], args['<pipe_prefix>'])

        if args['aggregate_scores_lstmsa']:
            ct.aggregate_scores_lstmsa(args['<data_prefix>'], args['--tags'])

        if args['create_doc2vec_train_file']:
            ft.create_d2vtrain_lines(args['<trees.txt>'],
                                     args['<out_file.txt>'])

        if args['train_doc2vec']:
            ft.train_doc2vec(args['<train_file.txt>'], args['<out_model.txt>'],
                             int(args['<epochs>']), int(args['<dim>']))

        if args['prefit_adders']:
            ft.prefit_adders(args['<data_prefix>'], args['--adders'])

        if args['prepare_dissent']:
            dst.run_trees(args['<trees.txt>'], args['<out_path>'])

        if args['merge_disprobas']:
            dst.merge_probas(probas_dir=args['<probas_dir>'],
                             out_path=args['<out_file.dispr>'])
Latitude :街区的纬度
Longitude :街区的经度
'''
housevalue.feature_names  #特征名字

X.columns = housevalue.feature_names

# In[]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                y,
                                                test_size=0.3,
                                                random_state=420)

# 索引重排
# 因为y 是array格式的,索引自动重排了。
ft.recovery_index([Xtrain, Xtest])

Ytrain = pd.DataFrame(Ytrain, columns=["Y"])
Ytest = pd.DataFrame(Ytest, columns=["Y"])

# In[]:
# 数据分布
# 直方图
for fe in Xtrain.columns:
    f, axes = plt.subplots(1, 2, figsize=(23, 8))
    ft.con_data_distribution(Xtrain, fe, axes)
# In[]:
f, axes = plt.subplots(1, 2, figsize=(23, 8))
ft.con_data_distribution(Ytrain, "Y", axes)

# In[]:
    num_bins_temp = get_num_bins(model_data, col, 'SeriousDlqin2yrs', hand_bins[col][0])
    iv_temp = get_iv(get_woe(num_bins_temp))
    hand_bins[col].append(iv_temp)

# 合并手动分箱数据    
bins_of_col.update(hand_bins)
'''
# In[]:
bins_of_col = bt.automatic_hand_binning_all(model_data, 'SeriousDlqin2yrs',
                                            auto_col_bins, hand_bins)

# In[]:
# 1.7.8、探索性分析:   使用 上采样 → 卡方检验分桶 后的数据
# 1.7.8.1、单变量分析:
# 年龄
'''
model_data['cut'] = pd.cut(model_data.age, bins_of_col['age'][0])
age_cut_grouped_good = model_data[model_data["SeriousDlqin2yrs"] == 0].groupby('cut')["SeriousDlqin2yrs"].count()
ft.seriers_change_colname(age_cut_grouped_good, "good")
age_cut_grouped_bad = model_data[model_data["SeriousDlqin2yrs"] == 1].groupby('cut')["SeriousDlqin2yrs"].count()
ft.seriers_change_colname(age_cut_grouped_bad, "bad")
#df1 = pd.merge(pd.DataFrame(age_cut_grouped_good), pd.DataFrame(age_cut_grouped_bad), left_index=True, right_index=True)
df1 = pd.concat([age_cut_grouped_good, age_cut_grouped_bad], axis=1)
df1.insert(2,"badgrade", df1["bad"] / (df1["good"] + df1["bad"]))
ax1 = df1[["good","bad"]].plot.bar()
ax1.set_xticklabels(df1.index,rotation=15)
ax1.set_ylabel("Num")
ax1.set_title("bar of age")
# In[]:
ax11=df1["badgrade"].plot()
ax11.set_xticklabels(df1.index,rotation=50)
Exemple #24
0
from sklearn.svm import SVR
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_error as MAE

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
import xgboost as xgb
import joblib

from math import isnan
import FeatureTools as ft

ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data"
)
import Tools_customize as tc
import Binning_tools as bt
import StackingModels as sm

# In[]:
train_data_6 = ft.readFile_inputData('train_data_6.csv', index_col=0)
test_data_6 = ft.readFile_inputData('test_data_6.csv', index_col=0)
# In[]:
train_data_6 = train_data_6.fillna(-1)
test_data_6 = test_data_6.fillna(-1)
# In[]:
temp_train_miss = ft.missing_values_table(train_data_6)
temp_test_miss = ft.missing_values_table(test_data_6)
# -*- coding: utf-8 -*-
"""
Created on Thu Jan  2 14:41:53 2020

@author: dell
"""

import math
import pandas as pd
import random
import numpy as np
import pickle

import FeatureTools as ft
ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\103_Recommend\BAT\data\ml-1m"
)
import Tools_customize as tc
import Data_Samping as ds


# In[]:
class LFM:
    def __init__(self, user_ids, item_ids):
        self.class_count = 5
        self.iter_count = 5
        self.lr = 0.02
        self.lambd = 0.01
        self._init_data(user_ids, item_ids)
        self._init_model()
fas = pd.DataFrame(fa.comps["rot"])  # 默认就以 列 的方式呈现
data = pd.DataFrame(data)
score = pd.DataFrame(np.dot(data, fas))

# 第三步:根据因子得分进行数据分析
# In[14]:
fa_scores = score.rename(columns={0: "ATM_POS", 1: "TBM", 2: "CSC"})
fa_scores.head()
# In[]:
import FeatureTools as ft
import matplotlib.pyplot as plt

# In[]:
f, axes = plt.subplots(1, 2, figsize=(23, 8))
ft.con_data_distribution(fa_scores, 'ATM_POS', axes, fit_type=1, box_scale=1.5)  # 右偏
# In[]:
f, axes = plt.subplots(1, 2, figsize=(23, 8))
ft.con_data_distribution(fa_scores, 'TBM', axes, fit_type=1, box_scale=1.5)
# In[]:
f, axes = plt.subplots(1, 2, figsize=(23, 8))
ft.con_data_distribution(fa_scores, 'CSC', axes, fit_type=1, box_scale=1.5)

# In[]:
# 第四步:使用因子得分进行k-means聚类
# 4.1、k-means聚类的第一种方式:不进行变量分布的正态转换--用于寻找异常值
# - 1、查看变量的偏度
# In[15]:
var = ["ATM_POS", "TBM", "CSC"]
skew_var = {}
for i in var:
Exemple #27
0
# xgboost库 默认是silent=False会打印训练进程,设置silent=True不会打印训练进程,只返回运行结果。
reg = XGBR(n_estimators=10, silent=False)
# sklearn库中的xgbsoost的默认为silent=True不会打印训练进程,想打印需要手动设置为False
CVS(reg, Xtrain, Ytrain, cv=5,
    scoring='neg_mean_squared_error').mean()  #-92.67865836936579

# In[]:
# A、集成算法框架超参数:
# 学习曲线:

# L、学习曲线顺序: 基于样本量:如果过拟合(训练集、测试集相差过远) →  基于超参数:比较现实的 目标 是将训练集效果降低,从而避免过拟合 →  基于样本量:再次检测过拟合情况

# 一、基于样本量(交叉验证学习曲线函数)
# 1、线性回归测试:
cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
ft.plot_learning_curve(LinearR(), "LinearR", Xtrain, Ytrain, ax=None, cv=cv)
plt.show()
# In[]:
# 2、Sklearn的XGBT:
#cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式
cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)

ft.plot_learning_curve(XGBR(n_estimators=100, random_state=420, silent=True),
                       "XGB",
                       Xtrain,
                       Ytrain,
                       ax=None,
                       cv=cv)
plt.show()
'''
样本量阈值[ 28  91 155 219 283]
Exemple #28
0
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# import pandas_profiling
color = sns.color_palette()
sns.set_style('darkgrid')

from math import isnan
import FeatureTools as ft

ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data"
)
import Tools_customize as tc
import Binning_tools as bt

# In[]:
# 一、表读取
train_data = ft.readFile_inputData('used_car_train_20200313.csv',
                                   parse_dates=['regDate', 'creatDate'],
                                   sep=' ')
test_data = ft.readFile_inputData('used_car_testA_20200313.csv',
                                  parse_dates=['regDate', 'creatDate'],
                                  sep=' ')
# In[]:
# 备份
# ft.writeFile_outData(train_data, "used_car_train_Backup.csv")
import re
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# import pandas_profiling
color = sns.color_palette()
sns.set_style('darkgrid')

from math import isnan
import FeatureTools as ft

ft.set_file_path(
    r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data")
import Tools_customize as tc
import Binning_tools as bt

# In[]:
# 一、表读取
train_data = ft.readFile_inputData('used_car_train_20200313.csv', parse_dates=['regDate', 'creatDate'], sep=' ')
test_data = ft.readFile_inputData('used_car_testA_20200313.csv', parse_dates=['regDate', 'creatDate'], sep=' ')
# In[]:
print(train_data.shape, test_data.shape)  # (150000, 31) (50000, 30)
print(train_data[train_data['price'] <= 0].shape)

# In[]
# 1、缺失值
train_miss = ft.missing_values_table(train_data)
test_miss = ft.missing_values_table(test_data)
import xgboost as xgb
import re
from sklearn.metrics import roc_auc_score, mean_absolute_error,  make_scorer
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_error as MAE
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#import pandas_profiling
color = sns.color_palette()
sns.set_style('darkgrid')

from math import isnan
import FeatureTools as ft
ft.set_file_path(r"E:\soft\Anaconda\Anaconda_Python3.6_code\data_analysis\100_Data_analysis_competition\3_TianChi\1_Used_car_transaction_price_prediction\data")
import Tools_customize as tc
import Binning_tools as bt

# In[]:
train_data_5 = ft.readFile_inputData('train_data_5.csv', index_col=0) # price是大于0的
test_data_5 = ft.readFile_inputData('test_data_5.csv', index_col=0)
# In[]:
temp_data_miss =  ft.missing_values_table(train_data_5)

# In[]:
categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'city', 'kilometer', 'power_cut_bin', 'diff_day_cut_bin']
temp_col = ['kilometer', 'power_cut_bin', 'diff_day_cut_bin']
categorical_astype_str_col = ft.set_diff(categorical_features, temp_col)[1] # 差集: 27-7=20
# In[]:
# 1、特征类型转换