Beispiel #1
0
def compute_score_for_mytest(each_df):
    each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440)
    each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440)
    each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last']
    each_df.set_index(['name', 'index'], drop=True, inplace=True)

    each_df_fillna = pd.concat(
        Parallel(n_jobs=-1)(delayed(fillna)(each_group)
                            for name, each_group in each_df.groupby('name')))

    for each_column in use_columns:
        # load模型
        each_test_x = each_df[each_df[each_column].notna()]
        each_max_min = each_df.groupby('name')[each_column].agg([max, min])

        if each_column in ['HCT', 'HGB']:
            each_model = joblib.load(os.path.join('lgb统计', each_column))
            each_test_x = each_df.drop(each_column, axis=1)
        else:
            each_model = joblib.load(os.path.join('lgb统计加fillna', each_column))
            each_test_x = each_df_fillna.drop(each_column, axis=1)

        ###生成mytest
        mytest_index = Parallel(n_jobs=1)(
            delayed(split_my_val2)(group)
            for name, group in each_test_x.groupby('name'))
        each_test_x = each_test_x.loc[mytest_index]
        each_test_y = each_df.loc[mytest_index][each_column]

        each_test_x[each_column] = each_model.predict(each_test_x)
        # 这里需要恢复原来的值计算score

        if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
            each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column],
                                                    0)
        elif each_column in ['PCRE']:
            each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column],
                                                    -1.5)

        mse = np.sum(
            np.square((each_test_y - each_test_x[each_column]) /
                      (each_max_min['max'] - each_max_min['min'])))
        score = np.sqrt(mse / each_test_x.shape[0])
        print(each_column, score)
        return score
Beispiel #2
0
def cal_loss(y_pre, d_train):
    global X_train2, X_val2, select_col
    ##需要判断此时是训练集还是验证集来决定取var_max和var_min
    y_label = d_train.get_label()
    if select_col in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        y_pre = inv_boxcox1p(y_pre, 0)
        y_label = inv_boxcox1p(y_label, 0)
    elif select_col in ['PCRE']:
        y_pre = inv_boxcox1p(y_pre, -1.5)
        y_label = inv_boxcox1p(y_label, -1.5)

    if len(y_pre) == X_train2.shape[0]:
        mse = (y_pre - y_label) / (X_train2['var_max'] - X_train2['var_min'])
    elif len(y_pre) == X_val2.shape[0]:
        mse = (y_pre - y_label) / (X_val2['var_max'] - X_val2['var_min'])
    #     grad = y_pre - y_label
    hess = np.power(np.abs(mse), 0.5)
    return mse, hess
Beispiel #3
0
def test_for_predict(each_dir, each_name):
    # 生成特征
    each_df = read_data_and_extract_features(each_dir, each_name)
    to_each_df = pd.DataFrame()
    each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440)
    each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440)
    each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last']
    each_df.set_index(['name', 'index'], drop=True, inplace=True)
    each_df_fillna = pd.concat(
        Parallel(n_jobs=-1)(delayed(fillna)(each_group)
                            for name, each_group in each_df.groupby('name')))

    for each_column in use_columns:
        # 保留预测位置
        y_index = each_df[each_column].isna()
        each_test_x = each_df.drop(each_column, axis=1)

        # if each_column in ['HCT', 'HGB']:
        #     each_model = joblib.load(os.path.join('lgb统计', each_column))
        #     each_test_x = each_df.drop(each_column, axis=1)
        # else:
        each_model = joblib.load(os.path.join('lgb统计加fillna', each_column))
        each_test_x = each_df_fillna.drop(each_column, axis=1)

        each_test_x = each_test_x[y_index]
        each_test_not_x = each_df.drop(each_test_x.index)

        each_test_x[each_column] = each_model.predict(each_test_x)
        # 这里需要恢复原来的值计算score

        if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
            each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column],
                                                    0)
        elif each_column in ['PCRE']:
            each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column],
                                                    -1.5)
        each_test_x = pd.concat([each_test_not_x, each_test_x])
        to_each_df[each_column] = each_test_x[each_column]
    if not os.path.exists('fillna_test_data'):
        os.mkdir('fillna_test_data')
    to_each_df['CHARTTIME'] = each_df['CHARTTIME']
    to_each_df.sort_values(['index'], inplace=True)
    to_each_df.to_csv(os.path.join('fillna_test_data', each_name), index=None)
Beispiel #4
0
def cal_val(y_pre, d_train):
    global X_train2, X_val2, select_col
    ##不管是训练集还是验证集都是用训练集每个人的最大值和最小值
    y_label = d_train.get_label().values

    if select_col in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        y_pre = inv_boxcox1p(y_pre, 0)
        y_label = inv_boxcox1p(y_label, 0)
    elif select_col in ['PCRE']:
        y_pre = inv_boxcox1p(y_pre, -1.5)
        y_label = inv_boxcox1p(y_label, -1.5)
    if len(y_pre) == X_val2.shape[0]:
        mse = np.sum(
            np.square((y_label - y_pre) / (X_val2['max'] - X_val2['min'])))
        score = np.sqrt(mse / len(y_pre))
    elif len(y_pre) == X_train2.shape[0]:
        mse = np.sum(
            np.square((y_label - y_pre) / (X_train2['max'] - X_train2['min'])))
        score = np.sqrt(mse / len(y_pre))
    return 't1', score, False
Beispiel #5
0
def test_inv_boxcox():
    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox(x, lam)
    x2 = inv_boxcox(y, lam)
    assert_almost_equal(x, x2)

    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox1p(x, lam)
    x2 = inv_boxcox1p(y, lam)
    assert_almost_equal(x, x2)
Beispiel #6
0
def test_inv_boxcox():
    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox(x, lam)
    x2 = inv_boxcox(y, lam)
    assert_almost_equal(x, x2)

    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox1p(x, lam)
    x2 = inv_boxcox1p(y, lam)
    assert_almost_equal(x, x2)
Beispiel #7
0
def predict_for_column(each_df, each_df_fillna, each_column):
    # 保留测试集index
    y_index = each_df[each_column].isna()

    # if each_column in ['HCT', 'HGB']:
    #     each_model = joblib.load(os.path.join('lgb统计', each_column))
    #     each_test_x = each_df.drop(each_column, axis=1)
    # else:
    each_model = joblib.load(os.path.join('lgb统计加fillna', each_column))
    each_test_x = each_df_fillna.drop(each_column, axis=1)

    each_test_x = each_test_x[y_index]
    each_test_not_x = each_df.drop(each_test_x.index)

    each_test_x[each_column] = each_model.predict(each_test_x)
    # 这里需要恢复原来的值计算score

    if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], 0)
    elif each_column in ['PCRE']:
        each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], -1.5)
    each_test_x = pd.concat([each_test_not_x, each_test_x])
    return each_test_x[each_column]
Beispiel #8
0
def generate(x, y, filename):
    """Generate fixture data and write to file.

    # Arguments

    * `x`: domain
    * `y`: domain
    * `name::str`: filename of the output file

    # Examples

    ```python
    python> x = np.linspace(-10.0, 10.0, 2001)
    python> y = np.linspace(-5.0, 5.0, 1001)
    python> generate(x, y, './data.json')
    ```
    """
    z = inv_boxcox1p(x, y)
    data = dict(x=x.tolist(), y=y.tolist(), expected=z.tolist())

    filepath = path.join(DIR, filename)

    with open(filepath, 'w') as out:
        json.dump(data, out)
Beispiel #9
0
    df_test = df_test_middle.drop(columns='daysOnMarket')
    df_test_label = df_test_middle['daysOnMarket']

    value_list = []
    for i in range(len(data.columns)):
        value_list.append('categorical')

    column_description1 = {
        key: value
        for key in data.columns for value in value_list
        if data[key].dtype == 'object'
    }
    column_description2 = {
        'daysOnMarket': 'output',
        'buildingTypeId': 'categorical'
    }

    print(column_description1)
    column_descriptions = dict(column_description1, **column_description2)

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_train, model_names='XGBRegressr')

    # ml_predictor.score(df_test)
    x = ml_predictor.predict(df_test)
    print(
        mean_absolute_error(inv_boxcox1p(df_test_label, 0.15),
                            inv_boxcox1p(x, 0.15)))
    print(mean_absolute_error(df_test_label, x))
Beispiel #10
0
def test_inv_boxcox1p_underflow():
    x = 1e-15
    lam = 1e-306
    y = inv_boxcox1p(x, lam)
    assert_allclose(y, x, rtol=1e-14)
test.isnull().sum()
test["runtime"] = test["runtime"].fillna(test["runtime"].mean())
test["status"] = test["status"].fillna(test["status"].mode()[0])
test["release_dayofweek"] = test["release_dayofweek"].fillna(
    test["release_dayofweek"].mode()[0])
test["release_quarter"] = test["release_quarter"].fillna(
    test["release_quarter"].mode()[0])
sns.heatmap(test.isnull())

X_test = test.drop(["id"], axis=1)

X_train.columns
X_test.columns

###### Building model
import xgboost
from sklearn.metrics import accuracy_score
predictor = xgboost.XGBRegressor()
predictor.fit(X_train, y_train)
pred_train = predictor.predict(X_train)
pred_test = predictor.predict(X_test)

pred_test_original = inv_boxcox1p(pred_test, 0.2)

PP = pd.concat([test.id], axis=1)
PP["revenue"] = pred_test_original
PP.head()

PP.to_csv("TMDB1stTry.csv", index=False)
cat_pred_train = model_cat.predict(train1.values)
cat_pred_train[cat_pred_train < 0] = 0
print("Mean square logarithmic error of cat model on whole train = {:.4f}".
      format(msle(y_train, cat_pred_train)))

# In[45]:

c = np.array([0.333334, 0.333333, 0.333333])

print("The sum of the entries of c is {}".format(c.sum()))

train_pred = xgb_pred_train * c[0] + lgb_pred_train * c[
    1] + cat_pred_train * c[2]
print("Mean square logarithmic error of chosen model on whole train = {:.4f}".
      format(msle(y_train, train_pred)))

# In[47]:

lgb_pred = model_lgb.predict(test)
xgb_pred = model_xgb.predict(test.values)
cat_pred = model_cat.predict(test)

# In[48]:

#将结果写入表格
pred = inv_boxcox1p((xgb_pred * c[0] + lgb_pred * c[1] + cat_pred * c[2]), 0.2)

sub = pd.DataFrame({"id": np.arange(test.shape[0]) + 3001, "revenue": pred})
sub.to_csv("C:/Users/jynkris/Desktop/sample_submission.csv", index=False)
Beispiel #13
0
def test_inv_boxcox1p_underflow():
    x = 1e-15
    lam = 1e-306
    y = inv_boxcox1p(x, lam)
    assert_allclose(y, x, rtol=1e-14)
Beispiel #14
0
def saveSubmission(iDs, preds):
    sub = pd.DataFrame()
    sub['Id'] = inv_boxcox1p(iDs, lda).apply(lambda x: round(x))
    sub['SalePrice'] = inv_boxcox1p(preds, lda)
    sub.to_csv('submission_stacked.csv', index=False)
Beispiel #15
0
from scipy import stats, special
import pandas as pd
import numpy as np

data = pd.read('data.csv')
y = data.target

lam_range = np.linspace(-2, 5, 100)  # default nums=50
llf = np.zeros(lam_range.shape, dtype=float)

# lambda estimate:
for i, lam in enumerate(lam_range):
    llf[i] = stats.boxcox_llf(lam, y)  # y 必须>0

# find the max lgo-likelihood(llf) index and decide the lambda
lam_best = lam_range[llf.argmax()]

#对预测变量进行cox-box变换
y_boxcox = special.boxcox1p(y, lam_best)

#对预测变量进行逆cox-box变换
y_invboxcox = special.inv_boxcox1p(y_boxcox, lam_best)
Beispiel #16
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    
    int_features = [21132,1231,4121,1214,123,412,12414,512,115,155]
    int_features = [x for x in request.form.values()]

    from scipy.special import boxcox1p
    lam = 0.15
    int_features[0] = np.int64(int_features[0])

    int_features[0] = boxcox1p(int_features[0], lam)
  
    int_features[1] = np.int64(int_features[1])

    int_features[1] = boxcox1p(int_features[1], lam)
    
    int_features[2] = np.int64(int_features[2])

    int_features[2] = boxcox1p(int_features[2], lam)
    
    int_features[3] = np.int64(int_features[3])

    int_features[3] = boxcox1p(int_features[3], lam)

    int_features[4] = np.int64(int_features[4])

    int_features[4] = boxcox1p(int_features[4], lam)

    int_features[5] = np.int64(int_features[5])

    int_features[5] = boxcox1p(int_features[5], lam)

    int_features[6] = np.int64(int_features[6])

    int_features[6] = boxcox1p(int_features[6], lam)

    int_features[7] = np.int64(int_features[7])

    int_features[7] = boxcox1p(int_features[7], lam)

    int_features[8] = np.int64(int_features[8])

    int_features[8] = boxcox1p(int_features[8], lam)
        
    int_features[9] = np.int64(int_features[9])

    int_features[9] = boxcox1p(int_features[9], lam)
    
    from scipy.special import inv_boxcox1p
        
    columns = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'TotalSF']
    values = [7.990963041593332, 1.8203341036428238, 9.125735246126716, 35.391370879389704,0.7304631471189666, 0.7304631471189666, 1.5409627556327752, 1.5409627556327752, 0.0005003172240540867, 1.4013368444274956, 0.0431549546952863, 2.9929161754443663, int_features[5], 1.1948967456559554, 0.24556737057378078, 1.4155057023841355, int_features[1], 2.1604465664352563, 14.13734536216387, int_features[7], 0.8932636402684723, 0.7538300024576454, 2.7582173064497746, 2.844430978246579, 1.071057748915512, 3.268986443449214, 1.3624422931139548, 1.7328990595033906, 0.8834994399659468, 1.2470924388496651, 1.4569085852270438, 1.2062523036053971, 1.317433970123405, 7.084892889342716, 1.9657590475261213, 1.0517430422748202, 9.391255388699905, 11.89254623970312, 0.7446911405592987, 0.7563474000353817, 0.6829330108338099, 1.6815016774004494, int_features[4], 4.917140353780343, 0.15794652317139848, int_features[2], 0.30751236837651724, 0.04166142948306093, int_features[6], 0.2774860242213367, 1.4742074483529004, 0.7517241594265581, 1.2746341259746379,int_features[8] , 2.1887574652310366, 0.42455642997342163, 1.3031568396814526, 1.140331159374495, 14.151202776364704, 0.7722995531965653, int_features[3],int_features[9], 1.774185368855956, 1.7865149945239338, 1.1111640610733957, 3.726756270282964, 3.253621458595589, 1.0382729368834795, 0.13072845037668754, 0.6225990043864975, 0.050872777658061384, 1.1920461692064572, 1.131509564697584, 1.1922835973693042, 0.40283910651921423, 2.230572790696048, 14.195035682494952, 2.478799472550466, 1.7025947724500126, int_features[0]]
    xdd=dict(zip(columns, values))

    df_s = pd.DataFrame(columns=columns)
    
    df_s = df_s.append(xdd, ignore_index=True)
    prediction = model.predict(df_s)

    
    print(prediction)
    
    prediction = inv_boxcox1p(prediction, 0.15)
#addiiton to look realistic 
    prediction = inv_boxcox1p(prediction, 0.15)
    
    print(prediction)
    output = round(prediction[0], 2)

    return render_template('index.html', prediction_text='House price should be $ {}'.format(output))
Beispiel #17
0
def train_val_test(each_df, each_column, save_dir):
    global X_train2, X_val2, select_col
    select_col = each_column

    #     use_columns_temp=use_columns.copy()
    #     use_columns_temp.remove(each_column)
    #     combined_df=pd.DataFrame()
    #     for each_i,i in enumerate(use_columns_temp):
    #         combined_df[i+'**2']=each_df[i]**2
    #         combined_df[i+'**0.5']=each_df[i]**0.5
    #         #加减乘都不用做两次,除可以
    #         for each_j in range(each_i+1,len(use_columns)):
    #             j=use_columns[each_j]
    #             combined_df[i+'+'+j]=each_df[i]+each_df[j]
    #             combined_df[i+'-'+j]=each_df[i]-each_df[j]
    #             combined_df[i+'*'+j]=each_df[i]*each_df[j]
    #     for i in use_columns:
    #         #加减乘都不用做两次,除可以
    #         for j in use_columns:
    #             if i!=j:
    #                 combined_df[i+'/'+j]=each_df[i]/each_df[j]

    #     each_df=pd.concat([each_df,combined_df],axis=1)

    each_df.reset_index(drop=True, inplace=True)
    each_df.sort_values(['name', 'index'], inplace=True)
    each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440)
    each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440)
    each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last']

    if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        each_df[each_column] = boxcox1p(each_df[each_column], 0)
    elif each_column in ['PCRE']:
        each_df[each_column] = boxcox1p(each_df[each_column], -1.5)

    each_df_copy = each_df.drop([each_column], axis=1)

    each_max_min = naidx_testinfo[naidx_testinfo['column_name'] == each_column]

    # fillna
    # 在fillna前删除异常值
    if not each_column in ['HCT', 'HGB']:
        each_df_copy = pd.concat(
            Parallel(n_jobs=-1)(
                delayed(fillna)(each_group)
                for name, each_group in each_df_copy.groupby('name')))

    each_df_copy.set_index(['name', 'index'], drop=True, inplace=True)

    each_max_min.set_index(['name', 'index'], drop=True, inplace=True)
    each_df.set_index(['name', 'index'], drop=True, inplace=True)

    each_test = pd.merge(each_df_copy,
                         each_max_min,
                         left_index=True,
                         right_index=True)

    each_train_x = each_df_copy.loc[each_df_copy.index.drop(each_test.index)]

    each_train_y = each_df.loc[each_train_x.index, each_column]

    each_test_x = each_test[each_train_x.columns]

    each_train_y = each_train_y.dropna()

    each_train_x = each_train_x.loc[each_train_y.index]

    X_train, X_val, y_train, y_val = train_test_split(each_train_x,
                                                      each_train_y,
                                                      test_size=0.3,
                                                      random_state=2019)

    #     val_index=Parallel(n_jobs=1)(delayed(split_my_val2)(group)for name,group in each_train_x.groupby('name'))
    #     X_val=each_train_x.loc[val_index]
    #     y_val=each_train_y.loc[val_index]
    #     X_train=each_train_x.drop(val_index)
    #     y_train=each_train_y.loc[X_train.index]

    train_max_min = each_train_y.groupby('name').agg([max, min])
    # 需要恢复
    if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        train_max_min['max'] = inv_boxcox1p(train_max_min['max'], 0)
        train_max_min['min'] = inv_boxcox1p(train_max_min['min'], 0)
    elif each_column in ['PCRE']:
        train_max_min['max'] = inv_boxcox1p(train_max_min['max'], -1.5)
        train_max_min['min'] = inv_boxcox1p(train_max_min['min'], -1.5)
    X_train2 = pd.merge(X_train,
                        train_max_min,
                        left_index=True,
                        right_index=True)
    X_val2 = pd.merge(X_val, train_max_min, left_index=True, right_index=True)

    # 创建成lgb特征的数据集格式
    lgb_train = lgb.Dataset(X_train, y_train)  # 将数据保存到LightGBM二进制文件将使加载更快
    lgb_eval = lgb.Dataset(X_val, y_val)  # 创建验证数据

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['mse'],
        'colsample_bytree': 0.9,
        'subsample': 0.9,
        'num_leaves': 30,  # 叶子节点个数
        'min_data': 50,  # 每个叶子节点最少样本数
        'max_depth': -1,  # 树深度
        'lambda_l2': 0.001,  # l2正则
        'lambda_l1': 0.01,  # l1正则
        'num_threads': 12,
        'verbose': -1,
        'tree_learner': 'voting',
        'seed': 2019
    }

    # 训练 cv and train
    gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_eval],
                    feval=cal_val,
                    num_boost_round=3000,
                    early_stopping_rounds=100,
                    verbose_eval=100)

    each_test['pred'] = gbm.predict(each_test[each_train_x.columns])

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    joblib.dump(gbm, os.path.join(save_dir, each_column))
    # 这里需要恢复原来的值计算score

    if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']:
        each_test['pred'] = inv_boxcox1p(each_test['pred'], 0)
    elif each_column in ['PCRE']:
        each_test['pred'] = inv_boxcox1p(each_test['pred'], -1.5)

    mse = np.sum(
        np.square((each_test['var_value'] - each_test['pred']) /
                  (each_test['var_max'] - each_test['var_min'])))
    score = np.sqrt(mse / each_test.shape[0])
    return score