Ejemplo n.º 1
0
def main():
    params = {
        'num_leaves': 256,
        'min_child_samples': 79,
        'objective': 'binary',
        'max_depth': 13,
        'learning_rate': 0.03,
        "boosting_type": "gbdt",
        "subsample_freq": 3,
        "subsample": 0.9,
        # "bagging_seed": 11,
        # "eval_metric": 'auc',
        # "verbosity": -1,
        'reg_alpha': 0.3,
        'reg_lambda': 0.3,
        'colsample_bytree': 0.9,
        #'categorical_feature': cat_cols
    }
    # df_train_glo, df_test_glo = read_all_data('small')
    df_train_glo, df_test_glo = read_all_data('all')
    X, y, X_test = data_preprocess(df_train_glo, df_test_glo)
    # ypred = train_lgb_cla(X, y, X_test, params)
    # write(ypred, '1006_1')
    ypred = train_xgb(X, y, X_test, params)
    write(ypred, '1006_2')
Ejemplo n.º 2
0
def main():
    time_now = time.time()
    # df_train_glo, df_test_glo = read_all_data('small')
    df_train_glo, df_test_glo = read_all_data('all')
    global get_dummies_fea
    must_delete = delete_null_feature(0.4)

    best_loss = 10000000
    best_ratio = 0.8
    best_mask = 'delete_row'
    delete_cols = delete_both_feature(best_ratio) + must_delete
    df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols,
                                        best_mask)
    pred = train(df_train, df_test)
    write(pred, '1004_2')
    for ratio in [0.8, 0.5, 0.3, 0.2]:
        # for ratio in []:
        delete_cols = delete_both_feature(ratio) + must_delete
        for drop_mask in ['fillna_mode', 'delete_row']:
            print '\n\ndelete_null_ratio = ', ratio, 'drop_mask = ', drop_mask, '-' * 100
            df_train, df_test = data_preprocess(df_train_glo, df_test_glo,
                                                delete_cols, drop_mask)
            for model in [xgboost.XGBRegressor()]:
                loss, mse = kFold_cross(df_train, model)  # 0.97
                if best_loss > loss:
                    best_loss = loss
                    best_mask = drop_mask
                    best_ratio = ratio
                print time.time() - time_now, '\n\n'

    delete_cols = delete_both_feature(best_ratio) + must_delete
    df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols,
                                        best_mask)
    pred = train(df_train, df_test)
    write(pred, '1004_3')
Ejemplo n.º 3
0
def plot1():
    data_size = sys.argv[1]  # all or small
    df_train, df_test = read_all_data(data_size)

    plt.figure(figsize=(15, 5))
    plt.scatter(df_train.TransactionDT, df_train.D15)
    plt.title('Original D15')
    plt.xlabel('Time')
    plt.ylabel('D15')
    plt.show()
Ejemplo n.º 4
0
def main():
    data_size = sys.argv[1]  # all or small
    df_train, df_test = read_all_data(data_size)
    print df_train.shape, df_test.shape

    row = 3
    col = 5
    for i in range(1, 16):
        plt.subplot(row, col, i)
        plt.scatter(df_train['TransactionDT'], df_train['D' + str(i)])
        plt.title('D' + str(i))

    plt.savefig('./picture/Transaction_D_index.jpg')
    plt.show()
Ejemplo n.º 5
0
def main():
    time_now = time.time()
    # df_train_glo, df_test_glo = read_all_data('small')
    df_train_glo, df_test_glo = read_all_data('all')
    global get_dummies_fea
    must_delete = delete_null_feature(0.4)

    best_ratio = 0.8
    best_mask = 'delete_row'
    delete_cols = delete_both_feature(best_ratio) + must_delete
    df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols, best_mask)
    print df_train.columns.values
    print df_test.columns.values
    y = df_train['isFraud']
    X = df_train.drop(['isFraud', 'TransactionDT'], axis = 1)
    X_test = df_test.drop(['TransactionDT'], axis = 1)
    y_pred = train_lgb(X, y, X_test)
    write(y_pred, '1008_1')
Ejemplo n.º 6
0
def main():
    begin_time = time.time()
    df_train, df_test = read_all_data(int(sys.argv[1]))
    y_train = df_train['isFraud'].copy()
    print df_train.shape, df_test.shape
    for i in range(1, 10):
        print 'M' + str(i) + '.most_common = ', Counter(
            df_train['M1'].tolist()).most_common(10)
        df_train['M' +
                 str(i)] = df_train['M' + str(i)].apply(lambda x: 1 if str(
                     x) == 'T' else -1 if str(x) == 'F' else 0).astype(np.int)
        df_test['M' + str(i)] = df_test['M' + str(i)].apply(lambda x: 1 if str(
            x) == 'T' else -1 if str(x) == 'F' else 0).astype(np.int)
        print 'M' + str(i) + '.most_common = ', Counter(
            df_train['M' + str(i)].tolist()).most_common(10)

    df_train, df_test = data_normalize(df_train, df_test)
    df_train, df_test = add_datetime_feature(df_train, df_test)
    df_train, df_test = encode(df_train, df_test)
    cols = remove_cols(df_train)
    df_train, df_test = get_uid(df_train, df_test)
    df_train, df_test = encode2(df_train, df_test, cols)
    for col in ['ProductCD', 'card6', 'P_emaildomain', 'R_emaildomain', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_31', 'id_35', \
            'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']:
        le = LabelEncoder()
        le.fit(df_train[col].tolist() + df_test[col].tolist())
        df_train[col] = le.transform(df_train[col].tolist())
        df_test[col] = le.transform(df_test[col].tolist())

    print df_train[cols].info()
    print df_train[['TransactionAmt']].info()
    for col in cols:
        if col == 'isFraud': continue
        if col not in df_train.columns.values or col not in df_test.columns.values:
            print '\nerror', col, '\n'
        mean = np.mean(df_train[~df_train[col].isna()][col].tolist() +
                       df_test[~df_test[col].isna()][col].tolist())
        df_train[col] = df_train[col].fillna(mean)
        df_test[col] = df_test[col].fillna(mean)
    oof, preds = BUILD96(df_train, df_test, y_train, cols)
    BUILD96_output(preds)

    print 'spend time = ', time.time() - begin_time
Ejemplo n.º 7
0
# encoding:utf-8
# FileName: main
# Author:   xiaoyi | 小一
# email:    [email protected]
# Date:     2020/2/22 21:05
# Description: 分析疫情数据| 拐点来了吗?
import os

from plot_data import plot_map, plot_line_chart
from preprocess import summary_data
from read_data import read_latest_data, read_all_data

if __name__ == '__main__':
    # 读取数据
    df_data = read_all_data('province')
    # 汇总每天的全国成绩
    df_result_all = summary_data(df_data, 'all')
    df_result_excep_HB = summary_data(df_data, 'excep_HB')
    df_result_HB = summary_data(df_data, 'HB')

    line_chart_title = [
        '累计确诊人数 (by:『知秋小梦』)', '新增确诊人数 (by:『知秋小梦』)', '累计治愈人数 (by:『知秋小梦』)',
        '累计死亡人数 (by:『知秋小梦』)', '治愈率 (by:『知秋小梦』)', '死亡率 (by:『知秋小梦』)'
    ]

    # 绘制折线图
    plot_line_chart('全国数据', line_chart_title, df_result_all)
    plot_line_chart('全国数据(除湖北省)', line_chart_title, df_result_excep_HB)
    plot_line_chart('湖北省数据', line_chart_title, df_result_HB)

    # 获取最新日期的疫情数据
Ejemplo n.º 8
0
def json_data(selectedDate):
    df_date = merged[merged["Date"]==str(selectedDate)]
    json_data = json.dumps(json.loads(df_date.to_json()))
    return df_date.to_json()

def update_date(attr, old, new):
    yr = date_slider.value
    new_data = json_data(yr)
    geosource.geojson = new_data
    p.title.text = 'covid 19 deaths, %s' %yr

def update_color(attr, old, new):
    color_mapper.high = color_slider.value

merged = read_all_data()
geosource = GeoJSONDataSource(geojson = json_data(str(date(2020, 4,25))))
palette = brewer['YlGnBu'][8]
palette = palette[::-1]
color_cap_deaths = round(np.nanmax(merged["Deaths"]) + 500, -3)
color_mapper = LinearColorMapper(palette = palette, low = 0, high = color_cap_deaths, nan_color = '#d9d9d9')
hover = HoverTool(tooltips = [ ('Country/region','@country'),('deaths', '@Deaths')])
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 900, height = 20, border_line_color=None,location = (0,0), orientation = 'horizontal')
p = figure(title = 'Covid 19 deaths', plot_height = 900 , plot_width = 1600, tools = [BoxZoomTool(), ResetTool(), hover])
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.patches('xs','ys', source = geosource,fill_color = {'field' :'Deaths', 'transform' : color_mapper}, line_color = 'black', line_width = 0.25, fill_alpha = 1)
p.add_layout(color_bar, 'below')

date_slider = DateSlider(title="Date Range: ", start=date(2020, 1, 31), end=date(2020, 4, 25), value=date(2020, 4, 25), step=1)
date_slider.on_change('value', update_date)
Ejemplo n.º 9
0
def main():
    df_train_glo, df_test_glo = read_all_data('small')
    # df_train_glo, df_test_glo = read_all_data('all')
    X, y, X_test = data_preprocess(df_train_glo, df_test_glo)
    ypred = train_xgb(X, y, X_test)
    write(ypred, '1006_3')
Ejemplo n.º 10
0
def main():
    df_train_glo, df_test_glo = read_all_data('small')
    # df_train_glo, df_test_glo = read_all_data('all')
    data_preprocess(df_train_glo, df_test_glo)
Ejemplo n.º 11
0
import threading

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import numpy as np
from collections import defaultdict

from read_data import read_all_data
from extract_word import extract_feature
from exture_feature_vec import extract_feature_of_a_seq

all_family_data = read_all_data()
sum_num = 0
for data in all_family_data.values():
    sum_num += len(data)
print(time.ctime(), ":", "数据总量:", sum_num)
features = extract_feature(all_family_data)
dict_feature = {}
for feature in features:
    if len(feature) in dict_feature.keys():
        dict_feature[len(feature)] += 1
    else:
        dict_feature[len(feature)] = 1
print(sorted(dict_feature.items(), key=lambda d: d[0], reverse=False))
print(features, len(features))

family_names = {}
Ejemplo n.º 12
0
        'TEST_VALIDATION', 'TOP_K'
    ]
    if all_para[2] == 'LCFN': para_name += ['FREQUENCY_USER', 'FREQUENCY_ITEM']
    if all_para[2] == 'LightLCFN':
        para_name += [
            'FREQUENCY_USER', 'FREQUENCY_ITEM', 'FREQUENCY', 'KEEP_PORB',
            'SAMPLE_RATE', 'GRAPH_CONV', 'PREDICTION', 'LOSS_FUNCTION',
            'GENERALIZATION', 'OPTIMIZATION', 'IF_TRASFORMATION', 'ACTIVATION',
            'POOLING'
        ]
    if all_para[2] == 'SGNN': para_name += ['PROP_DIM', 'PROP_EMB', 'IF_NORM']
    # if testing the model, we need to read in test set
    if tuning_method == 'test': all_para[11] = para[11] = 'Test'

    ## read data
    data = read_all_data(all_para)
    para[10] = data[-1]

    ## tuning the model
    os.environ["CUDA_VISIBLE_DEVICES"] = all_para[0]
    if tuning_method == 'tuning':
        tuning(path_excel_dir, para_name, para, data, lr_coarse, lamda_coarse,
               min_num_coarse, max_num_coarse, min_num_fine, max_num_fine)
    if tuning_method == 'fine_tuning':
        fine_tuning(path_excel_dir, para_name, para, data, lr_fine, lamda_fine,
                    min_num_fine, max_num_fine)
    if tuning_method == 'cross_tuning':
        cross_tuning(path_excel_dir, para_name, para, data, lr_fine,
                     lamda_fine, min_num_fine, max_num_fine)
    if tuning_method == 'coarse_tuning':
        coarse_tuning(path_excel_dir, para_name, para, data, lr_coarse,