Beispiel #1
0
def report_main():
    import os
    import shutil
    data = load_data()
    dirname = "Report"
    path = os.getcwd()
    make_dir(dirname, path)
    print("Tolong tunggu sebentar...")

    wb_recap = load_sheet('format/Format_3.xlsx')
    wb_solo = load_sheet('format/Format_3.xlsx')
    rekapfolder = "Process"
    rekapfile = "Rekap Evaluasi"
    organisasi = "BPMU"
    tahun = "2018/2019"
    wb = load_sheet(rekapfolder + '/' + rekapfile + '.xlsx', True)
    init = 0
    for cat in data:
        make_dir(cat, path + '/' + dirname)
        for y in data[cat]:
            for nim in y:
                wb.active = wb[nim]
                wb_recap.active = init
                wb_recap.copy_worksheet(wb_recap.active)
                wb_recap.active.title = nim[:9]
                # fillidentity(wb_recap,data[cat][0][nim][0]['nama'],nim,organisasi,data[cat][0][nim][0]['jabatan'],tahun)
                # fillscore(wb_recap,wb[nim])

                # fillidentity(wb_solo,data[cat][0][nim][0]['nama'],nim,organisasi,data[cat][0][nim][0]['jabatan'],tahun)
                # fillscore(wb_solo,wb[nim])
                wb_solo.save(path + '/' + dirname + "/" + cat + '/' + nim +
                             ".xlsx")
                init += 1
    wb_recap.save(path + '/' + dirname + "/Recap Laporan Evaluasi.xlsx")
    return
Beispiel #2
0
def get_data():
    x_i = []
    for row in load_data():
        x_i.append(row)
        if len(x_i) > 400:
            yield np.array(x_i[:400]).reshape(-1, 400,
                                              5), np.array(x_i[400]).reshape(
                                                  -1, 5)
            x_i.pop(-1)
Beispiel #3
0
def main(num_epochs=NUM_EPOCHS):

    #l_in = lasagne.layers.InputLayer((BATCH_SIZE,64,1,8,512),x,'input_layer')
    l_in = lasagne.layers.InputLayer((BATCH_SIZE, sli, 1, sli_l, 512))

    l_forward_1 = lasagne.layers.LSTMLayer(
        l_in,
        N_HIDDEN,
        grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh)

    l_forward_slice = lasagne.layers.SliceLayer(l_forward_1, -1, 1)

    l_out = lasagne.layers.DenseLayer(
        l_forward_slice,
        num_units=vocab_size,
        W=lasagne.init.GlorotUniform(),
        nonlinearity=lasagne.nonlinearities.softmax)

    target_values = T.ivector('target_output')

    network_output = lasagne.layers.get_output(l_out)

    cost = T.nnet.categorical_crossentropy(network_output,
                                           target_values).mean()

    all_params = lasagne.layers.get_all_params(l_out, trainable=True)

    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    train = theano.function([l_in.input_var, target_values],
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    compute_cost = theano.function([l_in.input_var, target_values],
                                   cost,
                                   allow_input_downcast=True)

    get_out = theano.function([l_in.input_var],
                              lasagne.layers.get_output(l_out),
                              allow_input_downcast=True)

    probs = theano.function([l_in.input_var],
                            network_output,
                            allow_input_downcast=True)
    for n in xrange(1000):
        inp_t, inp_v, output_t, output_v = load_data()
        x, x_v, y, y_v = gen_data()
        avg_cost = 0
        avg_cost += train(x, y)
        val_output = get_out(x_v)
        val_predictions = np.argmax(val_output, axis=1)
        #print(val_predictions)
        #print(y_v)
        accuracy = np.mean(val_predictions == y_v)
        print(accuracy)
        print(avg_cost)
Beispiel #4
0
def make_recap_sheet():
    data = load_data()
    sheet = load_sheet('format/Format_2.xlsx')
    active_sheet_idx = 0
    for cat in data:
        for _ in data[cat]:
            for nim in _:
                sheet.copy_worksheet(sheet.active)
                active_sheet_idx += 1
                sheet.active = active_sheet_idx
                sheet.active.title = nim
                sheet.active["B1"].value = data[cat][0][nim][0]['nama']
                sheet.active["B2"].value = nim
                sheet.active["B3"].value = data[cat][0][nim][0]['jabatan']
                sheet.active = 0
    return sheet
Beispiel #5
0
def main(cfg):

    # parse config
    DATA_FOLDER = path.Path(cfg["DATA"]["DatasetPath"])
    MODEL_PATH = path.Path(cfg["MODEL"]["FilePath"])
    # do something with data
    #X = pd.read_csv(f'{DATA_FOLDER}/{cfg["DATA"]["UsersFile"]}')

    x, y = load_data('train', cfg)
    train_x, val_x = split(x)
    train_y, val_y = split(y)

    model = SimpleModel()

    model.fit(train_x, train_y, val_x, val_y)

    joblib.dump(model, MODEL_PATH)
    logging.info("model was trained")
Beispiel #6
0
def main(num_epochs=NUM_EPOCHS):

    #l_in = lasagne.layers.InputLayer((BATCH_SIZE,64,1,8,512),x,'input_layer')
    l_in = lasagne.layers.InputLayer((BATCH_SIZE,sli,1,sli_l,512))

    l_forward_1 = lasagne.layers.LSTMLayer(
        l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh)


    l_forward_slice = lasagne.layers.SliceLayer(l_forward_1, -1, 1)


    l_out = lasagne.layers.DenseLayer(l_forward_slice, num_units=vocab_size, W = lasagne.init.GlorotUniform(),nonlinearity=lasagne.nonlinearities.softmax)

    target_values = T.ivector('target_output')

    network_output = lasagne.layers.get_output(l_out)

    cost = T.nnet.categorical_crossentropy(network_output,target_values).mean()

    all_params = lasagne.layers.get_all_params(l_out,trainable=True)

    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)


    train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True)
    compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True)

    get_out = theano.function([l_in.input_var],lasagne.layers.get_output(l_out),allow_input_downcast=True)

    probs = theano.function([l_in.input_var],network_output,allow_input_downcast=True)
    for n in xrange(1000):
        inp_t,inp_v,output_t,output_v = load_data()
        x, x_v, y, y_v = gen_data()
        avg_cost = 0
        avg_cost += train(x,y)
        val_output = get_out(x_v)
        val_predictions = np.argmax(val_output, axis=1)
        #print(val_predictions)
        #print(y_v)
        accuracy = np.mean(val_predictions == y_v)
        print(accuracy)
        print(avg_cost)
def main(cfg):

    # parse config
    DATA_FOLDER = path.Path(cfg["DATA"]["DatasetPath"])
    USER_ID = cfg["COLUMNS"]["USER_ID"]
    PREDICTION = cfg["COLUMNS"]["PREDICTION"]
    MODEL_PATH = path.Path(cfg["MODEL"]["FilePath"])
    SUBMISSION_FILE = path.Path(cfg["SUBMISSION"]["FilePath"])
    # do something with data
    #X = pd.read_csv(f'{DATA_FOLDER}/{cfg["DATA"]["UsersFile"]}')

    x, client_ids = load_data('test', cfg)

    model = joblib.load(MODEL_PATH)

    preds = model.predict(x)
    preds = np.round(preds.flatten()).astype(int)

    sub = pd.DataFrame.from_dict({
        'client_id': client_ids.tolist(),
        'target': preds.tolist()
    })
    sub.to_csv(SUBMISSION_FILE, index=False)
Beispiel #8
0
def main_process():
    import os
    import shutil
    data = load_data()
    dirname = "Process"
    path = os.getcwd()
    make_dir(dirname,path)
    print("Tolong tunggu sebentar...")
    try:
        wb_recap = make_recap_sheet()
        for cat in data:
            for y in data[cat]:
                for nim in y:
                    wb = load_sheet("Data Evaluasi/"+nim+".xlsx", 1)
                    for row in range(5,wb.active.max_row,7):
                        if wb.active["C"+str(row)].value != nim and wb.active["C"+str(row)].value != None:
                            wb_recap.active = wb_recap[wb.active["C"+str(row)].value]
                            init = 3
                            while wb_recap.active["E"+str(init)].value != None:
                                init+=1
                            wb_recap.active["E"+str(init)].value = data[cat][0][nim][0]['nama']
                            wb_recap.active["F"+str(init)].value = nim
                            wb_recap.active["G"+str(init)].value = data[cat][0][nim][0]['jabatan']
                            wb_recap.active["H"+str(init)].value = wb.active["F"+str(row+5)].value
                            wb_recap.active["I"+str(init)].value = wb.active["G"+str(row+5)].value
                            wb_recap.active["J"+str(init)].value = wb.active["H"+str(row+5)].value
                            wb_recap.active["K"+str(init)].value = wb.active["K"+str(row+5)].value
                            wb_recap.active["L"+str(init)].value = wb.active["N"+str(row+5)].value
                            wb_recap.active["M"+str(init)].value = wb.active["S"+str(row+5)].value
                            wb_recap.active["N"+str(init)].value = wb.active["W"+str(row+5)].value
                            wb_recap.active["O"+str(init)].value = wb.active["AA"+str(row+5)].value
    except Exception as e:
        print(e)
    wb_recap.active = 0
    wb_recap[wb_recap.active.title].sheet_state = "hidden"
    wb_recap.save(path+'/'+dirname+"/Rekap Evaluasi.xlsx")
    return
Beispiel #9
0
#x = T.tensor4()

N_HIDDEN = 100

LEARNING_RATE = .001

GRAD_CLIP = 100

NUM_EPOCHS = 20

BATCH_SIZE = 200

vocab_size = 9

inp_t, inp_v, output_t, output_v = load_data()
sli_l = 8
sli = 64


#y = T.ivector()
def gen_data():

    xx = np.zeros((BATCH_SIZE, 512, 512))
    rng_state = np.random.get_state()
    np.random.shuffle(inp_t)
    np.random.set_state(rng_state)
    np.random.shuffle(output_t)
    y = output_t[0:BATCH_SIZE]
    xx = inp_t[0:BATCH_SIZE, :, :]
    y_v = output_v
Beispiel #10
0
def run_temp_model(outcome, path_to_data, path_to_result_folder, n_samples=1000):
    if not os.path.exists(path_to_result_folder):
        os.makedirs(path_to_result_folder)

    # # get temp stuff
    # df = pd.read_csv(path_to_data)
    # annual_temps = []
    # daily_temps = []
    # for annual_temp in np.arange(df.meanTempDegree.min(), df.meanTempDegree.max() + 1, 1):
    #     at_dt_temps = np.arange(df.loc[df.meanTempDegree == annual_temp, 'dailyTempCat'].min(),
    #                             df.loc[df.meanTempDegree == annual_temp, 'dailyTempCat'].max() + 0.1, 0.1)
    #     annual_temps += [np.repeat(annual_temp, at_dt_temps.size)]
    #     daily_temps += [at_dt_temps]
    # annual_temps = np.hstack(annual_temps)
    # daily_temps = np.hstack(daily_temps)
    # del df

    # load data
    # -------------------------------------------------------------------------
    tdata = process.load_data(path_to_data, outcome)
    tdata = actions.mtslice.adjust_mean(tdata)
    with open(path_to_result_folder + "/" + outcome + "_tdata.pkl", 'wb') as fwrite:
        pickle.dump(tdata, fwrite, -1)
    tdata_agg = actions.mtslice.aggregate_mtslice(tdata)
    tdata_agg = actions.mtslice.adjust_agg_std(tdata_agg)
    with open(path_to_result_folder + "/" + outcome + "_tdata_agg.pkl", 'wb') as fwrite:
        pickle.dump(tdata_agg, fwrite, -1)

    # fit the mean surface
    # -------------------------------------------------------------------------
    linear_no_mono = ('inj' in outcome)
    surface_result = actions.surface.fit_surface(tdata_agg,
                                                 linear_no_mono=linear_no_mono)
    with open(path_to_result_folder + "/" + outcome + "_surface_result.pkl", 'wb') as fwrite:
        pickle.dump(surface_result, fwrite, -1)

    # fit the study structure in the residual
    # -------------------------------------------------------------------------
    trend_result, tdata_residual = actions.mtslice.fit_trend(tdata,
                                                             surface_result,
                                                             inlier_pct=0.95)
    with open(path_to_result_folder + "/" + outcome + "_trend_result.pkl", 'wb') as fwrite:
        pickle.dump(trend_result, fwrite, -1)
    with open(path_to_result_folder + "/" + outcome + "_tdata_residual.pkl", 'wb') as fwrite:
        pickle.dump(tdata_residual, fwrite, -1)

    # predict surface with UI
    # -----------------------------------------------------------------------------
    annual_temps, daily_temps = utils.create_grid_points_alt(np.unique(tdata_agg.mean_temp), 0.1, tdata)
    curve_samples = process.sample_surface(
        mt=annual_temps, dt=daily_temps, num_samples=n_samples,
        surface_result=surface_result, trend_result=trend_result,
        include_re=True
    )
    curve_samples_df = pd.DataFrame(
        np.vstack([annual_temps, daily_temps, curve_samples]).T,
        columns=['annual_temperature', 'daily_temperature'] + [f'draw_{i}' for i in range(n_samples)]
    )
    curve_samples_df.to_csv(
        path_to_result_folder + "/" + outcome + "_curve_samples.csv",
        index=False
    )

    evidence_score = score.scorelator(curve_samples_df, trend_result, 
        tdata, outcome, path_to_result_folder)
    evidence_score.to_csv(
        path_to_result_folder + "/" + outcome + "_score.csv",
        index=False
    )

    del curve_samples, curve_samples_df

    # plot the result
    # -------------------------------------------------------------------------
    # 3D surface and the level plot
    actions.surface.plot_surface(tdata_agg, surface_result)
    plt.savefig(path_to_result_folder + "/" + outcome + "_surface.pdf",
                bbox_inches="tight")
    # plot uncertainty for each mean temp (can be subset of this)
    plt.figure(figsize=(8, 6))
    for mt in trend_result.mean_temp:
        fig, ax = plt.subplots(1, 1, figsize=(8, 5))
        viz.plot_slice_uncertainty(
                mt,
                tdata,
                surface_result,
                trend_result,
                ylim=[-1.0, 1.0], ax=ax)
        ax.set_xlabel("daily temperature")
        ax.set_title(outcome + " at mean temperature %i" %mt)
        fig.savefig(path_to_result_folder + "/" + outcome + "_slice_%i.pdf" % mt,
                    bbox_inches="tight")
        plt.close(fig)
Beispiel #11
0
    t1 = time.time()
    print(wd_model.get_available_gpus()
          )  # 返回格式为:['/device:GPU:0', '/device:GPU:1']

    # LOAD DATA
    print('*-' * 40, 'LOAD DATA')
    making_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_order_xt/'
    link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_link_sqe_for_order/'
    cross_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/for_0714_cross_sqe_for_order/'
    head_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/'
    win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/'
    pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/'
    data_for_driver_xw = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/data_for_driver_xw/'
    downstream_status_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/downstream_status_for_order/'
    data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data(
        making_data_dir, link_data_dir, cross_data_dir, head_link_dir,
        win_order_data_dir, pre_arrival_sqe_dir, data_for_driver_xw,
        downstream_status_dir)

    # PROCESSING DATA
    print('*-' * 40, 'PROCESSING DATA')
    train_data, val_data = process.processing_data(data, mk_cols_list,
                                                   link_cols_list,
                                                   cross_cols_list, WIDE_COLS)
    del data
    gc.collect()
    # print(train_data.columns.tolist())

    # PROCESSING INPUTS
    print('*-' * 40, 'PROCESSING INPUTS')
    # SAVE LIST
    a = np.array(mk_cols_list)
Beispiel #12
0
def main():

    random.seed(240480)

    if use_preprocessed_data:
        print('load preprocessed data')
        df_train = pd.read_csv('data/train_processed.csv')
        df_test = pd.read_csv('data/test_processed.csv')
    else:
        df_train, df_test = load_data()

    print('configure data for training')
    id_test = df_test['id']
    y_train = df_train['relevance'].values
    X_train = df_train[:]
    X_test = df_test[:]

    print('construct model')

    # TF-IDF vectorize - converts docs to tf-idf feature matrix.
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')

    # truncated singular value decomposition - dimensionality reduction.
    tsvd = TruncatedSVD(n_components=10, random_state=240480)

     # random forest
    rfr = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=240480, verbose=1)

    # TODO: get these features to include some cosine similarity measure between search term and other fields!
    # think we need to first fit tfidvectoriser to each of title, description, brand
    # and then insert into pipeline to generate 3x features of search term against the respective vocabs
    # potentially just include similarity scores as features.  or maybe RF will handle this on its own...

    # pipeline:
    # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd]
    # 2. pass to random forest.
    clf = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('cst',  cust_regression_vals()),
                ('txt1', Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                ('txt2', Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                ('txt3', Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                ('txt4', Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
            ],
            transformer_weights={
                'cst': 1.0,
                'txt1': 0.5,
                'txt2': 0.25,
                'txt3': 0.0,
                'txt4': 0.5
            },
            n_jobs=-1
        )),
        ('rfr', rfr)])

    print('run grid search')
    # TODO: search over relative weightings of transformer features?
    param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
    RMSE = make_scorer(fmean_squared_error, greater_is_better=False)
    model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, cv=2, scoring=RMSE)
    model.fit(X_train, y_train)

    print("Best parameters found by grid search:")
    print(model.best_params_)
    print("Best CV score:")
    print(model.best_score_)

    print('run predictions')
    y_pred = model.predict(X_test)

    print('save submission file')
    pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv', index=False)
Beispiel #13
0
def main():

    random.seed(240480)

    if use_preprocessed_data:
        print('load preprocessed data')
        df_train = pd.read_csv('data/train_processed.csv')
        df_test = pd.read_csv('data/test_processed.csv')
    else:
        df_train, df_test = load_data()

    print('configure data for training')
    id_test = df_test['id']
    y_train = df_train['relevance'].values
    X_train = df_train[:]
    X_test = df_test[:]

    print('construct model')

    # TF-IDF vectorize - converts docs to tf-idf feature matrix.
    tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')

    # truncated singular value decomposition - dimensionality reduction.
    tsvd = TruncatedSVD(n_components=10, random_state=240480)

    # random forest
    rfr = RandomForestRegressor(n_estimators=500,
                                n_jobs=-1,
                                random_state=240480,
                                verbose=1)

    # TODO: get these features to include some cosine similarity measure between search term and other fields!
    # think we need to first fit tfidvectoriser to each of title, description, brand
    # and then insert into pipeline to generate 3x features of search term against the respective vocabs
    # potentially just include similarity scores as features.  or maybe RF will handle this on its own...

    # pipeline:
    # 1. build feature unions [cust_txt_col (to extract column) -> tfidf -> tsvd]
    # 2. pass to random forest.
    clf = Pipeline([('union',
                     FeatureUnion(transformer_list=[
                         ('cst', cust_regression_vals()),
                         ('txt1',
                          Pipeline([('s1', cust_txt_col(key='search_term')),
                                    ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                         ('txt2',
                          Pipeline([('s2', cust_txt_col(key='product_title')),
                                    ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                         ('txt3',
                          Pipeline([('s3',
                                     cust_txt_col(key='product_description')),
                                    ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                         ('txt4',
                          Pipeline([('s4', cust_txt_col(key='brand')),
                                    ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                     ],
                                  transformer_weights={
                                      'cst': 1.0,
                                      'txt1': 0.5,
                                      'txt2': 0.25,
                                      'txt3': 0.0,
                                      'txt4': 0.5
                                  },
                                  n_jobs=-1)), ('rfr', rfr)])

    print('run grid search')
    # TODO: search over relative weightings of transformer features?
    param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
    RMSE = make_scorer(fmean_squared_error, greater_is_better=False)
    model = grid_search.GridSearchCV(estimator=clf,
                                     param_grid=param_grid,
                                     cv=2,
                                     scoring=RMSE)
    model.fit(X_train, y_train)

    print("Best parameters found by grid search:")
    print(model.best_params_)
    print("Best CV score:")
    print(model.best_score_)

    print('run predictions')
    y_pred = model.predict(X_test)

    print('save submission file')
    pd.DataFrame({
        "id": id_test,
        "relevance": y_pred
    }).to_csv('submission.csv', index=False)
Beispiel #14
0
#x = T.tensor4()


N_HIDDEN = 100

LEARNING_RATE = .001

GRAD_CLIP = 100

NUM_EPOCHS = 20

BATCH_SIZE = 200

vocab_size = 9

inp_t,inp_v,output_t,output_v = load_data()
sli_l = 8
sli = 64

#y = T.ivector()
def gen_data():

    xx = np.zeros((BATCH_SIZE,512,512))
    rng_state = np.random.get_state()
    np.random.shuffle(inp_t)
    np.random.set_state(rng_state)
    np.random.shuffle(output_t)
    y = output_t[0:BATCH_SIZE]
    xx  = inp_t[0:BATCH_SIZE,:,:]
    y_v = output_v
import process as p
from sklearn import datasets, svm, metrics, utils
from sklearn.ensemble import RandomForestClassifier


dataset = p.load_data("./pot.csv","./targets.csv")

print("hello there :)")

#clf = svm.SVC()
clf = RandomForestClassifier(max_depth=5, random_state=0)

print("100% of the data is {}.".format(len(dataset.data)))

# Get 4/5
split_index = len(dataset.data)//5*4
print("80% of the data is {}.".format(split_index))

train_data = dataset.data[:split_index]
test_data = dataset.data[split_index:]

train_target = dataset.target[:split_index]
test_target = dataset.target[split_index:]

print train_target.shape
clf.fit(train_data, train_target)

out = clf.predict(test_data[0:])

print out
import numpy as np
from process import load_data

unique_base_classes = set(load_data('FC100_train.pickle')['labels'])
np.random.seed(seed=42)

base_test_image_indices = {}
for cl in sorted(unique_base_classes):
    base_test_image_indices[cl] = sorted(
        np.random.choice(a=list(range(1, 601)), size=100, replace=False))
    # print(cl, base_test_image_indices[cl])

import pickle
with open('base_test_indices.pickle', 'wb') as file:
    pickle.dump(obj=base_test_image_indices, file=file)
Beispiel #17
0
    head_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_head_link_data_clear/'
    win_order_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/win_order_xw/'
    #pre_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/final_pre_arrival_data/'
    arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_link_sqe_for_order_arrival/'
    zsl_arrival_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_arrival/'
    arrival_sqe_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_170_lk_arrival_sqe_for_order/'
    #h_s_for_link_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/max_hightmp_slice_for_link_eb/'
    pre_arrival_sqe_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/sqe_arrival_for_link/'
    zsl_link_data_dir = '/home/didi2021/didi2021/giscup_2021/final_train_data_0703/zsl_train_link/'
    data, mk_cols_list, link_cols_list, cross_cols_list = process.load_data(
        making_data_dir,
        link_data_dir,
        cross_data_dir,
        link_data_other_dir,
        head_data_dir,
        win_order_data_dir,
        pre_arrival_sqe_dir,
        zsl_link_data_dir,
        #pre_arrival_data_dir,
        #h_s_for_link_dir,
        arrival_data_dir,
        zsl_arrival_data_dir,
        arrival_sqe_data_dir)

    #fd = dcn_model.FeatureDictionary(data, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS,
    #                                 cate_cols=CATEGORICAL_COLS)
    # PROCESSING DATA
    data['date_time'] = data['date_time'].astype(int)
    print("type(data['date_time']):", data['date_time'].dtype)
    data = data[data['date_time'] != 20200901]
    print('Here train_test_split..................')
    # all_train_data, _ = train_test_split(all_train_data, test_size=0.9, random_state=42)