Ejemplo n.º 1
0
def run_model1v1(store_data_file, store_weather_file, test_data_file):
    """
    this is an update on the model1 that each optimization only includes one
    unknown day, which ignores the similarity constraint on unknown days. On
    validation set, it doesn't seem offer any improvement over model1.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max sale for each item at each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=70)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        # compute feature matrix
        _, m = l_sim(nm_trn, nm_vld, nm_tst, store_weather)

        v_init = None

        # init Y_hat
        Y_hat = np.zeros(
            (len(nm_trn) + len(nm_vld) + len(nm_tst), nm_trn.values.shape[1]))

        # predicting validation data
        helper_model2_1(nm_vld, len(nm_trn), nm_trn, m, Y_hat, store_data_max)

        # predicting testing data
        helper_model2_1(nm_tst,
                        len(nm_trn) + len(nm_vld), nm_trn, m, Y_hat,
                        store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat)
        print "error at %d is: train(%f), valid(%f)" % (n, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat, 'test_result.csv')
Ejemplo n.º 2
0
def run_model1(store_data_file, store_weather_file, test_data_file, model_param, only_validate=False):
    """
    the model uses the square error to measure the difference between Y_hat and
    Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be
    reconstructed by similar day's sale. The performance of the model in this
    task is not particularly good.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set2(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        v_init = None
        Y_hat2 = None

        for i in range(1):
            # run prediction on all validation and testing data set
            Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \
                                 valid_init=v_init, alpha_train=model_param)

            # save the code in case the model has stacking effect
            #v_init=Y_hat[len(trn):]

            # denormalize the sale
            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

            # evaluate error in training and validation set
            e1, e2 = eval_model(trn, vld, Y_hat2)
            print "error at %d is: train(%f), valid(%f)" % (i, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv', 'valid_result')
Ejemplo n.º 3
0
def run_model1v1(store_data_file, store_weather_file, test_data_file):
    """
    this is an update on the model1 that each optimization only includes one
    unknown day, which ignores the similarity constraint on unknown days. On
    validation set, it doesn't seem offer any improvement over model1.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max sale for each item at each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=70)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        # compute feature matrix
        _, m = l_sim(nm_trn, nm_vld, nm_tst, store_weather)

        v_init = None

        # init Y_hat
        Y_hat = np.zeros((len(nm_trn)+len(nm_vld)+len(nm_tst), nm_trn.values.shape[1]))

        # predicting validation data
        helper_model2_1(nm_vld, len(nm_trn), nm_trn, m, Y_hat, store_data_max)

        # predicting testing data
        helper_model2_1(nm_tst, len(nm_trn)+len(nm_vld), nm_trn, m, Y_hat, store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat)
        print "error at %d is: train(%f), valid(%f)" % (n, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat, 'test_result.csv')
Ejemplo n.º 4
0
def run_model5(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param)
        if len(tst)==0: continue

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def main():
    # DATA PREPARATION
    dataset = pd.read_csv('dataset.csv', encoding="ISO-8859-1")
    X_train, X_val, X_test, y_train, y_val, y_test, vect = prepare_dataset(
        dataset)

    # MODELS TRAINING
    print("\n--------------------------------------------------------")
    print("------------------- MODELS  TRAINING -------------------")
    print("--------------------------------------------------------\n")

    # Random Forest
    modelRF = train_RF(X_train, y_train)
    eval_model(modelRF, X_val, y_val)
    # SVM
    modelSVC = train_SVC(X_train, y_train)
    eval_model(modelSVC, X_val, y_val)
    #NN
    modelNN = train_NN(X_train, y_train)
    eval_model(modelNN, X_val, y_val)

    # MODELS TESTING
    print("\n--------------------------------------------------------")
    print("-------------- MODELS  TESTING (accuracy) --------------")
    print("--------------------------------------------------------\n")
    print("RANDOM FOREST:     ", test_model(modelRF, X_test, y_test))
    print("SVC:               ", test_model(modelSVC, X_test, y_test))
    print("NEURAL NETWORK:    ", test_model(modelNN, X_test, y_test))

    # RUN THE DEMO
    run_demo(vect, modelNN)
Ejemplo n.º 6
0
def run_model_eval_mp(trn, vld, tst, Y_hat2, col, cat, \
                  test_result_file, \
                  validate_only=False, eval_err=None):
    # evaluate error in training and validation set
    e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
    print "error at item(%s.%d) is: train(%f), valid(%f)" % (col, cat, e1, e2)
    if eval_err is not None:
        eval_err.add_result(e1, len(trn), e2, len(vld))
        
    # write results to test result
    if not validate_only:
        write_submission(trn, vld, tst, Y_hat2, test_result_file, \
                         'valid_result', column=col)   
Ejemplo n.º 7
0
def run_model_eval_mp(trn, vld, tst, Y_hat2, col, cat, \
                  test_result_file, \
                  validate_only=False, eval_err=None):
    # evaluate error in training and validation set
    e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
    print "error at item(%s.%d) is: train(%f), valid(%f)" % (col, cat, e1, e2)
    if eval_err is not None:
        eval_err.add_result(e1, len(trn), e2, len(vld))

    # write results to test result
    if not validate_only:
        write_submission(trn, vld, tst, Y_hat2, test_result_file, \
                         'valid_result', column=col)
Ejemplo n.º 8
0
def run_model4v1(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression with log error term
    """
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1']))

    for col, trn, vld, tst in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param)
        if len(tst)==0: continue

        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, fmat_wegith=build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather,col, alpha=model_param)

        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:,np.newaxis], store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
Ejemplo n.º 9
0
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False):
    print "start here"
    test_result_file ='test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(train, valid, test, store_weather, store_data_max)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat=build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)
Ejemplo n.º 10
0
def run_model2(store_data_file, store_weather_file, test_data_file):
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=0)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, theta = build_model2(nm_trn, nm_vld, nm_tst, store_weather)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
Ejemplo n.º 11
0
def run_model2(store_data_file, store_weather_file, test_data_file):
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=0)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, theta=build_model2(nm_trn, nm_vld, nm_tst, store_weather)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
Ejemplo n.º 12
0
def run_model5(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data,
                                   test,
                                   store_weather,
                                   store_data_max,
                                   valid_pct=0)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (
            col, len(trn), len(vld), len(tst), model_param)
        if len(tst) == 0: continue

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat = build_model5(nm_trn,
                             nm_vld,
                             nm_tst,
                             store_weather,
                             column=col,
                             alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model5(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1,
                                                               e2)
Ejemplo n.º 13
0
def run_model4v1(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression with log error term
    """
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data,
                                   test,
                                   store_weather,
                                   store_data_max,
                                   columns=set(['1']))

    for col, trn, vld, tst in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (
            col, len(trn), len(vld), len(tst), model_param)
        if len(tst) == 0: continue

        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, fmat_wegith = build_model_log_ridge(nm_trn,
                                                   nm_vld,
                                                   nm_tst,
                                                   store_weather,
                                                   col,
                                                   alpha=model_param)

        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat[:, np.newaxis],
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param,
                                                                 e1, e2)
Ejemplo n.º 14
0
def run_model4(store_data_file, store_weather_file, test_data_file, \
               model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression
    """
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set4(store_data, test, store_weather,
                                   store_data_max)

    for col, trn, vld, tst, cat in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \
              (col, len(trn), len(vld), len(tst), model_param, cat)
        if len(tst) == 0: continue

        if cat == 0:
            Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1))
        else:
            nm_trn = normalize_store_data(trn, store_data_max)
            nm_vld = normalize_store_data(vld, store_data_max)
            nm_tst = normalize_store_data(tst, store_data_max)

            _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

            Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
            X = fmat[:len(nm_trn)]

            Y = nm_trn[col].values[:, np.newaxis]
            clf = linear_model.Ridge(alpha=model_param)
            clf.fit(X, Y)
            Y_hat[:] = clf.predict(fmat)

            Y_hat2 = denormalize_store_data(trn,
                                            vld,
                                            tst,
                                            Y_hat,
                                            store_data_max,
                                            column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model4(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1,
                                                               e2)
Ejemplo n.º 15
0
def run_model3(store_data_file,
               store_weather_file,
               test_data_file,
               model_param=1,
               validate_only=False):
    print "start here"
    test_result_file = 'test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data,
                                      store_weather,
                                      valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(train, valid, test, store_weather,
                                   store_data_max)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat = build_model3(nm_trn,
                             nm_vld,
                             nm_tst,
                             store_weather,
                             column=col,
                             alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn,
                         vld,
                         tst,
                         Y_hat2,
                         test_result_file,
                         'valid_result',
                         column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)
Ejemplo n.º 16
0
def build_model2(train, valid, test, \
                 store_weather_data, \
                 valid_init=None, theta_init=None, \
                 alpha_train=1000, alpha_unknown=0.01,
                 eps=1e-5, max_iter=100):
    """
    the model uses logistic regression to model similarities between rows.
    iteratively update Y_hat and similarities parameter theta.
    """
    # compute similarity matrix without normalization
    _,fmat = sim(train, valid, test, store_weather_data, normalize=False)

    # feature value ranges from 1 to -1 and reduces 0 when doing inner product
    fmat[fmat<=0]=-1

    # use similarity matrix as init
    L,_ = sim(train, valid, test, store_weather_data)

    # init theta
    if (theta_init is None):
        theta = np.random.rand(fmat.shape[1])*10
    else:
        theta = theta_init

    # count the total number of rows
    ntrain, m = train.values.shape

    nvalid=0
    if valid is not None:
        nvalid, _ = valid.values.shape

    ntest=0
    if test is not None:
        ntest, _ = test.values.shape

    n = ntrain + nvalid + ntest

    # init Y and Y_hat
    Y = np.zeros((n, m))
    Y[0:ntrain] = train.values
    if (valid_init is not None):
        Y[ntrain:ntrain + nvalid] = valid_init
    Y_hat = np.random.rand(n, m).flatten()

    # set up constraint on Y_hat that all are >=0
    Y_hat_bounds = [(0, None)] * len(Y_hat)

    err = 1000.0
    iter = 0
    first=True
    while (True): # Y_hat is a flatten format
        # compute similarity score
        if not first:
            l=l_logistic_sim(theta, fmat)
            for i in range(n):
                L[i]=l(i)

        first=False

        # compute init total cost
        fval = cost_fun(Y_hat, Y, L, ntrain, alpha_train, alpha_unknown)
        print 'init total cost=', fval

        # run optimiaztion of Y_hat
        Y_hat, fval, _ = fmin_l_bfgs_b(cost_fun, Y_hat, g_cost_fun, \
                            args=(Y, L, ntrain, alpha_train, alpha_unknown), \
                            bounds=Y_hat_bounds, callback=None)
        print 'optimized total cost=', fval

        # compute init similarity cost
        print 'Y_hat shape', Y_hat.shape
        print 'very small Y_hat', Y_hat[Y_hat<1e-4].shape
        fval = l_cost_fun2(theta, fmat, Y_hat, Y)
        print 'init similarity cost=', fval

        # run optimiaztion of theta
        theta, fval, _  = fmin_l_bfgs_b(l_cost_fun2, theta, g_logistic_sim, \
                            args=(fmat, Y_hat, Y))
        print 'optimized similarity cost=', fval

        iter+=1

        # evaluate erros
        Y_hat=Y_hat.reshape(Y.shape)
        e1, e2 = eval_model(train, valid, Y_hat)
        Y_hat=Y_hat.flatten()
        print "error at %d is: train(%f), valid(%f)" % (iter, e1, e2)

        # stop if reach max iteration or changes is very small
        if abs(e1 - err) / err < eps or \
            iter >= max_iter:
            break
        else:
            err=e1

    return Y_hat.reshape(Y.shape), theta
Ejemplo n.º 17
0
def build_model2(train, valid, test, \
                 store_weather_data, \
                 valid_init=None, theta_init=None, \
                 alpha_train=1000, alpha_unknown=0.01,
                 eps=1e-5, max_iter=100):
    """
    the model uses logistic regression to model similarities between rows.
    iteratively update Y_hat and similarities parameter theta.
    """
    # compute similarity matrix without normalization
    _, fmat = sim(train, valid, test, store_weather_data, normalize=False)

    # feature value ranges from 1 to -1 and reduces 0 when doing inner product
    fmat[fmat <= 0] = -1

    # use similarity matrix as init
    L, _ = sim(train, valid, test, store_weather_data)

    # init theta
    if (theta_init is None):
        theta = np.random.rand(fmat.shape[1]) * 10
    else:
        theta = theta_init

    # count the total number of rows
    ntrain, m = train.values.shape

    nvalid = 0
    if valid is not None:
        nvalid, _ = valid.values.shape

    ntest = 0
    if test is not None:
        ntest, _ = test.values.shape

    n = ntrain + nvalid + ntest

    # init Y and Y_hat
    Y = np.zeros((n, m))
    Y[0:ntrain] = train.values
    if (valid_init is not None):
        Y[ntrain:ntrain + nvalid] = valid_init
    Y_hat = np.random.rand(n, m).flatten()

    # set up constraint on Y_hat that all are >=0
    Y_hat_bounds = [(0, None)] * len(Y_hat)

    err = 1000.0
    iter = 0
    first = True
    while (True):  # Y_hat is a flatten format
        # compute similarity score
        if not first:
            l = l_logistic_sim(theta, fmat)
            for i in range(n):
                L[i] = l(i)

        first = False

        # compute init total cost
        fval = cost_fun(Y_hat, Y, L, ntrain, alpha_train, alpha_unknown)
        print 'init total cost=', fval

        # run optimiaztion of Y_hat
        Y_hat, fval, _ = fmin_l_bfgs_b(cost_fun, Y_hat, g_cost_fun, \
                            args=(Y, L, ntrain, alpha_train, alpha_unknown), \
                            bounds=Y_hat_bounds, callback=None)
        print 'optimized total cost=', fval

        # compute init similarity cost
        print 'Y_hat shape', Y_hat.shape
        print 'very small Y_hat', Y_hat[Y_hat < 1e-4].shape
        fval = l_cost_fun2(theta, fmat, Y_hat, Y)
        print 'init similarity cost=', fval

        # run optimiaztion of theta
        theta, fval, _  = fmin_l_bfgs_b(l_cost_fun2, theta, g_logistic_sim, \
                            args=(fmat, Y_hat, Y))
        print 'optimized similarity cost=', fval

        iter += 1

        # evaluate erros
        Y_hat = Y_hat.reshape(Y.shape)
        e1, e2 = eval_model(train, valid, Y_hat)
        Y_hat = Y_hat.flatten()
        print "error at %d is: train(%f), valid(%f)" % (iter, e1, e2)

        # stop if reach max iteration or changes is very small
        if abs(e1 - err) / err < eps or \
            iter >= max_iter:
            break
        else:
            err = e1

    return Y_hat.reshape(Y.shape), theta
Ejemplo n.º 18
0
def run_model1(store_data_file,
               store_weather_file,
               test_data_file,
               model_param,
               only_validate=False):
    """
    the model uses the square error to measure the difference between Y_hat and
    Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be
    reconstructed by similar day's sale. The performance of the model in this
    task is not particularly good.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data,
                                      store_weather,
                                      valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set2(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        v_init = None
        Y_hat2 = None

        for i in range(1):
            # run prediction on all validation and testing data set
            Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \
                                 valid_init=v_init, alpha_train=model_param)

            # save the code in case the model has stacking effect
            #v_init=Y_hat[len(trn):]

            # denormalize the sale
            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat,
                                            store_data_max)

            # evaluate error in training and validation set
            e1, e2 = eval_model(trn, vld, Y_hat2)
            print "error at %d is: train(%f), valid(%f)" % (i, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv',
                         'valid_result')
Ejemplo n.º 19
0
def run_model4(store_data_file, store_weather_file, test_data_file, \
               model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression
    """
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set4(store_data, test, store_weather, store_data_max)

    for col, trn, vld, tst, cat in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \
              (col, len(trn), len(vld), len(tst), model_param, cat)
        if len(tst)==0: continue

        if cat==0:
            Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1))
        else:
            nm_trn = normalize_store_data(trn, store_data_max)
            nm_vld = normalize_store_data(vld, store_data_max)
            nm_tst = normalize_store_data(tst, store_data_max)

            _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

            Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
            X = fmat[:len(nm_trn)]

            Y = nm_trn[col].values[:,np.newaxis]
            clf = linear_model.Ridge(alpha=model_param)
            clf.fit(X, Y)
            Y_hat[:] = clf.predict(fmat)

            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)