def run_model1v1(store_data_file, store_weather_file, test_data_file): """ this is an update on the model1 that each optimization only includes one unknown day, which ignores the similarity constraint on unknown days. On validation set, it doesn't seem offer any improvement over model1. """ print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max sale for each item at each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=70) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) # compute feature matrix _, m = l_sim(nm_trn, nm_vld, nm_tst, store_weather) v_init = None # init Y_hat Y_hat = np.zeros( (len(nm_trn) + len(nm_vld) + len(nm_tst), nm_trn.values.shape[1])) # predicting validation data helper_model2_1(nm_vld, len(nm_trn), nm_trn, m, Y_hat, store_data_max) # predicting testing data helper_model2_1(nm_tst, len(nm_trn) + len(nm_vld), nm_trn, m, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat) print "error at %d is: train(%f), valid(%f)" % (n, e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat, 'test_result.csv')
def run_model1(store_data_file, store_weather_file, test_data_file, model_param, only_validate=False): """ the model uses the square error to measure the difference between Y_hat and Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be reconstructed by similar day's sale. The performance of the model in this task is not particularly good. """ print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set2(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) v_init = None Y_hat2 = None for i in range(1): # run prediction on all validation and testing data set Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \ valid_init=v_init, alpha_train=model_param) # save the code in case the model has stacking effect #v_init=Y_hat[len(trn):] # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error at %d is: train(%f), valid(%f)" % (i, e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv', 'valid_result')
def run_model1v1(store_data_file, store_weather_file, test_data_file): """ this is an update on the model1 that each optimization only includes one unknown day, which ignores the similarity constraint on unknown days. On validation set, it doesn't seem offer any improvement over model1. """ print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max sale for each item at each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=70) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) # compute feature matrix _, m = l_sim(nm_trn, nm_vld, nm_tst, store_weather) v_init = None # init Y_hat Y_hat = np.zeros((len(nm_trn)+len(nm_vld)+len(nm_tst), nm_trn.values.shape[1])) # predicting validation data helper_model2_1(nm_vld, len(nm_trn), nm_trn, m, Y_hat, store_data_max) # predicting testing data helper_model2_1(nm_tst, len(nm_trn)+len(nm_vld), nm_trn, m, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat) print "error at %d is: train(%f), valid(%f)" % (n, e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat, 'test_result.csv')
def run_model_mp(model_fun, store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None, columns=None): print "---------------------start here---------------------" worker_num = 4 test_result_file = 'test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max, columns=columns) queue = Queue() workers = [] for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst) == 0: return while (True): if len(workers) < worker_num: p=Process(target=model_fun, \ args=(queue, col, trn, vld, tst, cat, \ store_weather, store_data_max, \ model_param)) p.start() workers.append(p) break manage_workers(workers, queue, worker_num, \ test_result_file, \ validate_only, eval_err) manage_workers(workers, queue, 1, \ test_result_file, \ validate_only, eval_err) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model5(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): print "---------------------start here---------------------" test_result_file ='test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param) if len(tst)==0: continue # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4v1(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression with log error term """ print "---------------------start here---------------------" test_result_file ='test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1'])) for col, trn, vld, tst in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param) if len(tst)==0: continue nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, fmat_wegith=build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather,col, alpha=model_param) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:,np.newaxis], store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model_mp(model_fun, store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None, columns=None): print "---------------------start here---------------------" worker_num=4 test_result_file ='test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max, columns=columns) queue = Queue() workers=[] for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst)==0: return while (True): if len(workers)<worker_num: p=Process(target=model_fun, \ args=(queue, col, trn, vld, tst, cat, \ store_weather, store_data_max, \ model_param)) p.start() workers.append(p) break manage_workers(workers, queue, worker_num, \ test_result_file, \ validate_only, eval_err) manage_workers(workers, queue, 1, \ test_result_file, \ validate_only, eval_err) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False): print "start here" test_result_file ='test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(train, valid, test, store_weather, store_data_max) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat=build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file)
def run_model2(store_data_file, store_weather_file, test_data_file): print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=0) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, theta = build_model2(nm_trn, nm_vld, nm_tst, store_weather) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
def run_model2(store_data_file, store_weather_file, test_data_file): print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=0) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, theta=build_model2(nm_trn, nm_vld, nm_tst, store_weather) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
def run_model5(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): print "---------------------start here---------------------" test_result_file = 'test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % ( col, len(trn), len(vld), len(tst), model_param) if len(tst) == 0: continue # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat = build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4v1(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression with log error term """ print "---------------------start here---------------------" test_result_file = 'test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1'])) for col, trn, vld, tst in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % ( col, len(trn), len(vld), len(tst), model_param) if len(tst) == 0: continue nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, fmat_wegith = build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather, col, alpha=model_param) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:, np.newaxis], store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression """ print "---------------------start here---------------------" test_result_file = 'test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max) for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst) == 0: continue if cat == 0: Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:, np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False): print "start here" test_result_file = 'test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(train, valid, test, store_weather, store_data_max) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat = build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file)
def run_model4(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression """ print "---------------------start here---------------------" test_result_file ='test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max) for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst)==0: continue if cat==0: Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:,np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)