def avg(training_file, submission_file, output_file): data = utilities.read_file(training_file) train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data) targets_train, targets_cv = preprocess.get_train_cv_targets( train_data, cv_data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data) x_train_all, x_cv_all = feature_extraction.get_x_by_avg( train_data, cv_data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) clfs = regression.linear_regression(x_train_all, x_cv_all, targets_train, targets_cv) clfs = regression.random_forest(x_train_all, x_cv_all, targets_train, targets_cv) print 'Filling submission file...' sub_data = utilities.read_file(submission_file, True) for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] weekday = '' all_features = feature_extraction.get_features(chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) for j in range(5, len(sub_data[i])): if sub_data[i][j] == '0': feature = [] for f in all_features: feature.append(f[j - 5]) sub_data[i][j] = clfs[j - 5].predict([feature])[0] utilities.write_file(output_file, sub_data)
def avg(training_file, submission_file, output_file): data = utilities.read_file(training_file) train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data) targets_train, targets_cv = preprocess.get_train_cv_targets( train_data, cv_data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data) x_train_all, x_cv_all = feature_extraction.get_x_by_avg( train_data, cv_data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) clfs = regression.linear_regression( x_train_all, x_cv_all, targets_train, targets_cv) clfs = regression.random_forest( x_train_all, x_cv_all, targets_train, targets_cv) print 'Filling submission file...' sub_data = utilities.read_file(submission_file, True) for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] weekday = '' all_features = feature_extraction.get_features( chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) for j in range(5, len(sub_data[i])): if sub_data[i][j] == '0': feature = [] for f in all_features: feature.append(f[j - 5]) sub_data[i][j] = clfs[j - 5].predict([feature])[0] utilities.write_file(output_file, sub_data)
def time_series(training_file, submission_file, output_file): data = utilities.read_file(training_file, True) first_line = data[0] data = data[1 : :] data = preprocess.fill_NAs(data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data) clf_map = regression.linear_regression_2(data) print 'Filling submission file...' chunk_map = utilities.get_chunk_map(data, 1) sub_data = utilities.read_file(submission_file, True) positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72] for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] pos = positions[(i - 1) % 10] for j in range(5, len(sub_data[i])): target = j - 5 if sub_data[i][j] == '0': if not chunk_id in chunk_map: sub_data[i][j] = hour_avg[hour][target] else: data_in_chunk = chunk_map[chunk_id] start = len(data_in_chunk) - 24 t = len(data_in_chunk[0]) - 39 + target features = [] prev_hour = 0 for k in range(start, len(data_in_chunk)): features.append(float(data_in_chunk[k][t])) if data_in_chunk[k][5] == hour: prev_hour = float(data_in_chunk[k][t]) features.append(prev_hour) # Binary hour features. for h in range(24): if h == int(hour): features.append(1) else: features.append(0) # Binary month features. month = int(sub_data[i][4]) for m in range(1, 13): if m == month: features.append(1) else: features.append(0) # Weather features. tmp_length = len(data_in_chunk) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 1][k])) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 2][k])) sub_data[i][j] = \ clf_map[(target, pos)].predict([features])[0] utilities.write_file(output_file, sub_data)
def time_series(training_file, submission_file, output_file): data = utilities.read_file(training_file, True) first_line = data[0] data = data[1::] data = preprocess.fill_NAs(data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data) clf_map = regression.linear_regression_2(data) print 'Filling submission file...' chunk_map = utilities.get_chunk_map(data, 1) sub_data = utilities.read_file(submission_file, True) positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72] for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] pos = positions[(i - 1) % 10] for j in range(5, len(sub_data[i])): target = j - 5 if sub_data[i][j] == '0': if not chunk_id in chunk_map: sub_data[i][j] = hour_avg[hour][target] else: data_in_chunk = chunk_map[chunk_id] start = len(data_in_chunk) - 24 t = len(data_in_chunk[0]) - 39 + target features = [] prev_hour = 0 for k in range(start, len(data_in_chunk)): features.append(float(data_in_chunk[k][t])) if data_in_chunk[k][5] == hour: prev_hour = float(data_in_chunk[k][t]) features.append(prev_hour) # Binary hour features. for h in range(24): if h == int(hour): features.append(1) else: features.append(0) # Binary month features. month = int(sub_data[i][4]) for m in range(1, 13): if m == month: features.append(1) else: features.append(0) # Weather features. tmp_length = len(data_in_chunk) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 1][k])) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 2][k])) sub_data[i][j] = \ clf_map[(target, pos)].predict([features])[0] utilities.write_file(output_file, sub_data)