Exemple #1
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg,
     weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
        train_data, cv_data, chunk_avg, hour_avg_by_chunk,
        weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(x_train_all, x_cv_all, targets_train,
                                        targets_cv)
    clfs = regression.random_forest(x_train_all, x_cv_all, targets_train,
                                    targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(chunk_id, weekday, hour,
                                                       chunk_avg,
                                                       hour_avg_by_chunk,
                                                       weekday_avg_by_chunk,
                                                       hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
Exemple #2
0
def avg(training_file, submission_file, output_file):
    data = utilities.read_file(training_file)

    train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data)
    targets_train, targets_cv = preprocess.get_train_cv_targets(
        train_data, cv_data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data)

    x_train_all, x_cv_all = feature_extraction.get_x_by_avg(
            train_data, cv_data, chunk_avg, hour_avg_by_chunk,
             weekday_avg_by_chunk, hour_avg, weekday_avg)

    clfs = regression.linear_regression(
        x_train_all, x_cv_all, targets_train, targets_cv)
    clfs = regression.random_forest(
        x_train_all, x_cv_all, targets_train, targets_cv)

    print 'Filling submission file...'
    sub_data = utilities.read_file(submission_file, True)
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        weekday = ''
        all_features = feature_extraction.get_features(
            chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk,
            weekday_avg_by_chunk, hour_avg, weekday_avg)

        for j in range(5, len(sub_data[i])):
            if sub_data[i][j] == '0':
                feature = []
                for f in all_features:
                    feature.append(f[j - 5])
                sub_data[i][j] = clfs[j - 5].predict([feature])[0]

    utilities.write_file(output_file, sub_data)
Exemple #3
0
def time_series(training_file, submission_file, output_file):
    data = utilities.read_file(training_file, True)
    first_line = data[0]
    data = data[1 : :]
    data = preprocess.fill_NAs(data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data)

    clf_map = regression.linear_regression_2(data)

    print 'Filling submission file...'
    chunk_map = utilities.get_chunk_map(data, 1)
    sub_data = utilities.read_file(submission_file, True)

    positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        pos = positions[(i - 1) % 10]
        for j in range(5, len(sub_data[i])):
            target = j - 5
            if sub_data[i][j] == '0':
                if not chunk_id in chunk_map:
                    sub_data[i][j] = hour_avg[hour][target]
                else:
                    data_in_chunk = chunk_map[chunk_id]
                    start = len(data_in_chunk) - 24
                    t = len(data_in_chunk[0]) - 39 + target
                    features = []
                    prev_hour = 0
                    for k in range(start, len(data_in_chunk)):
                        features.append(float(data_in_chunk[k][t]))
                        if data_in_chunk[k][5] == hour:
                            prev_hour = float(data_in_chunk[k][t])

                    features.append(prev_hour)

                    # Binary hour features.
                    for h in range(24):
                        if h == int(hour):
                            features.append(1)
                        else:
                            features.append(0)

                    # Binary month features.
                    month = int(sub_data[i][4])
                    for m in range(1, 13):
                        if m == month:
                            features.append(1)
                        else:
                            features.append(0)

                    # Weather features.
                    tmp_length = len(data_in_chunk)
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 1][k]))
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 2][k]))

                    sub_data[i][j] = \
                        clf_map[(target, pos)].predict([features])[0]

    utilities.write_file(output_file, sub_data)
Exemple #4
0
def time_series(training_file, submission_file, output_file):
    data = utilities.read_file(training_file, True)
    first_line = data[0]
    data = data[1::]
    data = preprocess.fill_NAs(data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg,
     weekday_avg) = feature_extraction.get_avg_maps(data)

    clf_map = regression.linear_regression_2(data)

    print 'Filling submission file...'
    chunk_map = utilities.get_chunk_map(data, 1)
    sub_data = utilities.read_file(submission_file, True)

    positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        pos = positions[(i - 1) % 10]
        for j in range(5, len(sub_data[i])):
            target = j - 5
            if sub_data[i][j] == '0':
                if not chunk_id in chunk_map:
                    sub_data[i][j] = hour_avg[hour][target]
                else:
                    data_in_chunk = chunk_map[chunk_id]
                    start = len(data_in_chunk) - 24
                    t = len(data_in_chunk[0]) - 39 + target
                    features = []
                    prev_hour = 0
                    for k in range(start, len(data_in_chunk)):
                        features.append(float(data_in_chunk[k][t]))
                        if data_in_chunk[k][5] == hour:
                            prev_hour = float(data_in_chunk[k][t])

                    features.append(prev_hour)

                    # Binary hour features.
                    for h in range(24):
                        if h == int(hour):
                            features.append(1)
                        else:
                            features.append(0)

                    # Binary month features.
                    month = int(sub_data[i][4])
                    for m in range(1, 13):
                        if m == month:
                            features.append(1)
                        else:
                            features.append(0)

                    # Weather features.
                    tmp_length = len(data_in_chunk)
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length -
                                                            1][k]))
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length -
                                                            2][k]))

                    sub_data[i][j] = \
                        clf_map[(target, pos)].predict([features])[0]

    utilities.write_file(output_file, sub_data)