Beispiel #1
0
    def test_weighted_avg(self):
        y1, y2, y3 = [1, 2, 3], [4, 5, 6], [7, 8, 9]
        df = pd.DataFrame({'y1': y1, 'y2': y2, 'y3': y3})

        # Check unweighted mean
        self.assertTrue(np.array_equal(
            df.mean(axis=1).values,
            ensemble.weighted_avg([y1, y2, y3])
        ))

        # Check weighted mean
        self.assertTrue(np.array_equal(
            ensemble.weighted_avg([y1, y2, y3], weights=[1, 1, 2]),
            np.array([19/4, 23/4, 27/4])
        ))
Beispiel #2
0
def weighted_avg_from_files(
        fnames, outfile, weights=[],
        sample_submission_file=DEFAULT_SAMPLE_SUBMISSION_FILE,
        sample_submission_idx=DEFAULT_SAMPLE_SUBMISSION_IDX):
    """Compute weighted avg from submission files, and save results to a new file

    Parameters
    ----------
    fnames : Iterable of str's
        Submission file names of y_hats to be averaged

    outfile : str
        Output file name, including path

    weights : Iterable
        Weights corresponding to y_hats in the same order.
        If weights is empty, then the model just returns unweighted mean.

    sample_submission_file : str
        Path to example submission file provided by Kaggle

    sample_submission_idx : str
        Index column name in `sample_submission_file`

    Returns
    -------
    y_hat_avg : numpy.ndarray
        Weighted averages
    """
    y_hat_avg = ensemble.weighted_avg(
        [pd.read_csv(f, index_col=sample_submission_idx, squeeze=True).values
         for f in fnames],
        weights=weights
    )
    save_submission(y_hat_avg, outfile,
                    sample_submission_file=sample_submission_file)
    return y_hat_avg