Example #1
0
def test_minimize():
    df = pd.DataFrame(
        [[0., np.nan, random()], [random(), np.nan, random()],
         [0., random(), random()], [0., random(), random()]],
        columns=['click_time', 'conv_time', 'random_feature'])

    features = ['random_feature']
    end_time = 1.

    y = df[['click_time', 'conv_time']].as_matrix()
    X = csr_matrix(df[features])

    input_ = prepare_input(y, X, end_time=end_time)
    input_['Jacobian'] = np.array([random(), random(), random(), random()])

    clf = ConversionEstimator(end_time=end_time)
    clf.fit(X, y)

    assert isinstance(clf.coef_, np.ndarray) and isinstance(
        clf.lag_coef_, np.ndarray)
    assert clf.convergence_info['success']
    assert clf.convergence_info['message'] in {
        b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
        b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
    }
Example #2
0
def test_minimize_large(test_dummied_matrix):

    y, X = test_dummied_matrix

    end_time = 1.1 * y[:, 1][~np.isnan(y[:, 1])].max()

    input_ = prepare_input(y, X, end_time=end_time)
    input_['Jacobian'] = np.array([random() for _ in range(2 * X.shape[1])])

    clf = ConversionEstimator(end_time=end_time)
    clf.fit(X, y)

    assert isinstance(clf.coef_, np.ndarray) and isinstance(
        clf.lag_coef_, np.ndarray)
    assert clf.convergence_info['success']
    assert clf.convergence_info['message'] in {
        b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
        b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
    }
Example #3
0
def get_model(serialized_model):

    estimator = ConversionEstimator(end_time=serialized_model['end_time'])
    estimator.coef_ = serialized_model['coef_']
    estimator.lag_coef_ = serialized_model['lag_coef_']

    if serialized_model.get('keymap') is not None:
        encoder = OneHotEncoderCOO(features=serialized_model['features'],
                                   add_intercept=serialized_model['add_intercept']
                                   )
        encoder.keymap = serialized_model['keymap']

    elif serialized_model.get('modulo') is not None:
        encoder = FeatureHasher(serialized_model['modulo'],
                                features=serialized_model['features'],
                                add_intercept=serialized_model['add_intercept']
                                )
    else:
        encoder = None

    return estimator, encoder
Example #4
0
def get_log_loss(df, end_time):

    features = list(df.columns)
    features.pop(features.index('click_time'))
    features.pop(features.index('conv_time'))

    y_obs = df[['click_time', 'conv_time']].copy()
    y_obs['conv_time'] = y_obs['conv_time'].map(
        lambda x: x if (x <= end_time and not np.isnan(x)) else np.nan)
    y_obs = y_obs.values
    X = csr_matrix(df[features].values)

    estimator = ConversionEstimator(end_time)

    estimator.fit(X, y_obs)

    assert estimator.convergence_info.success

    logger.info('The convergence message is: {}'.format(
        estimator.convergence_info.message))

    y_true = df['conv_time']
    y_true = y_true.notnull()

    y_pred = estimator.predict(X)

    log_loss = estimator.log_loss(y_true, y_pred)

    return log_loss
Example #5
0
    def get_fitted_estimator_and_encoder():
        df = pd.DataFrame(
            [[0., np.nan, random()], [random(), np.nan,
                                      random()],
             [0., random(), random()], [0., random(), random()]],
            columns=['click_time', 'conv_time', 'random_feature'])

        features = ['random_feature']
        end_time = 1.
        y, X = df[['click_time', 'conv_time']].values, df[features].values
        X_enc = csr_matrix(X)
        input_ = prepare_input(y, X_enc, end_time=end_time)

        input_['Jacobian'] = np.array([random(), random(), random(), random()])

        estimator = ConversionEstimator(end_time=end_time)
        estimator.fit(X_enc, y)

        encoder = OneHotEncoderCOO(features=['random_feature'])
        encoder.fit(X)

        return estimator, encoder
Example #6
0
def time_pipeline(df, end_time):
    time0 = time()

    features = list(df.columns)
    features.pop(features.index('click_time'))
    features.pop(features.index('conv_time'))

    y_obs = df[['click_time', 'conv_time']].copy()
    y_obs['conv_time'] = y_obs['conv_time'].map(
        lambda x: x if (x <= end_time and not np.isnan(x)) else np.nan)
    y_obs = y_obs.values
    X = csr_matrix(df[features].values)

    estimator = ConversionEstimator(end_time)

    estimator.fit(X, y_obs)

    assert estimator.convergence_info.success

    logger.info('The convergence message is: {}'.format(
        estimator.convergence_info.message))

    return time() - time0
Example #7
0
def main():
    conv_prob_list = [0.1, 0.2, 0.3, 0.4]
    events = 1000
    lambda_list = [1 / 8, 1 / 4, 1 / 2, 1., 2.]
    end_time = 1.

    for conv_prob_, scale_ in product(conv_prob_list, lambda_list):
        number_of_simulations = 100
        predicted_probability_list_1 = []
        predicted_probability_list_2 = []

        for i in range(number_of_simulations):
            conv_lags_1 = [
                exponential(scale=scale_) * int(conv_prob_ - random() > 0)
                for _ in range(events)
            ]
            conv_lags_2 = [
                exponential(scale=2. * scale_) *
                int(conv_prob_ / 2. - random() > 0) for _ in range(events)
            ]
            df = pd.DataFrame(
                [[0., t, 1, 0] for t in conv_lags_1] + [[0., t, 0, 1]
                                                        for t in conv_lags_2],
                columns=['click_time', 'conv_time', 'f_1', 'f_2'])

            df['conv_time'] = df['conv_time'].map(lambda t: t
                                                  if t > 0 else np.nan)
            mc_1 = df[(df['f_1'] == 1) & (df['conv_time'].notnull(
            ))].shape[0] / df[df['f_1'] == 1].shape[0]
            mc_2 = df[(df['f_2'] == 1) & (df['conv_time'].notnull(
            ))].shape[0] / df[df['f_2'] == 1].shape[0]

            # measured data
            df['conv_time'] = df['conv_time'].map(
                lambda t: t if t <= end_time and not np.isnan(t) else np.nan)

            y = df[['click_time', 'conv_time']].values
            X = csr_matrix(df[['f_1', 'f_2']].values)

            clf = ConversionEstimator(end_time=end_time)
            clf.fit(X, y)

            assert clf.convergence_info['success']
            assert clf.convergence_info['message'] in {
                b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
                b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
            }

            predicted_probability_1 = clf.predict(np.array([[1, 0]]))[0]
            predicted_probability_list_1.append(predicted_probability_1)

            predicted_probability_2 = clf.predict(np.array([[0, 1]]))[0]
            predicted_probability_list_2.append(predicted_probability_2)

        f_1 = pd.DataFrame(predicted_probability_list_1,
                           columns=['predicted conversion probability'])
        f_2 = pd.DataFrame(predicted_probability_list_2,
                           columns=['predicted conversion probability'])

        print('conv_probability: {}, lambda: {}, conversion rate: {}'.format(
            conv_prob_, 1 / scale_, mc_1))
        print(f_1.describe())
        l = 1  # NOQA
        while abs(f_1.mean()[0] - conv_prob_) >= l * f_1.std()[0]:
            l += 1  # NOQA
        print(" |mean_pred_conv_prob - conv_prob| < {} * std_pred_conv_prob".
              format(l))

        print('conv_probability: {}, lambda: {}, conversion rate: {}'.format(
            conv_prob_ / 2, 1 / (3 * scale_), mc_2))
        print(f_2.describe())
        l = 1  # NOQA
        while abs(f_2.mean()[0] - conv_prob_ / 2) >= l * f_2.std()[0]:
            l += 1  # NOQA
        print(" |mean_pred_conv_prob - conv_prob| < {} * std_pred_conv_prob".
              format(l))

        print('----------')