def test_minimize(): df = pd.DataFrame( [[0., np.nan, random()], [random(), np.nan, random()], [0., random(), random()], [0., random(), random()]], columns=['click_time', 'conv_time', 'random_feature']) features = ['random_feature'] end_time = 1. y = df[['click_time', 'conv_time']].as_matrix() X = csr_matrix(df[features]) input_ = prepare_input(y, X, end_time=end_time) input_['Jacobian'] = np.array([random(), random(), random(), random()]) clf = ConversionEstimator(end_time=end_time) clf.fit(X, y) assert isinstance(clf.coef_, np.ndarray) and isinstance( clf.lag_coef_, np.ndarray) assert clf.convergence_info['success'] assert clf.convergence_info['message'] in { b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL', b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH' }
def test_minimize_large(test_dummied_matrix): y, X = test_dummied_matrix end_time = 1.1 * y[:, 1][~np.isnan(y[:, 1])].max() input_ = prepare_input(y, X, end_time=end_time) input_['Jacobian'] = np.array([random() for _ in range(2 * X.shape[1])]) clf = ConversionEstimator(end_time=end_time) clf.fit(X, y) assert isinstance(clf.coef_, np.ndarray) and isinstance( clf.lag_coef_, np.ndarray) assert clf.convergence_info['success'] assert clf.convergence_info['message'] in { b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL', b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH' }
def get_model(serialized_model): estimator = ConversionEstimator(end_time=serialized_model['end_time']) estimator.coef_ = serialized_model['coef_'] estimator.lag_coef_ = serialized_model['lag_coef_'] if serialized_model.get('keymap') is not None: encoder = OneHotEncoderCOO(features=serialized_model['features'], add_intercept=serialized_model['add_intercept'] ) encoder.keymap = serialized_model['keymap'] elif serialized_model.get('modulo') is not None: encoder = FeatureHasher(serialized_model['modulo'], features=serialized_model['features'], add_intercept=serialized_model['add_intercept'] ) else: encoder = None return estimator, encoder
def get_log_loss(df, end_time): features = list(df.columns) features.pop(features.index('click_time')) features.pop(features.index('conv_time')) y_obs = df[['click_time', 'conv_time']].copy() y_obs['conv_time'] = y_obs['conv_time'].map( lambda x: x if (x <= end_time and not np.isnan(x)) else np.nan) y_obs = y_obs.values X = csr_matrix(df[features].values) estimator = ConversionEstimator(end_time) estimator.fit(X, y_obs) assert estimator.convergence_info.success logger.info('The convergence message is: {}'.format( estimator.convergence_info.message)) y_true = df['conv_time'] y_true = y_true.notnull() y_pred = estimator.predict(X) log_loss = estimator.log_loss(y_true, y_pred) return log_loss
def get_fitted_estimator_and_encoder(): df = pd.DataFrame( [[0., np.nan, random()], [random(), np.nan, random()], [0., random(), random()], [0., random(), random()]], columns=['click_time', 'conv_time', 'random_feature']) features = ['random_feature'] end_time = 1. y, X = df[['click_time', 'conv_time']].values, df[features].values X_enc = csr_matrix(X) input_ = prepare_input(y, X_enc, end_time=end_time) input_['Jacobian'] = np.array([random(), random(), random(), random()]) estimator = ConversionEstimator(end_time=end_time) estimator.fit(X_enc, y) encoder = OneHotEncoderCOO(features=['random_feature']) encoder.fit(X) return estimator, encoder
def time_pipeline(df, end_time): time0 = time() features = list(df.columns) features.pop(features.index('click_time')) features.pop(features.index('conv_time')) y_obs = df[['click_time', 'conv_time']].copy() y_obs['conv_time'] = y_obs['conv_time'].map( lambda x: x if (x <= end_time and not np.isnan(x)) else np.nan) y_obs = y_obs.values X = csr_matrix(df[features].values) estimator = ConversionEstimator(end_time) estimator.fit(X, y_obs) assert estimator.convergence_info.success logger.info('The convergence message is: {}'.format( estimator.convergence_info.message)) return time() - time0
def main(): conv_prob_list = [0.1, 0.2, 0.3, 0.4] events = 1000 lambda_list = [1 / 8, 1 / 4, 1 / 2, 1., 2.] end_time = 1. for conv_prob_, scale_ in product(conv_prob_list, lambda_list): number_of_simulations = 100 predicted_probability_list_1 = [] predicted_probability_list_2 = [] for i in range(number_of_simulations): conv_lags_1 = [ exponential(scale=scale_) * int(conv_prob_ - random() > 0) for _ in range(events) ] conv_lags_2 = [ exponential(scale=2. * scale_) * int(conv_prob_ / 2. - random() > 0) for _ in range(events) ] df = pd.DataFrame( [[0., t, 1, 0] for t in conv_lags_1] + [[0., t, 0, 1] for t in conv_lags_2], columns=['click_time', 'conv_time', 'f_1', 'f_2']) df['conv_time'] = df['conv_time'].map(lambda t: t if t > 0 else np.nan) mc_1 = df[(df['f_1'] == 1) & (df['conv_time'].notnull( ))].shape[0] / df[df['f_1'] == 1].shape[0] mc_2 = df[(df['f_2'] == 1) & (df['conv_time'].notnull( ))].shape[0] / df[df['f_2'] == 1].shape[0] # measured data df['conv_time'] = df['conv_time'].map( lambda t: t if t <= end_time and not np.isnan(t) else np.nan) y = df[['click_time', 'conv_time']].values X = csr_matrix(df[['f_1', 'f_2']].values) clf = ConversionEstimator(end_time=end_time) clf.fit(X, y) assert clf.convergence_info['success'] assert clf.convergence_info['message'] in { b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL', b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH' } predicted_probability_1 = clf.predict(np.array([[1, 0]]))[0] predicted_probability_list_1.append(predicted_probability_1) predicted_probability_2 = clf.predict(np.array([[0, 1]]))[0] predicted_probability_list_2.append(predicted_probability_2) f_1 = pd.DataFrame(predicted_probability_list_1, columns=['predicted conversion probability']) f_2 = pd.DataFrame(predicted_probability_list_2, columns=['predicted conversion probability']) print('conv_probability: {}, lambda: {}, conversion rate: {}'.format( conv_prob_, 1 / scale_, mc_1)) print(f_1.describe()) l = 1 # NOQA while abs(f_1.mean()[0] - conv_prob_) >= l * f_1.std()[0]: l += 1 # NOQA print(" |mean_pred_conv_prob - conv_prob| < {} * std_pred_conv_prob". format(l)) print('conv_probability: {}, lambda: {}, conversion rate: {}'.format( conv_prob_ / 2, 1 / (3 * scale_), mc_2)) print(f_2.describe()) l = 1 # NOQA while abs(f_2.mean()[0] - conv_prob_ / 2) >= l * f_2.std()[0]: l += 1 # NOQA print(" |mean_pred_conv_prob - conv_prob| < {} * std_pred_conv_prob". format(l)) print('----------')