def test_summarizer(monkeypatch, rand_data, pre_int_period, post_int_period): summarizer_mock = mock.Mock() fit_mock = mock.Mock() process_mock = mock.Mock() summarize_mock = mock.Mock() monkeypatch.setattr('causalimpact.main.CausalImpact._fit_model', fit_mock) monkeypatch.setattr('causalimpact.main.CausalImpact._summarize_inferences', summarize_mock) monkeypatch.setattr( 'causalimpact.main.CausalImpact._process_posterior_inferences', process_mock) monkeypatch.setattr('causalimpact.main.summarizer', summarizer_mock) ci = CausalImpact(rand_data, pre_int_period, post_int_period, model_args={'fit_method': 'vi'}) ci.summary_data = 'summary_data' ci.p_value = 0.5 ci.alpha = 0.05 ci.summary() summarizer_mock.summary.assert_called_with('summary_data', 0.5, 0.05, 'summary', 2) with pytest.raises(ValueError) as excinfo: ci.summary(digits='1') assert str( excinfo.value) == ('Input value for digits must be integer. Received ' '"<class \'str\'>" instead.')
def test_summary_w_report_output(self, monkeypatch, inference_input, summary_report_filename): inferences_df = pd.DataFrame(inference_input) causal = CausalImpact() params = {'alpha': 0.05, 'post_period': [2, 4]} causal.params = params causal.inferences = inferences_df dedent_mock = mock.Mock() expected = open(summary_report_filename).read() expected = re.sub(r'\s+', ' ', expected) expected = expected.strip() tmpdir = mkdtemp() tmp_file = os.path.join(tmpdir, 'summary_test') def dedent_side_effect(msg): with open(tmp_file, 'a') as file_obj: msg = re.sub(r'\s+', ' ', msg) msg = msg.strip() file_obj.write(msg) return msg dedent_mock.side_effect = dedent_side_effect monkeypatch.setattr('textwrap.dedent', dedent_mock) causal.summary(output='report') result_str = open(tmp_file, 'r').read() assert result_str == expected
def test_summary_wrong_argument_raises(self, inference_input): inferences_df = pd.DataFrame(inference_input) causal = CausalImpact() params = {'alpha': 0.05, 'post_period': [2, 4]} causal.params = params causal.inferences = inferences_df with pytest.raises(ValueError): causal.summary(output='wrong_argument')
def causal_impact_analysis(ori_data, when_fi_started): x = list() y = list() post_period_index = 0 for point in ori_data: x.append(point[0]) y.append(point[1]) if post_period_index == 0 and when_fi_started <= point[0]: post_period_index = ori_data.index(point) data_frame = pd.DataFrame({ "timestamp": pd.to_datetime(x, unit="ms"), "y": y }) data_frame = data_frame.set_index("timestamp") pre_period = [ pd.to_datetime(ori_data[0][0], unit="ms"), pd.to_datetime(ori_data[post_period_index - 1][0], unit="ms") ] post_period = [ pd.to_datetime(ori_data[post_period_index][0], unit="ms"), pd.to_datetime(ori_data[-1][0], unit="ms") ] causal_impact = CausalImpact(data_frame, pre_period, post_period, prior_level_sd=0.1) summary = causal_impact.summary() report = causal_impact.summary(output='report') logging.info(summary) logging.info(report) relative_effect = -1 # Relative effect on average in the posterior area pattern_re = re.compile( r'Relative effect \(s\.d\.\)\s+-?(0\.\d+|[1-9]\d*\.\d+)%\s+\((0\.\d+|[1-9]\d*\.\d+)%\)' ) match = pattern_re.search(summary) relative_effect = float(match.group(2)) p = -1 # Posterior tail-area probability prob = -1 # Posterior prob. of a causal effect pattern_p_value = re.compile( r'Posterior tail-area probability p: (0\.\d+|[1-9]\d*\.\d+)\sPosterior prob. of a causal effect: (0\.\d+|[1-9]\d*\.\d+)%' ) match = pattern_p_value.search(summary) p = float(match.group(1)) prob = float(match.group(2)) # causal_impact.plot(panels=['original'], figsize=(12, 4)) return summary, report, p, prob, relative_effect
def main(): with open("/path/to/glowroot/data.json", 'rt') as file: glowroot_data = json.load(file) x = list() y = list() for point in glowroot_data["dataSeries"][0]["data"]: x.append(point[0]) y.append(point[1]) data_frame = pd.DataFrame({ "timestamp": pd.to_datetime(x, unit="ms"), "y": y }) data_frame = data_frame.set_index("timestamp") logging.info(data_frame) pre_period = [ pd.to_datetime(1573661277259, unit="ms"), pd.to_datetime(1573661647328, unit="ms") ] post_period = [ pd.to_datetime(1573661652328, unit="ms"), pd.to_datetime(1573661932369, unit="ms") ] causal_impact = CausalImpact(data_frame, pre_period, post_period, prior_level_sd=0.1) logging.info(causal_impact.summary()) causal_impact.plot()
def test_summary(self, inference_input): inferences_df = pd.DataFrame(inference_input) causal = CausalImpact() params = {'alpha': 0.05, 'post_period': [2, 4]} causal.params = params causal.inferences = inferences_df expected = [ [3, 7], [3, 7], [[3, 3], [7, 7]], [' ', ' '], [0, 0], [[0, 0], [0, 0]], [' ', ' '], ['-2.8%', '-2.8%'], [['0.0%', '-11.1%'], ['0.0%', '-11.1%']], [' ', ' '], ['0.0%', ' '], ['100.0%', ' '], ] expected = pd.DataFrame(expected, columns=['Average', 'Cumulative'], index=[ 'Actual', 'Predicted', '95% CI', ' ', 'Absolute Effect', '95% CI', ' ', 'Relative Effect', '95% CI', " ", "P-value", "Prob. of Causal Effect" ]) tmpdir = mkdtemp() tmp_expected = 'tmp_expected' tmp_result = 'tmp_test_summary' result_file = os.path.join(tmpdir, tmp_result) expected_file = os.path.join(tmpdir, tmp_expected) expected.to_csv(expected_file) expected_str = open(expected_file).read() causal.summary(path=result_file) result = open(result_file).read() assert result == expected_str
def causal_impact_analysis(ori_data, when_fi_started): x = list() y = list() post_period_index = 0 for point in ori_data: x.append(point[0]) y.append(point[1]) if post_period_index == 0 and when_fi_started <= point[0]: post_period_index = ori_data.index(point) data_frame = pd.DataFrame({ "timestamp": pd.to_datetime(x, unit="ms"), "y": y }) data_frame = data_frame.set_index("timestamp") pre_period = [ pd.to_datetime(ori_data[0][0], unit="ms"), pd.to_datetime(ori_data[post_period_index - 1][0], unit="ms") ] post_period = [ pd.to_datetime(ori_data[post_period_index][0], unit="ms"), pd.to_datetime(ori_data[-1][0], unit="ms") ] causal_impact = CausalImpact(data_frame, pre_period, post_period, prior_level_sd=0.1) p = -1 # Posterior tail-area probability prob = -1 # Posterior prob. of a causal effect pattern = re.compile( r'Posterior tail-area probability p: (0\.\d+|[1-9]\d*\.\d+)\sPosterior prob. of a causal effect: (0\.\d+|[1-9]\d*\.\d+)%' ) match = pattern.search(causal_impact.summary()) p = float(match.group(1)) prob = float(match.group(2)) summary = causal_impact.summary() report = causal_impact.summary(output='report') # causal_impact.plot() return summary, report, p, prob
# Causal Impact # 8. Using a custom model # 結局こっちにする # pip install pycausalimpact # https://github.com/dafiti/causalimpact/blob/master/examples/getting_started.ipynb from causalimpact import CausalImpact # x_test, x_train = # y_test, y_train = # 予測期間 # pre_piriodだけ, exogを入れる必要がある pre_period = ['2019-12-01', '2019-12-31'] post_period = ['2019-12-01', '2019-12-31'] # prior_level_sd=None, nseasons=[{'period': 52}] ci = CausalImpact(data=nq, model=model, pre_period, post_period) # 可視化 ci.plot(figsize=(14, 8)) # モデル・サマリー ci.summary()
axis=1) # Rename things, for the neatness data = data.rename(columns={ 'close': 'close_voo', }) print(data) # Check if the SP500 looks like a solid input for our synthetic control # data.plot() # plt.savefig('summary.svg') # Define periods. Article came out ~2:30 EDT on May 9th so let's say treatment end of markets the friday before pre_period = [ pd.Timestamp('2020-05-01 13:30:00+00:00'), pd.Timestamp('2020-05-08 20:00:00+00:00') ] post_period = [ pd.Timestamp('2020-05-11 13:30:00+00:00'), pd.Timestamp('2020-05-15 19:50:00+00:00') ] # Shove it into CasualImpact amc_data = data[['close_amc', 'close_voo', 'date']] amc_data = amc_data.set_index('date') ci = CausalImpact(amc_data, pre_period, post_period, prior_level_sd=None) ci.plot() print(ci.summary('report', 5))
t2 = 32 df['close_252d_rolling'] = df['close'].rolling(t1).mean() df['close_21d_rolling'] = df['close'].rolling(t2).mean() pre_period = [df.index[0], '2020/10/19 22:00'] # Define pre-event period post_period = ['2020/10/20 01:00', df.index[-1]] # Define post-event period pre_period_df = df[df.index <= '2020/10/19 22:00'] post_period_df = df[df.index >= '2020/10/20 01:00'] print('Pre-Event Statistics') print(pre_period_df.describe()) print('Post-Event Statistics') print(post_period_df.describe()) ci = CausalImpact(df['close'], pre_period, post_period) ########################## # DESDE ACA FALTA ARREGLAR ########################## ci.plot(figsize=(12, 6)) ci.plot(panels=['original', 'pointwise'], figsize=(12, 8)) print(ci.summary()) ci.trained_model.params print(ci.trained_model.summary()) _ = ci.trained_model.plot_diagnostics(figsize=(14, 6)) ci.trained_model.specification df['close'].plot(figsize=(12, 4))
def main(): st.title("""AB-Testing Tool """) html_temp = """ <div style="background-color:orange;padding:10px"> <h2 style="color:black;text-align:center;">Online Marketing Campaigns</h2> </div> """ print('----Data Imports ------') df = pd.read_csv('./main/streamlit/data/fake_data.csv') ###1 cup_df = pd.read_csv('./main/streamlit/data/fake_data_cuped.csv') ###2 test_f = {'Control_Matrix': 'Control', 'Variant_BT': 'Test'} df['test_flag'] = df['Variant'].replace(test_f) cup_df['test_flag'] = cup_df['Variant'].replace(test_f) test = df[df['test_flag'] == 'Test'] control = df[df['test_flag'] == 'Control'] test_cuped = cup_df[cup_df['test_flag'] == 'Test'] control = cup_df[cup_df['test_flag'] == 'Control'] np.random.seed(12345) ar = np.r_[1, 0.9] ma = np.array([1]) arma_process = ArmaProcess(ar, ma) X = 100 + arma_process.generate_sample(nsample=100) y = 1.2 * X + np.random.normal(size=100) y[70:] += 5 pre_post_data = pd.DataFrame({'y': y, 'X': X}, columns=['y', 'X']) ###3 pre_period = [0, 69] post_period = [70, 99] print('======================================================') print('----------- Sample Size Estimation--------------------') print('======================================================') st.markdown(html_temp, unsafe_allow_html=True) detectable_change = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10 ] MENU = [ 'Sample-Size-Estimation', 'Stat Base Measurement', 'Analysis & Recommendation' ] choice = st.sidebar.radio(''' Click here ''', MENU) if choice == 'Sample-Size-Estimation': mean_sales = st.sidebar.number_input('Base-Mean', 1) std_sales = st.sidebar.number_input('Base-StdDev', 1) alpha = st.sidebar.number_input('Alpha_Value', 0.05) power = st.sidebar.number_input('Power_Value', 0.8) k = min_detectable_data_prep(mean_sales, std_sales, detectable_change) k['require_sample_size'] = np.vectorize(sample_size_calculator)( k['mu_base'], k['mu_hat'], k['std_base']) st.subheader( 'Sample Sizes for different scenario of Minimum Detectable Effect') st.write(""" Enter your data into the sidebar and choose what will be Base Mean of KPI & Base Std Deviation of KPI. Below table shows the different sample sizes for different MDE(Minimum detectable Effect) """ ) st.dataframe(k) k['effect_in_%'] = (k['detectable_effect'] * 100) sns.pointplot( x=k['effect_in_%'], y=k['require_sample_size'], color='blue', ) st.pyplot() elif choice == 'Stat Base Measurement': METRIC = st.sidebar.selectbox('Choose the metric', ['Pvs_per_session']) METHOD = st.sidebar.selectbox('Choose the method', [ 'Post (Control) Vs Post (Test)', 'Pre (Test) Vs Post(Test)', 'CUPED', 'Post (Control) Vs Post (Test) NonParametric' ]) if METHOD == 'Post (Control) Vs Post (Test)': print('---Step-1:Distribution Plot---') plt.figure() ax1 = sns.distplot(test[METRIC], hist=False, kde=True) ax2 = sns.distplot(control[METRIC], hist=False, kde=True) plt.axvline(np.mean(test[METRIC]), color='b', linestyle='dashed', label='TEST', linewidth=5) plt.axvline(np.mean(control[METRIC]), color='orange', linestyle='dashed', label='CONTROL', linewidth=5) plt.legend(labels=['TEST', 'CONTROL']) st.subheader('Distribution Comparison(Density Plot)') st.pyplot() sns.boxplot(data=[test[METRIC], control[METRIC]], showmeans=True) st.subheader('Distribution Comparison(Box Plot)') st.pyplot() print('--Step-2:T-Test for Mean Comparison--') st.subheader( 'Mean comparison between Test & Control Distribution using Welsh T-Test' ) r = t_distribution_ci(df, metric=METRIC, control='Control', test='Test', alpha=0.05) st.dataframe(r) if r['p-value'].iloc[0] > 0.1: st.markdown('''### Inference ''') st.write( '''According to the null hypothesis, there is no difference between the means. The plot above shows the distribution of the difference of the means that we would expect under the null hypothesis.''') else: st.markdown('''### Inference ''') st.write( '''According to the null hypothesis, there is siginificant difference between the means. The plot above shows the distribution of the difference of the means that we would expect under the null hypothesis.''') elif METHOD == 'Pre (Test) Vs Post(Test)': figsize = (20, 6) ci = CausalImpact(pre_post_data, pre_period, post_period) print(ci.summary()) print(ci.summary(output='report')) pre_post_report = ci.summary_data pre_post_report['p_value'] = ci.p_value pre_post_report['siginificance'] = np.where( pre_post_report['p_value'] > 0.1, 'Not Significant', 'Significant') st.subheader('Causal Inference Analysis') ci.plot() st.pyplot() st.subheader('Causal Inference statistical output') st.write(ci.summary(output='report')) st.dataframe(pre_post_report) elif METHOD == 'CUPED': cup_df = CUPED(cup_df, KPI=METRIC) test_cuped = cup_df[cup_df['test_flag'] == 'Test'] control_cuped = cup_df[cup_df['test_flag'] == 'Control'] cup_r = t_distribution_ci(cup_df, metric='CUPED-adjusted_metric', control='Control', test='Test', alpha=0.05) cor_df = cup_r.corr() st.subheader('Pre Vs Post Correlation to understand Variance') sns.jointplot(cup_df[METRIC], cup_df[METRIC + '_pre_experiment'], kind="reg", stat_func=r2) st.pyplot() ax1 = sns.distplot(test_cuped['CUPED-adjusted_metric'], hist=False, kde=True) ax2 = sns.distplot(control_cuped['CUPED-adjusted_metric'], hist=False, kde=True) plt.axvline(np.mean(test_cuped['CUPED-adjusted_metric']), color='b', linestyle='dashed', label='TEST', linewidth=5) plt.axvline(np.mean(control_cuped['CUPED-adjusted_metric']), color='orange', linestyle='dashed', label='CONTROL', linewidth=5) plt.legend(labels=['TEST', 'CONTROL']) st.subheader( 'CUPED-Distribution Comparison(Density Plot) after removing variance' ) st.pyplot() st.subheader( 'CUPED-Mean comparison between Test & Control Distribution using Welsh T-Test after removing variance' ) st.dataframe(cup_r) elif METHOD == 'Post (Control) Vs Post (Test) NonParametric': print('---Step-1:Distribution Plot---') plt.figure() ax1 = sns.distplot(test[METRIC], hist=False, kde=True) ax2 = sns.distplot([METRIC], hist=False, kde=True) plt.axvline(np.mean(test[METRIC]), color='b', linestyle='dashed', label='TEST', linewidth=5) plt.axvline(np.mean([METRIC]), color='orange', linestyle='dashed', label='CONTROL', linewidth=5) plt.legend(labels=['TEST', 'CONTROL']) st.subheader('Distribution Comparison(Density Plot)') st.pyplot() sns.boxplot(data=[test[METRIC], [METRIC]], showmeans=True) st.subheader('Distribution Comparison(Box Plot)') st.pyplot() print('--Step-2:T-Test for Mean Comparison--') st.subheader( 'Mean comparison between Test & Control Distribution using Welsh T-Test' ) df[METRIC] = df[METRIC].astype('float') r = mann_whitney_u_test(df, metric=METRIC, control='Control', test='Test', test_flag='test_flag', alpha=0.05) st.dataframe(r) if r['p-value'].iloc[0] > 0.1: st.markdown('''### Inference ''') st.write( '''According to the null hypothesis, there is no difference between the means. The plot above shows the distribution of the difference of the means that we would expect under the null hypothesis.''') else: st.markdown('''### Inference ''') st.write( '''According to the null hypothesis, there is siginificant difference between the means. The plot above shows the distribution of the difference of the means that we would expect under the null hypothesis.''')
# pip install pycausalimpact import numpy as np import pandas as pd from statsmodels.tsa.arima_process import ArmaProcess from causalimpact import CausalImpact # Generate random sample np.random.seed(0) ar = np.r_[1, 0.9] ma = np.array([1]) arma_process = ArmaProcess(ar, ma) X = 50 + arma_process.generate_sample(nsample=1000) y = 1.6 * X + np.random.normal(size=1000) # There is a change starting from index 800 y[800:] += 10 data = pd.DataFrame({'y': y, 'X': X}, columns=['y', 'X']) pre_period = [0, 799] post_period = [800, 999] ci = CausalImpact(data, pre_period, post_period) print(ci.summary()) print(ci.summary(output='report')) ci.plot()