def get_confidence_interval_of_test(self, kpi='CVR', segment=None, segment_column='segment', date=None): """ This method returns the confidence_interval of test as dict. http://onlinestatbook.com/2/estimation/difference_means.html :param kpi: the KPI that should be used :type kpi: str :param segment: the segment that should be used :type segment: str :param segment_column: the column name that contains the segment information :type segment_column: str :param variation_column: the column name that contains the variation information :type variation_column :param date: if date is given (format '%Y%m%d') then the check will happen up to that date :type date: string ('%Y%m%d') :return: confidence_interval of the test summary as a tuple :rtype: json """ if kpi not in self.get_expirement_kpis(): raise ValueError( "Please use a valid KPI. this can be one of the followings: {}" .format_map(self.get_expirement_kpis())) if date is None: df_summary = get_test_summary( self.data, kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) else: df_summary = get_test_summary( self.data[self.data[self.date_column] <= date], kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) M1 = df_summary['rate'][self.variations.variation_label] M2 = df_summary['rate'][self.variations.control_label] N1 = df_summary['total'][self.variations.variation_label] N2 = df_summary['total'][self.variations.control_label] z = get_z_val( sig_level=self.significance_level, two_tailed=True if self.alternative == 'two-sided' else False) std1 = get_standard_deviation(M1) std2 = get_standard_deviation(M2) Sm1_m2 = np.sqrt(((N1 - 1) * pow(std1, 2) + (N2 - 1) * pow(std2, 2)) / (N1 + N2 - 2)) SE1_2 = Sm1_m2 * (np.sqrt(1 / N1 + 1 / N2)) uplift = self.get_relative_conversion_uplift( kpi=kpi, segment=segment, segment_column=segment_column, date=date) return { "lower_limit": uplift - (z * SE1_2), "upper_limit": uplift + (z * SE1_2) }
def get_p_val(self, kpi='CVR', segment=None, segment_column='segment', date=None): """Method that calculates the p-value for a given dataset and KPI :param kpi: the KPI that should be used :type kpi: str :param segment: the segment that should be used :type segment: str :param segment_column: the column name that contains the segment information :type segment_column: str :param variation_column: the column name that contains the variation information :type variation_column :param date: if date is given (format '%Y%m%d') then the check will happen up to that date :type date: string ('%Y%m%d') :return: the p value :rtype: dict """ if kpi not in self.get_expirement_kpis(): raise ValueError( "Please use a valid KPI. this can be one of the followings: {}" .format_map(self.get_expirement_kpis())) if date is None: df_summary = get_test_summary( self.data, kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) else: df_summary = get_test_summary( self.data[self.data[self.date_column] <= date], kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) zscore, pval = sm.stats.proportions_ztest([ df_summary[kpi][self.variations.variation_label], df_summary[kpi][self.variations.control_label] ], [ df_summary['total'][self.variations.variation_label], df_summary['total'][self.variations.control_label] ], alternative=self.alternative) return {"z-score": zscore, 'p-value': pval}
def get_standard_errors_of_test(self, kpi='CVR', segment=None, segment_column='segment', date=None): """ This method is calculating the standard error for variation and control and returns a dict where the first element as the standard error of control and the second as the standard error of variation :param kpi: the KPI that should be used :type kpi: str :param segment: the segment that should be used :type segment: str :param segment_column: the column name that contains the segment information :type segment_column: str :param variation_column: the column name that contains the variation information :type variation_column :param date: if date is given (format '%Y%m%d') then the check will happen up to that date :type date: string ('%Y%m%d') :return: standard error for variation and control :rtype: dict """ if kpi not in self.get_expirement_kpis(): raise ValueError( "Please use a valid KPI. this can be one of the followings: {}" .format_map(self.get_expirement_kpis())) if date is None: df_summary = get_test_summary( self.data, kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) else: df_summary = get_test_summary( self.data[self.data[self.date_column] <= date], kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) return { "control_standard_error": get_standard_error( df_summary['rate'][self.variations.variation_label], df_summary['total'][self.variations.variation_label]), "variation_standard_error": get_standard_error( df_summary['rate'][self.variations.control_label], df_summary['total'][self.variations.control_label]) }
def test_get_segments_sample_size(): df = generate_random_cvr_data(1000, 0.3, 0.5, days=10, control_label='control', variation_label='variation') df1 = get_test_summary(df, 'CVR') assert df1['CVR'].control is not None
def test_get_test_summary_with_segment(): df = generate_random_cvr_data(1000, 0.3, 0.5, days=10, control_label='control', variation_label='variation') df1 = get_test_summary(df, 'CVR', segment='new') assert df1['CVR'].control == df1['CVR'].control # trick with NaN != NaN
def get_relative_conversion_uplift(self, kpi='CVR', segment=None, segment_column='segment', date=None): """Method that calculates the relative conversion_uplift :param kpi: the KPI that should be used :type kpi: str :param segment: the segment that should be used :type segment: str :param segment_column: the column name that contains the segment information :type segment_column: str :param variation_column: the column name that contains the variation information :type variation_column :param date: if date is given (format '%Y%m%d') then the check will happen up to that date :type date: string ('%Y%m%d') :return: the relative conversion uplift :rtype: float """ if kpi not in self.get_expirement_kpis(): raise ValueError( "Please use a valid KPI. this can be one of the followings: {}" .format_map(self.get_expirement_kpis())) if date is None: df_summary = get_test_summary( self.data, kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) else: df_summary = get_test_summary( self.data[self.data[self.date_column] <= date], kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) return (df_summary['rate'][self.variations.variation_label] - df_summary['rate'][self.variations.control_label]) / \ df_summary['rate'][self.variations.control_label]
def get_summary(self, kpi='CVR', segment=None, segment_column='segment', date=None): """Method that calculates the p-value for a given dataset and KPI :param kpi: the KPI that should be used :type kpi: str :param segment: the segment that should be used :type segment: str :param segment_column: the column name that contains the segment information :type segment_column: str :param variation_column: the column name that contains the variation information :type variation_column :param date: if date is given (format '%Y%m%d') then the check will happen up to that date :type date: string ('%Y%m%d') :return: the p value :rtype: dict """ if kpi not in self.get_expirement_kpis(): raise ValueError( "Please use a valid KPI. this can be one of the followings: {}" .format_map(self.get_expirement_kpis())) if date is None: df_summary = get_test_summary( self.data, kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) else: df_summary = get_test_summary( self.data[self.data[self.date_column] <= date], kpi=kpi, segment=segment, segment_column=segment_column, variations_column=self.variations.get_column_name()) return { 'variation': { "label": self.variations.variation_label, "sessions": float(df_summary['total'][self.variations.variation_label]), 'conversions': float(df_summary[kpi][self.variations.variation_label]) }, 'control': { "label": self.variations.control_label, "sessions": float(df_summary['total'][self.variations.control_label]), 'conversions': float(df_summary[kpi][self.variations.control_label]) } }