Example #1
0
    def get_confidence_interval_of_test(self,
                                        kpi='CVR',
                                        segment=None,
                                        segment_column='segment',
                                        date=None):
        """
        This method returns the confidence_interval of test as dict. http://onlinestatbook.com/2/estimation/difference_means.html
        :param   kpi: the KPI that should be used
        :type    kpi: str
        :param   segment: the segment that should be used
        :type    segment: str
        :param   segment_column: the column name that contains the segment information
        :type    segment_column: str
        :param   variation_column: the column name that contains the variation information
        :type    variation_column
        :param   date: if date is given (format '%Y%m%d') then the check will happen up to that date
        :type    date: string ('%Y%m%d')
        :return: confidence_interval of the test summary as a tuple
        :rtype:  json
        """

        if kpi not in self.get_expirement_kpis():
            raise ValueError(
                "Please use a valid KPI. this can be one of the followings: {}"
                .format_map(self.get_expirement_kpis()))

        if date is None:
            df_summary = get_test_summary(
                self.data,
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())
        else:
            df_summary = get_test_summary(
                self.data[self.data[self.date_column] <= date],
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())

        M1 = df_summary['rate'][self.variations.variation_label]
        M2 = df_summary['rate'][self.variations.control_label]
        N1 = df_summary['total'][self.variations.variation_label]
        N2 = df_summary['total'][self.variations.control_label]
        z = get_z_val(
            sig_level=self.significance_level,
            two_tailed=True if self.alternative == 'two-sided' else False)
        std1 = get_standard_deviation(M1)
        std2 = get_standard_deviation(M2)
        Sm1_m2 = np.sqrt(((N1 - 1) * pow(std1, 2) + (N2 - 1) * pow(std2, 2)) /
                         (N1 + N2 - 2))
        SE1_2 = Sm1_m2 * (np.sqrt(1 / N1 + 1 / N2))
        uplift = self.get_relative_conversion_uplift(
            kpi=kpi, segment=segment, segment_column=segment_column, date=date)
        return {
            "lower_limit": uplift - (z * SE1_2),
            "upper_limit": uplift + (z * SE1_2)
        }
Example #2
0
    def get_p_val(self,
                  kpi='CVR',
                  segment=None,
                  segment_column='segment',
                  date=None):
        """Method that calculates the p-value for a given dataset and KPI


        :param   kpi: the KPI that should be used
        :type    kpi: str
        :param   segment: the segment that should be used
        :type    segment: str
        :param   segment_column: the column name that contains the segment information
        :type    segment_column: str
        :param   variation_column: the column name that contains the variation information
        :type    variation_column
        :param   date: if date is given (format '%Y%m%d') then the check will happen up to that date
        :type    date: string ('%Y%m%d')
        :return: the p value
        :rtype:  dict

        """

        if kpi not in self.get_expirement_kpis():
            raise ValueError(
                "Please use a valid KPI. this can be one of the followings: {}"
                .format_map(self.get_expirement_kpis()))

        if date is None:
            df_summary = get_test_summary(
                self.data,
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())
        else:
            df_summary = get_test_summary(
                self.data[self.data[self.date_column] <= date],
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())

        zscore, pval = sm.stats.proportions_ztest([
            df_summary[kpi][self.variations.variation_label],
            df_summary[kpi][self.variations.control_label]
        ], [
            df_summary['total'][self.variations.variation_label],
            df_summary['total'][self.variations.control_label]
        ],
                                                  alternative=self.alternative)

        return {"z-score": zscore, 'p-value': pval}
Example #3
0
    def get_standard_errors_of_test(self,
                                    kpi='CVR',
                                    segment=None,
                                    segment_column='segment',
                                    date=None):
        """
        This method is calculating the standard error for variation and control and returns a dict where the first
        element as the standard error of control and the second as the standard error of variation

        :param   kpi: the KPI that should be used
        :type    kpi: str
        :param   segment: the segment that should be used
        :type    segment: str
        :param   segment_column: the column name that contains the segment information
        :type    segment_column: str
        :param   variation_column: the column name that contains the variation information
        :type    variation_column
        :param   date: if date is given (format '%Y%m%d') then the check will happen up to that date
        :type    date: string ('%Y%m%d')
        :return: standard error for variation and control
        :rtype:  dict
        """
        if kpi not in self.get_expirement_kpis():
            raise ValueError(
                "Please use a valid KPI. this can be one of the followings: {}"
                .format_map(self.get_expirement_kpis()))

        if date is None:
            df_summary = get_test_summary(
                self.data,
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())
        else:
            df_summary = get_test_summary(
                self.data[self.data[self.date_column] <= date],
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())

        return {
            "control_standard_error":
            get_standard_error(
                df_summary['rate'][self.variations.variation_label],
                df_summary['total'][self.variations.variation_label]),
            "variation_standard_error":
            get_standard_error(
                df_summary['rate'][self.variations.control_label],
                df_summary['total'][self.variations.control_label])
        }
Example #4
0
def test_get_segments_sample_size():
    df = generate_random_cvr_data(1000,
                                  0.3,
                                  0.5,
                                  days=10,
                                  control_label='control',
                                  variation_label='variation')
    df1 = get_test_summary(df, 'CVR')
    assert df1['CVR'].control is not None
Example #5
0
def test_get_test_summary_with_segment():
    df = generate_random_cvr_data(1000,
                                  0.3,
                                  0.5,
                                  days=10,
                                  control_label='control',
                                  variation_label='variation')
    df1 = get_test_summary(df, 'CVR', segment='new')
    assert df1['CVR'].control == df1['CVR'].control  # trick with NaN != NaN
Example #6
0
    def get_relative_conversion_uplift(self,
                                       kpi='CVR',
                                       segment=None,
                                       segment_column='segment',
                                       date=None):
        """Method that calculates the relative conversion_uplift

        :param   kpi: the KPI that should be used
        :type    kpi: str
        :param   segment: the segment that should be used
        :type    segment: str
        :param   segment_column: the column name that contains the segment information
        :type    segment_column: str
        :param   variation_column: the column name that contains the variation information
        :type    variation_column
        :param   date: if date is given (format '%Y%m%d') then the check will happen up to that date
        :type    date: string ('%Y%m%d')
        :return: the relative conversion uplift
        :rtype:  float
        """
        if kpi not in self.get_expirement_kpis():
            raise ValueError(
                "Please use a valid KPI. this can be one of the followings: {}"
                .format_map(self.get_expirement_kpis()))

        if date is None:
            df_summary = get_test_summary(
                self.data,
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())
        else:
            df_summary = get_test_summary(
                self.data[self.data[self.date_column] <= date],
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())

        return (df_summary['rate'][self.variations.variation_label] - df_summary['rate'][self.variations.control_label]) / \
                df_summary['rate'][self.variations.control_label]
Example #7
0
    def get_summary(self,
                    kpi='CVR',
                    segment=None,
                    segment_column='segment',
                    date=None):
        """Method that calculates the p-value for a given dataset and KPI


        :param   kpi: the KPI that should be used
        :type    kpi: str
        :param   segment: the segment that should be used
        :type    segment: str
        :param   segment_column: the column name that contains the segment information
        :type    segment_column: str
        :param   variation_column: the column name that contains the variation information
        :type    variation_column
        :param   date: if date is given (format '%Y%m%d') then the check will happen up to that date
        :type    date: string ('%Y%m%d')
        :return: the p value
        :rtype:  dict

        """

        if kpi not in self.get_expirement_kpis():
            raise ValueError(
                "Please use a valid KPI. this can be one of the followings: {}"
                .format_map(self.get_expirement_kpis()))

        if date is None:
            df_summary = get_test_summary(
                self.data,
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())
        else:
            df_summary = get_test_summary(
                self.data[self.data[self.date_column] <= date],
                kpi=kpi,
                segment=segment,
                segment_column=segment_column,
                variations_column=self.variations.get_column_name())

        return {
            'variation': {
                "label":
                self.variations.variation_label,
                "sessions":
                float(df_summary['total'][self.variations.variation_label]),
                'conversions':
                float(df_summary[kpi][self.variations.variation_label])
            },
            'control': {
                "label":
                self.variations.control_label,
                "sessions":
                float(df_summary['total'][self.variations.control_label]),
                'conversions':
                float(df_summary[kpi][self.variations.control_label])
            }
        }