Esempio n. 1
0
    def test_proptest(self):
        # equality of k-samples
        pt = smprop.proportions_chisquare(self.n_success,
                                          self.nobs,
                                          value=None)
        assert_almost_equal(pt[0], self.res_prop_test.statistic, decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test.p_value, decimal=13)

        # several against value
        pt = smprop.proportions_chisquare(
            self.n_success,
            self.nobs,
            value=self.res_prop_test_val.null_value[0])
        assert_almost_equal(pt[0],
                            self.res_prop_test_val.statistic,
                            decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test_val.p_value, decimal=13)

        # one proportion against value
        pt = smprop.proportions_chisquare(
            self.n_success[0],
            self.nobs[0],
            value=self.res_prop_test_1.null_value)
        assert_almost_equal(pt[0], self.res_prop_test_1.statistic, decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test_1.p_value, decimal=13)
Esempio n. 2
0
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]),
                                    np.asarray([20., 20]),
                                    value=0,
                                    prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)

    # test with integers, issue #7603
    res1 = smprop.proportions_ztest(np.asarray([15, 10]),
                                    np.asarray([20, 50000]),
                                    value=0,
                                    prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20, 50000]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
    assert_array_less(0, res2[-1][1])  # expected should be positive
Esempio n. 3
0
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]),
                                 value=0, prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
def test_proportion_ztests():
    # currently only consistency test with proportions chisquare
    # Note: alternative handling is generic

    res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5)
    res2 = smprop.proportions_chisquare(15, 20., value=0.5)
    assert_almost_equal(res1[1], res2[1], decimal=13)

    res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]),
                                 value=0, prop_var=None)
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20]))
    # test only p-value
    assert_almost_equal(res1[1], res2[1], decimal=13)
Esempio n. 5
0
    def quote(self, client_callback):
        if len(self.quote_history) > 100:
            mid = np.mean(self.quote_history[-100:])
        else:
            mid = (self.curr_mid_bounds[0] + self.curr_mid_bounds[1]) / 2.
        bidaskspread = self.curr_mid_bounds[1] - self.curr_mid_bounds[0]
        self.curr_mid_bounds[1], self.curr_mid_bounds[0] = mid + bidaskspread/2, mid - bidaskspread/2
        skew = (self.curr_mid_bounds[1] - self.curr_mid_bounds[0]) / 2. * np.tanh(-self.inventory / self.MAX_INVENTORY)
        client_mid = mid + skew
        order = client_callback(client_mid - self.BID_ASK_SPREAD/2., client_mid + self.BID_ASK_SPREAD/2.)
        self.quote_history.append(client_mid)
        self.broker_mid.append(mid)
        self.order_history.append(order)
        self.curr_mid_stats.add_order(order, client_mid)
        self.inventory += order
        self.cashflow_history.append(-client_mid*order + self.BID_ASK_SPREAD/2. * np.abs(order))

        pbuy = float(self.curr_mid_stats.buy) / self.curr_mid_stats.total_orders
        psell = float(self.curr_mid_stats.sell) / self.curr_mid_stats.total_orders
        pval = proportions_chisquare(
            np.array([self.curr_mid_stats.buy, self.curr_mid_stats.sell]),
            self.curr_mid_stats.total_orders,
            np.array([(pbuy+psell)/2, (pbuy+psell)/2]))[1]
        if pval < self.significance_level:
            ave_mid = self.curr_mid_stats.ave_mid
            if self.curr_mid_stats.buy < self.curr_mid_stats.sell:
                self.curr_mid_bounds[1] += (ave_mid - self.curr_mid_bounds[0]) / 2.
                self.curr_mid_bounds[0] += (ave_mid - self.curr_mid_bounds[0]) / 2.
            else:
                self.curr_mid_bounds[0] -= (self.curr_mid_bounds[1] - ave_mid) / 2.
                self.curr_mid_bounds[1] -= (self.curr_mid_bounds[1] - ave_mid) / 2.
            self.curr_mid_stats = Broker.OrderStats()
            print('ave mid: %f, bound %f %f'  %((ave_mid,) + tuple(self.curr_mid_bounds)))
Esempio n. 6
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1)*21./20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    # d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    # TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Esempio n. 7
0
 def _p_value(self, row):
     _, p_value, _ = (proportions_chisquare(
         count=[row[self._numerator + SFX1], row[self._numerator + SFX2]],
         nobs=[
             row[self._denominator + SFX1], row[self._denominator + SFX2]
         ]))
     return p_value
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1) * 21. / 20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    #d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    #TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
Esempio n. 9
0
    def simulate(self):
        """This simulation assumes that we are testing for an `effect` in a single
        experiment.

        Returns
        -------
        chi2 : float
            The chi2 statistic from chi2 test in StatsModels
        p_value : float [0, 1]
            The p-value from chi2 test in StatsModels
        effect_point_estimates : list of float
            The effect size point estimates observed in the treatment groups
        """
        observations = [
            np.random.binomial(1, self.natural_rate + absolute_effeect,
                               int(self.sample_size *
                                   self.test_splits[i])).sum()
            for i, absolute_effect in enumerate(self.absolute_effects)
        ]

        effect_point_estimates = round(
            np.array(observations) / (self.test_splits * self.sample_size) -
            self.natural_rate, 4)
        chi2, p_value, _ = proportions_chisquare(
            observations, (self.test_splits * self.sample_size).astype(int))
        return chi2, p_value, effect_point_estimates
Esempio n. 10
0
    def test_proptest(self):
        # equality of k-samples
        pt = smprop.proportions_chisquare(self.n_success, self.nobs, value=None)
        assert_almost_equal(pt[0], self.res_prop_test.statistic, decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test.p_value, decimal=13)

        # several against value
        pt = smprop.proportions_chisquare(self.n_success, self.nobs,
                                    value=self.res_prop_test_val.null_value[0])
        assert_almost_equal(pt[0], self.res_prop_test_val.statistic, decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test_val.p_value, decimal=13)

        # one proportion against value
        pt = smprop.proportions_chisquare(self.n_success[0], self.nobs[0],
                                    value=self.res_prop_test_1.null_value)
        assert_almost_equal(pt[0], self.res_prop_test_1.statistic, decimal=13)
        assert_almost_equal(pt[1], self.res_prop_test_1.p_value, decimal=13)
def ztest(data: pd.DataFrame,
          factors: np.ndarray,
          levels: np.ndarray,
          y: str,
          name: str,
          alpha=0.05) -> dict:
    '''
    For each factor in factors, conducts a t-test/one-way ANOVA to see if there are any 
    differences in the means of two or more groups.
    Parameters
    ----------
    data (pandas.DataFrame): df containing data of the experiment.
    factors (numpy.ndarray): the list of the independent variables.
    levels (numpy.ndarray): the matrix of factor x level. Each row represents a 
    factor and each element in a row represents a level.
    y (str): the name of the dependent variable.
    name (str): the version of the experiment.
    alpha (float): ignored.

    Return
    -------
    result_dict (dict): keys are factors and values are the results of statistical tests. 
    "Experiment" key is used as an index.
    '''
    result_dict = {'Experiment': name.replace('_', ' ').title()}
    filtered = data[data[y] == 1]

    for factor, factor_levels in zip(factors, levels):
        count = []
        nobs = []
        base_data = data
        base_filtered = filtered
        for i in range(1, len(factor_levels)):
            base_data = base_data[base_data[factor + '_' +
                                            factor_levels[i]] == 0]
            base_filtered = base_filtered[base_filtered[factor + '_' +
                                                        factor_levels[i]] == 0]
            count.append(
                len(filtered[filtered[factor + '_' + factor_levels[i]] == 1]))
            nobs.append(len(data[data[factor + '_' + factor_levels[i]] == 1]))
        count.append(len(base_filtered))
        nobs.append(len(base_data))

        if len(count) > 2:
            result = proportions_chisquare(np.array(count), np.array(nobs))
        else:
            result = proportions_ztest(np.array(count), np.array(nobs))

        p = p_to_string(result[1])
        result_dict[factor] = str(round(result[0], 2)) + ' ' + p
    return result_dict
Esempio n. 12
0
def get_statistical_results_number_of_review(trait, reviews, authors_higher,
                                             authors_lower):
    all_data = []
    all_reviews_for_group_higher = [[author, anime, grade]
                                    for author, anime, grade in reviews
                                    if author in authors_higher]
    all_reviews_for_group_lower = [[author, anime, grade]
                                   for author, anime, grade in reviews
                                   if author in authors_lower]
    number_of_all_reviews_higher = len(all_reviews_for_group_higher)
    number_of_all_reviews_lower = len(all_reviews_for_group_lower)
    for genre in all_genres:
        reviews_for_genre_higher = [[
            author, anime, grade
        ] for author, anime, grade in all_reviews_for_group_higher
                                    if genre in anime_genres[str(anime)]]
        reviews_for_genre_lower = [[
            author, anime, grade
        ] for author, anime, grade in all_reviews_for_group_lower
                                   if genre in anime_genres[str(anime)]]
        ratio_higher = (len(reviews_for_genre_higher) /
                        number_of_all_reviews_higher) * 100
        ratio_lower = (len(reviews_for_genre_lower) /
                       number_of_all_reviews_lower) * 100
        diff_ratio = ratio_higher - ratio_lower
        chisq, pvalue, table = proportion.proportions_chisquare(
            [len(reviews_for_genre_higher),
             len(reviews_for_genre_lower)],
            [number_of_all_reviews_higher, number_of_all_reviews_lower],
        )
        power = chisq / (number_of_all_reviews_higher +
                         number_of_all_reviews_lower)
        all_data.append([
            genre,
            number_of_all_reviews_higher,
            len(reviews_for_genre_higher),
            number_of_all_reviews_lower,
            len(reviews_for_genre_lower),
            ratio_higher,
            ratio_lower,
            diff_ratio,
            chisq,
            pvalue,
            power,
            trait,
        ])
    return all_data
Esempio n. 13
0
def calc_chisquared_pvalue(group_counts: Sequence[int],
                           converter_counts: Sequence[int]) -> np.float64:
    """Performs the Chi-squared statistical test of proportions.

  Args:
    group_counts: Sequence of total user counts in test and control groups. Two
      or more groups should be used.
    converter_counts: Sequence of number of converters in each group specified
      in the group_counts.

  Returns:
    p-value from the test.
  """
    assert len(group_counts) >= 2, 'Two or more goups should be used.'
    assert len(group_counts) == len(converter_counts), (
        'group_counts and converter_counts should have the same length.')

    _, p_val, _ = proportion.proportions_chisquare(count=converter_counts,
                                                   nobs=group_counts)
    return p_val
normal_CI_uksd(nx,0.05)


'''
Hypothesis Testing in Python
'''
# Binomial Test
n_lcd = 893; x_lcd = 39; p_lcd = 0.04
from scipy.stats import binom_test
binom_test(x=x_lcd,n=n_lcd,p=p_lcd)
# Proportion Test
Male_Admit = [512, 353, 120, 138,  53,  22]
Male_Applied = [825, 560, 325, 417, 191, 373]
Female_Prop = np.array([0.824, 0.680, 0.341, 0.349, 0.239, 0.070])
from statsmodels.stats.proportion import proportions_chisquare
proportions_chisquare(count = Male_Admit, nobs=Male_Applied,value=Female_Prop)

# Normal test: One sample Z-test
def z_test(x,mu,sd,alternative):
	import scipy 
	xmean = x.mean(); n = len(x)
	ztest = np.sqrt(n)*(xmean-mu)/sd
	if alternative=="less":
		pvalue = scipy.stats.norm.cdf(ztest)
	if alternative=="greater":
		pvalue = 1-scipy.stats.norm.cdf(ztest)
	if alternative=="two.sided": 
		pvalue = 2*(1-scipy.stats.norm.cdf(ztest))
	print("The Z-test Value is :",ztest)
	print("The p-value is :",pvalue)
	
## Question:
## Do A and B versions of the website give the website the same
## conversion rate? What is the significance level?
###################################################################################

## Assign some variables
visits_per_group = 298234/2
success_A = 8365
success_B = 8604
conversion_rate_A = success_A / visits_per_group
conversion_rate_B = success_B / visits_per_group

## Perform a proportional hypothesis test
## H0 (null hypothesis): conversion rates of version A and B are the same
## H1 (alternative hypothesis): conversion rates of version A and B are different
(chi2, chi2_p_value, expected) = proportions_chisquare(count=[success_A, success_B],
                                                  nobs=[visits_per_group, visits_per_group])
print("chi2: %f \t p_value: %f" % (chi2, chi2_p_value))

## Examine the output of the chi2_contingency function.
## If the target p_value is 0.05, what is your conclusion? Do you accept or reject H0?

## Note that this test only tells you whether A & B have different conversion rates, not
## which is larger. In this case, since A & B had the same number of visits, this is easy to 
## determine. However, if you only showed B to 10% of your visitors, you may want to use a
## one-sided test instead.

## Your team also wants to know the "power" of the above results. Since they want to
## know if H1 is true, what is the possiblity that we accept H0 when H1 is true?
## The power can be obtained using the GofChisquarePower.solve_power function
effect_size = proportion_effectsize(prop1=conversion_rate_A, prop2=conversion_rate_B)
proportion_test_power = NormalIndPower().solve_power(effect_size=effect_size, nobs1=visits_per_group, alpha=0.05)
Esempio n. 16
0
## Do A and B versions of the website give the website the same
## conversion rate? What is the significance level?
###################################################################################

## Assign some variables
visits_per_group = 298234 / 2
success_A = 8365
success_B = 8604
conversion_rate_A = success_A / visits_per_group
conversion_rate_B = success_B / visits_per_group

## Perform a proportional hypothesis test
## H0 (null hypothesis): conversion rates of version A and B are the same
## H1 (alternative hypothesis): conversion rates of version A and B are different
(chi2, chi2_p_value,
 expected) = proportions_chisquare(count=[success_A, success_B],
                                   nobs=[visits_per_group, visits_per_group])
print("chi2: %f \t p_value: %f" % (chi2, chi2_p_value))

## Examine the output of the chi2_contingency function.
## If the target p_value is 0.05, what is your conclusion? Do you accept or reject H0?

## Note that this test only tells you whether A & B have different conversion rates, not
## which is larger. In this case, since A & B had the same number of visits, this is easy to
## determine. However, if you only showed B to 10% of your visitors, you may want to use a
## one-sided test instead.

## Your team also wants to know the "power" of the above results. Since they want to
## know if H1 is true, what is the possiblity that we accept H0 when H1 is true?
## The power can be obtained using the GofChisquarePower.solve_power function
effect_size = proportion_effectsize(prop1=conversion_rate_A,
                                    prop2=conversion_rate_B)
Esempio n. 17
0
    def run(self):
        subject = self.pipeline.subject
        task = self.pipeline.task

        events = self.get_passed_object(task + '_events')
        math_events = self.get_passed_object(task + '_math_events')
        intr_events = self.get_passed_object(task + '_intr_events')
        rec_events = self.get_passed_object(task + '_rec_events')
        all_events = self.get_passed_object(task + '_all_events')
        channels = self.get_passed_object('channels')
        tal_info = self.get_passed_object('tal_info')

        sessions = np.unique(events.session)

        self.pass_object('NUMBER_OF_SESSIONS', len(sessions))
        self.pass_object('NUMBER_OF_ELECTRODES', len(channels))

        session_data = []
        session_summary_array = []

        positions = np.unique(events.serialpos)
        first_recall_counter = np.zeros(positions.size, dtype=int)
        total_list_counter = 0

        irt_within_cat = []
        irt_between_cat = []

        cumulative_n_items_from_stim = 0
        cumulative_n_recalls_from_stim = 0
        cumulative_n_intr_from_stim = 0

        cumulative_n_items_from_nonstim = 0
        cumulative_n_recalls_from_nonstim = 0
        cumulative_n_intr_from_nonstim = 0

        for session in sessions:
            session_summary = SessionSummary()

            session_events = events[events.session == session]
            n_sess_events = len(session_events)

            session_rec_events = rec_events[rec_events.session == session]

            session_all_events = all_events[all_events.session == session]
            timestamps = session_all_events.mstime
            first_time_stamp = np.min(timestamps)
            last_time_stamp = np.max(timestamps)
            session_length = '%.2f' % ((last_time_stamp - first_time_stamp) / 60000.0)
            session_date = time.strftime('%d-%b-%Y', time.localtime(last_time_stamp/1000))

            session_data.append([session, session_date, session_length])

            session_name = 'Sess%02d' % session

            print 'Session =', session_name

            session_summary.number = session
            session_summary.name = session_name
            session_summary.length = session_length
            session_summary.date = session_date
            session_summary.n_words = len(session_events)
            session_summary.n_correct_words = np.sum(session_events.recalled)
            session_summary.pc_correct_words = 100*session_summary.n_correct_words / float(session_summary.n_words)

            positions = np.unique(session_events.serialpos)
            prob_recall = np.empty_like(positions, dtype=float)
            for i,pos in enumerate(positions):
                pos_events = session_events[session_events.serialpos == pos]
                prob_recall[i] = np.sum(pos_events.recalled) / float(len(pos_events))

            session_summary.prob_recall = prob_recall

            lists = np.unique(session_events.list)
            n_lists = len(lists)
            prob_first_recall = np.zeros(len(positions), dtype=float)
            session_irt_within_cat = []
            session_irt_between_cat = []
            session_summary.n_recalls_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.n_intr_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.n_stims_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.is_stim_list = np.zeros(n_lists, dtype=np.bool)
            items_per_list = np.zeros(n_lists, dtype=np.int)

            for lst in lists:
                list_events = session_all_events[session_all_events.list == lst]
                list_rec_events = session_rec_events[(session_rec_events.list == lst) & (session_rec_events.intrusion == 0)]
                list_intr_events = session_rec_events[(session_rec_events.list == lst) & (session_rec_events.intrusion >= 4)]
                list_word_events = list_events[list_events.type=='WORD']

                #session_summary.n_recalls_per_list[lst-1] = len(list_rec_events)
                #session_summary.n_intr_per_list[lst-1] = len(list_intr_events)
                session_summary.n_recalls_per_list[lst-1] = np.sum(list_word_events.recalled)
                session_summary.n_stims_per_list[lst-1] = np.sum(list_events.type=='STIM')
                session_summary.is_stim_list[lst-1] = session_events[session_events.list == lst][0].stimList
                for ie in list_intr_events:
                    session_summary.n_intr_per_list[ie.intrusion-1] += 1

                items_per_list[lst-1] = np.sum(list_events.type=='WORD')

                if list_rec_events.size > 0:
                    list_events = session_events[session_events.list == lst]
                    tmp = np.where(list_events.itemno == list_rec_events[0].itemno)[0]
                    if tmp.size > 0:
                        first_recall_idx = tmp[0]
                        prob_first_recall[first_recall_idx] += 1
                        first_recall_counter[first_recall_idx] += 1
                if task == 'RAM_CatFR3':
                    # list_rec_events = session_rec_events[session_rec_events.list == lst]
                    for i in xrange(1,len(list_rec_events)):
                        cur_ev = list_rec_events[i]
                        prev_ev = list_rec_events[i-1]
                        # if (cur_ev.intrusion == 0) and (prev_ev.intrusion == 0):
                        dt = cur_ev.mstime - prev_ev.mstime
                        if cur_ev.category == prev_ev.category:
                            session_irt_within_cat.append(dt)
                        else:
                            session_irt_between_cat.append(dt)

            prob_first_recall /= float(n_lists)
            total_list_counter += n_lists

            n_items_from_stim = np.sum(items_per_list[session_summary.is_stim_list])
            n_recalls_from_stim = np.sum(session_summary.n_recalls_per_list[session_summary.is_stim_list])
            n_intr_from_stim = np.sum(session_summary.n_intr_per_list[session_summary.is_stim_list])

            cumulative_n_items_from_stim += n_items_from_stim
            cumulative_n_recalls_from_stim += n_recalls_from_stim
            cumulative_n_intr_from_stim += n_intr_from_stim

            nonstim_list_mask = ~session_summary.is_stim_list
            nonstim_list_mask[0:3] = False
            n_items_from_nonstim = np.sum(items_per_list[nonstim_list_mask])
            n_recalls_from_nonstim = np.sum(session_summary.n_recalls_per_list[nonstim_list_mask])
            n_intr_from_nonstim = np.sum(session_summary.n_intr_per_list[nonstim_list_mask])

            cumulative_n_items_from_nonstim += n_items_from_nonstim
            cumulative_n_recalls_from_nonstim += n_recalls_from_nonstim
            cumulative_n_intr_from_nonstim += n_intr_from_nonstim

            session_summary.n_correct_stim = n_recalls_from_stim
            session_summary.n_total_stim = n_items_from_stim
            session_summary.pc_from_stim = 100 * n_recalls_from_stim / float(n_items_from_stim)

            session_summary.n_correct_nonstim = n_recalls_from_nonstim
            session_summary.n_total_nonstim = n_items_from_nonstim
            session_summary.pc_from_nonstim = 100 * n_recalls_from_nonstim / float(n_items_from_nonstim)

            session_summary.n_stim_intr = n_intr_from_stim
            session_summary.pc_from_stim_intr = 100 * n_intr_from_stim / float(n_items_from_stim)

            session_summary.n_nonstim_intr = n_intr_from_nonstim
            session_summary.pc_from_nonstim_intr = 100 * n_intr_from_nonstim / float(n_items_from_nonstim)

            session_summary.chisqr,session_summary.pvalue,_ = proportions_chisquare([n_recalls_from_stim, n_recalls_from_nonstim], [n_items_from_stim, n_items_from_nonstim])
            session_summary.chisqr_intr,session_summary.pvalue_intr,_ = proportions_chisquare([n_intr_from_stim, n_intr_from_nonstim], [n_items_from_stim, n_items_from_nonstim])

            session_summary.irt_within_cat = sum(session_irt_within_cat) / len(session_irt_within_cat) if session_irt_within_cat else 0.0
            session_summary.irt_between_cat = sum(session_irt_between_cat) / len(session_irt_between_cat) if session_irt_between_cat else 0.0

            irt_within_cat += session_irt_within_cat
            irt_between_cat += session_irt_between_cat

            session_summary.prob_first_recall = prob_first_recall

            if math_events is not None:
                session_math_events = math_events[math_events.session == session]
                session_summary.n_math = len(session_math_events)
                session_summary.n_correct_math = np.sum(session_math_events.iscorrect)
                session_summary.pc_correct_math = 100*session_summary.n_correct_math / float(session_summary.n_math)
                session_summary.math_per_list = session_summary.n_math / float(n_lists)

            session_intr_events = intr_events[intr_events.session == session]

            session_summary.n_pli = np.sum(session_intr_events.intrusion > 0)
            session_summary.pc_pli = 100*session_summary.n_pli / float(n_sess_events)
            session_summary.n_eli = np.sum(session_intr_events.intrusion == -1)
            session_summary.pc_eli = 100*session_summary.n_eli / float(n_sess_events)

            session_summary_array.append(session_summary)


        self.pass_object('SESSION_DATA', session_data)
        self.pass_object('session_summary_array', session_summary_array)

        cumulative_summary = SessionSummary()
        cumulative_summary.n_words = len(events)
        cumulative_summary.n_correct_words = np.sum(events.recalled)
        cumulative_summary.pc_correct_words = 100*cumulative_summary.n_correct_words / float(cumulative_summary.n_words)

        cumulative_summary.irt_within_cat = sum(irt_within_cat) / len(irt_within_cat) if irt_within_cat else 0.0
        cumulative_summary.irt_between_cat = sum(irt_between_cat) / len(irt_between_cat) if irt_between_cat else 0.0

        positions = np.unique(events.serialpos)
        prob_recall = np.empty_like(positions, dtype=float)
        for i,pos in enumerate(positions):
            pos_events = events[events.serialpos == pos]
            prob_recall[i] = np.sum(pos_events.recalled) / float(len(pos_events))
        cumulative_summary.prob_recall = prob_recall

        prob_first_recall = first_recall_counter / float(total_list_counter)
        cumulative_summary.prob_first_recall = prob_first_recall

        cumulative_summary.n_correct_stim = cumulative_n_recalls_from_stim
        cumulative_summary.n_total_stim = cumulative_n_items_from_stim
        cumulative_summary.pc_from_stim = 100 * cumulative_n_recalls_from_stim / float(cumulative_n_items_from_stim)

        cumulative_summary.n_correct_nonstim = cumulative_n_recalls_from_nonstim
        cumulative_summary.n_total_nonstim = cumulative_n_items_from_nonstim
        cumulative_summary.pc_from_nonstim = 100 * cumulative_n_recalls_from_nonstim / float(cumulative_n_items_from_nonstim)

        cumulative_summary.n_stim_intr = cumulative_n_intr_from_stim
        cumulative_summary.pc_from_stim_intr = 100 * cumulative_n_intr_from_stim / float(cumulative_n_items_from_stim)

        cumulative_summary.n_nonstim_intr = cumulative_n_intr_from_nonstim
        cumulative_summary.pc_from_nonstim_intr = 100 * cumulative_n_intr_from_nonstim / float(cumulative_n_items_from_nonstim)

        cumulative_summary.chisqr,cumulative_summary.pvalue,_ = proportions_chisquare([cumulative_n_recalls_from_stim, cumulative_n_recalls_from_nonstim], [cumulative_n_items_from_stim, cumulative_n_items_from_nonstim])
        cumulative_summary.chisqr_intr,cumulative_summary.pvalue_intr,_ = proportions_chisquare([cumulative_n_intr_from_stim, cumulative_n_intr_from_nonstim], [cumulative_n_items_from_stim, cumulative_n_items_from_nonstim])

        if math_events is not None:
            cumulative_summary.n_math = len(math_events)
            cumulative_summary.n_correct_math = np.sum(math_events.iscorrect)
            cumulative_summary.pc_correct_math = 100*cumulative_summary.n_correct_math / float(cumulative_summary.n_math)
            cumulative_summary.math_per_list = cumulative_summary.n_math / float(total_list_counter)

        cumulative_summary.n_pli = np.sum(intr_events.intrusion > 0)
        cumulative_summary.pc_pli = 100*cumulative_summary.n_pli / float(len(events))
        cumulative_summary.n_eli = np.sum(intr_events.intrusion == -1)
        cumulative_summary.pc_eli = 100*cumulative_summary.n_eli / float(len(events))

        self.pass_object('cumulative_summary', cumulative_summary)
Esempio n. 18
0
    def run(self):
        subject = self.pipeline.subject
        task = self.pipeline.task

        events = self.get_passed_object(task + '_events')
        math_events = self.get_passed_object(task + '_math_events')
        intr_events = self.get_passed_object(task + '_intr_events')
        rec_events = self.get_passed_object(task + '_rec_events')
        all_events = self.get_passed_object(task + '_all_events')
        channels = self.get_passed_object('monopolar_channels')
        tal_info = self.get_passed_object('bipolar_pairs')

        sessions = np.unique(events.session)

        self.pass_object('NUMBER_OF_SESSIONS', len(sessions))
        self.pass_object('NUMBER_OF_ELECTRODES', len(channels))

        session_data = []
        session_summary_array = []

        positions = np.unique(events.serialpos)
        first_recall_counter = np.zeros(positions.size, dtype=int)
        total_list_counter = 0

        irt_within_cat = []
        irt_between_cat = []

        cumulative_n_items_from_stim = 0
        cumulative_n_recalls_from_stim = 0
        cumulative_n_intr_from_stim = 0

        cumulative_n_items_from_nonstim = 0
        cumulative_n_recalls_from_nonstim = 0
        cumulative_n_intr_from_nonstim = 0

        for session in sessions:
            session_summary = SessionSummary()

            session_events = events[events.session == session]
            n_sess_events = len(session_events)

            session_rec_events = rec_events[rec_events.session == session]

            session_all_events = all_events[all_events.session == session]
            timestamps = session_all_events.mstime
            first_time_stamp = np.min(timestamps)
            last_time_stamp = np.max(timestamps)
            session_length = '%.2f' % ((last_time_stamp - first_time_stamp) / 60000.0)
            session_date = time.strftime('%d-%b-%Y', time.localtime(last_time_stamp/1000))

            session_data.append([session, session_date, session_length])

            session_name = 'Sess%02d' % session

            print 'Session =', session_name

            session_summary.number = session
            session_summary.name = session_name
            session_summary.length = session_length
            session_summary.date = session_date
            session_summary.n_words = len(session_events)
            session_summary.n_correct_words = np.sum(session_events.recalled)
            session_summary.pc_correct_words = 100*session_summary.n_correct_words / float(session_summary.n_words)

            positions = np.unique(session_events.serialpos)
            prob_recall = np.empty_like(positions, dtype=float)
            for i,pos in enumerate(positions):
                pos_events = session_events[session_events.serialpos == pos]
                prob_recall[i] = np.sum(pos_events.recalled) / float(len(pos_events))

            session_summary.prob_recall = prob_recall

            lists = np.unique(session_events.list)
            n_lists = len(lists)
            prob_first_recall = np.zeros(len(positions), dtype=float)
            session_irt_within_cat = []
            session_irt_between_cat = []
            session_summary.n_recalls_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.n_intr_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.n_stims_per_list = np.zeros(n_lists, dtype=np.int)
            session_summary.is_stim_list = np.zeros(n_lists, dtype=np.bool)
            items_per_list = np.zeros(n_lists, dtype=np.int)

            for lst in lists:
                list_events = session_all_events[session_all_events.list == lst]
                list_rec_events = session_rec_events[(session_rec_events.list == lst) & (session_rec_events.intrusion == 0)]
                list_intr_events = session_rec_events[(session_rec_events.list == lst) & (session_rec_events.intrusion >= 4)]
                list_word_events = list_events[list_events.type=='WORD']

                #session_summary.n_recalls_per_list[lst-1] = len(list_rec_events)
                #session_summary.n_intr_per_list[lst-1] = len(list_intr_events)
                session_summary.n_recalls_per_list[lst-1] = np.sum(list_word_events.recalled)
                session_summary.n_stims_per_list[lst-1] = np.sum(list_events.type=='STIM')
                session_summary.is_stim_list[lst-1] = session_events[session_events.list == lst][0].stimList
                for ie in list_intr_events:
                    session_summary.n_intr_per_list[ie.intrusion-1] += 1

                items_per_list[lst-1] = np.sum(list_events.type=='WORD')

                if list_rec_events.size > 0:
                    list_events = session_events[session_events.list == lst]
                    tmp = np.where(list_events.itemno == list_rec_events[0].itemno)[0]
                    if tmp.size > 0:
                        first_recall_idx = tmp[0]
                        prob_first_recall[first_recall_idx] += 1
                        first_recall_counter[first_recall_idx] += 1
                if task == 'RAM_CatFR3':
                    # list_rec_events = session_rec_events[session_rec_events.list == lst]
                    for i in xrange(1,len(list_rec_events)):
                        cur_ev = list_rec_events[i]
                        prev_ev = list_rec_events[i-1]
                        # if (cur_ev.intrusion == 0) and (prev_ev.intrusion == 0):
                        dt = cur_ev.mstime - prev_ev.mstime
                        if cur_ev.category == prev_ev.category:
                            session_irt_within_cat.append(dt)
                        else:
                            session_irt_between_cat.append(dt)

            prob_first_recall /= float(n_lists)
            total_list_counter += n_lists

            n_items_from_stim = np.sum(items_per_list[session_summary.is_stim_list])
            n_recalls_from_stim = np.sum(session_summary.n_recalls_per_list[session_summary.is_stim_list])
            n_intr_from_stim = np.sum(session_summary.n_intr_per_list[session_summary.is_stim_list])

            cumulative_n_items_from_stim += n_items_from_stim
            cumulative_n_recalls_from_stim += n_recalls_from_stim
            cumulative_n_intr_from_stim += n_intr_from_stim

            nonstim_list_mask = ~session_summary.is_stim_list
            nonstim_list_mask[0:3] = False
            n_items_from_nonstim = np.sum(items_per_list[nonstim_list_mask])
            n_recalls_from_nonstim = np.sum(session_summary.n_recalls_per_list[nonstim_list_mask])
            n_intr_from_nonstim = np.sum(session_summary.n_intr_per_list[nonstim_list_mask])

            cumulative_n_items_from_nonstim += n_items_from_nonstim
            cumulative_n_recalls_from_nonstim += n_recalls_from_nonstim
            cumulative_n_intr_from_nonstim += n_intr_from_nonstim

            session_summary.n_correct_stim = n_recalls_from_stim
            session_summary.n_total_stim = n_items_from_stim
            session_summary.pc_from_stim = 100 * n_recalls_from_stim / float(n_items_from_stim)

            session_summary.n_correct_nonstim = n_recalls_from_nonstim
            session_summary.n_total_nonstim = n_items_from_nonstim
            session_summary.pc_from_nonstim = 100 * n_recalls_from_nonstim / float(n_items_from_nonstim)

            session_summary.n_stim_intr = n_intr_from_stim
            session_summary.pc_from_stim_intr = 100 * n_intr_from_stim / float(n_items_from_stim)

            session_summary.n_nonstim_intr = n_intr_from_nonstim
            session_summary.pc_from_nonstim_intr = 100 * n_intr_from_nonstim / float(n_items_from_nonstim)

            session_summary.chisqr,session_summary.pvalue,_ = proportions_chisquare([n_recalls_from_stim, n_recalls_from_nonstim], [n_items_from_stim, n_items_from_nonstim])
            session_summary.chisqr_intr,session_summary.pvalue_intr,_ = proportions_chisquare([n_intr_from_stim, n_intr_from_nonstim], [n_items_from_stim, n_items_from_nonstim])

            session_summary.irt_within_cat = sum(session_irt_within_cat) / len(session_irt_within_cat) if session_irt_within_cat else 0.0
            session_summary.irt_between_cat = sum(session_irt_between_cat) / len(session_irt_between_cat) if session_irt_between_cat else 0.0

            irt_within_cat += session_irt_within_cat
            irt_between_cat += session_irt_between_cat

            session_summary.prob_first_recall = prob_first_recall

            if math_events is not None:
                session_math_events = math_events[math_events.session == session]
                session_summary.n_math = len(session_math_events)
                session_summary.n_correct_math = np.sum(session_math_events.iscorrect)
                session_summary.pc_correct_math = 100*session_summary.n_correct_math / float(session_summary.n_math)
                session_summary.math_per_list = session_summary.n_math / float(n_lists)

            session_intr_events = intr_events[intr_events.session == session]

            session_summary.n_pli = np.sum(session_intr_events.intrusion > 0)
            session_summary.pc_pli = 100*session_summary.n_pli / float(n_sess_events)
            session_summary.n_eli = np.sum(session_intr_events.intrusion == -1)
            session_summary.pc_eli = 100*session_summary.n_eli / float(n_sess_events)

            session_summary_array.append(session_summary)


        self.pass_object('SESSION_DATA', session_data)
        self.pass_object('session_summary_array', session_summary_array)

        cumulative_summary = SessionSummary()
        cumulative_summary.n_words = len(events)
        cumulative_summary.n_correct_words = np.sum(events.recalled)
        cumulative_summary.pc_correct_words = 100*cumulative_summary.n_correct_words / float(cumulative_summary.n_words)

        cumulative_summary.irt_within_cat = sum(irt_within_cat) / len(irt_within_cat) if irt_within_cat else 0.0
        cumulative_summary.irt_between_cat = sum(irt_between_cat) / len(irt_between_cat) if irt_between_cat else 0.0

        positions = np.unique(events.serialpos)
        prob_recall = np.empty_like(positions, dtype=float)
        for i,pos in enumerate(positions):
            pos_events = events[events.serialpos == pos]
            prob_recall[i] = np.sum(pos_events.recalled) / float(len(pos_events))
        cumulative_summary.prob_recall = prob_recall

        prob_first_recall = first_recall_counter / float(total_list_counter)
        cumulative_summary.prob_first_recall = prob_first_recall

        cumulative_summary.n_correct_stim = cumulative_n_recalls_from_stim
        cumulative_summary.n_total_stim = cumulative_n_items_from_stim
        cumulative_summary.pc_from_stim = 100 * cumulative_n_recalls_from_stim / float(cumulative_n_items_from_stim)

        cumulative_summary.n_correct_nonstim = cumulative_n_recalls_from_nonstim
        cumulative_summary.n_total_nonstim = cumulative_n_items_from_nonstim
        cumulative_summary.pc_from_nonstim = 100 * cumulative_n_recalls_from_nonstim / float(cumulative_n_items_from_nonstim)

        cumulative_summary.n_stim_intr = cumulative_n_intr_from_stim
        cumulative_summary.pc_from_stim_intr = 100 * cumulative_n_intr_from_stim / float(cumulative_n_items_from_stim)

        cumulative_summary.n_nonstim_intr = cumulative_n_intr_from_nonstim
        cumulative_summary.pc_from_nonstim_intr = 100 * cumulative_n_intr_from_nonstim / float(cumulative_n_items_from_nonstim)

        cumulative_summary.chisqr,cumulative_summary.pvalue,_ = proportions_chisquare([cumulative_n_recalls_from_stim, cumulative_n_recalls_from_nonstim], [cumulative_n_items_from_stim, cumulative_n_items_from_nonstim])
        cumulative_summary.chisqr_intr,cumulative_summary.pvalue_intr,_ = proportions_chisquare([cumulative_n_intr_from_stim, cumulative_n_intr_from_nonstim], [cumulative_n_items_from_stim, cumulative_n_items_from_nonstim])

        if math_events is not None:
            cumulative_summary.n_math = len(math_events)
            cumulative_summary.n_correct_math = np.sum(math_events.iscorrect)
            cumulative_summary.pc_correct_math = 100*cumulative_summary.n_correct_math / float(cumulative_summary.n_math)
            cumulative_summary.math_per_list = cumulative_summary.n_math / float(total_list_counter)

        cumulative_summary.n_pli = np.sum(intr_events.intrusion > 0)
        cumulative_summary.pc_pli = 100*cumulative_summary.n_pli / float(len(events))
        cumulative_summary.n_eli = np.sum(intr_events.intrusion == -1)
        cumulative_summary.pc_eli = 100*cumulative_summary.n_eli / float(len(events))

        self.pass_object('cumulative_summary', cumulative_summary)
Esempio n. 19
0
def multinomial_variable_plot(df,
                              feature,
                              target,
                              ylabel='Mean target',
                              ci_method='wilson', 
                              quiet=False):
    """
    Plots a multinomial distribution (k=2 or more) where the y-axis is the mean 
    target for each category.  A significance test is performed, testing the 
    null hypothesis that the distribution of targets across each category are 
    drawn from the same distribution.

    Paramters:
    ----------
    df : pd.DataFrame,
        Dataframe containing data to plot.

    feature : str,
        Name of the column in df to plot.
    
    target : {str, array-like},
        Must be name of column in df containing the binary target or numpy 
        array containing binary target for each instance in df.

    ylabel : str (default='Mean target'),
        Text to label the y-axis with.
        
    ci_method : (default=wilson),
        Passed to statsmodels.stats.proportion.proportion_confint, method used 
        to calculate 95 % confidence intervals.
    
    quiet : bool (default=False),
        Whether to print calculated statistics to screen.
        
    Returns:
    --------
    fig : matplotlib.figure.Figure,
        Figure object.
    
    ax : matplotlib.pyplot.axis,
        Axis plotted to.
    """
    if type(target)!=str:
        df['target'] = target
        target = 'target'

    # check target is binary
    if df[target].nunique()!=2:
        raise Exception('Target must be binary.')

    agg_df = df.groupby(feature)[target].agg(['mean', 'count', 'sum'])
    
    # Calculate confidence intervals and p-value
    nocc = agg_df['sum'] # number of occurences of event
    nobs = agg_df['count'] # number of observations
    ci_lower, ci_upper = proportion_confint(nocc, nobs, method=ci_method)
    
    cardinality = df[feature].nunique()
    if cardinality==2:
        # binary variable, perform 2-sided z test
        test_statistic, p = proportions_ztest(nocc,
                                              nobs,
                                              alternative='two-sided')
        statistic_text = f'Two-sided z score: {test_statistic:.4f}'
    else:
        # multinomial variable, perform chi-squared test
        test_statistic, p, _ = proportions_chisquare(nocc, nobs) 
        statistic_text = f'Chi-squared : {test_statistic:.4f}'
        
    if not quiet:
        print('----'*5)
        print('Variable:', feature)
        print('----'*5)
        print(agg_df)
        print('')
        print(statistic_text)
        print(f'p-value: {p:.4f}')
        print('')
        print('ci_lower\n', ci_lower)
        print('')
        print('ci_upper\n', ci_upper)
    
    # create the plot
    fig, ax = plt.subplots()
    yerr = np.vstack([agg_df['mean']-ci_lower, ci_upper-agg_df['mean']])
    ax.bar(agg_df.index,
           agg_df['mean'],
           yerr=yerr,
           color='grey',
           capsize=2)
    
    # format the plot
    ax.set_title(f'{feature.capitalize()}, $p$: {p:.3f}', fontsize=10)
    ax.tick_params(which='both', labelsize=8)
    utils.remove_axis(ax)
    ax.set_ylabel(ylabel, fontsize=9)
    fig.set_size_inches(3.5,2.5)
    return fig, ax