Esempio n. 1
0
    def calculate_significance(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected spot distances are
        checked.

        We perform two statistical tests:

        1. The unpaired Wilcoxon rank sum test. We use unpaired because the two
           sets of distances are unrelated (:ref:`Dalgaard <ref-dalgaard>`). In
           other words, a distance n in 'observed' is unrelated to distance n
           in 'expected' (where n is an item number in the lists).

        2. The Chi-squared test for given probabilities
           (:ref:`Millar <ref-dalgaard>`, :ref:`Dalgaard <ref-millar>`). The
           probabilities for all spot distances have been pre-calcualted. So
           the observed probabilities are compared with the pre-calculated
           probabilities.

           For the Chi-squared test the expected frequencies should not be
           less than 5 (:ref:`Buijs <ref-buijs>`). If we find an expected
           frequency that is less than 5, the result for this test is not
           saved.

        Based on the results of the tests we can decide which
        hypothesis we can assume to be true.

        Null hypothesis
            The species in question doesn't attract or repel itself.

        Alternative hypothesis
            The species in question attracts (mean observed < mean
            expected) or repels (mean observed > mean expected) itself.

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        The default value for the alpha level is 0.05 (5%). In biology
        we usually assume that differences are significant if P has
        a value less than 5% (:ref:`Millar <ref-dalgaard>`).

        A high number of positive spots on a plate will naturally lead
        to a high p-value (not significant). These plates will
        negatively affect the result of statistical test. To account
        for this, the tests are performed on groups of plates. Instead of
        doing one test on all plates, we group the plates based on the
        number of positive spots they contain.

        Both tests are performed on each group. Plates of group 1 and 25
        are not tested. We skip group 1 because it is not possible to
        calculate spot distances for plates with just one positive spot.
        Plates of group 25 are ignored because this will always result
        in a p-value of 1 as a result of equal observed and expected
        spot distances.

        Both tests are also performed on groups 2-24 taken together.

        Design Part: 1.24
        """

        # Perform the tests for records that have a specific number of
        # positive spots. The tests are performed separately for each
        # number in the list. Numbers starting with "-" means all records
        # with positive spots up to that number.
        spot_totals = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,-24]

        for n_spots in spot_totals:
            # Get both sets of distances from plates per total spot numbers.
            observed = self.db.get_distances_matching_spots_total(
                'spot_distances_observed', n_spots)
            expected = self.db.get_distances_matching_spots_total(
                'spot_distances_expected', n_spots)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Get the number of plates found that match the current
            # number of positive spots.
            n_plates = self.db.matching_plates_total

            # Get the lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected spot distances must always
            # be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of 2 observed distances is required for the
            # significance test. So skip this spots number if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Perform the two sample Wilcoxon test.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Set some test attributes for the report.
            if not self.statistics['wilcoxon_spots_repeats']['attr']:
                self.statistics['wilcoxon_spots_repeats']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'repeats': self.n_repeats,
                    'groups': 'spots',
                }
            if not self.statistics['wilcoxon_spots']['attr']:
                self.statistics['wilcoxon_spots']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': 'spots',
                }

            # Save the test result.
            self.statistics['wilcoxon_spots']['results'][n_spots] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'p_value': test_result['p.value'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }

            # Get the probability for each spot distance (used for the
            # Chi-squared test).
            spot_dist_to_prob = setlyze.config.cfg.get('spot-dist-to-prob-intra')

            # Get the frequencies for the observed distances (used for the
            # Chi-squared test).
            observed_freq = setlyze.std.distance_frequency(observed, 'intra')

            # Also perform the Chi-squared test.
            test_result = chisq_test(observed_freq.values(),
                p = spot_dist_to_prob.values())

            # If we find an expected frequency that is less than 5, do not save
            # the result.
            for f in test_result['expected']:
                if f < 5:
                    continue

            # Save the test result.
            if not self.statistics['chi_squared_spots']['attr']:
                self.statistics['chi_squared_spots']['attr'] = {
                    'method': test_result['method'],
                    'groups': 'spots',
                }
            self.statistics['chi_squared_spots']['results'][n_spots] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'chi_squared': test_result['statistic']['X-squared'],
                'p_value': test_result['p.value'],
                'df': test_result['parameter']['df'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }
Esempio n. 2
0
    def calculate_significance_chisq(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected positive spot numbers
        are checked.

        The Chi-squared test for given probabilities (:ref:`Millar <ref-millar>`,
        :ref:`Dalgaard <ref-dalgaard>`) is used to calculate this significance.
        The probabilities for the user defined plate areas are first calculated.
        From these probabilities the expected positive spots numbers are
        calculated by the Chi-squared test. The number of observed positive
        spots are then compared to the expected number of positive spots. This
        is done for all user defined plate areas.

        For the Chi-squared test the expected frequencies should not be less
        than 5 (:ref:`Buijs <ref-buijs>`). If we find an expected frequency
        that is less than 5, the result for this test is not saved.

        Based on the results of a test we can decide which hypothesis we can
        assume to be true.

        Null hypothesis
            The species in question does not have a preference or rejection
            for the plate area in question.

        Alternative hypothesis
            The species in question has a preference for the plate area in
            question (n observed > n expected) or has a rejection for
            the plate area in question (n observed < n expected).

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        In contrast to the results of the Wilcoxon test, the results for this
        test don't show whether the species has a preference or a rejection
        for a specific user defined plate area. This is because the design of
        the Chi-squared test, which looks at the data of all plate areas
        together. So it just tells you if the data shows significant
        differences.

        Design Part: 1.99
        """

        # Get the probabilities for the user defined plate areas.
        probabilities = self.get_area_probabilities()

        # Also perform Chi-squared test.
        test_result = chisq_test(self.chisq_observed.values(),
            p = probabilities.values())

        # If we find an expected frequency that is less than 5, do not save
        # the results for this test.
        for f in test_result['expected']:
            if f < 5:
                return

        # Save the significance result.
        self.statistics['chi_squared_areas']['attr'] = {
            'method': test_result['method'],
        }
        self.statistics['chi_squared_areas']['results'] = {
            'chi_squared': test_result['statistic']['X-squared'],
            'p_value': test_result['p.value'],
            'df': test_result['parameter']['df'],
        }

        # Save the expected values.
        self.chisq_expected = {}
        for i, area in enumerate(self.chisq_observed):
            self.chisq_expected[area] = test_result['expected'][i]
Esempio n. 3
0
    def calculate_significance(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected spot distances are
        checked.

        We perform two statistical tests:

        1. The unpaired Wilcoxon rank sum test. We use unpaired because the two
           sets of distances are unrelated (:ref:`Dalgaard <ref-dalgaard>`). In
           other words, a distance n in 'observed' is unrelated to distance n
           in 'expected' (where n is an item number in the lists).

        2. The Chi-squared test for given probabilities
           (:ref:`Millar <ref-dalgaard>`, :ref:`Dalgaard <ref-millar>`). The
           probabilities for all spot distances have been pre-calcualted. So
           the observed probabilities are compared with the pre-calculated
           probabilities.

           For the Chi-squared test the expected frequencies should not be
           less than 5 (:ref:`Buijs <ref-buijs>`). If we find an expected
           frequency that is less than 5, the result for this test is not
           saved.

        Based on the results of the tests we can decide which hypothesis
        we can assume to be true.

        Null hypothesis
            The species in question doesn't attract or repel itself.

        Alternative hypothesis
            The species in question attracts (mean observed < mean
            expected) or repels (mean observed > mean expected) itself.

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        The default value for the alpha level is 0.05 (5%). In biology
        we usually assume that differences are significant if P has
        a value less than 5% (:ref:`Millar <ref-dalgaard>`).

        A high number of positive spots on a plate will naturally lead
        to a high p-value (not significant). These plates will
        negatively affect the result of statistical test. To account
        for this, the tests are performed on groups of plates. Instead of
        doing one test on all plates, we group the plates based on the
        positive spots ratios.

        Because we match plates that contain both species selection, we
        can calculate a ratio of positive spots for each plate. So a
        plate with 3 positive spots for species A and 2 positive spots
        for species B, would result in a ratio of 3:2 (or 2:3). We consider
        a ratio of A:B to be the same as ratio B:A.

        We've grouped all possible ratio's in 5 ratios groups. See
        :ref:`record grouping <record-grouping>` in the user manual for
        more details. Both tests are performed on each ratios group.

        Both tests are also performed on ratios groups 1-5 taken together.

        Design Part: 1.24
        """

        # Create an iterator returning the ratio groups.
        ratio_groups = self.generate_spot_ratio_groups()

        for n_group, ratio_group in enumerate(ratio_groups, start=1):
            # Ratios group 6 is actually all 5 groups taken together.
            # So change the group number to -5, meaning all groups up
            # to 5.
            if n_group == 6:
                n_group = -5

            # Get both sets of distances from plates per total spot numbers.
            observed = self.db.get_distances_matching_ratios(
                'spot_distances_observed', ratio_group)
            expected = self.db.get_distances_matching_ratios(
                'spot_distances_expected', ratio_group)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Get the number of matching plates.
            n_plates = self.db.matching_plates_total

            # Get the lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected spot distances must always
            # be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of 2 observed distances is required for the
            # significance test. So skip this ratio group if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Perform two sample Wilcoxon tests.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Save the significance result.
            if not self.statistics['wilcoxon_ratios_repeats']['attr']:
                self.statistics['wilcoxon_ratios_repeats']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'repeats': self.n_repeats,
                    'groups': 'ratios',
                }

            if not self.statistics['wilcoxon_ratios']['attr']:
                self.statistics['wilcoxon_ratios']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': 'ratios',
                }

            self.statistics['wilcoxon_ratios']['results'][n_group] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'p_value': test_result['p.value'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }

            # Get the probability for each spot distance. Required for
            # the Chi-squared test.
            spot_dist_to_prob = setlyze.config.cfg.get('spot-dist-to-prob-inter')

            # Get the frequencies for the observed distances. These
            # are required for the Chi-squared test.
            observed_freq = setlyze.std.distance_frequency(observed, 'inter')

            # Also perform Chi-squared test.
            test_result = chisq_test(observed_freq.values(),
                p = spot_dist_to_prob.values())

            # If we find an expected frequency that is less than 5, do not save
            # the result.
            for f in test_result['expected']:
                if f < 5:
                    continue

            # Save the significance result.
            if not self.statistics['chi_squared_ratios']['attr']:
                self.statistics['chi_squared_ratios']['attr'] = {
                    'method': test_result['method'],
                    'groups': 'ratios',
                }

            self.statistics['chi_squared_ratios']['results'][n_group] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'chi_squared': test_result['statistic']['X-squared'],
                'p_value': test_result['p.value'],
                'df': test_result['parameter']['df'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }