コード例 #1
0
ファイル: spot_preference.py プロジェクト: Ruutger/setlyze
    def wilcoxon_test_for_repeats(self):
        """Perform the Wilcoxon rank sum test for repeats.

        This method does the same Wilcoxon test from :meth:`calculate_significance`,
        but it is designed to be called repeatedly, saving the results
        of the repeated test. This method doesn't save the detailed
        results of the Wilcoxon test, but just saves whether the p-value
        was significant, and whether it was attraction or repulsion for the
        different numbers of positive spots.

        Repeation of the Wilcoxon test is necessary because the expected values
        are calculated randomly. The test needs to be repeated many times if
        you want to draw a solid conclusion from the test.

        This method will be put in a loop by :meth:`repeat_wilcoxon_test`.

        Design Part: 1.100
        """

        # The plate area groups to perfom the test on.
        area_groups = [('A'),('B'),('C'),('D'),('A','B'),('C','D'),
            ('A','B','C'),('B','C','D')]

        # Perform the test on each area group.
        for area_group in area_groups:
            # Create a human readable string with the areas in the area group.
            area_group_str = "+".join(area_group)

            # Get area totals per area group per plate.
            observed = self.db.get_area_totals(
                'plate_area_totals_observed', area_group)
            expected = self.db.get_area_totals(
                'plate_area_totals_expected', area_group)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # A minimum of two positive spots totals are required for the
            # significance test. So skip this spots number if it's less.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected plate area totals must
            # always be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of two positive spots totals are required for the
            # significance test. So skip this plate area if it's less.
            if count_observed < 2:
                continue

            # Check if this area group is present in the statistics variable.
            # If not, create it.
            if area_group_str not in self.statistics['wilcoxon_areas_repeats']['results']:
                self.statistics['wilcoxon_areas_repeats']['results'][area_group_str] = {
                    'n_values': count_observed,
                    'n_sp_observed': sum(observed),
                    'n_significant': 0,
                    'n_preference': 0,
                    'n_rejection': 0
                }

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Perform two sample Wilcoxon tests.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Check if the result was significant. When all values are 0
            # the p-value will be NaN. Function `is_significant` will raise
            # ValueError if the p-value is NaN.
            try:
                significant = setlyze.std.is_significant(test_result['p.value'], self.alpha_level)
            except ValueError:
                continue

            # Save basic results for this repeated test.
            if significant:
                # If so, increase significant counter with one.
                self.statistics['wilcoxon_areas_repeats']['results'][area_group_str]['n_significant'] += 1

                # If significant, also check if there is preference or
                # rejection for this plate area.
                if mean_observed > mean_expected:
                    # Increase preference counter with one.
                    self.statistics['wilcoxon_areas_repeats']['results'][area_group_str]['n_preference'] += 1
                else:
                    # Increase rejection counter with one.
                    self.statistics['wilcoxon_areas_repeats']['results'][area_group_str]['n_rejection'] += 1
コード例 #2
0
ファイル: spot_preference.py プロジェクト: Ruutger/setlyze
    def calculate_significance_wilcoxon(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected positive spot numbers
        are checked.

        The unpaired Wilcoxon rank sum test is used. We use unpaired because
        the two sets of positive spots numbers are unrelated
        (:ref:`Dalgaard <ref-dalgaard>`).

        The test is performed on different data groups. Each data group
        contains the positive spots numbers for a specific plate area or
        a combination of plate areas. The user defined plate areas are not
        used for this test, so the default plate areas A, B, C and D are used.
        The groups are defined as follows:

            1. Plate area A
            2. Plate area B
            3. Plate area C
            4. Plate area D
            5. Plate area A+B
            6. Plate area C+D
            7. Plate area A+B+C
            8. Plate area B+C+D

        Based on the results of a test we can decide which hypothesis we can
        assume to be true.

        Null hypothesis
            The species in question does not have a preference or rejection
            for the plate area in question.

        Alternative hypothesis
            The species in question has a preference for the plate area in
            question (mean observed > mean expected) or has a rejection for
            the plate area in question (mean observed < mean expected).

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        Combining the results of all plate area groups listed above should
        allow you to draw a conclusion about the species' plate area preference.
        For example, should a species have a strong preference for the corners
        of a SETL-plate, then you would expect to find low p-values for group
        1 (preference). But also low P-values for groups 3, 4, 6 and 8
        because of rejection. If group 2 would not be significant, then group
        7 wouldn't be either, because areas A and C neutralize each other.

        Design Part: 1.98
        """

        # The area groups to perfom the test on.
        area_groups = [('A'),('B'),('C'),('D'),('A','B'),('C','D'),
            ('A','B','C'),('B','C','D')]

        for area_group in area_groups:
            # Get area totals per area group per plate.
            observed = self.db.get_area_totals(
                'plate_area_totals_observed', area_group)
            expected = self.db.get_area_totals(
                'plate_area_totals_expected', area_group)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Calculate the number of species encounters for the current
            # area group.
            species_encouters_observed = sum(observed)
            species_encouters_expected = sum(expected)

            # Get the lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected plate area totals must
            # always be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of two positive spots totals are required for the
            # significance test. So skip this plate area if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Create a human readable string with the areas in the area group.
            area_group_str = "+".join(area_group)

            # Perform two sample Wilcoxon tests.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Set the attributes for the tests.
            if not self.statistics['wilcoxon_areas_repeats']['attr']:
                self.statistics['wilcoxon_areas_repeats']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': "areas",
                    'repeats': self.n_repeats,
                }

            if not self.statistics['wilcoxon_areas']['attr']:
                self.statistics['wilcoxon_areas']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': "areas",
                }

            # Save the results for each test.
            self.statistics['wilcoxon_areas']['results'][area_group_str] = {
                'n_values': count_observed,
                'n_sp_observed': species_encouters_observed,
                'n_sp_expected': species_encouters_expected,
                'p_value': test_result['p.value'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }
コード例 #3
0
ファイル: attraction_intra.py プロジェクト: Ruutger/setlyze
    def calculate_significance(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected spot distances are
        checked.

        We perform two statistical tests:

        1. The unpaired Wilcoxon rank sum test. We use unpaired because the two
           sets of distances are unrelated (:ref:`Dalgaard <ref-dalgaard>`). In
           other words, a distance n in 'observed' is unrelated to distance n
           in 'expected' (where n is an item number in the lists).

        2. The Chi-squared test for given probabilities
           (:ref:`Millar <ref-dalgaard>`, :ref:`Dalgaard <ref-millar>`). The
           probabilities for all spot distances have been pre-calcualted. So
           the observed probabilities are compared with the pre-calculated
           probabilities.

           For the Chi-squared test the expected frequencies should not be
           less than 5 (:ref:`Buijs <ref-buijs>`). If we find an expected
           frequency that is less than 5, the result for this test is not
           saved.

        Based on the results of the tests we can decide which
        hypothesis we can assume to be true.

        Null hypothesis
            The species in question doesn't attract or repel itself.

        Alternative hypothesis
            The species in question attracts (mean observed < mean
            expected) or repels (mean observed > mean expected) itself.

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        The default value for the alpha level is 0.05 (5%). In biology
        we usually assume that differences are significant if P has
        a value less than 5% (:ref:`Millar <ref-dalgaard>`).

        A high number of positive spots on a plate will naturally lead
        to a high p-value (not significant). These plates will
        negatively affect the result of statistical test. To account
        for this, the tests are performed on groups of plates. Instead of
        doing one test on all plates, we group the plates based on the
        number of positive spots they contain.

        Both tests are performed on each group. Plates of group 1 and 25
        are not tested. We skip group 1 because it is not possible to
        calculate spot distances for plates with just one positive spot.
        Plates of group 25 are ignored because this will always result
        in a p-value of 1 as a result of equal observed and expected
        spot distances.

        Both tests are also performed on groups 2-24 taken together.

        Design Part: 1.24
        """

        # Perform the tests for records that have a specific number of
        # positive spots. The tests are performed separately for each
        # number in the list. Numbers starting with "-" means all records
        # with positive spots up to that number.
        spot_totals = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,-24]

        for n_spots in spot_totals:
            # Get both sets of distances from plates per total spot numbers.
            observed = self.db.get_distances_matching_spots_total(
                'spot_distances_observed', n_spots)
            expected = self.db.get_distances_matching_spots_total(
                'spot_distances_expected', n_spots)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Get the number of plates found that match the current
            # number of positive spots.
            n_plates = self.db.matching_plates_total

            # Get the lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected spot distances must always
            # be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of 2 observed distances is required for the
            # significance test. So skip this spots number if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Perform the two sample Wilcoxon test.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Set some test attributes for the report.
            if not self.statistics['wilcoxon_spots_repeats']['attr']:
                self.statistics['wilcoxon_spots_repeats']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'repeats': self.n_repeats,
                    'groups': 'spots',
                }
            if not self.statistics['wilcoxon_spots']['attr']:
                self.statistics['wilcoxon_spots']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': 'spots',
                }

            # Save the test result.
            self.statistics['wilcoxon_spots']['results'][n_spots] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'p_value': test_result['p.value'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }

            # Get the probability for each spot distance (used for the
            # Chi-squared test).
            spot_dist_to_prob = setlyze.config.cfg.get('spot-dist-to-prob-intra')

            # Get the frequencies for the observed distances (used for the
            # Chi-squared test).
            observed_freq = setlyze.std.distance_frequency(observed, 'intra')

            # Also perform the Chi-squared test.
            test_result = chisq_test(observed_freq.values(),
                p = spot_dist_to_prob.values())

            # If we find an expected frequency that is less than 5, do not save
            # the result.
            for f in test_result['expected']:
                if f < 5:
                    continue

            # Save the test result.
            if not self.statistics['chi_squared_spots']['attr']:
                self.statistics['chi_squared_spots']['attr'] = {
                    'method': test_result['method'],
                    'groups': 'spots',
                }
            self.statistics['chi_squared_spots']['results'][n_spots] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'chi_squared': test_result['statistic']['X-squared'],
                'p_value': test_result['p.value'],
                'df': test_result['parameter']['df'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }
コード例 #4
0
ファイル: attraction_intra.py プロジェクト: Ruutger/setlyze
    def wilcoxon_test_for_repeats(self):
        """Perform the Wilcoxon rank sum test for repeats.

        This method does the same Wilcoxon test from :meth:`calculate_significance`,
        but it is designed to be called repeatedly, saving the results
        of the repeated test. This method doesn't save the detailed
        results of the Wilcoxon test, but just saves whether the p-value
        was significant, and whether it was attraction or repulsion for the
        different numbers of positive spots.

        Repeation of the Wilcoxon test is necessary because the expected values
        are calculated randomly. The test needs to be repeated many times if
        you want to draw a solid conclusion from the test.

        This method will be put in a loop by :meth:`repeat_wilcoxon_test`.

        Design Part: 1.102
        """

        spot_totals = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
            23,24,-24]

        for n_spots in spot_totals:
            # Get both sets of distances from plates per total spot numbers.
            observed = self.db.get_distances_matching_spots_total(
                'spot_distances_observed', n_spots)
            expected = self.db.get_distances_matching_spots_total(
                'spot_distances_expected', n_spots)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Get the list lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected spot distances must always
            # be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of 2 observed distances is required for the
            # significance test. So skip this spots number if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Check if this spots number is present in the statistics variable.
            # If not, create it.
            if n_spots not in self.statistics['wilcoxon_spots_repeats']['results']:
                self.statistics['wilcoxon_spots_repeats']['results'][n_spots] = {
                    'n_plates': self.db.matching_plates_total,
                    'n_values': count_observed,
                    'n_significant': 0,
                    'n_attraction': 0,
                    'n_repulsion': 0
                }

            # Perform two sample Wilcoxon tests.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Check if the result was significant. When all values are
            # 0 the p-value will be NaN. Function `is_significant` will
            # raise ValueError if the p-value is NaN.
            try:
                significant = setlyze.std.is_significant(test_result['p.value'], self.alpha_level)
            except ValueError:
                significant = False

            if significant:
                # If so, increase significant counter with one.
                self.statistics['wilcoxon_spots_repeats']['results'][n_spots]['n_significant'] += 1

                # If significant, also check if there is preference or
                # rejection for this plate area.
                if mean_observed < mean_expected:
                    # Increase attracion counter with one.
                    self.statistics['wilcoxon_spots_repeats']['results'][n_spots]['n_attraction'] += 1
                else:
                    # Increase repulsion counter with one.
                    self.statistics['wilcoxon_spots_repeats']['results'][n_spots]['n_repulsion'] += 1
コード例 #5
0
ファイル: attraction_inter.py プロジェクト: Ruutger/setlyze
    def calculate_significance(self):
        """Perform statistical tests to check for significant differences.

        The differences between the observed and expected spot distances are
        checked.

        We perform two statistical tests:

        1. The unpaired Wilcoxon rank sum test. We use unpaired because the two
           sets of distances are unrelated (:ref:`Dalgaard <ref-dalgaard>`). In
           other words, a distance n in 'observed' is unrelated to distance n
           in 'expected' (where n is an item number in the lists).

        2. The Chi-squared test for given probabilities
           (:ref:`Millar <ref-dalgaard>`, :ref:`Dalgaard <ref-millar>`). The
           probabilities for all spot distances have been pre-calcualted. So
           the observed probabilities are compared with the pre-calculated
           probabilities.

           For the Chi-squared test the expected frequencies should not be
           less than 5 (:ref:`Buijs <ref-buijs>`). If we find an expected
           frequency that is less than 5, the result for this test is not
           saved.

        Based on the results of the tests we can decide which hypothesis
        we can assume to be true.

        Null hypothesis
            The species in question doesn't attract or repel itself.

        Alternative hypothesis
            The species in question attracts (mean observed < mean
            expected) or repels (mean observed > mean expected) itself.

        The decision is based on the p-value calculated by the test:

        P >= alpha level
            Assume that the null hypothesis is true.

        P < alpha level
            Assume that the alternative hypothesis is true.

        The default value for the alpha level is 0.05 (5%). In biology
        we usually assume that differences are significant if P has
        a value less than 5% (:ref:`Millar <ref-dalgaard>`).

        A high number of positive spots on a plate will naturally lead
        to a high p-value (not significant). These plates will
        negatively affect the result of statistical test. To account
        for this, the tests are performed on groups of plates. Instead of
        doing one test on all plates, we group the plates based on the
        positive spots ratios.

        Because we match plates that contain both species selection, we
        can calculate a ratio of positive spots for each plate. So a
        plate with 3 positive spots for species A and 2 positive spots
        for species B, would result in a ratio of 3:2 (or 2:3). We consider
        a ratio of A:B to be the same as ratio B:A.

        We've grouped all possible ratio's in 5 ratios groups. See
        :ref:`record grouping <record-grouping>` in the user manual for
        more details. Both tests are performed on each ratios group.

        Both tests are also performed on ratios groups 1-5 taken together.

        Design Part: 1.24
        """

        # Create an iterator returning the ratio groups.
        ratio_groups = self.generate_spot_ratio_groups()

        for n_group, ratio_group in enumerate(ratio_groups, start=1):
            # Ratios group 6 is actually all 5 groups taken together.
            # So change the group number to -5, meaning all groups up
            # to 5.
            if n_group == 6:
                n_group = -5

            # Get both sets of distances from plates per total spot numbers.
            observed = self.db.get_distances_matching_ratios(
                'spot_distances_observed', ratio_group)
            expected = self.db.get_distances_matching_ratios(
                'spot_distances_expected', ratio_group)

            # Iterators cannot be used directly by RPy, so convert them to
            # lists first.
            observed = list(observed)
            expected = list(expected)

            # Get the number of matching plates.
            n_plates = self.db.matching_plates_total

            # Get the lengths.
            count_observed = len(observed)
            count_expected = len(expected)

            # The number of observed and expected spot distances must always
            # be the same.
            assert count_observed == count_expected, \
                "Number of observed and expected values are not equal."

            # A minimum of 2 observed distances is required for the
            # significance test. So skip this ratio group if it's less.
            if count_observed < 2:
                continue

            # Calculate the means.
            mean_observed = setlyze.std.mean(observed)
            mean_expected = setlyze.std.mean(expected)

            # Perform two sample Wilcoxon tests.
            test_result = wilcox_test(observed, expected,
                alternative = "two.sided", paired = False,
                conf_level = 1 - self.alpha_level,
                conf_int = False)

            # Save the significance result.
            if not self.statistics['wilcoxon_ratios_repeats']['attr']:
                self.statistics['wilcoxon_ratios_repeats']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'repeats': self.n_repeats,
                    'groups': 'ratios',
                }

            if not self.statistics['wilcoxon_ratios']['attr']:
                self.statistics['wilcoxon_ratios']['attr'] = {
                    'method': test_result['method'],
                    'alternative': test_result['alternative'],
                    'conf_level': 1 - self.alpha_level,
                    'paired': False,
                    'groups': 'ratios',
                }

            self.statistics['wilcoxon_ratios']['results'][n_group] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'p_value': test_result['p.value'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }

            # Get the probability for each spot distance. Required for
            # the Chi-squared test.
            spot_dist_to_prob = setlyze.config.cfg.get('spot-dist-to-prob-inter')

            # Get the frequencies for the observed distances. These
            # are required for the Chi-squared test.
            observed_freq = setlyze.std.distance_frequency(observed, 'inter')

            # Also perform Chi-squared test.
            test_result = chisq_test(observed_freq.values(),
                p = spot_dist_to_prob.values())

            # If we find an expected frequency that is less than 5, do not save
            # the result.
            for f in test_result['expected']:
                if f < 5:
                    continue

            # Save the significance result.
            if not self.statistics['chi_squared_ratios']['attr']:
                self.statistics['chi_squared_ratios']['attr'] = {
                    'method': test_result['method'],
                    'groups': 'ratios',
                }

            self.statistics['chi_squared_ratios']['results'][n_group] = {
                'n_plates': n_plates,
                'n_values': count_observed,
                'chi_squared': test_result['statistic']['X-squared'],
                'p_value': test_result['p.value'],
                'df': test_result['parameter']['df'],
                'mean_observed': mean_observed,
                'mean_expected': mean_expected,
            }