def test_get_chromosome_1_loop_through_size_20_s3(self):
        start = 0
        size = 20

        looped_through = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        datasets, index_marker = self.searcher.search_chromosome(chromosome=1,
                                                                 start=start,
                                                                 size=size,
                                                                 study='s3')
        d = utils.extend_dsets_with_subset(d, datasets)
        while len(datasets[REFERENCE_DSET]) > 0:
            assert_studies_from_list(datasets, ['s3'])
            if looped_through <= 2:
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)
            else:
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10)

            start = start + index_marker
            datasets, index_marker = self.searcher.search_chromosome(
                chromosome=1, start=start, size=size, study='s3')
            d = utils.extend_dsets_with_subset(d, datasets)
            looped_through += 1

        assert looped_through == 4
        assert index_marker == 0
        # 50 unique variants
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 50
    def test_get_chromosome_1_loop_through_size_20_lower_pval(self):
        start = 0
        size = 20
        pval_interval = FloatInterval().set_tuple(0.00001, 0.00001)

        looped_through = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        datasets, index_marker = self.searcher.search_chromosome(
            chromosome=1, start=start, size=size, pval_interval=pval_interval)
        d = utils.extend_dsets_with_subset(d, datasets)
        while len(datasets[REFERENCE_DSET]) > 0:
            assert_studies_in_list(datasets, ['s1', 's3'])
            assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)

            start = start + index_marker
            datasets, index_marker = self.searcher.search_chromosome(
                chromosome=1,
                start=start,
                size=size,
                pval_interval=pval_interval)
            d = utils.extend_dsets_with_subset(d, datasets)
            looped_through += 1

        assert looped_through == 5
        # start changes on each loop!
        assert index_marker == 0
        # 80 unique variants
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 80
    def test_get_chr_1_second_range_loop_20_upper_pval(self):
        start = 0
        size = 20

        # index 25-40 for first non-empty block: 48500000
        # index 40-50 for second non-empty block: 49200000
        bp_interval = IntInterval().set_string_tuple("1200001:49200000")
        # index 20-35
        pval_interval = FloatInterval().set_tuple(0.1, 0.1)
        looped_through = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval)
        d = utils.extend_dsets_with_subset(d, datasets)
        while len(datasets[REFERENCE_DSET]) > 0:
            assert_studies_in_list(datasets, ['s1','s3'])

            assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)
            start = start + index_marker

            datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval, pval_interval=pval_interval)
            assert_datasets_have_size(datasets, TO_QUERY_DSETS, 0)
            d = utils.extend_dsets_with_subset(d, datasets)
            looped_through += 1

        assert looped_through == 2
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 20
    def test_get_snp_loop_through_filter_lower_pval(self):
        start = 0
        size = 2
        pval_interval = FloatInterval().set_tuple(0.01, 0.01)

        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)

        datasets, index_marker = self.searcher.search_snp(
            snp='rs138808727',
            start=start,
            size=size,
            pval_interval=pval_interval)
        d = utils.extend_dsets_with_subset(d, datasets)

        assert_datasets_have_size(datasets, TO_QUERY_DSETS, 2)
        assert_studies_from_list(datasets, ['s1', 's3'])
        assert index_marker == 6

        start = start + index_marker
        datasets, index_marker = self.searcher.search_snp(
            snp='rs138808727',
            start=start,
            size=size,
            pval_interval=pval_interval)
        assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1)

        assert_studies_from_list(datasets, ['s5'])
        d = utils.extend_dsets_with_subset(d, datasets)

        assert len(d[REFERENCE_DSET]) == 3
    def test_loop_through_w_restrinction_and_always_get_size_20_results(self):
        start = 0
        size = 20

        looped_through = 1

        # s2 and s3 p-value limits
        pval_interval = FloatInterval().set_tuple(0.0002, 0.06)
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        while True:
            print("start", start)
            datasets, next_index = self.searcher.search_all_assocs(
                start=start, size=size, pval_interval=pval_interval)
            print("next index", next_index)
            print(datasets[STUDY_DSET])
            d = utils.extend_dsets_with_subset(d, datasets)
            if len(datasets[REFERENCE_DSET]) <= 0:
                break
            if looped_through <= 2:
                assert_studies_from_list(datasets, ['s2'])
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)
            elif looped_through == 3:
                assert_studies_from_list(datasets, ['s2', 's3'])
                assert_number_of_times_study_is_in_datasets(datasets, 's2', 10)
                assert_number_of_times_study_is_in_datasets(datasets, 's3', 10)
            else:
                assert_studies_from_list(datasets, ['s3'])
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)
            looped_through += 1
            start = start + next_index

        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET]) == 100
Exemple #6
0
    def test_loop_through_t2_size_42(self):
        start = 0
        size = 42

        looped_through = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)

        while True:
            datasets, index_marker = self.searcher.search_trait(trait='t2',
                                                                start=start,
                                                                size=size)
            d = utils.extend_dsets_with_subset(d, datasets)
            if len(datasets[REFERENCE_DSET]) <= 0:
                break

            if looped_through <= 1:
                assert_number_of_times_study_is_in_datasets(datasets, 's3', 42)
                assert_studies_from_list(datasets, ['s3'])
            elif looped_through == 2:
                assert_number_of_times_study_is_in_datasets(datasets, 's3', 8)
                assert_number_of_times_study_is_in_datasets(datasets, 's4', 34)
                assert_studies_from_list(datasets, ['s3', 's4'])
            else:
                assert_number_of_times_study_is_in_datasets(datasets, 's4', 16)
                assert_studies_from_list(datasets, ['s4'])
            looped_through += 1
            start = start + index_marker

        assert looped_through == 4
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
Exemple #7
0
    def test_loop_through_t2_size_5_w_restriction_to_s4(self):
        start = 0
        size = 5

        looped_through = 1
        pval_interval = FloatInterval().set_tuple(0.06, 0.3)
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)

        while True:
            datasets, index_marker = self.searcher.search_trait(
                trait='t2',
                start=start,
                size=size,
                pval_interval=pval_interval)
            d = utils.extend_dsets_with_subset(d, datasets)
            if len(datasets[REFERENCE_DSET]) <= 0:
                break

            # already on the first loop I want to have reached s4
            if looped_through == 1:
                # all of s3 + the first 5 elements of s4
                assert index_marker == 55
            assert_number_of_times_study_is_in_datasets(datasets, 's4', 5)
            assert_studies_from_list(datasets, ['s4'])

            looped_through += 1
            start = start + index_marker

        assert looped_through == 11
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
Exemple #8
0
    def test_get_all_loop_through_size_20(self):
        start = 0
        size = 20

        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size)
        d = utils.extend_dsets_with_subset(d, datasets)
        while len(datasets[REFERENCE_DSET]) > 0:
            if start + index_marker >= 240:
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 10)
            else:
                assert_datasets_have_size(datasets, TO_QUERY_DSETS, 20)
            start = start + index_marker
            datasets, index_marker = self.searcher.search_all_assocs(start=start, size=size)
            d = utils.extend_dsets_with_subset(d, datasets)

        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
    def test_get_snp_loop_through_size_1(self):
        start = 0
        size = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)

        datasets, index_marker = self.searcher.search_snp(snp='rs138808727',
                                                          start=start,
                                                          size=size)
        d = utils.extend_dsets_with_subset(d, datasets)

        while len(datasets[REFERENCE_DSET]) > 0:
            assert_datasets_have_size(datasets, TO_QUERY_DSETS, 1)
            start = start + index_marker
            datasets, index_marker = self.searcher.search_snp(
                snp='rs138808727', start=start, size=size)
            d = utils.extend_dsets_with_subset(d, datasets)

        assert len(d[REFERENCE_DSET]) == 5
Exemple #10
0
def general_search(search_obj, max_size, arguments, restriction_dictionary=None):
    """
    :param search_obj: an object that has a 'query' method and that will perform the actual query
    :param max_size: the max size of the datasets that we are traversing/querying
    :param arguments: the arguments to be passed to the query
    :param restriction_dictionary: a dictonary of restriction objects (see sumstats.utils.restrictions)
    that will be applied to the datasets returned by the query
    :return: a tuple (datasets, index_marker) where 'datasets' is a dictionary with the names of the datasets and
    the data to be returned (the result of the query after applying restrictions) and index_marker is an integer indicating
    up to where the query went in the dataset so that the next query can calculate it's next start base on the index_marker.
    The index marker is needed as we are applying filtering (restrictions) to the data and the start/end size used in a query might
    not be the real indicators of up-till where we have been in the dataset.
    """
    iteration_size = search_obj.size
    search_id = str(search_obj.__class__.__name__) + str(arguments) + str(restriction_dictionary)
    logger.info("Searching with search id %s starting...", search_id)
    logger.debug("Search %s - max size is %s", search_id, max_size)

    while True:
        logger.debug("Search %s - loop with start %s and size %s", search_id, str(search_obj.start), str(iteration_size))
        arguments['size'] = iteration_size
        arguments['start'] = search_obj.start

        # call the query function
        search_obj.service.query(**arguments)

        result_before_filtering = search_obj.service.get_result()
        logger.debug("Search %s - result size before filtering is %s...", search_id,
                     str(len(result_before_filtering[REFERENCE_DSET])))

        if _traversed(start=search_obj.start, result=result_before_filtering, max_size=max_size):
            logger.debug("Search %s - traverse of group complete...", search_id)
            break

        search_obj.index_marker = _increase_search_index(index_marker=search_obj.index_marker, start=search_obj.start,
                                                         iteration_size=iteration_size, max_size=max_size,
                                                         result=result_before_filtering)

        # after search index is increased, we can apply restrictions
        search_obj.service.apply_restrictions(**restriction_dictionary)

        result_after_filtering = search_obj.service.get_result()
        logger.debug("Search %s - result size after filtering is %s...", search_id,
                     str(len(result_after_filtering[REFERENCE_DSET])))

        search_obj.datasets = dataset_utils.extend_dsets_with_subset(search_obj.datasets, result_after_filtering)
        search_obj.start = search_obj.start + iteration_size
        iteration_size = _next_iteration_size(size=search_obj.size, datasets=search_obj.datasets)

        if _search_complete(size=search_obj.size, datasets=search_obj.datasets):
            logger.debug("Search %s - search complete, gathered plethora of elements needed...", search_id)
            break

    logger.debug("Search %s - search completed. Returning index marker %s", search_id, str(search_obj.index_marker))
    search_obj.service.close_file()
    return search_obj.datasets, search_obj.index_marker
    def test_get_chr_1_second_range_loop_5(self):
        start = 0
        size = 5

        bp_interval = IntInterval().set_string_tuple("1200001:49200000")

        looped_through = 1
        d = utils.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)
        datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval)
        d = utils.extend_dsets_with_subset(d, datasets)
        while len(datasets[REFERENCE_DSET]) > 0:
            assert_studies_in_list(datasets, ['s1', 's3'])
            assert_datasets_have_size(datasets, TO_QUERY_DSETS, 5)
            start = start + index_marker
            datasets, index_marker = self.searcher.search_chromosome(chromosome=1, start=start, size=size, bp_interval=bp_interval)
            d = utils.extend_dsets_with_subset(d, datasets)
            looped_through += 1

        assert looped_through == 11
        assert len(set(d[SNP_DSET])) == len(d[SNP_DSET])
    def test_get_dsets_group(self):
        chr_group_2 = gu.Group(self.f.get("/2"))

        bp_interval = IntInterval().set_tuple(48500000, 48500000)
        block = bk.Block(bp_interval)
        block_groups = block.get_block_groups_from_parent(chr_group_2)

        block_group = next(block_groups)

        block_sub_groups = block_group.get_all_subgroups()
        d = du.create_dictionary_of_empty_dsets(TO_QUERY_DSETS)

        for block_sub_group in block_sub_groups:
            datasets = query.get_dsets_from_group(block_sub_group, self.start,
                                                  self.size)
            assert len(datasets) == len(TO_STORE_DSETS)
            d = du.extend_dsets_with_subset(d, datasets)

        for dset_name, dset in d.items():
            if dset_name is STUDY_DSET:
                assert len(set(dset)) == 3
            else:
                assert len(set(dset)) == 1
Exemple #13
0
 def _extend_datasets(self, result):
     self.datasets = utils.extend_dsets_with_subset(self.datasets, result)