def test_divide_into_chunks(self):
     "Check that a list is chunked correctly"
     self.assertEqual(
         divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3),
         [[1, 2, 7], [3, 22, 8], [5, 22, 333]])
     self.assertEqual(
         divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 4),
         [[1, 2, 7], [3, 22, 8], [5, 22, 333]])
     self.assertEqual(
         divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 5),
         [[1, 2], [7, 3], [22, 8], [5, 22], [333]])
     self.assertEqual(divide_into_chunks([], 5), [[]])
Example #2
0
    def get_trait_data(self, sample_list=None):
        if sample_list:
            self.samplelist = sample_list
        else:
            self.samplelist = self.group.samplelist

        if self.group.parlist != None and self.group.f1list != None:
            if (self.group.parlist + self.group.f1list) in self.samplelist:
                self.samplelist += self.group.parlist + self.group.f1list

        query = """
            SELECT Strain.Name, Strain.Id FROM Strain, Species
            WHERE Strain.Name IN {}
            and Strain.SpeciesId=Species.Id
            and Species.name = '{}'
            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
        logger.sql(query)
        results = dict(g.db.execute(query).fetchall())
        sample_ids = [results[item] for item in self.samplelist]

        # MySQL limits the number of tables that can be used in a join to 61,
        # so we break the sample ids into smaller chunks
        # Postgres doesn't have that limit, so we can get rid of this after we transition
        chunk_size = 50
        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
        trait_sample_data = []
        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
            if self.type == "Publish":
                dataset_type = "Phenotype"
            else:
                dataset_type = self.type
            temp = ['T%s.value' % item for item in sample_ids_step]
            if self.type == "Publish":
                query = "SELECT {}XRef.Id,".format(escape(self.type))
            else:
                query = "SELECT {}.Name,".format(escape(dataset_type))
            data_start_pos = 1
            query += string.join(temp, ', ')
            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
                                                                     self.type,
                                                                     self.type))

            for item in sample_ids_step:
                query += """
                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
                        and T{}.StrainId={}\n
                        """.format(*mescape(self.type, item, item, self.type, item, item))

            if self.type == "Publish":
                query += """
                        WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.name,
                                    dataset_type, self.type, dataset_type, dataset_type))
            else:
                query += """
                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.type,
                                   self.name, dataset_type, self.type, self.type, dataset_type))

            #logger.debug("trait data query: ", query)

            logger.sql(query)
            results = g.db.execute(query).fetchall()
            #logger.debug("query results:", results)
            trait_sample_data.append(results)

        trait_count = len(trait_sample_data[0])
        self.trait_data = collections.defaultdict(list)

        # put all of the separate data together into a dictionary where the keys are
        # trait names and values are lists of sample values
        for trait_counter in range(trait_count):
            trait_name = trait_sample_data[0][trait_counter][0]
            for chunk_counter in range(int(number_chunks)):
                self.trait_data[trait_name] += (
                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
Example #3
0
    def get_trait_data(self, sample_list=None):
        if sample_list:
            self.samplelist = sample_list
        else:
            self.samplelist = self.group.samplelist

        if self.group.parlist != None and self.group.f1list != None:
            if (self.group.parlist + self.group.f1list) in self.samplelist:
                self.samplelist += self.group.parlist + self.group.f1list

        query = """
            SELECT Strain.Name, Strain.Id FROM Strain, Species
            WHERE Strain.Name IN {}
            and Strain.SpeciesId=Species.Id
            and Species.name = '{}'
            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
        logger.sql(query)
        results = dict(g.db.execute(query).fetchall())
        sample_ids = [results[item] for item in self.samplelist]

        # MySQL limits the number of tables that can be used in a join to 61,
        # so we break the sample ids into smaller chunks
        # Postgres doesn't have that limit, so we can get rid of this after we transition
        chunk_size = 50
        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
        trait_sample_data = []
        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
            if self.type == "Publish":
                dataset_type = "Phenotype"
            else:
                dataset_type = self.type
            temp = ['T%s.value' % item for item in sample_ids_step]
            if self.type == "Publish":
                query = "SELECT {}XRef.Id,".format(escape(self.type))
            else:
                query = "SELECT {}.Name,".format(escape(dataset_type))
            data_start_pos = 1
            query += string.join(temp, ', ')
            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
                                                                     self.type,
                                                                     self.type))

            for item in sample_ids_step:
                query += """
                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
                        and T{}.StrainId={}\n
                        """.format(*mescape(self.type, item, item, self.type, item, item))

            if self.type == "Publish":
                query += """
                        WHERE {}XRef.InbredSetId = {}Freeze.InbredSetId
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.name,
                                    dataset_type, self.type, dataset_type, dataset_type))
            else:
                query += """
                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.type,
                                   self.name, dataset_type, self.type, self.type, dataset_type))

            #logger.debug("trait data query: ", query)

            logger.sql(query)
            results = g.db.execute(query).fetchall()
            #logger.debug("query results:", results)
            trait_sample_data.append(results)

        trait_count = len(trait_sample_data[0])
        self.trait_data = collections.defaultdict(list)

        # put all of the separate data together into a dictionary where the keys are
        # trait names and values are lists of sample values
        for trait_counter in range(trait_count):
            trait_name = trait_sample_data[0][trait_counter][0]
            for chunk_counter in range(int(number_chunks)):
                self.trait_data[trait_name] += (
                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])