Exemple #1
0
def run_human(pheno_vector,
            covariate_matrix,
            plink_input_file,
            kinship_matrix,
            refit=False,
            loading_progress=None):

    v = np.isnan(pheno_vector)
    keep = True - v
    keep = keep.reshape((len(keep),))

    identifier = str(uuid.uuid4())
    
    print("pheno_vector: ", pf(pheno_vector))
    print("kinship_matrix: ", pf(kinship_matrix))
    print("kinship_matrix.shape: ", pf(kinship_matrix.shape))

    lmm_vars = pickle.dumps(dict(
        pheno_vector = pheno_vector,
        covariate_matrix = covariate_matrix,
        kinship_matrix = kinship_matrix
    ))
    Redis.hset(identifier, "lmm_vars", lmm_vars)
    Redis.expire(identifier, 60*60)

    if v.sum():
        pheno_vector = pheno_vector[keep]
        #print("pheno_vector shape is now: ", pf(pheno_vector.shape))
        covariate_matrix = covariate_matrix[keep,:]
        #print("kinship_matrix shape is: ", pf(kinship_matrix.shape))
        #print("len(keep) is: ", pf(keep.shape))
        kinship_matrix = kinship_matrix[keep,:][:,keep]

    n = kinship_matrix.shape[0]
    #print("n is:", n)
    lmm_ob = LMM(pheno_vector,
                kinship_matrix,
                covariate_matrix)
    lmm_ob.fit()


    # Buffers for pvalues and t-stats
    p_values = []
    t_stats = []

    #print("input_file: ", plink_input_file)

    with Bench("Opening and loading pickle file"):
        with gzip.open(plink_input_file, "rb") as input_file:
            data = pickle.load(input_file)
            
    plink_input = data['plink_input']

    #plink_input.getSNPIterator()
    with Bench("Calculating numSNPs"):
        total_snps = data['numSNPs']

    with Bench("snp iterator loop"):
        count = 0

        with Bench("Create list of inputs"):
            inputs = list(plink_input)
            
        print("len(genotypes): ", len(inputs))

        with Bench("Divide into chunks"):
            results = chunks.divide_into_chunks(inputs, 64)

        result_store = []

        key = "plink_inputs"
        
        # Todo: Delete below line when done testing
        Redis.delete(key)
        
        timestamp = datetime.datetime.utcnow().isoformat()

        #print("Starting adding loop")
        for part, result in enumerate(results):
            #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL)
            holder = pickle.dumps(dict(
                identifier = identifier,
                part = part,
                timestamp = timestamp,
                result = result
            ), pickle.HIGHEST_PROTOCOL)
            
            #print("Adding:", part)
            Redis.rpush(key, zlib.compress(holder))
        #print("End adding loop")
        #print("***** Added to {} queue *****".format(key))
        for snp, this_id in plink_input:
            #with Bench("part before association"):
            #if count > 2000:
            #    break
            count += 1

            percent_complete = (float(count) / total_snps) * 100
            #print("percent_complete: ", percent_complete)
            loading_progress.store("percent_complete", percent_complete)

            #with Bench("actual association"):
            ps, ts = human_association(snp,
                                       n,
                                       keep,
                                       lmm_ob,
                                       pheno_vector,
                                       covariate_matrix,
                                       kinship_matrix,
                                       refit)

            #with Bench("after association"):
            p_values.append(ps)
            t_stats.append(ts)
        
    return p_values, t_stats
    def get_trait_data(self, sample_list=None):
        if sample_list:
            self.samplelist = sample_list
        else:
            self.samplelist = self.group.samplelist
            
        if (self.group.parlist + self.group.f1list) in self.samplelist:
            self.samplelist += self.group.parlist + self.group.f1list
        
        query = """
            SELECT Strain.Name, Strain.Id FROM Strain, Species
            WHERE Strain.Name IN {}
            and Strain.SpeciesId=Species.Id
            and Species.name = '{}'
            """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
        results = dict(g.db.execute(query).fetchall())
        sample_ids = [results[item] for item in self.samplelist]

        # MySQL limits the number of tables that can be used in a join to 61,
        # so we break the sample ids into smaller chunks
        # Postgres doesn't have that limit, so we can get rid of this after we transition
        chunk_size = 50
        number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
        trait_sample_data = []
        for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):

        #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId 
        #tempTable = None
        #if GeneId and db.type == "ProbeSet": 
        #    if method == "3":
        #        tempTable = self.getTempLiteratureTable(species=species,
        #                                                input_species_geneid=GeneId,
        #                                                returnNumber=returnNumber)
        #
        #    if method == "4" or method == "5":
        #        tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
        #                                        TissueProbeSetFreezeId=tissueProbeSetFreezeId,
        #                                        method=method,
        #                                        returnNumber=returnNumber)
        
            if self.type == "Publish":
                dataset_type = "Phenotype"
            else:
                dataset_type = self.type
            temp = ['T%s.value' % item for item in sample_ids_step]
            if self.type == "Publish":
                query = "SELECT {}XRef.Id,".format(escape(self.type))
            else:
                query = "SELECT {}.Name,".format(escape(dataset_type))
            data_start_pos = 1
            query += string.join(temp, ', ')
            query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(dataset_type,
                                                                     self.type,
                                                                     self.type))

            for item in sample_ids_step:
                query += """
                        left join {}Data as T{} on T{}.Id = {}XRef.DataId
                        and T{}.StrainId={}\n
                        """.format(*mescape(self.type, item, item, self.type, item, item))
                        
            if self.type == "Publish":
                query += """
                        WHERE {}XRef.PublicationId = {}Freeze.Id
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.name, 
                                    dataset_type, self.type, dataset_type, dataset_type))
            else:
                query += """
                        WHERE {}XRef.{}FreezeId = {}Freeze.Id
                        and {}Freeze.Name = '{}'
                        and {}.Id = {}XRef.{}Id
                        order by {}.Id
                        """.format(*mescape(self.type, self.type, self.type, self.type,
                                   self.name, dataset_type, self.type, self.type, dataset_type))
            results = g.db.execute(query).fetchall()
            trait_sample_data.append(results)

        trait_count = len(trait_sample_data[0])
        self.trait_data = collections.defaultdict(list)
        
        # put all of the separate data together into a dictionary where the keys are
        # trait names and values are lists of sample values
        for trait_counter in range(trait_count):
            trait_name = trait_sample_data[0][trait_counter][0]
            for chunk_counter in range(int(number_chunks)):
                self.trait_data[trait_name] += (
                    trait_sample_data[chunk_counter][trait_counter][data_start_pos:])