Python standardize Exemples, fastlmm.util.preprocess.standardize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : PerformSelectionDistributable.py Projet : xiaofeng007/FaST-LMM

def build_kernel_blocked(snpreader, snp_idx=None, blocksize=10000,alt_snpreader=None,allowlowrank=False):
    """build kernel by loading blocks of SNPs
    """
    if alt_snpreader is None:
        alt_snpreader = snpreader
            
    if hasattr(alt_snpreader,"ind_used") and alt_snpreader.ind_used is not None:
        N = len(alt_snpreader.ind_used)
    else:
        N = len(alt_snpreader.original_iids)
    
    t0 = time.time()

    K = sp.zeros([N,N])
    num_snps = alt_snpreader.snp_count

    if snp_idx != None:
        snp_names = alt_snpreader.rs[snp_idx]
        current_size = len(snp_names)
        logging.info("reading %i SNPs in blocks of %i and adding up kernels" % (len(snp_idx), blocksize))
    else:
        current_size = num_snps
        logging.info("constructing K from all %i SNPs (for %i individuals)" % (num_snps, N))

    ct = 0
    ts = time.time()

    if (not allowlowrank) and alt_snpreader.snp_count<N: raise Exception("need to adjust code to handle low rank")

    for start in xrange(0, current_size, blocksize):
        ct += blocksize

        if snp_idx == None:
            tmp_set = PositionRange(start, blocksize)
        else:
            tmp_set = SnpAndSetName('someset', snp_names[start:start+blocksize])

        snps = alt_snpreader.read(tmp_set)['snps']
        snps = up.standardize(snps)

        #logging.info("start = {0}".format(start))
        K += snps.dot(snps.T)

        if ct % blocksize==0:
            logging.info("read %s SNPs in %.2f seconds" % (ct, time.time()-ts))


    # normalize kernel
    #K = K/sp.sqrt(alt_snpreader.snp_count)

    #K = K + 1e-5*sp.eye(N,N)     
    t1 = time.time()
    logging.info("%.2f seconds elapsed" % (t1-t0))

    return K

Exemple #2

0

Afficher le fichier

Fichier : PerformSelectionDistributable.py Projet : gaow/FaST-LMM

def build_kernel_blocked(snpreader, snp_idx=None, blocksize=10000,alt_snpreader=None,allowlowrank=False):
    """build kernel by loading blocks of SNPs
    """
    if alt_snpreader is None:
        alt_snpreader = snpreader
            
    if hasattr(alt_snpreader,"ind_used") and alt_snpreader.ind_used is not None:
        N = len(alt_snpreader.ind_used)
    else:
        N = len(alt_snpreader.original_iids)
    
    t0 = time.time()

    K = sp.zeros([N,N])
    num_snps = alt_snpreader.snp_count

    if snp_idx != None:
        snp_names = alt_snpreader.rs[snp_idx]
        current_size = len(snp_names)
        logging.info("reading %i SNPs in blocks of %i and adding up kernels" % (len(snp_idx), blocksize))
    else:
        current_size = num_snps
        logging.info("constructing K from all %i SNPs (for %i individuals)" % (num_snps, N))

    ct = 0
    ts = time.time()

    if (not allowlowrank) and alt_snpreader.snp_count<N: raise Exception("need to adjust code to handle low rank")

    for start in range(0, current_size, blocksize):
        ct += blocksize

        if snp_idx == None:
            tmp_set = PositionRange(start, blocksize)
        else:
            tmp_set = SnpAndSetName('someset', snp_names[start:start+blocksize])

        snps = alt_snpreader.read(tmp_set)['snps']
        snps = up.standardize(snps)

        #logging.info("start = {0}".format(start))
        K += snps.dot(snps.T)

        if ct % blocksize==0:
            logging.info("read %s SNPs in %.2f seconds" % (ct, time.time()-ts))


    # normalize kernel
    #K = K/sp.sqrt(alt_snpreader.snp_count)

    #K = K + 1e-5*sp.eye(N,N)     
    t1 = time.time()
    logging.info("%.2f seconds elapsed" % (t1-t0))

    return K

Exemple #3

0

Afficher le fichier

Fichier : tests_util.py Projet : bdepardo/FaST-LMM

def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000):
    '''
    In full rank case, loads up the SNPs in blocks, and construct the kernel.
    In low rank case, loads up all SNPs in to memory
    '''    
    if SNPs0 is None:
        return None, None
    if SNPs0.has_key("K"):
        K0 = SNPs0["K"]
        G0 = None
    elif SNPs0.has_key("data"):
        K0 = None
        G0 = SNPs0["data"]["snps"]
    else:        
        #full rank
        if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0]                      
            SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = SNPs0["K"]
            G0 = None
        else:
            #low rank            
            K0 = None
            SNPs0["data"] = SNPs0["snp_set"].read()
            SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"])
            G0 = SNPs0["data"]["snps"]

    #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion
    #exclude. So this is only for score test and lrt. 
    if i_exclude is not None:
        if K0 is not None:
            #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster,
            #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now
            #where it is cubic in N in the outer loop only once
            K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = K0 - K_up
        elif G0 is not None:
            G0 = G0[:,~i_exclude]                        
        num_snps = SNPs0["num_snps"] - i_exclude.sum()
    else:
        num_snps = SNPs0["num_snps"]
    #intersect data?
        
    #normalize:
    if K0 is not None:
        K0 = K0 / num_snps#K0.diagonal().mean()
    elif G0 is not None:
        G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; *  means pointwise multiplication 
    return G0, K0

Exemple #4

0

Afficher le fichier

Fichier : tests_util.py Projet : lucasmiranda42/FaST-LMM

def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000):
    '''
    In full rank case, loads up the SNPs in blocks, and construct the kernel.
    In low rank case, loads up all SNPs in to memory
    '''    
    if SNPs0 is None:
        return None, None
    if "K" in SNPs0:
        K0 = SNPs0["K"]
        G0 = None
    elif "data" in SNPs0:
        K0 = None
        G0 = SNPs0["data"]["snps"]
    else:        
        #full rank
        if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0]                      
            SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = SNPs0["K"]
            G0 = None
        else:
            #low rank            
            K0 = None
            SNPs0["data"] = SNPs0["snp_set"].read()
            SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"])
            G0 = SNPs0["data"]["snps"]

    #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion
    #exclude. So this is only for score test and lrt. 
    if i_exclude is not None:
        if K0 is not None:
            #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster,
            #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now
            #where it is cubic in N in the outer loop only once
            K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank)
            K0 = K0 - K_up
        elif G0 is not None:
            G0 = G0[:,~i_exclude]                        
        num_snps = SNPs0["num_snps"] - i_exclude.sum()
    else:
        num_snps = SNPs0["num_snps"]
    #intersect data?
        
    #normalize:
    if K0 is not None:
        K0 = K0 / num_snps#K0.diagonal().mean()
    elif G0 is not None:
        G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; *  means pointwise multiplication 
    return G0, K0

Exemple #5

0

Afficher le fichier

def TESTBEFOREUSING_score_testfilesFromDir(phenofile,
                                           base0,
                                           pedfilesalt,
                                           covarfile=None,
                                           outfile=None,
                                           ipheno=0,
                                           mindist=-1.0,
                                           idist=2,
                                           filetype='PED'):
    '''
    given a list of basefilenames that define alternative models and a basefilename of the null model
    test all alternative models
    --------------------------------------------------------------------------
    Input:
    phenofile   : filename of the phenotype file
    base0       : basefilename of the .ped and .map files containing the
                  null-model SNPs
    pedfilesalt : [Nalt] list of basefilenames of the Nalt .ped and .map files
                  containing the alternative-model SNPs
    covarfile   : filename of the covariates file (default None, in this case
                  only a bias is used)
    outfile     : filename of the output file (default None, in this case no
                  output is written to disk)
    ipheno      : 0-based index of the phenotype to be analyzed (default 0)
    mindist     : minimum distance for SNPs to be included in null model
                  (default -1.0: no excluson in this case)
    idist       : index in pos array that the exclusion is based on.
                  (1=genetic distance, 2=basepair distance)
    filetype    : plink filetype of the input (default 'PED')
                    'PED'   : PED file format
                    'BED'   : BED file format
    --------------------------------------------------------------------------
    Output dictionary:
    'pv'        : [Nalt] array P-values,
    'lik0'      : [Nalt] array containing the model parameters and negative
                  log-likelihoods of the null models,
    'lik1'      : [Nalt] array containing the model parameters and negative
                  log-likelihoods of the alternative models,
    'nexclude'  : [Nalt] array of numbers of SNPs excluded,
    'filenames' : [Nalt] array of basefilenames
    --------------------------------------------------------------------------
    '''
    pheno = pstpheno.loadPhen(filename=phenofile, missing='-9', pheno=None)
    if covarfile is None:
        X = SP.ones((pheno['vals'].shape[0], 1))
    else:
        covar = pstpheno.loadPhen(filename=covarfile, missing='-9', pheno=None)
        X = SP.hstack((SP.ones((pheno['vals'].shape[0], 1)), covar['vals']))
    if filetype == 'PED':
        SNPs0 = plink.readPED(basefilename=base0,
                              delimiter=' ',
                              missing='0',
                              standardize=True,
                              pheno=None)
    elif filetype == 'BED':
        SNPs0 = plink.readBED(basefilename=base0)
        SNPs0['snps'] = util.standardize(SNPs0['snps'])

    y = pheno['vals'][:, ipheno]
    G0 = SNPs0['snps'] / SP.sqrt(SNPs0['snps'].shape[1])

    #build the null model
    test2K = scoretest(Y=y[:, SP.newaxis], X=X, K=None, G=G0)

    squaredform = SP.zeros(len(pedfilesalt))
    expectationsqform = SP.zeros(len(pedfilesalt))
    varsqform = SP.zeros(len(pedfilesalt))
    squaredform2K = SP.zeros(len(pedfilesalt))
    expectationsqform2K = SP.zeros(len(pedfilesalt))
    varsqform2K = SP.zeros(len(pedfilesalt))
    nexclude = SP.zeros(len(pedfilesalt))
    include = SP.zeros(len(pedfilesalt))
    Pv = SP.zeros(len(pedfilesalt))
    Pv2K = SP.zeros(len(pedfilesalt))

    for i, base1 in enumerate(pedfilesalt):  #iterate over all ped files
        SNPs1 = plink.readPED(basefilename=base1,
                              delimiter=' ',
                              missing='0',
                              standardize=True,
                              pheno=None)
        if mindist >= 0:
            i_exclude = excludeinds(SNPs0['pos'],
                                    SNPs1['pos'],
                                    mindist=mindist,
                                    idist=idist)
            nexclude[i] = i_exclude.sum()
        else:
            nexclude[i] = 0
        G1 = SNPs1['snps'] / SP.sqrt(SNPs1['snps'].shape[1])

        if nexclude[i] > 0:
            test2Ke = scoretest(Y=y[:, SP.newaxis],
                                X=X,
                                K=None,
                                G=G0[:, ~i_exclude])
            squaredform2K[i], expectationsqform2K[i], varsqform2K[
                i] = test2Ke.score(G=G1)
        else:
            squaredform2K[i], expectationsqform2K[i], varsqform2K[
                i] = test2K.score(G=G1)
        squaredform[i], expectationsqform[i], varsqform[i] = scoreNoK(
            y, X=X, G=G1, sigma2=None)

        #perform moment matching
        Pv2K[i], dofchi22K, scalechi22K = pv_mom(squaredform2K[i],
                                                 expectationsqform2K[i],
                                                 varsqform2K[i])
        Pv[i], dofchi2, scalechi2 = pv_mom(squaredform[i],
                                           expectationsqform[i], varsqform[i])

    ret = {
        'filenames': SP.array(pedfilesalt, dtype='str'),
        'squaredform': squaredform,
        'expectationsqform': expectationsqform,
        'varsqform': varsqform,
        'P': Pv,
        'squaredform2K': squaredform2K,
        'expectationsqform2K': expectationsqform2K,
        'varsqform2K': varsqform2K,
        'nexclude': nexclude,
        'P2K': Pv2K
    }
    if outfile is not None:
        #TODO
        print 'implement me!'
        #header = SP.array(['PV_5050','neg_log_lik_0','neg_loglik_alt','n_snps_excluded','filename_alt'])
        #data = SP.concatenate(())
    return ret

Exemple #6

0

Afficher le fichier

Fichier : _pipelines.py Projet : xiaofeng007/FaST-LMM

def TESTBEFOREUSING_score_testfilesFromDir(phenofile, base0, pedfilesalt, covarfile = None, outfile = None, ipheno=0, mindist = -1.0, idist=2 ,filetype='PED'):
    '''
    given a list of basefilenames that define alternative models and a basefilename of the null model
    test all alternative models
    --------------------------------------------------------------------------
    Input:
    phenofile   : filename of the phenotype file
    base0       : basefilename of the .ped and .map files containing the
                  null-model SNPs
    pedfilesalt : [Nalt] list of basefilenames of the Nalt .ped and .map files
                  containing the alternative-model SNPs
    covarfile   : filename of the covariates file (default None, in this case
                  only a bias is used)
    outfile     : filename of the output file (default None, in this case no
                  output is written to disk)
    ipheno      : 0-based index of the phenotype to be analyzed (default 0)
    mindist     : minimum distance for SNPs to be included in null model
                  (default -1.0: no excluson in this case)
    idist       : index in pos array that the exclusion is based on.
                  (1=genetic distance, 2=basepair distance)
    filetype    : plink filetype of the input (default 'PED')
                    'PED'   : PED file format
                    'BED'   : BED file format
    --------------------------------------------------------------------------
    Output dictionary:
    'pv'        : [Nalt] array P-values,
    'lik0'      : [Nalt] array containing the model parameters and negative
                  log-likelihoods of the null models,
    'lik1'      : [Nalt] array containing the model parameters and negative
                  log-likelihoods of the alternative models,
    'nexclude'  : [Nalt] array of numbers of SNPs excluded,
    'filenames' : [Nalt] array of basefilenames
    --------------------------------------------------------------------------
    '''
    pheno = plink.loadPhen(filename = phenofile, missing ='-9', pheno = None)
    if covarfile is None:
        X = SP.ones((pheno['vals'].shape[0],1))
    else:
        covar = plink.loadPhen(filename = covarfile, missing ='-9', pheno = None)
        X = SP.hstack((SP.ones((pheno['vals'].shape[0],1)),covar['vals']))
    if filetype =='PED':
        SNPs0 = plink.readPED(basefilename = base0, delimiter = ' ',missing = '0',standardize = True, pheno = None)
    elif filetype =='BED':
        SNPs0 = plink.readBED(basefilename = base0)
        SNPs0['snps'] = util.standardize(SNPs0['snps'])

    y = pheno['vals'][:,ipheno]
    G0 = SNPs0['snps']/SP.sqrt(SNPs0['snps'].shape[1])

    #build the null model
    test2K = scoretest(Y=y[:,SP.newaxis],X=X,K=None,G=G0)

    squaredform = SP.zeros(len(pedfilesalt))
    expectationsqform = SP.zeros(len(pedfilesalt))
    varsqform = SP.zeros(len(pedfilesalt))
    squaredform2K = SP.zeros(len(pedfilesalt))
    expectationsqform2K = SP.zeros(len(pedfilesalt))
    varsqform2K = SP.zeros(len(pedfilesalt))
    nexclude = SP.zeros(len(pedfilesalt))
    include = SP.zeros(len(pedfilesalt))
    Pv = SP.zeros(len(pedfilesalt))
    Pv2K = SP.zeros(len(pedfilesalt))

    for i, base1 in enumerate(pedfilesalt):#iterate over all ped files
        SNPs1 = plink.readPED(basefilename = base1, delimiter = ' ',missing = '0',standardize = True, pheno = None)
        if mindist>=0:
            i_exclude =  excludeinds(SNPs0['pos'], SNPs1['pos'], mindist = mindist,idist = idist)
            nexclude[i] = i_exclude.sum()
        else:
            nexclude[i]=0
        G1 = SNPs1['snps']/SP.sqrt(SNPs1['snps'].shape[1])

        if nexclude[i]>0:
            test2Ke = scoretest(Y=y[:,SP.newaxis],X=X,K=None,G=G0[:,~i_exclude])
            squaredform2K[i], expectationsqform2K[i], varsqform2K[i] = test2Ke.score( G = G1 )
        else:
            squaredform2K[i], expectationsqform2K[i], varsqform2K[i] = test2K.score( G = G1 )
        squaredform[i], expectationsqform[i], varsqform[i] = scoreNoK( y, X = X, G = G1, sigma2=None)

        #perform moment matching
        Pv2K[i],dofchi22K,scalechi22K=pv_mom(squaredform2K[i],expectationsqform2K[i],varsqform2K[i])
        Pv[i],dofchi2,scalechi2=pv_mom(squaredform[i],expectationsqform[i],varsqform[i])

    ret = {
           'filenames': SP.array(pedfilesalt,dtype = 'str'),
           'squaredform':squaredform,
           'expectationsqform':expectationsqform,
           'varsqform':varsqform,
           'P':Pv,
           'squaredform2K':squaredform2K,
           'expectationsqform2K':expectationsqform2K,
           'varsqform2K':varsqform2K,
           'nexclude':nexclude,
           'P2K':Pv2K
           }
    if outfile is not None:
        #TODO
        print 'implement me!'
        #header = SP.array(['PV_5050','neg_log_lik_0','neg_loglik_alt','n_snps_excluded','filename_alt'])
        #data = SP.concatenate(())
    return ret