Exemple #1
0
def get_fam_means(ids,ped,gts,gts_ids,remove_proband = True, return_famsizes = False):
    """
    Used in get_gts_matrix to find the mean genotype in each sibship (family) for each SNP or for a PGS.
    The gtarray that is returned is indexed based on the subset of ids provided from sibships of size 2 or greater.
    If remove_proband=True, then the genotype/PGS of the index individual is removed from the fam_mean given for that individual.
    """
    ids, ids_fams, gts_fams = find_individuals_with_sibs(ids, ped, gts_ids)
    fams = np.unique(ids_fams)
    fams_dict = make_id_dict(fams)
    # Compute sums of genotypes in each family
    fam_sums = np.zeros((fams.shape[0],gts.shape[1]),dtype=gts.dtype)
    fam_counts = np.zeros((fams.shape[0]),dtype=int)
    for i in range(0,fams.shape[0]):
        fam_indices = np.where(gts_fams==fams[i])[0]
        fam_sums[i,:] = np.sum(gts[fam_indices,:],axis=0)
        fam_counts[i] = fam_indices.shape[0]
    # Place in vector corresponding to IDs
    if remove_proband:
        gts_id_dict = make_id_dict(gts_ids)
    G_sib = np.zeros((ids.shape[0],gts.shape[1]),dtype = np.float32)
    for i in range(0,ids.shape[0]):
        fam_index = fams_dict[ids_fams[i]]
        G_sib[i,:] = fam_sums[fam_index,:]
        n_i = fam_counts[fam_index]
        if remove_proband:
            G_sib[i,:] = G_sib[i,:] - gts[gts_id_dict[ids[i]],:]
            n_i = n_i-1
        G_sib[i,:] = G_sib[i,:]/float(n_i)
    if return_famsizes:
        return [gtarray(G_sib, ids),fam_counts,fam_sums]
    else:
        return gtarray(G_sib,ids)
Exemple #2
0
def find_par_gts(pheno_ids, ped, gts_id_dict, imp_fams=None):
    """
    Used in get_gts_matrix to find whether individuals have imputed or observed parental genotypes, and to
    find the indices of the observed/imputed parents in the observed/imputed genotype arrays.
    'par_status' codes whether an individual has parents that are observed or imputed or neither.
    'gt_indices' records the relevant index of the parent in the observed/imputed genotype arrays
    'fam_labels' records the family of the individual based on the pedigree
    """
    # Whether mother and father have observed/imputed genotypes
    par_status = np.zeros((pheno_ids.shape[0],2),dtype=int)
    par_status[:] = -1
    # Indices of obsered/imputed genotypes in relevant arrays
    gt_indices = np.zeros((pheno_ids.shape[0],3),dtype=int)
    gt_indices[:] = -1
    ## Build dictionaries
    # Where each individual is in the pedigree
    ped_dict = make_id_dict(ped,1)
    # Where the imputed data is for each family
    if imp_fams is not None:
        fam_dict = make_id_dict(imp_fams)
    # Store family ID of each individual
    fam_labels = np.zeros((pheno_ids.shape[0]),dtype=ped.dtype)
    # Find status and find indices
    for i in range(0,pheno_ids.shape[0]):
        # Find index in genotypes
        if pheno_ids[i] in gts_id_dict:
            gt_indices[i,0] = gts_id_dict[pheno_ids[i]]
        # Find index in pedigree
        if pheno_ids[i] in ped_dict:
            ped_i = ped[ped_dict[pheno_ids[i]], :]
            fam_labels[i] = ped_i[0]
            # Check for observed father
            if ped_i[2] in gts_id_dict:
                gt_indices[i,1] = gts_id_dict[ped_i[2]]
                par_status[i,0] = 0
            # Check for observed mother
            if ped_i[3] in gts_id_dict:
                gt_indices[i, 2] = gts_id_dict[ped_i[3]]
                par_status[i,1] = 0
            # If parent not observed, look for imputation
            if imp_fams is not None:
                if ped_i[0] in fam_dict:
                    imp_index = fam_dict[ped_i[0]]
                    # Check if this is imputation of father, or mother, or both
                    if ped_i[4] == 'False' and not par_status[i,0] == 0:
                        gt_indices[i, 1] = imp_index
                        par_status[i, 0] = 1
                    if ped_i[5] == 'False' and not par_status[i,1] == 0:
                        gt_indices[i, 2] = imp_index
                        par_status[i, 1] = 1
    return par_status, gt_indices, fam_labels
Exemple #3
0
def get_map_positions(mapfile, gts, min_map_prop=0.5):
    map_file = open(mapfile, 'r')
    map_header = map_file.readline()
    map_header = np.array(map_header.split(' '))
    map_header[len(map_header) - 1] = map_header[len(map_header) -
                                                 1].split('\n')[0]
    map_file.close()
    if 'pposition' in map_header and 'gposition' in map_header:
        bp_pos = np.loadtxt(mapfile,
                            usecols=np.where(map_header == 'pposition')[0][0],
                            dtype=int,
                            skiprows=1)
        pos_dict = make_id_dict(bp_pos)
        cm_pos = np.loadtxt(mapfile,
                            usecols=np.where(map_header == 'gposition')[0][0],
                            dtype=float,
                            skiprows=1)
        # Check for NAs
        if np.sum(np.isnan(cm_pos)) > 0:
            raise (ValueError('Map cannot have NAs'))
        if np.min(cm_pos) < 0:
            raise (ValueError('Map file cannot have negative values'))
        if np.var(cm_pos) == 0:
            raise (ValueError('Map file has no variation'))
        # Check ordering
        ordered_map = np.sort(cm_pos)
        if np.array_equal(cm_pos, ordered_map):
            pass
        else:
            raise (ValueError(
                'Map not monotonic. Please make sure input is ordered correctly'
            ))
        # Check scale
        if np.max(cm_pos) > 5000:
            raise (ValueError('Maximum value of map too large'))
        # Find positions of SNPs in map file
        map = np.zeros((gts.shape[1]), dtype=float)
        map[:] = np.nan
        in_map = np.array([x in pos_dict for x in gts.pos])
        # Check if we have at least 50% of SNPs in map
        prop_in_map = np.mean(in_map)
        if prop_in_map < min_map_prop:
            raise (ValueError('Only ' + str(round(100 * prop_in_map)) +
                              '% of SNPs have genetic positions in ' +
                              mapfile + '. Need at least ' +
                              str(round(100 * min_map_prop)) + '%'))
        print('Found genetic map positions for ' +
              str(round(100 * prop_in_map)) + '% of SNPs in ' + mapfile)
        # Fill in map values
        map[in_map] = cm_pos[[pos_dict[x] for x in gts.pos[in_map]]]
        # Linearly interpolate map
        if prop_in_map < 1:
            print(
                'Linearly interpolating genetic map for SNPs not in input map')
            map = np.interp(gts.pos, gts.pos[in_map], map[in_map])
        return map
    else:
        raise (ValueError(
            'Map file must contain columns pposition and gposition'))
Exemple #4
0
def get_indices_given_ped(ped, gts_ids, imp_fams=None, ids=None, sib=False, verbose=False):
    """
    Used in get_gts_matrix_given_ped to get the ids of individuals with observed/imputed parental genotypes and, if sib=True, at least one genotyped sibling.
    It returns those ids along with the indices of the relevant individuals and their first degree relatives in the observed genotypes (observed indices),
    and the indices of the imputed parental genotypes for those individuals.
    """
    # Made dictionary for observed genotypes
    gts_id_dict = make_id_dict(gts_ids)
    # If IDs not provided, use all individuals with observed genotypes
    if ids is None:
        ids = gts_ids
    # Find individuals with genotyped siblings
    if sib:
        # Look in full genotype sample in case some genotyped sibs are not in ids
        ids = gts_ids
        ids = find_individuals_with_sibs(ids, ped, gts_ids, return_ids_only=True)
        if verbose:
            print('Found ' + str(ids.shape[0]) + ' individuals with genotyped siblings')
    ### Find parental status
    if verbose:
        print('Checking for observed/imputed parental genotypes')
    par_status, gt_indices, fam_labels = find_par_gts(ids, ped, gts_id_dict, imp_fams=imp_fams)
    # Find which individuals can be used
    none_missing = np.min(gt_indices, axis=1) >= 0
    N = np.sum(none_missing)
    if N == 0:
        raise ValueError(
            'No individuals with phenotype observations and complete observed/imputed genotype observations')
    # Take those that can be used
    gt_indices = gt_indices[none_missing, :]
    par_status = par_status[none_missing, :]
    ids = ids[none_missing]
    parcount = np.sum(par_status==0,axis=1)
    if verbose:
        print(str(N) + ' individuals with phenotype observations and complete observed/imputed genotype observations')
        print(str(np.sum(parcount==0))+' individuals with imputed but no observed parental genotypes')
        print(str(np.sum(parcount==1))+' individuals with one observed and one imputed parent')
        print(str(np.sum(parcount==2))+' individuals with both parents observed')
    # Find indices of individuals and their parents in observed genotypes
    observed_indices = np.sort(np.unique(np.hstack((gt_indices[:, 0],
                                                    gt_indices[par_status[:, 0] == 0, 1],
                                                    gt_indices[par_status[:, 1] == 0, 2]))))
    # Get indices of imputed parents
    imp_indices = np.sort(np.unique(np.hstack((gt_indices[par_status[:, 0] == 1, 1],
                                               gt_indices[par_status[:, 1] == 1, 2]))))
    # Return ids with imputed/observed parents
    return ids, observed_indices, imp_indices, parcount
Exemple #5
0
 def __init__(self,chrom,sid,pos, A1, A2, freqs, direct, direct_SE, avg_NTC, avg_NTC_SE, population, population_SE, r_direct_avg_NTC, r_direct_pop, ldscores = None, map=None):
     sizes = np.array([sid.shape[0],pos.shape[0],A1.shape[0],A2.shape[0],freqs.shape[0],direct.shape[0],
                         avg_NTC.shape[0],population.shape[0],r_direct_avg_NTC.shape[0],r_direct_pop.shape[0]])
     if np.unique(sizes).shape[0] > 1:
         raise(ValueError('All inputs to sumstats class must have same size'))
     self.chrom = np.zeros(sid.shape,dtype=int)
     self.chrom[:] = int(chrom)
     self.sid = np.array(sid,dtype=str)
     self.sid_dict = make_id_dict(self.sid)
     self.pos = np.array(pos,dtype=int)
     self.A1 = np.array(A1,dtype=str)
     self.A2 = np.array(A2,dtype=str)
     self.freqs = ma.array(freqs,dtype=float)
     self.freqs.mask = np.isnan(self.freqs)
     self.direct = ma.array(direct, dtype=float)
     self.direct.mask = np.isnan(self.direct)
     self.direct_SE = ma.array(direct_SE, dtype=float)
     self.direct_SE.mask = np.isnan(self.direct_SE)
     self.avg_NTC = ma.array(avg_NTC, dtype=float)
     self.avg_NTC.mask = np.isnan(self.avg_NTC)
     self.avg_NTC_SE = ma.array(avg_NTC_SE, dtype=float)
     self.avg_NTC_SE.mask = np.isnan(self.avg_NTC_SE)
     self.population = ma.array(population, dtype=float)
     self.population.mask = np.isnan(self.population)
     self.population_SE = ma.array(population_SE, dtype=float)
     self.population_SE.mask = np.isnan(self.population_SE)
     self.r_direct_avg_NTC = ma.array(r_direct_avg_NTC, dtype=float)
     self.r_direct_avg_NTC.mask = np.isnan(self.r_direct_avg_NTC)
     self.r_direct_pop = ma.array(r_direct_pop, dtype=float)
     self.r_direct_pop.mask = np.isnan(self.r_direct_pop)
     if ldscores is not None:
         if not ldscores.shape[0] == sid.shape[0]:
             raise(ValueError('LD scores must have same size as other sumstats'))
         self.ldscores = ma.array(ldscores,dtype=float)
         self.ldscores.mask = np.isnan(self.ldscores)
     else:
         self.ldscores = None
     if map is not None:
         if not map.shape[0] == sid.shape[0]:
             raise(ValueError('LD scores must have same size as other sumstats'))
         self.map = ma.array(map,dtype=float)
         self.map.mask = np.isnan(self.map)
     else:
         self.map = None
Exemple #6
0
 def filter(self,filter_pass):
     self.chrom = self.chrom[filter_pass]
     self.sid = self.sid[filter_pass]
     self.sid_dict = make_id_dict(self.sid)
     self.pos = self.pos[filter_pass]
     self.A1 = self.A1[filter_pass]
     self.A2 = self.A2[filter_pass]
     self.freqs = self.freqs[filter_pass]
     self.direct = self.direct[filter_pass]
     self.direct_SE = self.direct_SE[filter_pass]
     self.avg_NTC = self.avg_NTC[filter_pass]
     self.avg_NTC_SE = self.avg_NTC_SE[filter_pass]
     self.population = self.population[filter_pass]
     self.population_SE = self.population_SE[filter_pass]
     self.r_direct_avg_NTC = self.r_direct_avg_NTC[filter_pass]
     self.r_direct_pop = self.r_direct_pop[filter_pass]
     if self.ldscores is not None:
         self.ldscores = self.ldscores[filter_pass]
     if self.map is not None:
         self.map = self.map[filter_pass]
Exemple #7
0
 def concatenate(self,s2):
     self.chrom = np.hstack((self.chrom,s2.chrom))
     self.sid = np.hstack((self.sid, s2.sid))
     self.sid_dict = make_id_dict(self.sid)
     self.pos = np.hstack((self.pos, s2.pos))
     self.A1 = np.hstack((self.A1, s2.A1))
     self.A2 = np.hstack((self.A2, s2.A2))
     self.freqs = ma.concatenate([self.freqs, s2.freqs])
     self.direct = ma.concatenate([self.direct, s2.direct])
     self.direct_SE = ma.concatenate([self.direct_SE, s2.direct_SE])
     self.avg_NTC = ma.concatenate([self.avg_NTC, s2.avg_NTC])
     self.avg_NTC_SE = ma.concatenate([self.avg_NTC_SE, s2.avg_NTC_SE])
     self.population = ma.concatenate([self.population, s2.population])
     self.population_SE = ma.concatenate([self.population_SE, s2.population_SE])
     self.r_direct_avg_NTC = ma.concatenate([self.r_direct_avg_NTC, s2.r_direct_avg_NTC])
     self.r_direct_pop = ma.concatenate([self.r_direct_pop, s2.r_direct_pop])
     if self.ldscores is not None and s2.ldscores is not None:
         self.ldscores = ma.concatenate([self.ldscores, s2.ldscores])
     if self.map is not None and s2.map is not None:
         self.map = ma.concatenate([self.map, s2.map])
Exemple #8
0
 def filter_ids(self,keep_ids, verbose=False):
     """
     Keep only individuals with ids given by keep_ids
     """
     in_ids = np.array([x in self.id_dict for x in keep_ids])
     n_filtered = np.sum(in_ids)
     if n_filtered==0:
         raise(ValueError('No individuals would be left after filtering'))
     else:
         if verbose:
             print('After filtering, '+str(n_filtered)+' individuals remain')
         indices = np.array([self.id_dict[x] for x in keep_ids[in_ids]])
         if self.ndim == 2:
             self.gts = self.gts[indices, :]
         elif self.ndim == 3:
             self.gts = self.gts[indices, :, :]
         self.ids = self.ids[indices]
         self.id_dict = make_id_dict(self.ids)
         self.shape = self.gts.shape
         if self.fams is not None:
             self.fams = self.fams[indices]
Exemple #9
0
 def filter(self, filter_pass):
     if self.freqs is not None:
         self.freqs = self.freqs[filter_pass]
     if self.ndim == 2:
         self.gts = self.gts[:,filter_pass]
     elif self.ndim == 3:
         self.gts = self.gts[:,:,filter_pass]
     self.shape = self.gts.shape
     if self.sid is not None:
         self.sid = self.sid[filter_pass]
         self.sid_dict = make_id_dict(self.sid)
     if self.pos is not None:
         self.pos = self.pos[filter_pass]
     if self.alleles is not None:
         self.alleles = self.alleles[filter_pass]
     if self.chrom is not None:
         self.chrom = self.chrom[filter_pass]
     if self.map is not None:
         self.map = self.map[filter_pass]
     if self.error_probs is not None:
         self.error_probs = self.error_probs[filter_pass]
Exemple #10
0
def match_phenotype(G,y,pheno_ids):
    """Match a phenotype to a genotype array by individual IDs.

    Args:
        G : :class:`gtarray`
            genotype array to match phenotype to
        y : :class:`~numpy:numpy.array`
            vector of phenotype values
        pheno_ids: :class:`~numpy:numpy.array`
            vector of individual IDs corresponding to phenotype vector, y

    Returns:
       y : :class:`~numpy:numpy.array`
            vector of phenotype values matched by individual IDs to the genotype array

    """
    in_G_dict = np.array([x in G.id_dict for x in pheno_ids])
    y = y[in_G_dict]
    pheno_ids = pheno_ids[in_G_dict]
    pheno_id_dict = make_id_dict(pheno_ids)
    y = y[[pheno_id_dict[x] for x in G.ids]]
    return y
Exemple #11
0
def find_individuals_with_sibs(ids, ped, gts_ids, return_ids_only=False):
    """
    Used in get_gts_matrix and get_fam_means to find the individuals in ids that have genotyped siblings.
    """
    # Find genotyped sibships of size > 1
    ped_dict = make_id_dict(ped, 1)
    ids_in_ped = np.array([x in ped_dict for x in gts_ids])
    gts_fams = np.zeros((gts_ids.shape[0]), dtype=gts_ids.dtype)
    gts_fams[ids_in_ped] = np.array(
        [ped[ped_dict[x], 0] for x in gts_ids[ids_in_ped]])
    fams, counts = np.unique(gts_fams[ids_in_ped], return_counts=True)
    sibships = set(fams[counts > 1])
    # Find individuals with genotyped siblings
    ids_in_ped = np.array([x in ped_dict for x in ids])
    ids = ids[ids_in_ped]
    ids_fams = np.array([ped[ped_dict[x], 0] for x in ids])
    ids_with_sibs = np.array([x in sibships for x in ids_fams])
    ids = ids[ids_with_sibs]
    ids_fams = ids_fams[ids_with_sibs]
    if return_ids_only:
        return ids
    else:
        return ids, ids_fams, gts_fams
Exemple #12
0
def infer_ibd_chr(sibpairs, error_prob, error_probs, outprefix, bedfile=None, bgenfile=None, chrom=None, min_length=0.01, mapfile=None, ibdmatrix=False, ld_out=False, min_maf=0.01, max_missing=5, max_error=0.01):
    if bedfile is None and bgenfile is None:
        raise(ValueError('Must provide either bed file or bgenfile'))
    if bedfile is not None and bgenfile is not None:
        raise(ValueError('Provide either bed file or bgen file. Not both.'))
    if bedfile is not None:
        ## Read bed
        print('Reading genotypes from ' + bedfile)
        bimfile = bedfile.split('.bed')[0] + '.bim'
        # Determine chromosome
        if chrom is None:
            chrom = np.loadtxt(bimfile, usecols=0, dtype=str)
            chrom = np.unique(chrom)
            if chrom.shape[0] > 1:
                raise (ValueError('More than 1 chromosome in input bedfile'))
            else:
                chrom = chrom[0]
        print('Inferring IBD for chromosome ' + str(chrom))
        # Read sibling genotypes from bed file
        gts = read_sibs_from_bed(bedfile, sibpairs)
    elif bgenfile is not None:
        ## Read bed
        print('Reading genotypes from ' + bgenfile)
        # Determine chromosome
        if chrom is None:
            bgen = open_bgen(bgenfile,verbose=False)
            chrom = bgen.chromosomes
            chrom = np.unique(chrom)
            if chrom.shape[0] > 1:
                raise (ValueError('More than 1 chromosome in input bgenfile'))
            else:
                chrom = chrom[0]
                if chrom=='':
                    chrom = 0
        print('Inferring IBD for chromosome ' + str(chrom))
        # Read sibling genotypes from bed file
        gts = read_sibs_from_bgen(bgenfile, sibpairs)
    # Calculate allele frequencies
    print('Calculating allele frequencies')
    gts.compute_freqs()
    # Check which sibling pairs have genotypes
    sibpair_indices = np.zeros((sibpairs.shape), dtype=bool)
    sibpair_indices[:, 0] = np.array([x in gts.id_dict for x in sibpairs[:, 0]])
    sibpair_indices[:, 1] = np.array([x in gts.id_dict for x in sibpairs[:, 1]])
    sibpairs = sibpairs[np.sum(sibpair_indices, axis=1) == 2, :]
    if sibpairs.shape[0] == 0:
        raise (ValueError('No genotyped sibling pairs found'))
    print(str(np.sum(sibpairs.shape[0])) + ' sibpairs have genotypes')
    # Find indices of sibpairs
    sibpair_indices = np.zeros((sibpairs.shape), dtype=int)
    sibpair_indices[:, 0] = np.array([gts.id_dict[x] for x in sibpairs[:, 0]])
    sibpair_indices[:, 1] = np.array([gts.id_dict[x] for x in sibpairs[:, 1]])
    # Filtering on MAF, LD score, and genotyping error
    # Find error probabilities
    p_error = np.zeros((gts.sid.shape[0]))
    p_error[:] = error_prob
    if error_probs is not None:
        in_error_probs = np.array([x in error_probs.sid_dict for x in gts.sid])
        error_index = np.array([error_probs.sid_dict[x] for x in gts.sid[in_error_probs]])
        p_error[in_error_probs] = error_probs.error_ests[error_index]
    gts.error_probs = p_error
    # Filter
    print('Before filtering on MAF, missingness, and genotyping error, there were ' + str(gts.shape[1]) + ' SNPs')
    gts.filter_maf(min_maf)
    gts.filter_missingness(max_missing)
    gts.filter(gts.error_probs < max_error)
    print('After filtering, there are ' + str(gts.shape[1]) + ' SNPs')
    # Read map file
    if mapfile is None and bedfile is not None:
        print('Separate genetic map not provided, so attempting to read map from ' + bimfile)
        map = np.loadtxt(bimfile, usecols=2)
        map_snp_dict = make_id_dict(np.loadtxt(bimfile, usecols=1, dtype=str))
        # Check for NAs
        if np.var(map) == 0:
            print('Map information not found in bim file.')
            print('Using default map (decode sex averaged map on GRCh38 coordinates)')
            gts.map = decode_map_from_pos(chrom, gts.pos)
            pc_mapped = str(round(100*(1-np.mean(np.isnan(gts.map))),2))
            print('Found map positions for '+str(pc_mapped)+'% of SNPs')
            gts.filter(~np.isnan(gts.map))
        else:
            if np.sum(np.isnan(map)) > 0:
                raise (ValueError('Map cannot have NAs'))
            if np.min(map) < 0:
                raise (ValueError('Map file cannot have negative values'))
            # Check ordering
            ordered_map = np.sort(map)
            if np.array_equal(map, ordered_map):
                pass
            else:
                raise (ValueError('Map not monotonic. Please make sure input is ordered correctly'))
            # Check scale
            if np.max(map) > 5000:
                raise (ValueError('Maximum value of map too large'))
            gts.filter(np.array([x in map_snp_dict for x in gts.sid]))
            gts.map = map[[map_snp_dict[x] for x in gts.sid]]
    elif mapfile is None and bgenfile is not None:
        print('Map file not provided.')
        print('Using default map (decode sex averaged map on Hg19 coordinates)')
        gts.map = decode_map_from_pos(chrom, gts.pos)
        pc_mapped = 100*(1-np.mean(np.isnan(gts.map)))
        if pc_mapped < 50:
            print('Warning: map positions not found for the majority of SNPs. Consider providing a genetic map using --map')
        print('Found map positions for '+str(round(pc_mapped,2))+'% of SNPs')
        gts.filter(~np.isnan(gts.map))
    else:
        print('Reading map from ' + str(mapfile))
        gts.map = get_map_positions(mapfile, gts)
    print('Read map')
    # Weights
    print('Computing LD weights')
    ld = compute_ld_scores(np.array(gts.gts, dtype=np.float_), gts.map, max_dist=1)
    gts.weights = np.power(ld, -1)
    # IBD
    print('Inferring IBD')
    ibd = infer_ibd(sibpair_indices, np.array(gts.gts,dtype=np.float_), gts.freqs, gts.map, gts.weights, gts.error_probs)
    ibd, allsegs = smooth_ibd(ibd, gts.map, gts.sid, gts.pos, min_length)
    ## Write output
    # Write segments
    segs_outfile = outfile_name(outprefix,'.ibd.segments.gz', chrom)
    print('Writing segments to ' + segs_outfile)
    write_segs(sibpairs, allsegs, chrom, segs_outfile)
    # Write matrix
    if ibdmatrix:
        outfile = outfile_name(outprefix,'.ibdmatrix.gz', chrom)
        print('Writing matrix output to ' + str(outfile))
        ibd = np.row_stack(
            (np.column_stack((np.array(['sib1', 'sib2']).reshape((1, 2)), gts.sid.reshape(1, gts.shape[1]))),
             np.column_stack((sibpairs, ibd))))
        np.savetxt(outfile, ibd, fmt='%s')
    if ld_out:
        ld_outfile = outfile_name(outprefix,'.l2.ldscore.gz', chrom)
        print('Writing LD-scores to '+ld_outfile)
        ld_out = np.vstack((np.array(['CHR', 'SNP', 'BP', 'L2']).reshape((1,4)),np.vstack((np.array([chrom for x in gts.sid]), gts.sid, gts.pos, ld)).T))
        np.savetxt(ld_outfile, ld_out, fmt='%s')
Exemple #13
0
    def __init__(self, garray, ids, sid=None, alleles=None, pos=None, chrom=None, map=None, error_probs=None, fams=None, par_status=None):
        if type(garray) == np.ndarray or type(garray) == np.ma.core.MaskedArray:
            if type(garray) == np.ndarray:
                self.gts = ma.array(garray,mask=np.isnan(garray))
            else:
                self.gts = garray
            self.shape = garray.shape
            self.ndim = garray.ndim
            self.dtype = garray.dtype
            self.freqs = None
        else:
            raise ValueError('Genotypes must be a numpy ndarray')
        if garray.shape[0] == ids.shape[0]:
            self.ids = ids
            self.id_dict = make_id_dict(ids)
        else:
            raise ValueError('Shape of genotypes and ids does not match')
        if sid is not None:
            if sid.shape[0] == garray.shape[1]:
                self.snp_index = 1
                self.sid = sid
                self.sid_dict = make_id_dict(sid)
            elif sid.shape[0] == garray.shape[2]:
                self.snp_index = 2
                self.sid = sid
                self.sid_dict = make_id_dict(sid)
            else:
                raise ValueError('Shape of SNP ids (sid) does not match shape of genotype array')
        if alleles is not None:
            if self.sid is not None:
                if alleles.shape[0] == self.sid.shape[0]:
                    self.alleles = alleles
                else:
                    raise ValueError('Size of alleles does not match size of genotypes')
            else:
                raise(ValueError('Must provide SNP ids'))
        else:
            self.alleles = None
        if pos is not None:
            if self.sid is not None:
                if pos.shape[0] == self.sid.shape[0]:
                    self.pos = pos
                else:
                    raise ValueError('Size of position vector does not match size of genotypes')
            else:
                raise(ValueError('Must provide SNP ids'))
        else:
            self.pos = None
        if chrom is not None:
            if self.sid is not None:
                if chrom.shape[0] == self.sid.shape[0]:
                    self.chrom = chrom
                else:
                    raise ValueError('Size of map does not match number of SNPs')
            else:
                raise(ValueError('Must provide SNP ids'))
        else:
            self.chrom = None
        if map is not None:
            if self.sid is not None:
                if map.shape[0] == self.sid.shape[0]:
                    self.map = map
                else:
                    raise ValueError('Size of map does not match number of SNPs')
            else:
                raise(ValueError('Must provide SNP ids'))
        else:
            self.map = None
        if error_probs is not None:
            if self.sid is not None:
                if error_probs.shape[0] == self.sid.shape[0]:
                    self.error_probs = error_probs
                else:
                    raise ValueError('Size of map does not match number of SNPs')
            else:
                raise(ValueError('Must provide SNP ids'))
        else:
            self.error_probs = None
        if fams is not None:
            if fams.shape[0] == ids.shape[0] and fams.ndim==1:
                self.fams = fams
            else:
                raise ValueError('Fams not of same length as IDs')
        else:
            self.fams = None

        if par_status is not None:
            if par_status.shape[0] == ids.shape[0] and par_status.shape[1] == 2:
                self.par_status = par_status
            else:
                raise ValueError('Incompatible par status array')
        else:
            self.par_status = None

        self.mean_normalised = False

        if np.sum(self.gts.mask)>0:
            self.has_NAs = True
        else:
            self.has_NAs = False

        self.info = None