Example #1
1
def getContigStats(vcf,window_size,outfile,subpops):
    #load data
    vcf=allel.read_vcf(vcf)
    snps=allel.GenotypeArray(vcf['calldata/GT'])
    positions=vcf['variants/POS']
    sample_indices=dict()
    for i in range(len(vcf['samples'])): sample_indices[vcf['samples'][i]]=i

    #prep output file
    outfile=open(str(outfile),'w')
    outfile.write('chrom\tchromStart\tchromEnd\tnumSites\tfst\ttajD1\ttajD2\tthetaW1\tthetaW2\tdxy_bw\tpi\tdfd\n')
    outfile.close()

    #get window bounds
    window_bounds=getSubWinBounds(window_size,max(positions))
    window_bound_indices=getSnpIndicesInSubWins(window_bounds,positions)
    nwindows=max(positions)//window_size - 1

    #loop over windows and print summary stats to file
    for i in range(nwindows):
        if(len(window_bound_indices[i])<10): #if <n snps in the window
            outfile=open(str(outfile),'a')
            sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(0),
                      "NA","NA","NA","NA","NA","NA","NA","NA"]
            sumstats='\t'.join(sumstats)+'\n'
            outfile.write(sumstats)
            outfile.close()
        else:
            window_snp_positions=positions[window_bound_indices[i]]
            window_snps=snps.subset(window_bound_indices[i])
            window_ac_all=window_snps.count_alleles()
            window_ac_subpop=window_snps.count_alleles_subpops(subpops=subpops)
            window_ac_per_ind=window_snps.to_allele_counts()

            #summary stats
            a,b,c=allel.stats.fst.weir_cockerham_fst(window_snps,[subpops['rufus'],subpops['sasin']])
            fst=np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
            tajD1=allel.stats.diversity.tajima_d(window_ac_subpop['rufus'])
            tajD2=allel.stats.diversity.tajima_d(window_ac_subpop['sasin'])
            thetaW1=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['rufus'])
            thetaW2=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['sasin'])
            dxy_bw=allel.stats.diversity.sequence_divergence(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'])
            pi=allel.stats.diversity.sequence_diversity(window_snp_positions,window_ac_all)
            dfd=allel.stats.diversity.windowed_df(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'],size=window_size)[0][0]
            # pdxy=allel.stats.distance.pairwise_dxy(window_snp_positions,window_ac_per_ind)
            # dmax=pdxy.max()
            # dmin=pdxy.min()
            # f2=allel.stats.admixture.patterson_f2(window_ac_subpop['rufus'],window_ac_subpop['sasin']) #need to drop non-biallelic sites for this

            #write a vector of summary stats to file
            outfile=open(str(outfile),'a')
            sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(window_snps.shape[0]),
                      str(round(fst,6)),str(round(tajD1,6)),str(round(tajD2,6)),
                      str(round(thetaW1,6)),str(round(thetaW2,6)),str(round(dxy_bw,6)),
                      str(round(pi,6)),str(round(dfd,6))]
            sumstats='\t'.join(sumstats)+'\n'
            outfile.write(sumstats)
            outfile.close()
Example #2
0
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5):
    window_size = int(window_size)
    reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples])
    original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code]

    synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles()
    reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles()

    synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Nucleotide Diversity Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Nucleotide Diversity (π)')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())

    synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size)
    reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size)

    plt.title('Tajima\'s D Sliding Window Analysis')
    plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code))
    plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code))
    plt.xlabel('Windows ({}kb)'.format(window_size // 1000))
    plt.ylabel('Tajima\'s D')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())
Example #3
0
File: core.py Project: niemasd/pixy
def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2,
                              sites_list_chunk):

    # a string representation of the target region of the current window
    window_region = chromosome + ":" + str(window_pos_1) + "-" + str(
        window_pos_2)

    # read in data from the source VCF for the current window
    callset = allel.read_vcf(args.vcf,
                             region=window_region,
                             fields=[
                                 'CHROM', 'POS', 'calldata/GT',
                                 'variants/is_snp', 'variants/numalt'
                             ])

    # keep track of whether the callset was empty (no sites for this range in the VCF)
    # used by compute_summary_stats to add info about completely missing sites
    if callset is None:
        callset_is_none = True
        gt_array = None
        pos_array = None

    else:
        # if the callset is NOT empty (None), continue with pipeline
        callset_is_none = False

        # convert to a genotype array object
        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['calldata/GT']))

        # build an array of positions for the region
        pos_array = allel.SortedIndex(callset['variants/POS'])

        # create a mask for biallelic snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['variants/is_snp'][:] == 1,
                           callset['variants/numalt'][:] == 1),
            callset['variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        # if a list of target sites was specified, mask out all non-target sites
        if sites_list_chunk is not None:
            gt_array = mask_non_target_sites(gt_array, pos_array,
                                             sites_list_chunk)

        # extra 'none' check to catch cases where every site was removed by the mask
        if len(gt_array) == 0:
            callset_is_none = True
            gt_array = None
            pos_array = None

    return callset_is_none, gt_array, pos_array
Example #4
0
def extractGenosAndPositionsForArm(vcfFile, chroms,
                                   currChr, sampleIndicesToKeep):
    # sys.stderr.write("extracting vcf info for arm %s\n" %(currChr))

    rawgenos = np.take(
        vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == currChr], axis=0)  # NOQA
    if len(rawgenos) > 0:
        genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep)
        if isHaploidVcfGenoArray(genos):
            sys.stderr.write(
                "Detected haploid input for %s. "\
                 "Converting into diploid individuals "\
                 "(combining haplotypes in order).\n" % (currChr))
            genos = diploidizeGenotypeArray(genos)
            sys.stderr.write("Done diploidizing %s\n" % (currChr))
        positions = np.extract(chroms == currChr, vcfFile["variants/POS"])
        if len(positions) > 0:
            genos = allel.GenotypeArray(
                genos.subset(sel0=range(len(positions))))

            positions2SnpIndices = {}
            for i in range(len(positions)):
                positions2SnpIndices[positions[i]] = i

            assert len(positions) == len(
                positions2SnpIndices) and len(positions) == len(genos)
            return genos, positions, positions2SnpIndices, genos.count_alleles().is_biallelic()  # NOQA
    return np.array([]), [], {}, np.array([])
Example #5
0
def main(vcffile, popa, popb, popc, popd, freqw, freqx, freqy, freqz, outfile,
         outfreqfile):
    """
    U_A,B,C,D(w,x,y,z)
    A是非渗入群体,B是被渗入群体,C是渗入来源群体1, D是渗入来源群体2
    在窗口内A中频率小于w,B中大于x,C中大于y,D中小于z的SNP位点数即为U_A,B,C,D(w,x,y,z)
    详见:Signatures of Archaic Adaptive Introgression in Present-Day Human Populations
    """
    popA = [x.strip() for x in open(popa)]
    popB = [x.strip() for x in open(popb)]
    popC = [x.strip() for x in open(popc)]
    popD = [x.strip() for x in open(popd)]
    callset_C = allel.read_vcf(
        vcffile,
        samples=popC,
        fields=['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'])
    gt_C = allel.GenotypeArray(callset_C['calldata/GT'])
    ac_C = gt_C.count_alleles()
    af_C = ac_C.to_frequencies()
    site_selection = np.sum(af_C >= freqy, axis=1) > 0  # 只保留C群体中频率大于y的位点
    pos = callset_C['variants/POS'][site_selection]
    chroms = callset_C['variants/CHROM'][site_selection]
    allel_selection = af_C >= freqy  # 筛选allele的编号,以C群体中频率最大的allel为准(包含了site_selection的内容)
    af_C = af_C[allel_selection]
    del (callset_C)
    del (gt_C)
    del (ac_C)

    callset_A = allel.read_vcf(vcffile, samples=popA, fields=['calldata/GT'])
    af_A = allel.GenotypeArray(callset_A['calldata/GT']).count_alleles(
    ).to_frequencies()[allel_selection]
    del (callset_A)

    callset_B = allel.read_vcf(vcffile, samples=popB, fields=['calldata/GT'])
    af_B = allel.GenotypeArray(callset_B['calldata/GT']).count_alleles(
    ).to_frequencies()[allel_selection]
    del (callset_B)

    callset_D = allel.read_vcf(vcffile, samples=popD, fields=['calldata/GT'])
    af_D = allel.GenotypeArray(callset_D['calldata/GT']).count_alleles(
    ).to_frequencies()[allel_selection]
    del (callset_D)

    Usites_selection = (af_A <= freqw) & (af_B >= freqx) & (af_C >= freqy) & (
        af_D <= freqz)
    U_chroms = chroms[Usites_selection]
    U_pos = pos[Usites_selection]

    with open(outfile, 'w') as f:
        for chrom, pos in zip(U_chroms, U_pos):
            f.write(f'{chrom}\t{pos}\n')
    with open(outfreqfile, 'w') as f:
        f.write('chrom\tpos\tfreqA\tfreqB\tfreqC\tfreqD\n')
        for chrom, pos, freqA, freqB, freqC, freqD in zip(
                chroms, pos, af_A, af_B, af_C, af_D):
            f.write(
                f'{chrom}\t{pos}\t{freqA:.3f}\t{freqB:.3f}\t{freqC:.3f}\t{freqD:.3f}\n'
            )
Example #6
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
    elif args.vcf is not None:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        genotypes = allel.GenotypeArray(vcf['calldata/GT'])
        samples = vcf['samples']
    return genotypes, samples
Example #7
0
def get_genotype_array_concat(callsets,
                              genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    if len(callsets) == 1:
        # Only one callset provided. No need for concatenation
        callset = callsets[0]
        return get_genotype_array(callset=callset,
                                  genotype_array_type=genotype_array_type)

    gt_list = []

    # Get genotype data for each callset
    for callset in callsets:
        gt = get_callset_genotype_data(callset)
        if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Encapsulate underlying zarr array with a chunked dask array
            gt = da.from_array(gt, chunks=gt.chunks)
        gt_list.append(gt)

    if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        combined_gt = da.concatenate(gt_list, axis=0)
        combined_gt = allel.GenotypeDaskArray(combined_gt)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        combined_gt = allel.GenotypeChunkedArray(
            np.concatenate(gt_list, axis=0))
    elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0))
    else:
        raise ValueError(
            'Error: Invalid option specified for genotype_array_type.')

    return combined_gt
Example #8
0
def vcf2hmmibd(args):

    print(f'Reading VCF file: {args.infile}')
    vcf = allel.read_vcf(args.infile,
                         fields=[
                             'samples', 'variants', 'calldata/DP',
                             'calldata/GT', 'calldata/AD'
                         ])

    samples = vcf['samples']

    # convert chrom name to integer
    trans_d = read_chrom_translation(args.translationfile)
    chrom = [trans_d[x] for x in vcf['variants/CHROM']]
    coordinates = np.column_stack((chrom, vcf['variants/POS']))

    # create genotypes
    allele_depths = allel.GenotypeArray(vcf['calldata/AD'])
    genotypes = np.argmax(allele_depths, axis=2)

    # set for missing values or lower than mindepth
    total_depths = allele_depths.sum(axis=2, where=(allele_depths > 0))
    genotypes[total_depths < args.mindepth] = -1

    columns = ['chrom', 'pos'] + list(samples)
    genotypes = np.hstack((coordinates, genotypes))

    df = pd.DataFrame(genotypes, columns=columns)
    df.to_csv(args.outfile, sep='\t', index=False)
    print(f'Input file for hmmIBD written at: {args.outfile}')
def readData(dir):
    fs=os.listdir(dir)

    for f in fs:
        bf=tbf.copy_template(outdir+f+".bloom")
        try:
            input=''.join([dir, f])
            callset=allel.read_vcf(input, fields=['variants/CHROM','variants/POS', 'calldata/GT'], types={'calldata/GT':'i2'},
                                           fills={'calldata/GT':2})
            chrom=callset['variants/CHROM']
            pos=callset['variants/POS']
            gt=allel.GenotypeArray(callset['calldata/GT'])
            for i in range(len(chrom)):
                position='|'.join([chrom[i], str(pos[i])])
                gv=allel.GenotypeVector(gt[i])
                key1=0
                key2=0
                if gv[0][0] ==0|gv[0][0] ==1|gv[0][0] ==2 :
                    key1=gv[0][0]
                print("key1:"+str(key1))
                if gv[0][1] ==0|gv[0][1] ==1|gv[0][1] ==2:
                    key2=gv[0][1]
                sum=key1|key2
                key='|'.join([position, str(sum)])
                bf.add(key)
        except Exception as err:
            print(f)
            print(err)
        bf.close()
Example #10
0
def obtain_ancestry_panel(local_callset, sample_list, max_read_count,
                          gq_threshold):

    indices = obtain_indices(local_callset['samples'], sample_list)

    dp = local_callset['calldata/DP']
    dp = dp[:, indices]

    gq = local_callset['calldata/GQ']
    gq = gq[:, indices]

    dp_pass = dp < max_read_count
    gq_pass = gq >= gq_threshold

    snp_pass = dp_pass * gq_pass

    gt_all = local_callset['calldata/GT']
    gt = gt_all[:, indices]

    gt = allel.GenotypeArray(gt)

    alt_alleles = gt.to_n_alt()[:]

    alt_counts = (alt_alleles * snp_pass).sum(1)

    ref_alleles = gt.to_n_ref()[:]
    ref_counts = (ref_alleles * snp_pass).sum(1)

    panel_alleles = np.column_stack((ref_counts, alt_counts))
    return panel_alleles
Example #11
0
def circos(directory, outfn, vcffile):
    ## make config file

    ## turn vcf file into .dat file
    ## chr - start - finish - snp density
    ## bash script

    ## create data file for heterozygosity
    snpdensity = pd.read_csv(directory + outfn + ".dat", sep='\t', header=None)

    ## using user input name read in vcf (either old vcf or new vcf)
    callset = allel.read_vcf(directory + vcffile + ".vcf")
    pos = callset['variants/POS']
    chrm = callset['variants/CHROM']

    gt = allel.GenotypeArray(callset['calldata/GT'])

    ##test fst
    samplelist = callset['samples']
    fstlist = fst(gt, directory, outfn, samplelist)
    fp.makeDATFile(pos, gt, chrm, fstlist, snpdensity, directory, outfn, 'fst')

    ## count the heterozygoes for each pos
    hetcount = gt.count_het(axis=1)
    fp.makeDATFile(pos, gt, chrm, hetcount, snpdensity, directory, outfn,
                   'het')
Example #12
0
def countPatternDFOIL(callset, sample_ix, outgroup):
    """Count patterns for all samples
    """
    print("counting patterns in file...")
    gt = allel.GenotypeArray(callset['calldata/GT'])
    pos = allel.SortedIndex(callset['variants/POS'])
    # remove any sites where outgroup is ./. or 0/1
    keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic()
    gt = gt.compress(keep, axis=0)
    pos = pos[keep]
    windict = {}
    permute = 1
    g1, g2, g3, g4 = sample_ix
    quartet = list(product(g1, g2, g3, g4))
    print("total number of combinations: {}".format(len(quartet)))
    for quart in quartet:
        print("permutation number {}".format(permute))
        i, j, k, m = quart
        gt_sub = gt.take([i, j, k, m, outgroup], axis=1)
        keep = gt_sub.is_hom().all(axis=1)
        gt_sub = gt_sub.compress(keep, axis=0)
        pos_sub = pos[keep]
        count_array = gt_sub.is_hom_alt()
        pattern_array = np.packbits(count_array, axis=1)
        # windows
        windict[permute] = (pos_sub, pattern_array)
        permute += 1
    return (windict)
def main(vcffile, groupfile, outfile):
    group2inds = defaultdict(list)
    with open(groupfile) as f:
        for line in f:
            sampleID, groupID = line.strip().split()
            group2inds[groupID].append(sampleID)
    callset = allel.read_vcf(
        vcffile,
        fields=[
            'variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT'
        ],
        numbers={'ALT': 1})  # 第2个及以上的ALT将被忽略(但是位点还在) 多等位推荐把不同ALT分开在vcf的不同行
    df = pd.DataFrame({
        'chr': callset['variants/CHROM'],
        'pos': callset['variants/POS'],
        'REF': callset['variants/REF'],
        'ALT': callset['variants/ALT']
    })
    for group, samples in group2inds.items():
        print(group)
        print(samples)
        callset = allel.read_vcf(vcffile,
                                 samples=samples,
                                 fields=['samples', 'calldata/GT'])
        af = allel.GenotypeArray(
            callset['calldata/GT']).count_alleles().to_frequencies()
        if af.shape[1] > 1:  # ALT如果频率都是0的话,就只会有一列REF的频率了
            df[group] = af[:, 1]  # 第一个ALT的频率
        else:
            df[group] = .0
    df.to_csv(outfile,
              sep='\t',
              index=False,
              float_format='%.3f',
              na_rep='nan')
Example #14
0
def read_vcf_founderliab(path):

    """
    Read whole vcf and return ONLY founder matrix
    """

    geno_dosage = allel.GenotypeArray(allel.read_vcf(path, fields=['calldata/GT'])['calldata/GT']).to_n_alt().T
    return geno_dosage
Example #15
0
def read_vcf_allel(file_vcf):
    '''
    Use scikit allel to read vcf file. Organise variant information into summary pandas df. 
    '''
    
    print(file_vcf)
    vcf_ori= allel.read_vcf(file_vcf)
    
    if not vcf_ori:
        print('empty vcf.')
        return {}, {}, {}

    print(vcf_ori.keys())
    ### get genotype array
    geno= vcf_ori['calldata/GT']

    
    mult_alt= [x for x in range(geno.shape[0]) if vcf_ori['variants/ALT'][x][1]] #len(vcf_ori['variants/REF'][x]) > 1
    
    indel= [x for x in range(geno.shape[0]) if len(vcf_ori['variants/REF'][x]) == 1 and len(vcf_ori['variants/ALT'][x][0]) == 1]
    
    ## eliminate +1 segregating mutations.
    for mult in mult_alt: 
        gen_t= geno[mult]
        gen_t[gen_t > 1] = 0
        geno[mult]= gen_t
    
    
    geno= allel.GenotypeArray(geno)
    geno= geno.to_n_alt().T
    
    ## setup summary
    column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']

    alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])]
    PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])]

    summary= [
        vcf_ori['variants/CHROM'],
        vcf_ori['variants/POS'],
        vcf_ori['variants/ID'],
        vcf_ori['variants/REF'],
        alts,
        vcf_ori['variants/QUAL'],
        PASS,

    ]
    
    summary= np.array(summary).T
    
    if len(indel):
        print('mutliple ref loci: {}'.format(geno.shape[1] - len(indel)))
        geno= geno[:,indel]
        summary= summary[indel,:]
    
    summary= pd.DataFrame(summary,columns= column_names)
    return geno, summary, vcf_ori['samples']
Example #16
0
    def get_geno(self, m=0, n=0, z=None):
        """return the subset or whole genotype data in the vcf files

        :param m: the beginning row #
        :param n: the ending row #
        :param z: the selected column #, should be a list or None as default
        :return: the genotype data, which fill the missing cells with average value
        """
        if m + n > 0 and z is not None:  # need to be more flexible
            gc = allel.GenotypeArray(self._gt[m:n, z])
        elif m + n > 0 and z is None:
            gc = allel.GenotypeArray(self._gt[m:n])
        else:
            gc = allel.GenotypeArray(self._gt[...])
        gc_alt = gc.to_n_alt(fill=-1).astype('float64')  # missing is '-1'
        gc_alt_ma = ma.masked_less(gc_alt, 0)
        ma_mean = gc_alt_ma.mean(axis=1)
        np.copyto(gc_alt_ma, ma_mean[..., None], where=gc_alt_ma.mask)
        return gc_alt_ma.data
Example #17
0
    def data_generator(self, z=None):
        """generate batchs of genetype data

	    :param z: the selected column #, should be a list or None as default
        """
        batch_size = self._batch_size
        genotype_batch_indexes = [[i * batch_size, (i + 1) * batch_size]
                                  for i in range(self._num_batches)]
        for k, (x, y) in enumerate(genotype_batch_indexes, 1):
            if z is not None:
                batch_geno = allel.GenotypeArray(self._gt[x:y, z])
            else:
                batch_geno = allel.GenotypeArray(self._gt[x:y])
                if k == genotype_batch_indexes:
                    batch_geno = allel.GenotypeArray(
                        self._gt[x:y])  # deal with the last batch
            batch_alt = batch_geno.to_n_alt(fill=0).astype(
                'float64')  # missing is '0'
            yield batch_alt
Example #18
0
    def sci_variant_bldr(self):
        import allel
        import subprocess
        import collections
        import pandas as pd
        import os
        if len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]) > 1:
            print("Multiple VCFs detected. Files will be merged")
            if len([
                    _ for _ in os.listdir(self.path) if _.endswith('.vcf')
            ]) < len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]):
                print("VCFs not compressed - compressing")
                for i in [
                        _ for _ in os.listdir(self.path) if _.endswith('.vcf')
                ]:
                    #testing
                    #i = [_ for _ in os.listdir(path) if _.endswith('.vcf')][0]
                    vcf = path + i
                    subprocess.run(['bgzip', "-c", vcf, ">"],
                                   stdout=open(vcf + ".gz", "w"))
                    # required?
                    subprocess.run(['tabix', '-p', 'vcf', vcf + ".vcf"])
            command = 'bcftools merge --force-samples ' + path + "*.gz" + ' -o ' + path + 'INPUT.vcf'
            subprocess.run(command, shell=True)
            vcfdata = allel.read_vcf(path + 'INPUT.vcf',
                                     fields=[
                                         'samples', 'calldata/GT',
                                         'variants/ALT', 'variants/REF',
                                         'variants/CHROM', 'variants/POS',
                                         'variants/svlen'
                                     ])
            vcfdf = allel.vcf_to_dataframe(
                path + 'INPUT.vcf',
                exclude_fields=['QUAL', 'FILTER_PASS', 'ID'])
        else:
            vcffile = [_ for _ in os.listdir(self.path) if _.endswith('.vcf')]
            vcfdata = allel.read_vcf(self.path + vcffile[0],
                                     fields=[
                                         'samples', 'calldata/GT',
                                         'variants/ALT', 'variants/REF',
                                         'variants/CHROM', 'variants/POS',
                                         'variants/svlen'
                                     ])
            #vcfdata = allel.read_vcf("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf")
            vcfdf = allel.vcf_to_dataframe(
                self.path + vcffile[0],
                exclude_fields=['QUAL', 'FILTER_PASS', 'ID'])
        #vcfdf = allel.vcf_to_dataframe("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf")
        sample_set = list(collections.OrderedDict.fromkeys(vcfdata['samples']))
        gt = allel.GenotypeArray(
            vcfdata['calldata/GT']).to_n_alt()  # drop additional information
        gt_data = pd.DataFrame(gt, columns=sample_set)

        data = pd.concat([vcfdf, gt_data], axis=1, join='inner')
        return data
def generate_encoded_genotypes(path='',
                               geno_file='',
                               subset=False,
                               subset_geno_file=''):
    '''
    Generates genotype input for external_dataset.py
    Genotypes are encoded as either 0, 1, or 2, denoting the number of reference
    alleles in the sample's genotype
    Arg:
        if subset = True, encodes 2000 snps for 250 subjects
        This subset was generated using vcf_subset_generator.py
    Returns:
        snps.txt
    '''

    if subset:
        filepath = os.path.join(path, subset_geno_file)
        with open(file_path, 'rb') as f:
            callset = pickle.load(f)
    else:
        filepath = os.path.join(path, geno_file)
        with open(file_path, 'rb') as f:
            callset = pickle.load(f)

    #create allel.model.GenotypeArray
    gt = allel.GenotypeArray(callset['calldata/GT'])

    #trim repeated name in samples
    samples = callset['samples'].tolist()
    samples_trimmed = []
    for name in samples:
        samples_trimmed.append(name.split('_')[0])

    #Populate df with genotypes (encoded as 0, 1, or 2)
    df = pd.DataFrame()
    df['subjects'] = samples_trimmed
    ids = callset['variants/ID'].tolist()
    for snp, genotype in enumerate(gt):
        genotypes_per_snp = []
        for subject in genotype:
            genotype_per_subject = []
            if subject[0] == subject[1]:
                if subject[0] == 0:
                    genotype_per_subject = 2
                elif subject[0] == 1:
                    genotype_per_subject = 0
            elif subject[0] != subject[1]:
                genotype_per_subject = 1
            else:
                genotype_per_subject = 'NA'
        genotypes_per_snp.append(genotype_per_subject)
    df[ids[snp]] = genotypes_per_snp

    #save 'snps.txt' file
    df.to_csv(os.path.join(path, 'snps.txt'), index=None, sep='\t')
Example #20
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
        positions = callset['variants/POS']
    elif args.vcf is not None:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        genotypes = allel.GenotypeArray(vcf['calldata/GT'])
        samples = vcf['samples']
    elif args.matrix is not None:
        gmat = pd.read_csv(args.matrix, sep="\t")
        samples = np.array(gmat['sampleID'])
        gmat = gmat.drop(labels="sampleID", axis=1)
        gmat = np.array(gmat, dtype="int8")
        for i in range(gmat.shape[0]
                       ):  #kludge to get haplotypes for reading in to allel.
            h1 = []
            h2 = []
            for j in range(gmat.shape[1]):
                count = gmat[i, j]
                if count == 0:
                    h1.append(0)
                    h2.append(0)
                elif count == 1:
                    h1.append(1)
                    h2.append(0)
                elif count == 2:
                    h1.append(1)
                    h2.append(1)
            if i == 0:
                hmat = h1
                hmat = np.vstack((hmat, h2))
            else:
                hmat = np.vstack((hmat, h1))
                hmat = np.vstack((hmat, h2))
        genotypes = allel.HaplotypeArray(
            np.transpose(hmat)).to_genotypes(ploidy=2)
    return genotypes, samples
Example #21
0
def filterGT(callset, outgroup):
    """Count patterns from VCF
    """
    gt = allel.GenotypeArray(callset['calldata/GT'])
    p = callset['variants/POS']
    pos = allel.SortedIndex(p)
    acs = gt[:, outgroup].count_alleles(max_allele=1)
    flt = acs.is_segregating()  # needs to be segregating in the outgroup
    gt = gt.compress(flt, axis=0)
    pos = pos[flt]
    return (gt, pos)
Example #22
0
    def sample_genotype_array(self, sample, part):
        """Get a genotype array for the specified individual"""
        file = self._sample_genotype_path(sample, part)

        if not self._check_local(file):
            cmd = f'bcftools view -s {sample} -v snps -m2 -M2 -Oz -o {file} {self._chr_path} {self._query(part)}'
            subprocess.call(cmd, shell=True, stdout=subprocess.PIPE)

        gt = allel.read_vcf(file, fields=['GT', 'POS'])
        return pd.Series(allel.GenotypeArray(gt['calldata/GT'])[:, 0],
                         index=gt['variants/POS'])
Example #23
0
File: pca.py Project: tle003/ipyrad
    def _load_calldata(self):
        callset = allel.read_vcf(self.data, fields=["samples", "GT"])
        self.samples_vcforder = callset["samples"]

        gt = allel.GenotypeArray(callset['calldata/GT'])

        ## All this is for removing multi-allelic snps, and biallelic singletons
        ac = gt.count_alleles()
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)

        self.genotypes = gt.compress(flt, axis=0)
Example #24
0
def get_genotype_array(callset,
                       genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    gtz = get_callset_genotype_data(callset)

    if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        return allel.GenotypeArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        return allel.GenotypeDaskArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        return allel.GenotypeChunkedArray(gtz)
    else:
        return None
Example #25
0
def geno2fst( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    cout('Grouping:')
    groups = lineparser.parse_grouping()
    for k in groups:
        cout(' %12s %3d' % (k, len(groups[k])))

    FST = [] # FST indexed by group_keys
    group_keys = sorted(groups.keys())
    cout(group_keys)

    # output to file
    cout('Writing outfile...')
    outfile = open(args.outfile, 'w')

    outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) )

    idx = 0
    for (posinfo, genolist) in lineparser.parse():

        idx += 1
        genoarray = allel.GenotypeArray( [genolist]  )

        # calculate MAF
        ac = genoarray.count_alleles()
        num = np.min(ac)
        denom = np.sum(ac)
        if num == denom:
            maf = 0
        else:
            maf = np.min(ac)/np.sum(ac)

        # calculate FST per group against other samples

        fst_sites = []
        for g in group_keys:
            ac_g = genoarray.count_alleles(subpop = groups[g])
            ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g])))
            num, den = allel.stats.hudson_fst(ac_g, ac_ng)
            fst = num[0]/den[0]
            if not (0.0 <= fst <= 1.0):
                fst = 0
            fst_sites.append( fst )

        if idx % 100 == 0:
            cerr('I: writing position no %d' % idx)

        outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' %
                        (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf,
                            '\t'.join( '%5.4f' % x for x in fst_sites)))
Example #26
0
def geno2dhe(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.haploid_translator)
    lineparser.parse_grouping()

    cout('Grouping:')
    groups = lineparser.groups
    for k in lineparser.groups:
        cout(' %12s %3d' % (k, len(lineparser.groups[k])))

    group_keys = sorted(lineparser.groups.keys())
    cout(group_keys)

    # read whole genotype, and release all unused memory
    cerr('I: reading genotype file')
    allel_array = lineparser.parse_all()
    cerr('I: generating genotype array')
    genoarray = allel.GenotypeArray(allel_array)
    del allel_array

    cerr('I: calculating He')
    He = 1 - np.sum(genoarray.count_alleles().to_frequencies()**2, axis=1)

    He_groups = {}
    pHe = None

    for g in groups:
        He_groups[g] = 1 - np.sum(
            genoarray.count_alleles(subpop=groups[g]).to_frequencies()**2,
            axis=1)
        if pHe is None:
            pHe = He_groups[g] * len(groups[g])
        else:
            pHe = pHe + He_groups[g] * len(groups[g])

    dHe = He - pHe / sum(len(x) for x in groups.values())
    FST = dHe / He

    #import IPython; IPython.embed()

    cerr('I: writing output file')
    with open(args.outfile, 'wt') as outfile:
        outfile.write('CHROM\tPOS\tREGION\tFST\tdHe\tHe\t%s\n' %
                      '\t'.join(group_keys))

        for i in range(len(He)):
            posinfo = lineparser.position[i]

            outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%s\n' %
                          (posinfo[0], posinfo[1], posinfo[4], FST[i], dHe[i],
                           He[i], '\t'.join('%5.4f' % He_groups[g][i]
                                            for g in group_keys)))
Example #27
0
 def allelify(self):
     """
     Updates genotypes and allele counts array to scikit-allel wrappers
     """
     self.genotypes = {
         key: allel.GenotypeArray(value)
         for key, value in self.genotypes.items()
     }  # Numpy -> allel
     self.allele_counts = {
         key: allel.AlleleCountsArray(value)
         for key, value in self.allele_counts.items()
     }
Example #28
0
def load_vcf_wrapper(path, seqid, samples):

    callset = allel.read_vcf(path,
                             region=seqid,
                             fields=['variants/POS', 'calldata/GT', 'samples'],
                             tabix="tabix",
                             samples=samples)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
Example #29
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
    else:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        gt = vcf['calldata/GT']
        genotypes = allel.GenotypeArray(gt)
        hap0 = genotypes[:, :, 0]
        hap1 = genotypes[:, :, 1]
        haps = allel.HaplotypeArray(
            np.concatenate((hap0, hap1), axis=1)
        )  #note order is all hap0 in order of samples, then all hap1 in order of samples.
        samples = vcf['samples']
        s0 = [x + "_h0" for x in samples]
        s1 = [x + "_h1" for x in samples]
        samples = np.concatenate((s0, s1), axis=0)
    return haps, samples
Example #30
0
def diploidizeGenotypeArray(genos):
    numSnps, numSamples, numAlleles = genos.shape
    if numSamples % 2 != 0:
        sys.stderr.write(
            "Diploidizing an odd-numbered sample. The last genome will be truncated.\n")
        numSamples -= 1
    newGenos = []
    for i in range(numSnps):
        currSnp = []
        for j in range(0, numSamples, 2):
            currSnp.append([genos[i, j, 0], genos[i, j+1, 0]])
        newGenos.append(currSnp)
    newGenos = np.array(newGenos)
    return allel.GenotypeArray(newGenos)