Beispiel #1
1
def getContigStats(vcf,window_size,outfile,subpops):
    #load data
    vcf=allel.read_vcf(vcf)
    snps=allel.GenotypeArray(vcf['calldata/GT'])
    positions=vcf['variants/POS']
    sample_indices=dict()
    for i in range(len(vcf['samples'])): sample_indices[vcf['samples'][i]]=i

    #prep output file
    outfile=open(str(outfile),'w')
    outfile.write('chrom\tchromStart\tchromEnd\tnumSites\tfst\ttajD1\ttajD2\tthetaW1\tthetaW2\tdxy_bw\tpi\tdfd\n')
    outfile.close()

    #get window bounds
    window_bounds=getSubWinBounds(window_size,max(positions))
    window_bound_indices=getSnpIndicesInSubWins(window_bounds,positions)
    nwindows=max(positions)//window_size - 1

    #loop over windows and print summary stats to file
    for i in range(nwindows):
        if(len(window_bound_indices[i])<10): #if <n snps in the window
            outfile=open(str(outfile),'a')
            sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(0),
                      "NA","NA","NA","NA","NA","NA","NA","NA"]
            sumstats='\t'.join(sumstats)+'\n'
            outfile.write(sumstats)
            outfile.close()
        else:
            window_snp_positions=positions[window_bound_indices[i]]
            window_snps=snps.subset(window_bound_indices[i])
            window_ac_all=window_snps.count_alleles()
            window_ac_subpop=window_snps.count_alleles_subpops(subpops=subpops)
            window_ac_per_ind=window_snps.to_allele_counts()

            #summary stats
            a,b,c=allel.stats.fst.weir_cockerham_fst(window_snps,[subpops['rufus'],subpops['sasin']])
            fst=np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
            tajD1=allel.stats.diversity.tajima_d(window_ac_subpop['rufus'])
            tajD2=allel.stats.diversity.tajima_d(window_ac_subpop['sasin'])
            thetaW1=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['rufus'])
            thetaW2=allel.stats.diversity.watterson_theta(window_snp_positions,window_ac_subpop['sasin'])
            dxy_bw=allel.stats.diversity.sequence_divergence(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'])
            pi=allel.stats.diversity.sequence_diversity(window_snp_positions,window_ac_all)
            dfd=allel.stats.diversity.windowed_df(window_snp_positions,window_ac_subpop['rufus'],window_ac_subpop['sasin'],size=window_size)[0][0]
            # pdxy=allel.stats.distance.pairwise_dxy(window_snp_positions,window_ac_per_ind)
            # dmax=pdxy.max()
            # dmin=pdxy.min()
            # f2=allel.stats.admixture.patterson_f2(window_ac_subpop['rufus'],window_ac_subpop['sasin']) #need to drop non-biallelic sites for this

            #write a vector of summary stats to file
            outfile=open(str(outfile),'a')
            sumstats=[vcf['variants/CHROM'][0],str(window_bounds[i][0]),str(window_bounds[i][1]),str(window_snps.shape[0]),
                      str(round(fst,6)),str(round(tajD1,6)),str(round(tajD2,6)),
                      str(round(thetaW1,6)),str(round(thetaW2,6)),str(round(dxy_bw,6)),
                      str(round(pi,6)),str(round(dfd,6))]
            sumstats='\t'.join(sumstats)+'\n'
            outfile.write(sumstats)
            outfile.close()
Beispiel #2
0
def read_vcf_founderliab(path):

    """
    Read whole vcf and return ONLY founder matrix
    """

    geno_dosage = allel.GenotypeArray(allel.read_vcf(path, fields=['calldata/GT'])['calldata/GT']).to_n_alt().T
    return geno_dosage
Beispiel #3
0
def read_vcf_allel(file_vcf):
    '''
    Use scikit allel to read vcf file. Organise variant information into summary pandas df. 
    '''
    
    print(file_vcf)
    vcf_ori= allel.read_vcf(file_vcf)
    
    if not vcf_ori:
        print('empty vcf.')
        return {}, {}, {}

    print(vcf_ori.keys())
    ### get genotype array
    geno= vcf_ori['calldata/GT']

    
    mult_alt= [x for x in range(geno.shape[0]) if vcf_ori['variants/ALT'][x][1]] #len(vcf_ori['variants/REF'][x]) > 1
    
    indel= [x for x in range(geno.shape[0]) if len(vcf_ori['variants/REF'][x]) == 1 and len(vcf_ori['variants/ALT'][x][0]) == 1]
    
    ## eliminate +1 segregating mutations.
    for mult in mult_alt: 
        gen_t= geno[mult]
        gen_t[gen_t > 1] = 0
        geno[mult]= gen_t
    
    
    geno= allel.GenotypeArray(geno)
    geno= geno.to_n_alt().T
    
    ## setup summary
    column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']

    alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])]
    PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])]

    summary= [
        vcf_ori['variants/CHROM'],
        vcf_ori['variants/POS'],
        vcf_ori['variants/ID'],
        vcf_ori['variants/REF'],
        alts,
        vcf_ori['variants/QUAL'],
        PASS,

    ]
    
    summary= np.array(summary).T
    
    if len(indel):
        print('mutliple ref loci: {}'.format(geno.shape[1] - len(indel)))
        geno= geno[:,indel]
        summary= summary[indel,:]
    
    summary= pd.DataFrame(summary,columns= column_names)
    return geno, summary, vcf_ori['samples']
Beispiel #4
0
 def __get_variants_from_vcf(cls, vcf: str) -> Optional[Dict[str, Any]]:
     # variants is None precisely when filtered vcf file has no variants
     try:
         variants = allel.read_vcf(vcf,
                                   fields=cls.FIELD_NAMES,
                                   transformers=allel.ANNTransformer())
     except IOError:
         raise FileNotFoundError("File " + vcf +
                                 " not found or cannot be opened.")
     return variants
Beispiel #5
0
def get_ann_from_output_snpeff(temp_out_name):
    callset = allel.read_vcf(temp_out_name, fields='ANN', transformers=allel.ANNTransformer(), \
                             numbers={'ANN': num_ann_max})

    df1 = pd.DataFrame(data=callset['variants/ANN_Allele'])
    df2 = pd.DataFrame(data=callset['variants/ANN_Annotation'])
    df3 = pd.concat((df1, df2), axis=1)
    df3.columns = range(0, df3.shape[1])

    return df3
Beispiel #6
0
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix):
    """
    计算pop1和pop2之间的Fst
    using the method of Hudson (1992) elaborated by Bhatia et al. (2013).
    """
    pop1 = [x.strip() for x in open(pop1)]
    pop2 = [x.strip() for x in open(pop2)]
    callset = allel.read_vcf(vcffile)
    allsamples = callset['samples']
    genotypes = allel.GenotypeChunkedArray(callset['calldata/GT'])
    variant_selection = np.full((genotypes.shape[0] + 1), True)  # 选择vcf中的全部位点
    sample_selection = [True if x in pop1 else False for x in allsamples]
    ac1 = getAC(genotypes, variant_selection, sample_selection)
    sample_selection = [True if x in pop2 else False for x in allsamples]
    ac2 = getAC(genotypes, variant_selection, sample_selection)
    num, den = allel.hudson_fst(ac1, ac2)
    fst = num / den
    meanFst = np.sum(num) / np.sum(den)
    print('meanFst: %s' % meanFst)
    chrom = callset['variants/CHROM']
    pos = callset['variants/POS']
    df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst})
    df.to_csv(f'{outprefix}_persite.tsv.gz',
              sep='\t',
              index=False,
              na_rep='nan',
              compression='gzip')
    df['num'] = num
    df['den'] = den
    # sliding bins
    bdf = []
    for offset in range(0, binwidth, stepsize):
        df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth
        for group_name, gdf in df.groupby(by=['chrom', 'bin_index']):
            chrom, bin_index = group_name
            start = bin_index * binwidth + offset + 1
            if start < 0:  # 开头几个窗口长度不足的就直接跳过
                continue
            end = start + binwidth - 1
            n_snp = gdf.shape[0]
            sum_num = gdf['num'].sum()
            sum_den = gdf['den'].sum()
            if sum_den > 0:
                meanFst = sum_num / sum_den
            else:
                meanFst = np.nan
            bdf.append([chrom, start, end, n_snp, meanFst])
    bdf = pd.DataFrame(bdf,
                       columns=['chrom', 'start', 'end', 'n_snp',
                                'meanFst']).sort_values(by=['chrom', 'start'])
    bdf.to_csv(f'{outprefix}_meanFst.tsv.gz',
               index=False,
               compression='gzip',
               sep='\t',
               float_format='%.3f')
Beispiel #7
0
def import_data(callset_path):
    '''Read in the VCF in the appropriate format.'''

    callset = allel.read_vcf(callset_path,
                             fields=[
                                 'samples', 'calldata/GT', 'variants/CHROM',
                                 'variants/FILTER', 'variants/POS',
                                 'variants/REF', 'variants/ALT'
                             ])

    return callset
Beispiel #8
0
    def _load_calldata(self):
        callset = allel.read_vcf(self.data, fields=["samples", "GT"])
        self.samples_vcforder = callset["samples"]

        gt = allel.GenotypeArray(callset['calldata/GT'])

        ## All this is for removing multi-allelic snps, and biallelic singletons
        ac = gt.count_alleles()
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)

        self.genotypes = gt.compress(flt, axis=0)
Beispiel #9
0
def vcf2ped( args ):
    """ create a ped and map file based on vcf and metafile, suitable for isoRelate """

    # open group file
    group_parser = grpparser.GroupParser( args )

    # open VCF file
    cerr('[I: reading VCF...]')
    start_time = time.monotonic()
    vcfset = allel.read_vcf(args.infile,
                fields = ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'])
    cerr('[I: read %s site, %s samples in %d secs]' % (len(vcfset['variants/CHROM']),
         len(vcfset['samples']), time.monotonic() - start_time))

    # assign groups
    samples = vcfset['samples']
    group_parser.assign_groups(samples)
    groups = group_parser.group_keys
    #import IPython; IPython.embed()

    # write to PED
    with open(args.outprefix + '.ped', 'w') as outf:
        for i in range(len(samples)):
            outf.write('%s\t%s\t0\t0\t1\t0\t' % (groups[i], samples[i]))
            alleles = []
            for gt in vcfset['calldata/GT'][:,i]:
                allele_1, allele_2 = gt
                #print(allele_1, allele_2)
                if allele_1 == allele_2:
                    if allele_1 == -1:
                        alleles += [0, 0]
                    elif allele_1 == 0:
                        alleles += [1, 1]
                    elif allele_1 == 1:
                        alleles += [2, 2]
                    else:
                        alleles += [1, 1]
                else:
                    alleles += [1, 2]
            outf.write('\t'.join( str(i) for i in alleles))
            outf.write('\n')
            #import IPython; IPython.embed()

    # write to MAP
    with open(args.outprefix + '.map', 'w') as outf:
        last_pos = 0
        curr_chr = None
        for (chrom, pos) in zip( vcfset['variants/CHROM'], vcfset['variants/POS'] ):
            if curr_chr != chrom:
                curr_chr = chrom
                last_pos = 0
            dist = (pos - last_pos) * 1e-6
            last_pos = pos
            outf.write('%s\t%s:%d\t%8.6f\t%d\n' % (chrom, chrom, pos, dist, pos))
Beispiel #10
0
    def sample_genotype_array(self, sample, part):
        """Get a genotype array for the specified individual"""
        file = self._sample_genotype_path(sample, part)

        if not self._check_local(file):
            cmd = f'bcftools view -s {sample} -v snps -m2 -M2 -Oz -o {file} {self._chr_path} {self._query(part)}'
            subprocess.call(cmd, shell=True, stdout=subprocess.PIPE)

        gt = allel.read_vcf(file, fields=['GT', 'POS'])
        return pd.Series(allel.GenotypeArray(gt['calldata/GT'])[:, 0],
                         index=gt['variants/POS'])
Beispiel #11
0
def vcf2npy(vcffile, samples):
    callset = allel.read_vcf(vcffile, samples=samples)
    haplotypes_1 = callset['calldata/GT'][:, :, 0]
    haplotypes_2 = callset['calldata/GT'][:, :, 1]

    m, n = haplotypes_1.shape
    mat_haplo = np.empty((2 * n, m))
    mat_haplo[::2] = haplotypes_1.T
    mat_haplo[1::2] = haplotypes_2.T

    return mat_haplo.astype(np.uint8)
Beispiel #12
0
def load_vcf_wrapper(path, seqid, samples):

    callset = allel.read_vcf(path,
                             region=seqid,
                             fields=['variants/POS', 'calldata/GT', 'samples'],
                             tabix="tabix",
                             samples=samples)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
def main() -> None:
    file_path1 = sys.argv[1]
    file_path2 = sys.argv[2]

    if len(sys.argv) == 5:
        population1 = sys.argv[3]
        population2 = sys.argv[4]

    callset1 = allel.read_vcf(file_path1, fields=VCF_FIELDS)
    callset2 = allel.read_vcf(file_path2, fields=VCF_FIELDS)

    genotypes1, positions1, _, _ = biallelic_variant_filter(callset1)
    genotypes2, positions2, _, _ = biallelic_variant_filter(callset2)

    shared_position_indices1, shared_position_indices2 = joint_position_indices(positions1, positions2)
    positions1 = positions1[shared_position_indices1]
    positions2 = positions2[shared_position_indices2]
    genotypes1 = genotypes1[shared_position_indices1]
    genotypes2 = genotypes2[shared_position_indices2]

    genotypes1[genotypes1 < 0] = 0
    genotypes2[genotypes2 < 0] = 0

    #
    # allele_counts1 = genotypes1.reshape(genotypes1.shape[0], -1).sum(1)
    # allele_counts2 = genotypes2.reshape(genotypes2.shape[0], -1).sum(1)
    # sfs1 = allel.sfs(allele_counts1, np.product(genotypes1.shape[1:]))
    # sfs2 = allel.sfs(allele_counts2, np.product(genotypes2.shape[1:]))
    # plt.title('real vs synthetic PGP site frequency spectrum')
    # ax = plt.gca()
    # ax = allel.plot_sfs(sfs1, ax=ax, label=population1, plot_kwargs=dict([('c','b')]))
    # ax = allel.plot_sfs(sfs2, ax=ax, label=population2, plot_kwargs=dict([('c','g')]))
    # ax.legend()
    # plt.savefig(os.path.join(FIGURES_DIR, 'synthetic_PGP.PGP.sfs.png'))
    # plt.clf()
    #

    # sfs1 = site_frequency_spectrum(genotypes1, population1)
    # sfs2 = site_frequency_spectrum(genotypes2, population2)
    joint_site_frequency_spectrum(genotypes1, genotypes2, population1, population2)
Beispiel #14
0
def mutect2(inputs, normal_name) -> dict:
    # chrom: pos: normal_genotype
    chrom_pos_gt = dict()
    cnt = 0
    cnt_het_hom = 0
    for ifile in inputs:
        # ["variants/CHROM", "variants/POS", "variants/REF", "variants/ALT", "calldata/GT"]
        in_vcf = allel.read_vcf(ifile, fields='*')
        idx_normal = np.argwhere(in_vcf["samples"] == normal_name)[0][0]
        zipped = zip(
            in_vcf["variants/CHROM"][in_vcf["variants/is_snp"]],
            in_vcf["variants/POS"][in_vcf["variants/is_snp"]],
            in_vcf["variants/REF"][in_vcf["variants/is_snp"]],
            in_vcf["variants/ALT"][in_vcf["variants/is_snp"]],
            in_vcf["calldata/GT"][in_vcf["variants/is_snp"]],
            in_vcf["variants/FILTER_PASS"][in_vcf["variants/is_snp"]],
            in_vcf["variants/FILTER_artifact_in_normal"][
                in_vcf["variants/is_snp"]])
        for chrom, pos, ref, alt, gt, is_pass, is_artifact in zipped:
            if is_artifact:
                continue
            chrom = str(chrom)
            pos = int(pos)
            num_pass = int(is_pass)
            alt = alt[0]
            ref_alt = ref + alt
            normal = ref_alt[gt[idx_normal][0]] + ref_alt[gt[idx_normal][1]]
            if gt[idx_normal][0] != 0 or gt[idx_normal][1] != 0:
                cnt_het_hom += 1

            if chrom in chrom_pos_gt:
                if pos in chrom_pos_gt[chrom]:
                    chrom_pos_gt[chrom][pos]["num_pass"] += num_pass
                    if chrom_pos_gt[chrom][pos]["gt"][0] == chrom_pos_gt[
                            chrom][pos]["gt"][1] and normal[0] != normal[1]:
                        cnt += 1
                        chrom_pos_gt[chrom][pos]["gt"] = normal
                else:
                    chrom_pos_gt[chrom][pos] = {
                        "gt": normal,
                        "num_pass": num_pass
                    }
            else:
                chrom_pos_gt[chrom] = {
                    pos: {
                        "gt": normal,
                        "num_pass": num_pass
                    }
                }
    print(f"Disagreement on normal: {cnt} times.")
    print(f"Not ref: {cnt_het_hom} times.")
    return chrom_pos_gt
Beispiel #15
0
def fit_em_smm(
    variants_vcf: str,
    n_iterations: int,
    K: int,
    seed: int,
    logsum_approx: bool,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    variants = (
        allel.vcf_to_dataframe(variants_vcf, fields=[
            'POS', 'REF', 'ALT'
        ]).drop(['ALT_2', 'ALT_3'], axis=1)  # ALT_2, ALT_3 are always empty
    )
    genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT'])
    genotypes = genotypes['calldata/GT']
    # scikit-allel reads missing values as -1
    genotypes = np.where(genotypes == -1, 0, genotypes)
    haplo_1 = genotypes[:, :, 0]
    haplo_2 = genotypes[:, :, 1]
    haplos = np.hstack((haplo_1, haplo_2)).T

    n_variants_pos = (
        variants  # find number of variants by position
        .groupby('POS'
                 )  # add 1 to account for fact that we always have a reference
        .count()['REF'].values) + 1
    max_n_variants = np.sort(n_variants_pos)[-1]
    n_loci = len(variants['POS'].unique())
    n_samples = haplos.shape[0]
    haplos = _encode_haplotypes(variants['POS'].values, haplos, n_samples,
                                n_loci)

    # em initialization
    rng = np.random.default_rng(seed)
    group_e_ini = rng.random(size=(n_samples, K))
    group_e = group_e_ini / np.sum(group_e_ini, axis=1, keepdims=1)
    group_probs = np.full(6, 1 / K)  # make this a probability vector
    variant_ini = rng.random(size=(K, n_loci, max_n_variants))
    variant_probs = variant_ini / np.sum(variant_ini, axis=2, keepdims=1)
    # TODO: add step filtering this to correct number of variants

    return _em_loop(
        n_iterations,
        K,
        n_samples,
        n_loci,
        n_variants_pos,
        group_e,
        group_probs,
        variant_probs,
        haplos,
        logsum_approx,
    )
Beispiel #16
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
    elif args.vcf is not None:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        genotypes = allel.GenotypeArray(vcf['calldata/GT'])
        samples = vcf['samples']
    return genotypes, samples
Beispiel #17
0
def generate_vcf_classes(vcfs):
    print("Parsing VCFs")
    parsed_vcf_bodies = list(map(lambda x: allel.read_vcf(x, fields="*"),
                                 vcfs))
    parsed_vcf_bodies = list(filter(None, parsed_vcf_bodies))
    deque(
        map(
            lambda x: x.update(samples=numpy.char.upper(x['samples'].tolist())
                               ), parsed_vcf_bodies))
    deque(map(lambda x, y: x.update(FILE=y), parsed_vcf_bodies, vcfs))
    add_headers = lambda x, y: x.update(header=allel.read_vcf_headers(y))
    deque(map(add_headers, parsed_vcf_bodies, vcfs))
    return parsed_vcf_bodies
Beispiel #18
0
def vcf2npy(
    vcfpath
):  #converts a .vcf file to a numpy matrix w/ values 0, 1, and 2; https://github.com/bcm-uga/Loter/blob/master/python-package/Local_Ancestry_Example.ipynb
    callset = allel.read_vcf(vcfpath)
    haplotypes_1 = callset['calldata/GT'][:, :, 0]
    haplotypes_2 = callset['calldata/GT'][:, :, 1]

    m, n = haplotypes_1.shape
    mat_haplo = np.empty((2 * n, m))
    mat_haplo[::2] = haplotypes_1.T
    mat_haplo[1::2] = haplotypes_2.T

    return mat_haplo.astype(np.uint8)
Beispiel #19
0
def vcf2npy(vcffile, samples):
    callset = allel.read_vcf(vcffile, samples=samples,
                             fields=['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'])
    haplotypes_1 = callset['calldata/GT'][:,:,0]
    haplotypes_2 = callset['calldata/GT'][:,:,1]

    m, n = haplotypes_1.shape
    mat_haplo = np.empty((2*n, m))
    mat_haplo[::2] = haplotypes_1.T
    mat_haplo[1::2] = haplotypes_2.T

    keep_samples = callset['samples']
    return mat_haplo.astype(np.uint8), keep_samples, callset['variants/CHROM'], callset['variants/POS']
Beispiel #20
0
 def test_few_chrom_small_region_num(self):
     self.data_holder.filename = VCF_DATA_FEW_CHR
     self.region_len = 500000
     chromosomes = create_bed_files_and_extract_chromosomes(
         data_holder=self.data_holder,
         output_dir=self.output_dir,
         region_len=self.region_len,
     )
     vcf_data = allel.read_vcf(self.data_holder.filename)
     self.assertEqual(len(set(vcf_data['variants/CHROM'])),
                      len(chromosomes))
     regions_num = len(listdir(self.output_dir))
     self.assertEqual(regions_num, 16)
Beispiel #21
0
def loadvcf(vcfFile):
    """Reads VCF using scikit-allel, object is stored as pandas DF
    """
    print("loading vcf file...")
    print("using scitkit allele version:", allel.__version__)
    h5 = "{}h5".format(vcfFile.strip("vcf"))
    if os.path.isfile(h5):
        callset = h5py.File(h5, 'r')
    else:
        callset = allel.read_vcf(vcfFile)
        print("creating h5 for faster loading")
        allel.vcf_to_hdf5(vcfFile, h5)
    return(callset)
Beispiel #22
0
def loadvcf(vcfFile):
    """Reads VCF using scikit-allel, object is stored as pandas DF
    """
    print("loading vcf file...")
    print("using scitkit allele version:", allel.__version__)
    h5 = "{}h5".format(vcfFile.strip("vcf"))
    if os.path.isfile(h5):
        callset = h5py.File(h5, 'r')
    else:
        callset = allel.read_vcf(vcfFile)
        print("creating h5 for faster loading")
        allel.vcf_to_hdf5(vcfFile, h5)
    return (callset)
Beispiel #23
0
def simulate(reference_file_bcf, sample_file, idx_to_pop_map, genetic_map_file,
             generations, num_out, sim_output_path, chm, use_phase_shift):

    # path to RFMix/simulate binary 
    rfmix_sim_path = "./Admixture/simulate" 
    # NOTE: If running from a different directory than XGMIX/, this needs to
    # be updated.

    # assume everyone in the sample map is founder
    print("Creating founders.bcf.gz for {}".format(sample_file))
    write_founders_bcf(reference_file_bcf, sample_file, sim_output_path)

    for gen in generations:
        
        print("-"*80)
        print("Simulation for generation {} from {}".format(gen,sample_file))

        # generation simulation output path
        gen_path = join_paths(sim_output_path, "gen_"+str(gen))
        out_basename = gen_path+'/admix'
        
        # Simulating the individuals via rfmix simulation
        print("simulating ...")
        rfmix_sim_cmd = rfmix_sim_path + " -f %s/founders.bcf.gz -m %s/founders.map -g %s -o %s --growth-rate=1.5 --maximum-size=2000 --n-output=%d -c %s -G %d -p %f --random-seed=%d %s"
        rfmix_sim = rfmix_sim_cmd % (sim_output_path, sim_output_path, genetic_map_file, out_basename, num_out, chm, gen, 0.0, 123, "--dephase" if use_phase_shift else "")
        try:
            run_shell_cmd(rfmix_sim, verb=True)
        except:
            print("something went wrong using rfmix/simulate ...", end=" ")
            print("trying -c chr"+chm+" insted of -c "+chm+" ...")
            rfmix_sim = rfmix_sim_cmd % (sim_output_path, sim_output_path, genetic_map_file, out_basename, num_out, "chr"+chm, gen, 0.0, 123, "--dephase" if use_phase_shift else "")
            run_shell_cmd(rfmix_sim, verb=True)

        # reading .vcf output of simulation and converting to npy matricies
        print('reading .vcf output of simulation and converting to npy matricies ...')
        vcf_data = allel.read_vcf(out_basename+".query.vcf")
        chm_len, nout, _ = vcf_data["calldata/GT"].shape
        mat_vcf_2d = vcf_data["calldata/GT"].reshape(chm_len,nout*2).T
        np.save(gen_path+"/mat_vcf_2d.npy", mat_vcf_2d)
        
        # reading .map output of simulation and converting to npy matricies
        print('reading .map output of simulation and converting to npy matricies ...')
        map_path = out_basename + ".result"
        matrix = sample_map_to_matrix(map_path)

        # Finally map them to original labels (which can be further mapped to coordinates) and saving
        matrix = np.vectorize(idx_to_pop_map.get)(matrix)
        np.save(gen_path+"/mat_map.npy",matrix)
    
    print("-"*80)
    print("Finishing up ...")
Beispiel #24
0
def import_data(callset_path):

    '''Read in the VCF in the appropriate format.'''

    numbers = get_numbers_dict(args.ploidy)

    callset = allel.read_vcf(callset_path,
                             numbers=numbers,
                             fields=['samples', 'calldata/GT',
                                     'variants/CHROM', 'variants/FILTER',
                                     'variants/POS', 'variants/REF',
                                     'variants/ALT'])

    return callset
Beispiel #25
0
    def load_data(self, genome_file, panel_file, population_file):
        '''
    	Load files and create the population codes and description files
    	'''
        self.genome_file = allel.read_vcf(genome_file)
        self.panel_file = pd.read_csv(panel_file, sep='\t')
        self.population_file = pd.read_csv(population_file, sep="\t")

        #Basic Processing
        self.panel_file['pop'] = self.panel_file['pop'].str.strip()
        self.panel_pop = self.panel_file['pop'].values
        self.df_code = self.population_file["Population Code"].dropna().values
        self.df_desc = self.population_file["Population Description"].dropna(
        ).values[:-1]
Beispiel #26
0
def readVcf(inFile, logDebug):
    log.info("reading the VCF file")
    ## We read only one sample from the VCF file
    if logDebug:
        vcf = allel.read_vcf(inFile, samples=[0], fields='*')
    else:
        import StringIO
        import sys
        sys.stderr = StringIO.StringIO()
        vcf = allel.read_vcf(inFile, samples=[0], fields='*')
        #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
        #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
        sys.stderr = sys.__stderr__
    (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
    try:
        snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
    except AttributeError:
        snpmatch.die("input VCF file doesnt have required GT field")
    snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
    snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
    if 'calldata/PL' in sorted(vcf.keys()):
        snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
        snpWEI = snpWEI / (-10)
        snpWEI = np.exp(snpWEI)
    else:
        snpBinary = parseGT(snpGT)
        snpWEI = np.ones((len(snpsREQ), 3))  ## for h**o and het
        snpWEI[np.where(snpBinary != 0), 0] = 0
        snpWEI[np.where(snpBinary != 1), 2] = 0
        snpWEI[np.where(snpBinary != 2), 1] = 0
    snpCHR = snpCHR[snpsREQ]
    if 'calldata/DP' in sorted(vcf.keys()):
        DPmean = np.mean(vcf['calldata/DP'][snpsREQ, 0])
    else:
        DPmean = "NA"
    snpPOS = np.array(vcf['variants/POS'][snpsREQ])
    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
Beispiel #27
0
def read_haplotype_matrix_from_vcf(filepath):
    """"
    Read the vcf file into a haplotype matrix (numpy)
    assume everything is phased,
    """
    callset = allel.read_vcf(filepath)
    gt = callset['calldata/GT']
    hap0 = gt[:, :, 0].T
    hap1 = gt[:, :, 1].T

    haps = np.empty((hap0.shape[0]*2, hap0.shape[1]), dtype=hap0.dtype)
    haps[0::2] = hap0
    haps[1::2] = hap1

    return haps
def process_vcf(vcf_file):
    vcf = allel.read_vcf(vcf_file)
    gt = vcf['calldata/GT']
    n_variants, n_samples, ploidy = gt.shape
    gt_matrix = gt.reshape(n_variants, n_samples * ploidy).astype(np.float32)
    np.place(gt_matrix, gt_matrix < 0, np.nan)
    IDs = vcf['variants/ID']
    rs_IDs = [int(x[2:]) for x in IDs]
    samples = vcf['samples']
    ind_IDs = []
    for sample in samples:
        ind_IDs.append(sample + '_A')
        ind_IDs.append(sample + '_B')
    ind_IDs = np.array(ind_IDs)
    positions = vcf['variants/POS'].tolist()
    return gt_matrix, rs_IDs, ind_IDs, positions
Beispiel #29
0
def load_vcf_wrapper(path, seqid, samples, samples_path):

    callset = allel.read_vcf(
        path,
        region=seqid,
        fields=['variants/POS', 'calldata/GT', 'samples'],
        tabix="tabix",
        samples=samples)

    assert "samples" in callset.keys(), "None of the samples provided in {0!r} are found in {1!r}".format(
        samples_path, path)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
Beispiel #30
0
def load_vcf_wrapper(path, seqid, samples, samples_path):

    callset = allel.read_vcf(path,
                             region=seqid,
                             fields=['variants/POS', 'calldata/GT', 'samples'],
                             tabix="tabix",
                             samples=samples)

    assert "samples" in callset.keys(
    ), "None of the samples provided in {0!r} are found in {1!r}".format(
        samples_path, path)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
Beispiel #31
0
def main(vcffile: str = typer.Argument(..., help="总vcf文件"),
         groupfile: str = typer.Argument(..., help='样本分群信息,两列,一列vcf中的样本ID,一列对应的群体ID。不需要包含vcf中的全部个体。'),
         outfile: str = typer.Argument(..., help='输出文件名(.ac.tsv.gz)'),
         region: str = typer.Option(None, help='只使用指定区域chrom:start-end或者chrom')):
    """把vcf文件转换为treemix的输入格式,可以只转指定区域指定样本"""
    group2samples = load_group(groupfile)
    df = {}
    for group, samples in group2samples.items():
        callset = allel.read_vcf(vcffile, fields=['calldata/GT', 'samples'], numbers={'ALT': 1}, region=region)
        samples_missed = set(samples) - set(callset['samples'])
        if samples_missed:
            print(f'{len(samples_missed)} / {len(samples)} samples missed in {group}:')
            print(','.join(samples_missed))
        df[group] = pd.DataFrame(allel.GenotypeArray(callset['calldata/GT']).count_alleles()).apply(lambda x: ','.join([str(x[0]), str(x[1])]), axis=1).values
    df = pd.DataFrame(df)
    df.to_csv(outfile, sep=' ', index=False, compression='gzip')
Beispiel #32
0
def load_vcf2array(vcffile, region: None, chrom2sites: None, samples,
                   outsamples: None):
    callset = allel.read_vcf(vcffile,
                             region=region,
                             samples=samples,
                             fields=['samples', 'calldata/GT', 'variants/POS'])
    gt_array = callset['calldata/GT']  # 三维array
    samples_all = callset['samples']
    pos_array = callset['variants/POS']
    chrom = region.split(':')[0]

    # 只保留sitefile中的位点
    if chrom2sites:
        selection_requiredsites = [
            True if x in chrom2sites[chrom] else False for x in pos_array
        ]
        gt_array = gt_array[selection_requiredsites, :, :]
        pos_array = pos_array[selection_requiredsites]
        print(
            f'{np.sum(selection_requiredsites)} remained according to the sitefile.'
        )

    # 只保留双等位,因为对于多等位杂合后续没法区分,如0/2和 1/1加了之后都会转化为2
    selection_biallelic = np.max(np.max(gt_array, axis=2), axis=1) < 2
    gt_array = gt_array[selection_biallelic, :, :]
    pos_array = pos_array[selection_biallelic]
    n_sites, n_samples, n_hap = gt_array.shape
    print(f'{n_sites} biallelic sites were remained.')

    # 把outsamples中最高频率的allele设置为alt
    if outsamples:
        selection_outsamples = select_samples(samples_all, outsamples)
        print(f'{np.sum(selection_outsamples)} outgroup samples in vcf file.')
        gt_array_out = gt_array[:, selection_outsamples, :].reshape(
            n_sites,
            np.sum(selection_outsamples) * n_hap)
        selection_swtich = np.sum(gt_array_out == 0, axis=1) > np.sum(
            gt_array_out > 0, axis=1)  # ref(0)的数量比非ref(!=0)但不是miss(-1)的数量多
        print(f'Swtich REF and ALT in {np.sum(selection_swtich)} sites.')
        assert gt_array.min() >= -1
        assert gt_array.max() <= 1
        gt_swtich = gt_array[selection_swtich, :, :]
        gt_swtich[gt_swtich == 1] = 9
        gt_swtich[gt_swtich == 0] = 1
        gt_swtich[gt_swtich == 9] = 0
        gt_array[selection_swtich, :, :] = gt_swtich
    return gt_array, callset['samples'], pos_array
# import scikit-allel
import allel
import sys

# Sys arguments
in_vcf = sys.argv[1]
out_file = sys.argv[2]

callset = allel.read_vcf(in_vcf)
# available keys in vcf file
sorted(callset.keys())
# to get reference from vcf file
callset['variants/REF']
# to get genotype/allel form vcf file
callset['calldata/GT']
# to get genotype infomations in array form
gt = allel.GenotypeArray(callset['calldata/GT'])
# write the output to a file
with open(out_file, 'w') as fh_out:
	fh_out.write(str(gt))