Esempio n. 1
0
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c")

    try:
        tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"]
    except ValueError:
        logger.info("File not tab delimited as expected- trying with spaces")
        tbl = pd.read_csv(mapfn,
                          sep=" ",
                          header=None,
                          engine="c",
                          names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    try:
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")
    except ValueError:
        tbl = tbl.sort_values(["CHROM", "POS"])
        logger.warning(
            "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient"
        )
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    pos = allel.SortedIndex(vartbl.POS[:])
    assert np.isnan(pos).sum() == 0, "nans values are not supported"

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
Esempio n. 2
0
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn,
                      sep=" ",
                      names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
Esempio n. 3
0
    def work(self):

        import numpy as np
        import allel
        import h5py
        import pandas as pd
        from luigi.file import atomic_file

        # Opens the SynSNPS file, which contains only biallelic synonymous sites
        callset = h5py.File(self.input()['syn'].path, mode='r')
        genotypes = allel.GenotypeChunkedArray(callset['calldata']['genotype'])
        samples = np.array([x.decode() for x in callset['samples']])

        # Selects site with r**2 linkage < max_linkage
        n_ref = genotypes.to_n_ref(fill=-9)
        unlinked = allel.locate_unlinked(n_ref, threshold=self.max_linkage)[:]

        # Create pseudohaplotypes (0=ref, 1=alt, -1=missing)
        hap_matrix = genotypes[:][unlinked].to_haplotypes()

        # Double up the sample names
        samples_dup = np.array(list(zip(samples, samples))).reshape(-1, 1)

        hap_df = pd.DataFrame(np.hstack((samples_dup, hap_matrix.T)))

        # Atomic write TSV file output
        af = atomic_file(self.output().path)
        hap_df.to_csv(af.tmp_path, sep='\t', index=False)
        af.move_to_final_destination()
def genotype_array_from_vcf(filename):
    """
    @Params: filename: relative path to genotpye data file
    """
    g = allel.GenotypeChunkedArray(allel.read_vcf(filename)['calldata/GT'])
    logging.info(f"Loaded Genotype Data from file {filename}")
    return g
Esempio n. 5
0
def plth12(chromlist):
    """
    """
    for c in chromlist:
        # callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r')
        callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r')
        samples = callset['samples'][:]
        sample_name = [sid.decode() for sid in samples.tolist()]
        g = allel.GenotypeChunkedArray(callset["calldata/GT"])
        h = g.to_haplotypes()
        pos = allel.SortedIndex(callset["variants/POS"][:])
        acc = h.count_alleles()[:, 1]
        # H12
        h12 = allel.moving_garud_h(h, window_size)[1]  # set window size
        h12_pos = []
        p = 0
        end = window_size
        i = 0
        while i < len(h12):
            stop = pos[end]
            while pos[p] < stop:
                h12_pos.append(h12[i])
                p += 1
            i += 1
            end += window_size
        while len(h12_pos) < len(pos):
            h12_pos.append(h12[-1])
        plt.plot(pos, h12_pos)
        plt.xlabel("{} genomic position".format(c))
        plt.ylabel("H12")
        plt.savefig("PNG.{}.H12.pdf".format(c))
        plt.clf()
Esempio n. 6
0
def get_genotype_array_concat(callsets,
                              genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    if len(callsets) == 1:
        # Only one callset provided. No need for concatenation
        callset = callsets[0]
        return get_genotype_array(callset=callset,
                                  genotype_array_type=genotype_array_type)

    gt_list = []

    # Get genotype data for each callset
    for callset in callsets:
        gt = get_callset_genotype_data(callset)
        if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
            # Encapsulate underlying zarr array with a chunked dask array
            gt = da.from_array(gt, chunks=gt.chunks)
        gt_list.append(gt)

    if genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        combined_gt = da.concatenate(gt_list, axis=0)
        combined_gt = allel.GenotypeDaskArray(combined_gt)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        combined_gt = allel.GenotypeChunkedArray(
            np.concatenate(gt_list, axis=0))
    elif genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        combined_gt = allel.GenotypeArray(np.concatenate(gt_list, axis=0))
    else:
        raise ValueError(
            'Error: Invalid option specified for genotype_array_type.')

    return combined_gt
Esempio n. 7
0
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):

    import zarr

    samples1 = get_sample_ids(s1)
    samples2 = get_sample_ids(s2)

    zfh = zarr.open_group(zarr_fn, mode="r")[chrom]

    samples_x = zfh["samples"][:]
    sample_name = [sid.decode() for sid in samples_x.tolist()]

    idx1 = np.array([sample_name.index(sid) for sid in samples1])
    idx2 = np.array([sample_name.index(sid) for sid in samples2])

    g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])

    pos = allel.SortedIndex(zfh["variants"]["POS"][:])

    if gdistkey is not None:
        gdist = h5fh["variants"][gdistkey][:]
    else:
        gdist = None

    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
Esempio n. 8
0
def load_hdf5_data(hdf5_fn, chrom, s1, s2):
    callset = h5py.File(hdf5_fn, mode='r')
    samples = callset['samples'][:]
    sample_name = [sid.decode() for sid in samples.tolist()]
    idx1 = np.array([sample_name.index(sid) for sid in s1])
    idx2 = np.array([sample_name.index(sid) for sid in s2])
    g = allel.GenotypeChunkedArray(callset["calldata/GT"])
    pos = allel.SortedIndex(callset["variants/POS"][:])
    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos
Esempio n. 9
0
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix):
    """
    计算pop1和pop2之间的Fst
    using the method of Hudson (1992) elaborated by Bhatia et al. (2013).
    """
    pop1 = [x.strip() for x in open(pop1)]
    pop2 = [x.strip() for x in open(pop2)]
    callset = allel.read_vcf(vcffile)
    allsamples = callset['samples']
    genotypes = allel.GenotypeChunkedArray(callset['calldata/GT'])
    variant_selection = np.full((genotypes.shape[0] + 1), True)  # 选择vcf中的全部位点
    sample_selection = [True if x in pop1 else False for x in allsamples]
    ac1 = getAC(genotypes, variant_selection, sample_selection)
    sample_selection = [True if x in pop2 else False for x in allsamples]
    ac2 = getAC(genotypes, variant_selection, sample_selection)
    num, den = allel.hudson_fst(ac1, ac2)
    fst = num / den
    meanFst = np.sum(num) / np.sum(den)
    print('meanFst: %s' % meanFst)
    chrom = callset['variants/CHROM']
    pos = callset['variants/POS']
    df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst})
    df.to_csv(f'{outprefix}_persite.tsv.gz',
              sep='\t',
              index=False,
              na_rep='nan',
              compression='gzip')
    df['num'] = num
    df['den'] = den
    # sliding bins
    bdf = []
    for offset in range(0, binwidth, stepsize):
        df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth
        for group_name, gdf in df.groupby(by=['chrom', 'bin_index']):
            chrom, bin_index = group_name
            start = bin_index * binwidth + offset + 1
            if start < 0:  # 开头几个窗口长度不足的就直接跳过
                continue
            end = start + binwidth - 1
            n_snp = gdf.shape[0]
            sum_num = gdf['num'].sum()
            sum_den = gdf['den'].sum()
            if sum_den > 0:
                meanFst = sum_num / sum_den
            else:
                meanFst = np.nan
            bdf.append([chrom, start, end, n_snp, meanFst])
    bdf = pd.DataFrame(bdf,
                       columns=['chrom', 'start', 'end', 'n_snp',
                                'meanFst']).sort_values(by=['chrom', 'start'])
    bdf.to_csv(f'{outprefix}_meanFst.tsv.gz',
               index=False,
               compression='gzip',
               sep='\t',
               float_format='%.3f')
Esempio n. 10
0
def get_genotype_array(callset,
                       genotype_array_type=config.GENOTYPE_ARRAY_DASK):
    gtz = get_callset_genotype_data(callset)

    if genotype_array_type == config.GENOTYPE_ARRAY_NORMAL:
        return allel.GenotypeArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_DASK:
        return allel.GenotypeDaskArray(gtz)
    elif genotype_array_type == config.GENOTYPE_ARRAY_CHUNKED:
        return allel.GenotypeChunkedArray(gtz)
    else:
        return None
Esempio n. 11
0
    def run(self, n_samples=1000):
        '''
    	Run PCA and UMAP on the data. 
    	'''
        start = time.time()
        self.timestamp = int(start)

        gt = allel.GenotypeChunkedArray(self.genome_file['calldata/GT'])
        gn = gt.to_n_alt()
        vidx = np.random.choice(gn.shape[0], n_samples, replace=False)
        vidx.sort()
        gnr = gn.take(vidx, axis=0)[:]

        pca = PCA(n_components=2)
        gnu_pca = pca.fit_transform(gnr.T)

        reducer = umap.UMAP()
        #gnu_umap = reducer.fit_transform(gnr.T)

        gnu_pca_df = pd.DataFrame(gnu_pca, self.panel_pop)
        gnu_pca_df["pop"] = gnu_pca_df.index

        gnu_umap_df = pd.DataFrame(gnu_pca, self.panel_pop)
        gnu_umap_df["pop"] = gnu_umap_df.index

        plot_signal = self.plot(gnu_pca_df, gnu_umap_df)
        write_signal = self.write(gnu_pca_df, gnu_umap_df)

        end = time.time()

        if (plot_signal and write_signal):
            log_file = open(str(self.timestamp) + ".log", "w+")
            log_file.write("Log File Start \n")
            log_file.write("\nRun time : " + str(end - start))

            log_file.write("\nRandom Seed : " + str(self.seed))

            log_file.write("\nPackage Versions : \n")
            log_file.write("\nScikit Allel Version : " +
                           str(allel.__version__))
            log_file.write("\nSklearn Version : " + str(sklearn.__version__))
            log_file.write("\nNumpy Version : " + str(np.__version__))
            #log_file.write("Matplotlib Version : " + str(matplotlib.__version__))
            log_file.write("\nPandas Version : " + str(pd.__version__))
            log_file.write("\nUMAP Version : " + str(umap.__version__))

            log_file.write("\nPCA Parameters : \n")
            log_file.write(str(pca))
            log_file.write("\nUMAP Parameters : \n")
            log_file.write(str(reducer))

            log_file.close()
Esempio n. 12
0
def pltPi(chromlist):
    """
    """
    for c in chromlist:
        callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c),
                            mode='r')
        # callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r')
        samples = callset['samples'][:]
        sample_name = [sid.decode() for sid in samples.tolist()]
        g = allel.GenotypeChunkedArray(callset["calldata/GT"])
        pos = allel.SortedIndex(callset["variants/POS"][:])
        acc = g.count_alleles()
        pi_windowed = allel.windowed_diversity(pos, acc, size=10)
        plt.plot(pos, h12_pos)
        plt.xlabel("{} genomic position".format(c))
        plt.ylabel("H12")
        plt.savefig("PNG.{}.H12.pdf".format(c))
        plt.clf()
Esempio n. 13
0
def write_hap_array(pop, chrom, p1, p2, name, samples, inaccessible=False):
    """ Function to write a haplotype array for a specific region and population. currently using for iSAFE """

    if inaccessible is False:
        ############ Read zarrs #############
        Ag_store = zarr.open_array(
            f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
            mode='r')
        positions = zarr.open_array(
            f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS",
            mode='r')[:]
    else:
        Ag_store = zarr.open_array(
            f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
            mode='r')
        positions = zarr.open_array(
            f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS",
            mode='r')[:]

    print("--------------------------------------------------")
    print(f"Zarrs loaded: {pop}, Chromosome {chrom}")

    ############ Load intro gen.array and compute statistics ###########
    ag_geno = allel.GenotypeChunkedArray(Ag_store)
    pop_bool = samples.population == pop

    print("Constructing HaplotypeArray")
    pop_geno = ag_geno.compress(pop_bool, axis=1)
    pop_haplo = pop_geno.to_haplotypes()

    flt_region = np.where((positions >= p1)
                          & (positions <= p2))[0]  #get chrom positions
    sweep = pop_haplo.take(flt_region, axis=0)
    ac = sweep.count_alleles()
    flt_ac = ac.is_segregating()
    sweep = sweep.compress(flt_ac, axis=0)  #eep only segregating
    flt_seg = positions.take(
        flt_region[flt_ac])  #repeat filtering on positions
    dt = pd.DataFrame(data=sweep)
    dt.index = flt_seg
    dt.to_csv(f'../data/{pop}/{chrom}/sweep_hapl_{name}', index=True, sep="\t")
    print(f"Writing Haplotype array for {name} region for iSAFE algorithm")
Esempio n. 14
0
def main(h5file, samplesfile, outgroupfile, outprefix, blen):
    """
    use samples in samplesfile to calculate pair-wise outgroup-f3
    use samples in outgroupfile as outgroup
    h5file generate from vcffile from scikit-allele(1.1.10)
    import allel; allel.vcf_to_hdf5('in.vcf.gz', 'out.h5')
    """
    print(__doc__)
    print('scikit-allel', allel.__version__)
    samples = [x.strip() for x in open(samplesfile).readlines()]  # 待计算个体
    callset = h5py.File(h5file, mode='r')
    allsamples = list(callset['samples'])  # vcf包含的全部个体
    calldata = callset['calldata']
    genotypes = allel.GenotypeChunkedArray(calldata['GT'])
    variant_selection = np.full((genotypes.shape[0] + 1), True)  # 选择vcf中的全部位点
    ac_outgroup = cal_outgroup_ac(genotypes, outgroupfile, allsamples,
                                  variant_selection)
    ac_dict = cal_all_ac(genotypes, samples, allsamples, variant_selection)
    print('begin to cal outgroup f3')
    n_comb = len(list(combinations(samples, 2)))
    print(f'total combinations is {n_comb}')
    n_iter = 0
    n_samples = len(samples)
    f3_ay = np.full((n_samples, n_samples), None)
    z_ay = np.full((n_samples, n_samples), None)
    for sample1, sample2 in combinations(samples, 2):
        x = samples.index(sample1)
        y = samples.index(sample2)
        n_iter += 1
        print(f'{n_iter}/{n_comb}')
        f3, se, z, vb, vj = allel.average_patterson_f3(ac_dict[sample1],
                                                       ac_dict[sample2],
                                                       ac_outgroup, blen)
        f3_ay[x, y] = f3
        f3_ay[y, x] = f3
        z_ay[x, y] = z
        z_ay[y, x] = z
    pd.DataFrame(f3_ay, columns=samples,
                 index=samples).to_csv(f'{outprefix}.f3.tsv', sep='\t')
    pd.DataFrame(z_ay, columns=samples,
                 index=samples).to_csv(f'{outprefix}.z.tsv', sep='\t')
Esempio n. 15
0
def sim_load_h5_to_PCA(h5_path):
    '''
    load dataset from h5 format file, remove non-informative columns,
    fit a PCA
    input: path file
    output:PCA coordenates
    '''
    callset = h5py.File(h5_path, mode='r')
    #Reference: http://alimanfoo.github.io/2015/09/28/fast-pca.html
    g = allel.GenotypeChunkedArray(callset['calldata/GT'])
    ac = g.count_alleles()[:]

    # remove singletons and multiallelic SNPs. Singletons are not informative for PCA,
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    # transform the genotype data into a 2-dimensional matrix where each cell has the number of non-reference alleles per call
    gn = gf.to_n_alt()

    #Removing correlated features (LD pruning): each SNP is a feature, SNPs tend to be correlated
    #It takes a while 5:15-
    def ld_prune(gn, size, step, threshold=.1, n_iter=1):
        for i in range(n_iter):
            loc_unlinked = allel.locate_unlinked(gn,
                                                 size=size,
                                                 step=step,
                                                 threshold=threshold)
            n = np.count_nonzero(loc_unlinked)
            n_remove = gn.shape[0] - n
            print('iteration', i + 1, 'retaining', n, 'removing', n_remove,
                  'variants')
            gn = gn.compress(loc_unlinked, axis=0)
        return gn

    #more than 3 does not remove almost anything
    gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3)

    #PCA
    k = 2
    coords1, model1 = allel.pca(gnu, n_components=k, scaler='patterson')
    np.savetxt('data_s//tgp_pca' + str(k) + '.txt', coords1, delimiter=',')
    return coords1
Esempio n. 16
0
def prepData(directory, outfn, newVCF, samples, bs):
    ## can either use an existing vcf file or make a new vcf file from a list of samples
    if newVCF == True:
        makeVCF(directory, samples, outfn)

    vcffile = directory + outfn + ".vcf"

    #runConversion(outfn, vcffile, bs)

    callsetfn = directory + '/analysis-vcf2hdf5/' + outfn + ".snps.hdf5"
    callset = h5py.File(callsetfn, mode='r')

    #callset = allel.read_vcf(vcffile)

    #get genotype data
    g = allel.GenotypeChunkedArray(callset['genos'])

    ## transform data
    gn = transform(g)

    return gn, callset
Esempio n. 17
0
def main(callset, samplesA, samplesB, window_size):

    callset_samples = callset["samples"][:].astype("U8").tolist()

    sa = [callset_samples.index(x) for x in samplesA if x in callset_samples]
    sb = [callset_samples.index(x) for x in samplesB if x in callset_samples]

    check_samples(sa, samplesA, "A")
    check_samples(sb, samplesB, "B")

    positions = allel.SortedIndex(callset["variants/POS"])
    
    last_pos = positions[-1]
    window_starts = np.arange(1, last_pos, window_size, dtype=int)

    df = pd.DataFrame(columns=["start", "stop", "nvar"], index=window_starts)
    df["fst"] = pd.Series(index=window_starts, dtype=float)
    df["start"] = window_starts
    df["stop"] = window_starts + window_size

    gt = allel.GenotypeChunkedArray(callset["calldata/GT"])

    for start in window_starts:
        try:
            loc = positions.locate_range(start, start + window_size - 1) 
        except KeyError:
            df.at[start, "nvar"] = 0
            continue

        g = gt[loc]
        ac1 = g.count_alleles(subpop=sa)
        ac2 = g.count_alleles(subpop=sb)
        
        num, den = allel.stats.hudson_fst(ac1, ac2)
        df.at[start, "fst"] = (np.sum(num) / np.sum(den))
        df.at[start, "nvar"] = num.size

    return df
Esempio n. 18
0
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000):
    """
    main function to run pca visualization
    """
    import pdb

    #gn, callset = prepData(directory, outfn, newVCF, samples, bs)
    callset = allel.read_vcf(directory + outfn + ".vcf")

    g = allel.GenotypeChunkedArray(callset['calldata/GT'])
    gn = transform(g)

    ## get metadata
    df = fp.retrieveMetaData(samples, directory, outfn)

    coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson')

    fig_pca(directory,
            outfn,
            coords1,
            model1,
            'Conventional PCA.',
            sample_population=df[column])
Esempio n. 19
0
def import_data(filepath,
                chrom_name,
                names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles']):
    '''Take the path to a well-formed h5py file and return a VariantTable
    and a GenotypeArray.'''

    ##to-do: check that h5py file is well-formed

    callset_handle = filepath
    callset = h5py.File(callset_handle, mode='r')

    variants = allel.VariantChunkedTable(callset[chrom_name]['variants'],
                                         names=names,
                                         index='POS')

    genotypes = allel.GenotypeChunkedArray(
        callset[chrom_name]['calldata']['genotype'])

    if not genotypes.shape[0] == variants.shape[0]:
        raise ValueError("Genotypes and variant table must contain the\
                         same number of positions")

    return variants, genotypes
'''

if not len(sys.argv) in [15, 17]:
    sys.exit(
        "usage:\npython2 empirical_convert_to_FVs.py chrArmFileName chrArm chrLen [segmentStart segmentEnd] subWinSize numSubWins unmaskedFracCutoff pMisPol partialStatAndDafFileName maskFileName ancestralArmFaFileName sampleToPopFileName targetPop statFileName fvecFileName\n"
    )
if len(sys.argv) == 17:
    chrArmFileName, chrArm, chrLen, segmentStart, segmentEnd, subWinSize, numSubWins, unmaskedFracCutoff, pMisPol, partialStatAndDafFileName, maskFileName, ancestralArmFaFileName, sampleToPopFileName, targetPop, statFileName, fvecFileName = sys.argv[
        1:]
else:
    chrArmFileName, chrArm, chrLen, subWinSize, numSubWins, unmaskedFracCutoff, pMisPol, partialStatAndDafFileName, maskFileName, ancestralArmFaFileName, sampleToPopFileName, targetPop, statFileName, fvecFileName = sys.argv[
        1:]
    segmentStart = None

chrArmFile = h5py.File(chrArmFileName, "r")
genos = allel.GenotypeChunkedArray(chrArmFile[chrArm]["calldata"]["genotype"])
positions = allel.SortedIndex(chrArmFile["/%s/variants/POS" % (chrArm)][:])
refAlleles = chrArmFile[chrArm]['variants']['REF']
altAlleles = chrArmFile[chrArm]['variants']['ALT']
samples = chrArmFile[chrArm]["samples"]
chrLen = int(chrLen)
assert chrLen > 0
if segmentStart != None:
    segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
    assert segmentStart > 0 and segmentEnd >= segmentStart
    snpIndicesToKeep = [
        x for x in range(len(positions))
        if segmentStart <= positions[x] <= segmentEnd
    ]
    genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
    positions = [positions[x] for x in snpIndicesToKeep]
Esempio n. 21
0
# load the prerolled genomeplot instance
f = genomeplot.anophelesgambiae.load()

# load metadata
meta = pd.read_table(os.path.join(release_dir, "samples/samples.meta.txt"),
                     index_col=0)

# identify Ugandan samples
ugs_samples = meta.query("population == 'UGS'").index

diversity = {}

# loop through contigs to generate diversity frame
for seq in f.contigs:
    gt = allel.GenotypeChunkedArray(callset[seq]["calldata/genotype"]).take(
        ugs_samples, axis=1)
    pos = allel.SortedIndex(callset[seq]["variants/POS"])
    accessible = accessibility[seq]["is_accessible"]
    ac = gt.count_alleles()

    pi, windows, bases, counts = allel.stats.windowed_diversity(
        pos, ac, size=100000, is_accessible=accessible)

    diversity[seq] = pd.DataFrame.from_dict({
        "pi": pi,
        "start": windows.T[0],
        "stop": windows.T[1],
        "nbases": bases,
        "counts": counts
    })
Esempio n. 22
0
def selective_sweep(chroms,
                    pop,
                    samples,
                    haplo=True,
                    plot=False,
                    inaccessible=False):
    """ Function to calculate H12 statistic across chromosome for given population. Currently not standardised or normalised. """

    for chrom in chroms:

        if inaccessible is False:
            ############ Read zarrs #############
            Ag_store = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
                mode='r')
            positions = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS",
                mode='r')[:]
        else:
            Ag_store = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
                mode='r')
            positions = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS",
                mode='r')[:]

        print("--------------------------------------------------")
        print(f"Zarrs loaded: {pop}, Chromosome {chrom}")

        ############ Load intro gen.array and compute statistics ###########
        ag_geno = allel.GenotypeChunkedArray(Ag_store)
        pop_bool = samples.population == pop

        print("Constructing HaplotypeArray")
        pop_geno = ag_geno.compress(pop_bool, axis=1)
        pop_haplo = pop_geno.to_haplotypes()

        print("Computing statistics")
        h1, h12, h123, h2_h1 = allel.moving_garud_h(pop_haplo, size=1000)
        median_pos = allel.moving_statistic(positions, np.median, size=1000)

        print(f"mean {chrom} h12", np.mean(h12))

        if plot is True:

            print("Producing figure")
            sns.set_palette("muted")
            xtick = np.arange(0, median_pos.max(), 1000000)
            plt.figure(figsize=(30, 10))
            sns.lineplot(
                median_pos,
                h12).set_title(f'{pop} {chrom} H12 in 1000 snp windows')
            plt.xticks(xtick)
            plt.savefig(f"../data/{pop}/{chrom}/{pop}_{chrom}_H12_scatter.png",
                        dpi=800)
            plt.close

    if haplo is True:
        return (pop_haplo, h12, np.around(median_pos), positions)
    else:
        return (h12, np.around(median_pos), positions)
Esempio n. 23
0
def get_haplos(pops,
               chrom,
               p1,
               p2,
               samples,
               inaccessible=False,
               geno=False,
               biallelic=False,
               zarrpath=None):
    """ Returns a haplotype array or genotype array for the region and populations requested """

    print(
        '---------------------- retrieving haplotypes -----------------------')

    # Open Zarrs, genotype and variant data

    if zarrpath is False:
        if inaccessible is False:
            ############ Read zarrs #############
            Ag_array = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
                mode='r')
            Ag_store = zarr.open_group(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/",
                mode='r')
        else:
            Ag_array = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
                mode='r')
            Ag_store = zarr.open_group(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/",
                mode='r')

    else:
        if inaccessible is False:
            ############ Read zarrs #############
            Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r')
            Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r')
        else:
            Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r')
            Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r')

    variants = allel.VariantChunkedTable(
        Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ',
                         'QD'], index='POS')[:]

    positions = allel.SortedIndex(variants['POS'])
    positions = positions.intersect_range(p1, p2)
    # focus on haplotype region
    sweep_region = (variants['POS'] >= p1) & (variants['POS'] <= p2)

    ag_geno = allel.GenotypeChunkedArray(Ag_array)
    print('Zarr arrays opened')
    ag_geno = ag_geno.compress(sweep_region, axis=0)

    print(
        f'------------------------------- {pops} ------------------------------------'
    )
    # Restrict genotypeArray to population and make HapArray
    pop_bool = samples.population.isin(pops)
    pop_geno = ag_geno.compress(pop_bool, axis=1)
    pop_haplo = pop_geno.to_haplotypes()
    print("HaplotypeArray constructed")

    if biallelic is True:
        ac = pop_geno.count_alleles()
        bi_al = ac.is_biallelic_01()
        pop_haplo = pop_haplo.compress(bi_al, axis=0)
        positions = positions[bi_al]

    if geno is True:
        return (pop_geno, pop_bool, sweep_region, positions)
    else:
        return (pop_haplo, pop_bool, sweep_region, positions)
Esempio n. 24
0
def multiple_alignment(pops, chrom, p1, p2, samples, hap_only=False):
    """ Returns a multiple sequence alignment FASTA for a region, given populations, chromosome and locations. Useful for constructing phylogenetic trees (in IQTREE, e.g)
        Currently not bi-allelic which may be incorrect """

    print(
        '---------------------- multiple sequence alignment -----------------------'
    )

    # Open Zarrs, genotype and variant data
    Ag_array = zarr.open_array(
        f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
        mode='r')
    Ag_store = zarr.open_group(
        f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/",
        mode='r')

    variants = allel.VariantChunkedTable(
        Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS')
    # focus on haplotype region
    sweep_region = (variants['POS'][:] >= p1) & (variants['POS'][:] <= p2)
    variants_in_region = variants.compress(sweep_region, axis=0)
    ag_geno = allel.GenotypeChunkedArray(Ag_array)
    print('Zarr arrays opened')
    ag_geno = ag_geno.compress(sweep_region, axis=0)

    # clean metadata
    species_map = {'M': 'coluzzii', 'S': 'gambiae'}
    samples['species'] = samples['m_s'].map(species_map)
    color_map = {'BFcol': 'gold'}
    samples = samples[[
        'ox_code', 'population', 'country', 'species', 'region'
    ]]

    #empty df for FASTAS
    multi_fastas = pd.DataFrame()
    all_samples = pd.DataFrame()
    for pop in pops:
        print(
            f'------------------------------- {pop} ------------------------------------'
        )
        # Restrict genotypeArray to population and make HapArray
        pop_bool = samples.population == pop
        pop_geno = ag_geno.compress(pop_bool, axis=1)
        pop_haplo = pop_geno.to_haplotypes()
        print("HaplotypeArray constructed")
        list_of_haplotypes = np.arange(0, pop_haplo.shape[1]).astype('str')
        #     all_haps = pd.DataFrame(np.repeat(all_samples.values,2,axis=0))

        list_of_haplotypes = list(list_of_haplotypes)
        pop_hap_sizes = dict()
        pop_hap_sizes[pop] = len(list_of_haplotypes)

        # THIS CREATES AN EMPTY DATAFRAME TO FILL WITH SEQUENCES
        # EACH ROW IS A HAPLOTYPE
        fastas = pd.DataFrame({
            "hap": np.nan,
            "seq": np.nan
        },
                              columns=["hap", "seq"])

        # THIS LOOPS THROUGH HAPLOTYPES AND POPULATES "seq" VARIABLE WITH A CONCATENATED ARRAY OF ALT/REF VARIANTS
        # genotypes_in_region: array of genotypes as loaded by scikit-allel (compress it to region of interest)
        # variants_in_region: table of variants as loaded by scikit-allel (compress it to region of interest)
        print(f"Extracting variants and writing to Pandas Dataframe")
        for n, i in enumerate(list_of_haplotypes):
            gen = np.ndarray.tolist(pop_haplo[:, n])

            endstring = ''
            for gn, allele in enumerate(gen):
                if allele == 1:
                    seq = variants_in_region['ALT'][gn][0].astype(str)
                if allele == 2:
                    seq = variants_in_region['ALT'][gn][1].astype(
                        str
                    )  #should this be here, or should it be bi-allelic only?
                else:
                    seq = variants_in_region['REF'][gn].astype(
                        str)  # if allele 0 then REF

                endstring += seq  # concatenate bases into sequence

            fastas["seq"][
                n] = endstring  #input to corresponding seq column of df

        # Join the dfs of different pops
        multi_fastas = multi_fastas.append(fastas, ignore_index=True)
        print(len(multi_fastas), "Haplotypes complete")
        pop_samples = samples[samples.population == pop]
        all_samples = all_samples.append(pop_samples)
        multi_fastas['hap'] = '>' + all_samples['population'].astype(
            str) + '_' + all_samples['ox_code'].astype(str)

    #write to csv with \n sep to make FASTA file
    multi_fastas.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.fasta",
                        sep="\n",
                        index=False,
                        header=False)
    print('Multiple alignment FASTA written')

    #remove > and join with metadata for each pop, useful for plotting phylo trees
    multi_fastas['hap'] = multi_fastas['hap'].str.strip('>')
    all_haps = pd.DataFrame(np.repeat(all_samples.values, 2, axis=0))
    all_haps.columns = all_samples.columns
    all_haps = pd.concat([multi_fastas.reset_index(drop=True), all_haps],
                         axis=1)

    all_haps.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.metadata",
                    sep="\t",
                    index=False,
                    header=True)

    return (multi_fastas, all_haps)
Esempio n. 25
0
            b'#CHROM', b'POS', b'ID', b'REF', b'ALT', b'QUAL', b'FILTER',
            b'INFO', b'FORMAT'
        ]

        # rememeber to act on all 1st level keys!
        # does not support multiple chromosomes currently!
        # Actually should probably add to filter script...
        assert len(h5_handle.keys()) <= 1
        for k in h5_handle.keys():

            fh_samples = [str(s) for s in callset_fn['3R']["samples"][:]]
            samples = list(compress(fh_samples, pop_selection))
            missing_rates = np.zeros(len(samples))
            ok_samples = np.ones(len(samples), dtype="bool")

            gt = allel.GenotypeChunkedArray(h5_handle[k][:])

            if not args.keepmissing:

                missing_gt = gt.is_missing()

                for i, s in enumerate(samples):

                    consecutive_miss = get_consecutive_true(missing_gt[:, i])
                    miss_rate_i = consecutive_miss / float(missing_gt.shape[0])

                    print("Missing rate of", s, ':',
                          "{:.8f}".format(miss_rate_i),
                          "({0}/{1})".format(i + 1, len(samples)))
                    missing_rates[i] = miss_rate_i
print("* Samples     = ", oc_samples.shape[0])
print("* Populations = ", set(oc_samples[oc_popc]))
print(oc_samples.groupby(("population")).size())

# Phased variants and genotypes:
# declare objects with variant data
oc_hapcall   = h5py.File(oc_hapcall_fn)
# variants of genotypes
print("Variants phased...")
oc_hapcall_var = oc_hapcall[chrom]["variants"]
oc_hapvars = allel.VariantChunkedTable(oc_hapcall_var,names=["POS","REF","ALT"],index="POS")
print(oc_hapvars.shape)
# genotype data
print("Genotypes phased...")
oc_hapcall_hap = oc_hapcall[chrom]["calldata"]["genotype"]
oc_haploty     = allel.GenotypeChunkedArray(oc_hapcall_hap)
oc_haploty     = oc_haploty.subset(sel1=oc_samples_bool)
print(oc_haploty.shape)


# Effects:
oc_effcall     = zarr.open(oc_effcall_fn)
oc_effvars     = allel.VariantChunkedTable(oc_effcall["variants"],names=[
    "POS","REF","ALT","ANN_HGVS_p","ANN_HGVS_c",
    "ANN_Annotation","ANN_AA_pos","ANN_CDS_pos",
    "ANN_Feature_ID","ANN_Gene_ID","ANN_Gene_Name"
],index="POS")


# Is effect among phased variants?
is_eff_in_phased = np.isin(oc_effvars["POS"], oc_hapvars["POS"])
Esempio n. 27
0
def get_alternates(pops,
                   chrom,
                   p1,
                   p2,
                   samples,
                   haps=None,
                   t=0.3,
                   missense=True,
                   inaccessible=False):
    """ This function returns a dict of alternate alleles for each pop above a given frequency t, given a list of populations, chromosome and region p1-p2. 
        It also extracts the SNP_effect values for bi-allelic variants  """

    if inaccessible is False:
        ############ Read zarrs #############
        Ag_store = zarr.open_array(
            f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
            mode='r')
        positions = zarr.open_array(
            f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS",
            mode='r')[:]

        callset_fn = '../../data/snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.pass.h5'
        callset = h5py.File(callset_fn, mode='r')
        snp_eff = callset[chrom]['variants']['ANN']

    else:
        Ag_store = zarr.open_array(
            f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
            mode='r')
        positions = zarr.open_array(
            f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS",
            mode='r')[:]

        callset_fn = '../../data/all_snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.h5'
        callset = h5py.File(callset_fn, mode='r')
        snp_eff = callset[chrom]['variants']['ANN']

    pos = (positions > p1) & (positions < p2)
    if haps is None:
        ag_geno = allel.GenotypeChunkedArray(Ag_store)
        ag_geno = ag_geno.compress(pos, axis=0)

    snp_eff = snp_eff[pos]
    snps_in_region = dict()

    for pop in pops:

        if haps is None:
            pop_bool = samples.population == pop
            print(
                f"Constructing HaplotypeArray for {pop} {chrom} between {p1} and {p2}"
            )
            pop_geno = ag_geno.compress(pop_bool, axis=1)
            haps = pop_geno.to_haplotypes()

        ac = haps.count_alleles()
        freq = ac.to_frequencies()[:]
        print("Calculating allele frequencies")

        alt1 = freq[:, 1] > t
        alt2 = freq[:, 2] > t
        alts = alt1 + alt2

        region_positions = positions[:][pos]
        snps = region_positions[alts]
        freq = freq[alts]
        snp_eff_alts = pd.DataFrame(snp_eff[alts])

        df = pd.DataFrame([snps, freq]).T
        df.columns = ['pos', 'freqs']
        df['annotation'] = snp_eff_alts.Annotation.str.decode('utf8')
        df['aa'] = snp_eff_alts.HGVS_p.str.decode('utf8')
        df['ID'] = snp_eff_alts.Gene_Name.str.decode('utf8')
        df = df.set_index('pos')

        if missense is True:
            df = df[df.annotation == 'missense_variant']

        snps_in_region[pop] = df

    return (snps_in_region)
# In[5]:

# declare objects with variant data
p2_callset = zarr.open(p2_callset_fn)
# variants of genotypes
print("Variants...")
p2_callset_var = p2_callset[chrom]["variants"]
p2_genvars = allel.VariantChunkedTable(p2_callset_var,
                                       names=["POS", "REF", "ALT"],
                                       index="POS")
print(p2_genvars.shape)
# genotype data
print("Genotypes...")
p2_callset_gen = p2_callset[chrom]["calldata"]["genotype"]
p2_genotyp = allel.GenotypeChunkedArray(p2_callset_gen)
p2_genotyp = p2_genotyp.subset(sel1=p2_samples_bool)
print(p2_genotyp.shape)

# #### Outgroups
#
# Loads one outgroup, removes indels (duplicated variant positions) and subsets phase2 to include variants present in this outgroup. Then, loads outgroup genotypes and subsets them to remove indels and fit phase2. Then, loads the second outgroup and performs the same task. Thus, at each iteration, less and less variants remain (hopefully not too many are lost; worst offenders are `chri` and `epir`).

# In[6]:

oc_genotyp = p2_genotyp
oc_genvars = p2_genvars

for outn, outi in enumerate(ou_species):

    print("# p2 genotypes remaining: %i" % oc_genotyp.shape[0])
Esempio n. 29
0
import numpy as np
import h5py
import seaborn as sns
import pandas as pd
chromlist = [
    "Wb_Chr1_0", "Wb_Chr1_1", "Wb_Chr2_0", "Wb_Chr2_1", "Wb_Chr2_2",
    "Wb_Chr2_3", "Wb_Chr3_0", "Wb_Chr3_1", "Wb_Chr4_0", "Wb_Chr4_1",
    "Wb_Chr4_2"
]
seldict = {}
for c in chromlist:
    callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c),
                        mode='r')
    samples = callset['samples'][:]
    sample_name = [sid.decode() for sid in samples.tolist()]
    g = allel.GenotypeChunkedArray(callset["calldata/GT"])
    h = g.to_haplotypes()
    pos = allel.SortedIndex(callset["variants/POS"][:])
    acc = h.count_alleles()[:, 1]
    # ihs
    ihs = allel.ihs(h, pos, include_edges=True)
    ihs_std = allel.standardize_by_allele_count(ihs, acc)
    plt.plot(pos, -np.log10(ihs_std[0]))
    nan = ~np.isnan(ihs)
    ihs_real = ihs[nan]
    pos_ihs = pos[nan]
    # nsl
    nsl = allel.nsl(h)
    nsl_std = allel.standardize_by_allele_count(nsl, acc)
    plt.plot(pos, -np.log10(nsl_std[0]))
    nan = ~np.isnan(ihs)
Esempio n. 30
0
#list(csh['variants'].keys())

# In[21]:

## apply filter
var_pass = var_tb.compress(var_tb_fltr)

# ## Genotype from HDF5

# In[22]:

list(csh['calldata'].keys())

# In[23]:

gth = allel.GenotypeChunkedArray(csh['calldata/GT'])
gth

# In[24]:

list(csh['samples'])

# In[25]:

import pandas as pd

samples = pd.DataFrame({
    'sample': [b'AC3812', b'AC3813', b'AC3814', b'AC3815'],
    'cell_type': ['TAP', 'TAP', 'TLX3', 'TLX3']
})
TLX = samples['cell_type'].isin(['TLX3'])