Ejemplo n.º 1
0
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c")

    try:
        tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"]
    except ValueError:
        logger.info("File not tab delimited as expected- trying with spaces")
        tbl = pd.read_csv(mapfn,
                          sep=" ",
                          header=None,
                          engine="c",
                          names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    try:
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")
    except ValueError:
        tbl = tbl.sort_values(["CHROM", "POS"])
        logger.warning(
            "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient"
        )
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    pos = allel.SortedIndex(vartbl.POS[:])
    assert np.isnan(pos).sum() == 0, "nans values are not supported"

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
Ejemplo n.º 2
0
def load_arrays_noncoding_and_centromeres(local_path,
                                          _set,
                                          chrom,
                                          coding_reg_df,
                                          sitefilter='gamb_colu',
                                          filter_centro=True):
    """
    This function reads and filters a genotyping array to the noncoding, noncentromeric regions, and applys a filter depending on 
    whether the samples are arabiensis (arab) or gambiae/coluzzii (gamb_colu)
    """
    Ag_array = zarr.open_array(
        f"{local_path}/snp_genotypes/all/{_set}/{chrom}/calldata/GT/",
        mode='r')
    filters = zarr.open(
        f"{local_path}/site_filters/dt_20200416/{sitefilter}/{chrom}/variants/filter_pass",
        mode="r")
    positions = zarr.open_array(
        f"{local_path}/snp_genotypes/all/sites/{chrom}/variants/POS/",
        mode='r')
    positions = positions[:][filters[:]]
    geno = allel.GenotypeDaskArray(Ag_array)
    geno = geno[filters[:]]

    if filter_centro is True:
        if chrom == '2L':
            centromere = (positions > 3000000)
        elif chrom == '2R':
            centromere = (positions < 57000000)
        elif chrom == '3L':
            centromere = (positions > 2000000)
        elif chrom == '3R':
            centromere = (positions < 50000000)
        elif chrom == 'X':
            centromere = (positions < 21000000)

        positions = allel.SortedIndex(positions[centromere])
    else:
        positions = allel.SortedIndex(positions)

    #get boolean array for positions that are coding - allel.locate_ranges so fast!
    coding = positions.locate_ranges(coding_reg_df.start,
                                     coding_reg_df.end,
                                     strict=False)
    #compress to get noncoding SNPs and remove centromeric regions of low recombination
    #get non-centromeric regions. currently chosen by eye based on ag1000g phase1 paper fig1.

    if filter_centro is True: geno = geno.compress(centromere, axis=0)
    geno = geno.compress(
        ~coding,
        axis=0)  #we want noncoding regions so '~' to get inverse of boolean
    positions = positions[~coding]

    return (geno, positions)
Ejemplo n.º 3
0
def plotvars(chrm, callset, window_size=100000, title=None, saved=True):
    """
    """
    try:
        chrm = chrm.decode("utf-8")
    except AttributeError:
        chrm = chrm
    chrom = callset['variants/CHROM']
    chrom_mask = np.where(chrom[:] == chrm)
    pos = callset['variants/POS']
    p = pos[:][chrom_mask]
    varpos = allel.SortedIndex(p)
    # setup windows
    bins = np.arange(0, varpos.max(), window_size)
    # use window midpoints as x coordinate
    x = (bins[1:] + bins[:-1]) / 2
    # compute variant density in each window
    h, _ = np.histogram(varpos, bins=bins)
    y = h / window_size
    # plot
    fig, ax = plt.subplots(figsize=(12, 3))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y)
    ax.set_xlabel('Chromosome position (bp)')
    ax.set_ylabel('Variant density (bp$^{-1}$)')
    if title:
        ax.set_title(title)
    else:
        ax.set_title(chrm)
    if saved:
        fig.savefig("{}.vars.pdf".format(chrm), bbox_inches='tight')
Ejemplo n.º 4
0
Archivo: core.py Proyecto: niemasd/pixy
def read_and_filter_genotypes(args, chromosome, window_pos_1, window_pos_2,
                              sites_list_chunk):

    # a string representation of the target region of the current window
    window_region = chromosome + ":" + str(window_pos_1) + "-" + str(
        window_pos_2)

    # read in data from the source VCF for the current window
    callset = allel.read_vcf(args.vcf,
                             region=window_region,
                             fields=[
                                 'CHROM', 'POS', 'calldata/GT',
                                 'variants/is_snp', 'variants/numalt'
                             ])

    # keep track of whether the callset was empty (no sites for this range in the VCF)
    # used by compute_summary_stats to add info about completely missing sites
    if callset is None:
        callset_is_none = True
        gt_array = None
        pos_array = None

    else:
        # if the callset is NOT empty (None), continue with pipeline
        callset_is_none = False

        # convert to a genotype array object
        gt_array = allel.GenotypeArray(
            allel.GenotypeDaskArray(callset['calldata/GT']))

        # build an array of positions for the region
        pos_array = allel.SortedIndex(callset['variants/POS'])

        # create a mask for biallelic snps and invariant sites
        snp_invar_mask = np.logical_or(
            np.logical_and(callset['variants/is_snp'][:] == 1,
                           callset['variants/numalt'][:] == 1),
            callset['variants/numalt'][:] == 0)

        # remove rows that are NOT snps or invariant sites from the genotype array
        gt_array = np.delete(gt_array,
                             np.where(np.invert(snp_invar_mask)),
                             axis=0)
        gt_array = allel.GenotypeArray(gt_array)

        # select rows that ARE snps or invariant sites in the position array
        pos_array = pos_array[snp_invar_mask]

        # if a list of target sites was specified, mask out all non-target sites
        if sites_list_chunk is not None:
            gt_array = mask_non_target_sites(gt_array, pos_array,
                                             sites_list_chunk)

        # extra 'none' check to catch cases where every site was removed by the mask
        if len(gt_array) == 0:
            callset_is_none = True
            gt_array = None
            pos_array = None

    return callset_is_none, gt_array, pos_array
Ejemplo n.º 5
0
def misspos(chrm,
            callset,
            pc,
            samples,
            window_size=10000,
            title=None,
            saved=False):
    """
    """
    #    chrm = chrm.decode("utf-8")
    chrom = callset['variants/CHROM']
    chrom_mask = np.where(chrom[:] == chrm)
    pos = callset['variants/POS']
    p = pos[:][chrom_mask]
    varpos = allel.SortedIndex(p)
    bins = np.arange(0, varpos.max(), window_size)
    # use window midpoints as x coordinate
    x = bins
    miss_site = pc[:][chrom_mask]
    yy = []
    for i, j in enumerate(x):
        try:
            left = bisect.bisect_left(varpos, j)
            right = bisect.bisect_left(varpos, x[i + 1]) - 1
            yy.append(np.mean(miss_site[left:right]))
        except Exception:
            yy.append(0)
    y = np.array(yy)
    ap.plotmiss(x, y / samples, title, chrm, saved)
Ejemplo n.º 6
0
def msp2sf2(tree_sequence, npops):
    """
    """
    pix = [tree_sequence.get_samples(pop) for pop in range(npops)]
    # get derived allele counts from allel
    muts = tree_sequence.get_num_mutations()
    sample_size = tree_sequence.get_sample_size()
    V = np.zeros((muts, sample_size), dtype=np.int8)
    for variant in tree_sequence.variants():
        V[variant.index] = variant.genotypes
        gt = allel.HaplotypeArray(V)
    pos = allel.SortedIndex(
        [int(variant.position) for variant in tree_sequence.variants()])
    for i, p in enumerate(pix):
        ac = gt[:, p].count_alleles()[:, 1]
        d = open("{}.Neutral.sf2inrecomb".format(i), 'w')
        d.write("position\trate\n")
        with open("{}.Neutral.sf2in".format(i), 'w') as f:
            f.write("position\tx\tn\tfolded\n")
            for r, dac in enumerate(ac):
                if dac > 0:
                    f.write("{}\t{}\t{}\t0\n".format(pos[r], dac, len(p)))
                    if r != 0:
                        d.write("{}\t{}\n".format(pos[r], pos[r] / 850000.0))
                    else:
                        d.write("{}\t{}\n".format(pos[r], 0))
            d.close()
    return (None)
Ejemplo n.º 7
0
def countPatternDFOIL(callset, sample_ix, outgroup):
    """Count patterns for all samples
    """
    print("counting patterns in file...")
    gt = allel.GenotypeArray(callset['calldata/GT'])
    pos = allel.SortedIndex(callset['variants/POS'])
    # remove any sites where outgroup is ./. or 0/1
    keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic()
    gt = gt.compress(keep, axis=0)
    pos = pos[keep]
    windict = {}
    permute = 1
    g1, g2, g3, g4 = sample_ix
    quartet = list(product(g1, g2, g3, g4))
    print("total number of combinations: {}".format(len(quartet)))
    for quart in quartet:
        print("permutation number {}".format(permute))
        i, j, k, m = quart
        gt_sub = gt.take([i, j, k, m, outgroup], axis=1)
        keep = gt_sub.is_hom().all(axis=1)
        gt_sub = gt_sub.compress(keep, axis=0)
        pos_sub = pos[keep]
        count_array = gt_sub.is_hom_alt()
        pattern_array = np.packbits(count_array, axis=1)
        # windows
        windict[permute] = (pos_sub, pattern_array)
        permute += 1
    return (windict)
Ejemplo n.º 8
0
def load_zarr_data(zarr_fn, chrom, s1, s2, gdistkey=None):

    import zarr

    samples1 = get_sample_ids(s1)
    samples2 = get_sample_ids(s2)

    zfh = zarr.open_group(zarr_fn, mode="r")[chrom]

    samples_x = zfh["samples"][:]
    sample_name = [sid.decode() for sid in samples_x.tolist()]

    idx1 = np.array([sample_name.index(sid) for sid in samples1])
    idx2 = np.array([sample_name.index(sid) for sid in samples2])

    g = allel.GenotypeChunkedArray(zfh["calldata"]["genotype"])

    pos = allel.SortedIndex(zfh["variants"]["POS"][:])

    if gdistkey is not None:
        gdist = h5fh["variants"][gdistkey][:]
    else:
        gdist = None

    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos, gdist
Ejemplo n.º 9
0
def plth12(chromlist):
    """
    """
    for c in chromlist:
        # callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r')
        callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r')
        samples = callset['samples'][:]
        sample_name = [sid.decode() for sid in samples.tolist()]
        g = allel.GenotypeChunkedArray(callset["calldata/GT"])
        h = g.to_haplotypes()
        pos = allel.SortedIndex(callset["variants/POS"][:])
        acc = h.count_alleles()[:, 1]
        # H12
        h12 = allel.moving_garud_h(h, window_size)[1]  # set window size
        h12_pos = []
        p = 0
        end = window_size
        i = 0
        while i < len(h12):
            stop = pos[end]
            while pos[p] < stop:
                h12_pos.append(h12[i])
                p += 1
            i += 1
            end += window_size
        while len(h12_pos) < len(pos):
            h12_pos.append(h12[-1])
        plt.plot(pos, h12_pos)
        plt.xlabel("{} genomic position".format(c))
        plt.ylabel("H12")
        plt.savefig("PNG.{}.H12.pdf".format(c))
        plt.clf()
Ejemplo n.º 10
0
def calculate_overlap(chrom_overlap_regions, window, modern_haplotype_id,
                      informative_site_positions):
    overlapping_bp = 0
    overlapping_informative_sites = list()
    if not chrom_overlap_regions.empty:
        sample_chrom_overlap_regions = (chrom_overlap_regions[
            chrom_overlap_regions['sample'] == modern_haplotype_id])
        if not sample_chrom_overlap_regions.empty:
            logging.debug(
                "Overlap regions in chrom:\n{}".format(chrom_overlap_regions))
            logging.debug("Window: {}".format(window.start))
            overlapping_regions = sample_chrom_overlap_regions[
                (chrom_overlap_regions['start'] <= window.end)
                & (chrom_overlap_regions['end'] >= window.start)]
            logging.debug("Overlapping regions for window:\n{}".format(
                overlapping_regions))
            overlapping_bp = 0
            for index, region in overlapping_regions.iterrows():
                logging.debug(region)
                overlap = (min(region['end'], window.end) -
                           max(region['start'], window.start))
                overlapping_bp += overlap
            logging.debug("Informative site positions: {}".format(
                informative_site_positions))
            informative_site_index = allel.SortedIndex(
                informative_site_positions)
            overlapping_informative_sites = (
                informative_site_index.intersect_ranges(
                    starts=overlapping_regions['start'],
                    stops=overlapping_regions['end']))
    return (overlapping_bp, len(overlapping_informative_sites))
Ejemplo n.º 11
0
def ld_prune(gn, pos, size=500, step=200, threshold=.1, n_iter=5):
    """Remove sites in LD.

    Parameters
    ----------
    gn : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    size : TYPE, optional
        DESCRIPTION. The default is 500.
    step : TYPE, optional
        DESCRIPTION. The default is 200.
    threshold : TYPE, optional
        DESCRIPTION. The default is .1.
    n_iter : TYPE, optional
        DESCRIPTION. The default is 5.

    Returns
    -------
    TYPE
        DESCRIPTION.
    gn : TYPE
        DESCRIPTION.

    """
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print(f"iteration {i+1} retaining {n} removing {n_remove} variants")
        gn = gn.compress(loc_unlinked, axis=0)
        pos = pos[loc_unlinked]

    return allel.SortedIndex(pos), gn
Ejemplo n.º 12
0
def calculate_switch_distances(windows, switch_array, marker_pos, hz_pos,
                               rohz, gaps):

    marker_pos = allel.SortedIndex(marker_pos)
    gap_mp = np.mean(gaps, axis=1)

    assert np.in1d(marker_pos, hz_pos).all(), "all markers are subset of hets"

    marker_count = np.zeros(windows.shape[0], dtype="int")
    marker_dist = np.zeros(windows.shape[0], dtype="float")
    error_count = np.zeros(windows.shape[0], dtype="int")
    hz_count = np.zeros(windows.shape[0], dtype="int")

    pos_sw = ph.switch.derive_position_switch_array(switch_array)
    pos_errors = np.take(marker_pos, pos_sw[:-1].cumsum())

    for i, (start, stop) in enumerate(windows):

        # this is the code I need to change
        # A don't count error if immediately after GAP
        # B don't count towards distance
        try:
            ix = marker_pos.locate_range(start, stop)

        except KeyError:
            marker_dist[i] = 0.0
            marker_count[i] = 0
            error_count[i] = 0
            hz_count[i] = 0
            continue

        # how many separate gaps between first and last ix?
        gap_ix = np.searchsorted(marker_pos[ix], gap_mp)

        # interested in number of gaps
        gap_pos = np.unique(
            np.compress((gap_ix < marker_pos[ix].size) & (gap_ix > 0), gap_ix))

        # now insert 0 and pos size at beginning and end
        cuts = np.concatenate([[0], gap_pos, [marker_pos[ix].size]])
        assert cuts.size >= 2

        for p, q in zip(cuts[:-1], cuts[1:]):

            first, last = marker_pos[ix][p], marker_pos[ix][q-1]

            # how many hets between first an last?
            counthets = np.searchsorted(hz_pos, last) - \
                np.searchsorted(hz_pos, first)

            error_count[i] += np.sum(evaluate_markers(marker_pos[ix][p:q],
                                                      pos_errors))

            marker_dist[i] += calc_marker_dist(marker_pos[ix][p:q], rohz)
            # just one marker is not informative.
            marker_count[i] += (q - p - 1)
            hz_count[i] += counthets

    return np.vstack([marker_dist, marker_count, error_count, hz_count])
Ejemplo n.º 13
0
def load_hdf5_data(hdf5_fn, chrom, s1, s2):
    callset = h5py.File(hdf5_fn, mode='r')
    samples = callset['samples'][:]
    sample_name = [sid.decode() for sid in samples.tolist()]
    idx1 = np.array([sample_name.index(sid) for sid in s1])
    idx2 = np.array([sample_name.index(sid) for sid in s2])
    g = allel.GenotypeChunkedArray(callset["calldata/GT"])
    pos = allel.SortedIndex(callset["variants/POS"][:])
    return g.take(idx1, axis=1), g.take(idx2, axis=1), pos
Ejemplo n.º 14
0
def __main__():

    parser = arg.ArgumentParser()
    parser.add_argument('--chr', dest='chrom')
    args = parser.parse_args()

    # read in extra data
    bed = pd.read_csv(
        '/psych/ripke/vasa/reference_data/ldetect-data/EUR/fourier_ls-chr{}.bed'
        .format(args.chrom),
        sep='\s+')
    eur_samples = pd.read_csv(
        '/psych/ripke/1000Genomes_reference/1KG_Oct14/1000GP_Phase3_sr_0517d/integrated_call_samples_v3.20130502.ALL.panel.fam.EUR',
        sep='\t',
        names=['fid', 'iid', 'mid', 'pid', 'sex', 'pheno'],
        header=None)

    # read in genotype data
    zarr_path = '/psych/ripke/vasa/reference_data/1000G/loc.ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.zarr'.format(
        args.chrom)
    callset = zarr.open_group(zarr_path, mode='r')
    pos = ska.SortedIndex(callset['variants/POS'])

    callset_samples = list(callset['samples'][:])
    eur_samples['callset_index'] = [
        callset_samples.index(s) for s in eur_samples['iid']
    ]

    gt = callset['calldata/GT']
    gt_da = ska.GenotypeDaskArray(gt)

    print('Subsetting to europeans')
    eur_da = gt_da.take(eur_samples['callset_index'].values, axis=1)
    eur_ac = eur_da.count_alleles()

    print('Filtering european singletons and invariants')
    flt = (eur_ac.max_allele() == 1) & (eur_ac[:, :2].min(axis=1) > 1)
    flt_mask = flt.compute()
    flt_da = eur_da.compress(flt_mask, axis=0).compute()

    # update variant index
    pos = pos[flt_mask]

    #import ipdb
    #ipdb.set_trace()

    print('Counting region window sizes: ')
    bed['num_variants'] = np.nan
    for i, region in bed.iterrows():
        print('\t{} of {}'.format(i, bed.shape[0]))
        loc_region = pos.locate_range(region['start'], region['stop'])
        bed.loc[i, ['num_variants']] = flt_da[loc_region, :, :].n_variants

    bed.to_csv('data/1000G_eur_chr{}_region_variant_counts.tsv'.format(
        args.chrom),
               sep='\t')
Ejemplo n.º 15
0
    def jsfs(self, fold=False):
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        stats_ls = []
        for p1, p2 in combinations(self.stats["pop_config"], 2):
            gtpops = gt.take(p1 + p2, axis=1)
            props = afs.jsfs_stats(len(p1), gtpops, pos, fold)
            stats_ls.extend(props)

        return stats_ls
Ejemplo n.º 16
0
    def sfs(self, fold=False):
        fold = self.stats["sfs_fold"]
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        stats_ls = []
        for pop in self.stats["pop_config"]:
            gtpop = gt.take(pop, axis=1)
            sfs = afs.asfs_stats(gtpop, pos, fold)
            stats_ls.extend(sfs)

        return stats_ls
Ejemplo n.º 17
0
def filterGT(callset, outgroup):
    """Count patterns from VCF
    """
    gt = allel.GenotypeArray(callset['calldata/GT'])
    p = callset['variants/POS']
    pos = allel.SortedIndex(p)
    acs = gt[:, outgroup].count_alleles(max_allele=1)
    flt = acs.is_segregating()  # needs to be segregating in the outgroup
    gt = gt.compress(flt, axis=0)
    pos = pos[flt]
    return (gt, pos)
Ejemplo n.º 18
0
def load_vcf_wrapper(path, seqid, samples):

    callset = allel.read_vcf(path,
                             region=seqid,
                             fields=['variants/POS', 'calldata/GT', 'samples'],
                             tabix="tabix",
                             samples=samples)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
Ejemplo n.º 19
0
    def tajd(self):
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        win_size = self.stats["win_size1"]
        length_bp = self.stats["length_bp"]
        stats_ls = []
        for pop in self.stats["pop_config"]:
            gtpop = gt.take(pop, axis=1)
            tajd_, tajd_std = popstats.tajimaD(pos, gtpop, win_size, length_bp)
            stats_ls.extend([tajd_, tajd_std])

        return stats_ls
Ejemplo n.º 20
0
 def delta_tajD(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     win_size = self.stats["win_size1"]
     length_bp = self.stats["length_bp"]
     quants = self.stats["pw_quants"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.d_tajD(len(p1), pos, gtpops, win_size, length_bp,
                                 quants)
         stats_ls.extend(flt)
     return stats_ls
Ejemplo n.º 21
0
 def ddRank12(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     win_size = self.stats["win_size2"]
     length_bp = self.stats["length_bp"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.ddRank1_2(len(p1), pos, gtpops, win_size,
                                    length_bp, quants)
         stats_ls.extend(flt)  # 2 values returned as list [dd1, dd2]
     return stats_ls
Ejemplo n.º 22
0
def getSNPHistogram(callset, winSize):

    pos = allel.SortedIndex(callset['variants/POS'])
    bins = np.arange(0, pos.max(), winSize)

    # use window midpoints as x coordinate
    x = (bins[1:] + bins[:-1]) / 2

    # compute variant density in each window
    y, _ = np.histogram(pos, bins=bins)
    #y = y / windowSize

    return [x, y]
Ejemplo n.º 23
0
def whatsnpisit(locs,
                chrom,
                inaccessible=False,
                missense=True,
                provide_region=False):
    """ Given a list of locations+chrom, returns a table of those snps with their aa change
    if a missense variant. Useful for RNA_seq variant calling pipeline"""

    if inaccessible is False:
        ############ Read zarrs #############
        Ag_store = zarr.open_array(
            f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
            mode='r')
        positions = allel.SortedIndex(
            zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS",
                mode='r')[:])

        callset_fn = '/home/sanj/ag1000g/data/snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.pass.h5'
        callset = h5py.File(callset_fn, mode='r')
        snp_eff = callset[chrom]['variants']['ANN'][:]
    else:
        Ag_store = zarr.open_array(
            f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
            mode='r')
        positions = allel.SortedIndex(
            zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS",
                mode='r')[:])

        callset_fn = '/home/sanj/ag1000g/data/all_snp_eff/ag1000g.phase2.ar1.snpeff.AgamP4.2.h5'
        callset = h5py.File(callset_fn, mode='r')
        snp_eff = callset[chrom]['variants']['ANN'][:]

    positions_bool, pos_bool = positions.locate_intersection(locs)
    snp_eff = snp_eff[positions_bool]

    return (snp_eff)
Ejemplo n.º 24
0
def get_callables_sites(callset, chrom):
    '''
    Input:
        - chrom     : chromosome number
        - callset   : Zarr object which directs to all the arrays
    Output:
        - callable  : np array boolean of shape (# SNPs, ) which encodes which positions are located in callalble regions 
    '''
    callable_regions = get_callable(chrom)
    return allel.SortedIndex(
        callset['{}/variants/POS'.format(chrom)]).locate_ranges(
            starts=callable_regions[:, 0],
            stops=callable_regions[:, 1],
            strict=False)
Ejemplo n.º 25
0
 def FST(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.fst(len(p1), pos, gtpops, quants)
         try:
             stats_ls.extend(flt)
         except TypeError:
             flt = [np.nan] * len(quants)
             stats_ls.extend(flt)
     return stats_ls
def locate_intersection(positions_a, lengths_a, positions_b, lengths_b):

    log("Computing position overlap")
    loc_a = np.zeros(positions_a.shape, dtype=bool)
    loc_b = np.zeros(positions_b.shape, dtype=bool)

    ix_b, ix_a = 0, 0

    for va, vb in zip(lengths_a, lengths_b):

        positions_given_seq_a = allel.SortedIndex(positions_a[ix_a:(ix_a +
                                                                    va)])
        positions_given_seq_b = allel.SortedIndex(positions_b[ix_b:(ix_b +
                                                                    vb)])

        temp_loc_a, temp_loc_b = positions_given_seq_a.locate_intersection(
            positions_given_seq_b)
        loc_a[ix_a:(ix_a + va)] = temp_loc_a
        loc_b[ix_b:(ix_b + vb)] = temp_loc_b
        ix_a += va
        ix_b += vb

    return loc_a, loc_b
Ejemplo n.º 27
0
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn,
                      sep=" ",
                      names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
Ejemplo n.º 28
0
 def dmin(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     win_size = self.stats["win_size2"]
     length_bp = self.stats["length_bp"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.dmin(len(p1), pos, gtpops, win_size, length_bp)
         if quants[0] < 0:
             dminq = [np.nanmean(flt)]
         else:
             dminq = np.nanquantile(flt, quants)
         stats_ls.extend(dminq)
     return stats_ls
Ejemplo n.º 29
0
def load_vcf_wrapper(path, seqid, samples, samples_path):

    callset = allel.read_vcf(path,
                             region=seqid,
                             fields=['variants/POS', 'calldata/GT', 'samples'],
                             tabix="tabix",
                             samples=samples)

    assert "samples" in callset.keys(
    ), "None of the samples provided in {0!r} are found in {1!r}".format(
        samples_path, path)

    p = allel.SortedIndex(callset["variants/POS"])
    g = allel.GenotypeArray(callset['calldata/GT'])

    return p, g
Ejemplo n.º 30
0
def countPattern(callset, sample_ix, outgroup):
    """Count patterns for all samples
    """
    print("counting patterns in file...")
    gt = allel.GenotypeArray(callset['calldata/GT'])
    pos = allel.SortedIndex(callset['variants/POS'])
    # remove any sites where outgroup is ./. or 0/1
    keep = gt[:, outgroup].is_hom() & gt.count_alleles().is_biallelic()
    gt = gt.compress(keep, axis=0)
    pos = pos[keep]
    # permute among all sample indexes, list of lists
    # [[1,2,3,4,5],[6,7,8,9],[12,14,15,16]]
    t1t2dict = defaultdict(list)
    windict = {}
    permute = 1
    g1, g2, g3 = sample_ix
    quartet = list(product(g1, g2, g3))
    print("total number of combinations: {}".format(len(quartet)))
    for quart in quartet:
        print("permutation number {}".format(permute))
        i, j, k = quart
        gt_sub = gt.take([i, j, k, outgroup], axis=1)
        keep = gt_sub.is_hom().all(axis=1)
        gt_sub = gt_sub.compress(keep, axis=0)
        pos_sub = pos[keep]
        count_array = gt_sub.is_hom_alt()
        pattern_array = np.packbits(count_array, axis=1)
        calc_patterns = np.unique(pattern_array, return_counts=True)
        d = {n: calc_patterns[1][i] for i, n in enumerate(calc_patterns[0])}
        # total counts
        AAAA = d.get(0, 0) + d.get(240, 0)  # FFFF TTTT 240 and 0
        BAAA = d.get(112, 0) + d.get(128, 0)  # FTTT + TFFF 112 and 128
        ABAA = d.get(176, 0) + d.get(64, 0)  # TFTT + FTFF 176 and 64
        AABA = d.get(208, 0) + d.get(32, 0)  # TTFT + FFTF 208 and 32
        BBAA = d.get(48, 0) + d.get(192, 0)  # FFTT + TTFF 48 and 192
        ABBA = d.get(144, 0) + d.get(96, 0)  # TFFT + FTTF 144 and 96
        BABA = d.get(80, 0) + d.get(160, 0)  # FTFT + TFTF 80 and 160
        BBBA = d.get(224, 0) + d.get(16, 0)  # FFFT + TTTF 224 and 16
        # t1t2 calc
        t1, t2 = calct1t2(AAAA, BAAA, ABAA, AABA, BBAA, ABBA, BABA, BBBA)
        t1t2dict["t1"].append(t1)
        t1t2dict["t2"].append(t2)
        # windows
        windict[permute] = (pos_sub, pattern_array)
        permute += 1
    return (t1t2dict, windict)