def print_pi(self, tree_sequence, indices, populations):
        if not self.pi_needed():
            return

        writer = self.writers['pi']
        # invert populations dictionary to be keyed by population index
        # this keeps the order consistent instead of relying on keys

        pops = 'AF EU AS'.split()
        indices = np.array(indices)

        writer.write('\t'.join(pops) + '\t')
        writer.write('AF-EU\tAF-AS\tEU-AS\n')

        length = tree_sequence.get_sequence_length()
        haplotypes = tree_sequence.genotype_matrix()
        for pop in pops:
            mpd = allel.mean_pairwise_difference(
                allel.HaplotypeArray(
                    haplotypes[:,
                               indices == populations[pop]]).count_alleles())
            writer.write(f'{mpd.sum()/length:.5}\t')

        for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
            count1 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[0]]]).count_alleles()
            count2 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[1]]]).count_alleles()
            num, den = allel.hudson_fst(count1, count2)
            writer.write(f'{num.sum() / den.sum():.5}\t')
        writer.write('\n')
Beispiel #2
0
def ts_to_dadi_sfs(ts_path,
                   out_path,
                   out_path_nonvariant,
                   sample_size=20,
                   mask_file=None):
    '''
	Generate however many different SFS with msprime and convert+save them into SFS for dadi to use.
	'''
    ts = tskit.load(ts_path)

    #haps_pops_joint = np.array(ts.genotype_matrix())

    haps = ts.genotype_matrix()

    total_length = ts.sequence_length

    # Masking
    retain = np.full(ts.get_num_mutations(), False)
    if mask_file:
        mask_table = pd.read_csv(mask_file, sep="\t", header=None)
        chrom = ts_path.split("/")[-1].split(".")[0]
        sub = mask_table[mask_table[0] == chrom]
        mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2])
        snp_locs = [int(x.site.position) for x in ts.variants()]
        tmp_bool = [mask_ints.contains(x) for x in snp_locs]
        retain = np.logical_or(retain, tmp_bool)
        #print(retain)
        total_length -= np.sum(mask_ints.length)
    #print(ts.sequence_length)
    #print(total_length)

    retain = np.logical_not(retain)

    haps_pops_joint = np.array(haps[retain, :])

    #Break up the haplotypes into seperate populations based on sample_size
    haps_pop0_joint = haps_pops_joint[:, :sample_size]
    haps_pop1_joint = haps_pops_joint[:, sample_size:]

    genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles()
    genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles()

    sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1],
                                allele_counts_pop1_joint[:, 1])
    num_sites = sum(sum(sfs_joint))
    #print(ts.num_sites)
    sfs_joint = dadi.Spectrum(sfs_joint)
    sfs_joint.to_file(out_path)
    sfs_joint[
        0,
        0] = total_length - num_sites  # need to get the number of nonvariant sites for the [0,0] entry
    sfs_joint.to_file(out_path_nonvariant)
Beispiel #3
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1, mask_file=None):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """
        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0
        for i, ts_p in enumerate(ts_path):
            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples
            haps = ts.genotype_matrix()

            SFSs = []
            # Masking
            retain = np.full(ts.get_num_mutations(), False)
            if mask_file:
                mask_table = pd.read_csv(mask_file, sep="\t", header=None)
                chrom = ts_p.split("/")[-1].split(".")[0]
                sub = mask_table[mask_table[0] == chrom]
                mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2])
                snp_locs = [int(x.site.position) for x in ts.variants()]
                tmp_bool = [mask_ints.contains(x) for x in snp_locs]
                retain = np.logical_or(retain, tmp_bool)
                total_length -= np.sum(mask_ints.length)

            retain = np.logical_not(retain)
            # append unmasked SFS
            SFSs.append(allel.sfs(allel.HaplotypeArray(haps).count_alleles()[:, 1])[1:])
            # get masked allele counts and append SFS
            allele_counts = allel.HaplotypeArray(haps[retain, :]).count_alleles()
            SFSs.append(allel.sfs(allele_counts[:, 1])[1:])
            sfs_path = ts_p+".sfs.pdf"
            plots.plot_sfs(SFSs, sfs_path)
            # Bootstrap allele counts
            derived_counts_all[0].extend(allele_counts[:, 1])
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)
        # Get the SFS minus the 0 bin and write output
        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
Beispiel #4
0
def msp2sf2(tree_sequence, npops):
    """
    """
    pix = [tree_sequence.get_samples(pop) for pop in range(npops)]
    # get derived allele counts from allel
    muts = tree_sequence.get_num_mutations()
    sample_size = tree_sequence.get_sample_size()
    V = np.zeros((muts, sample_size), dtype=np.int8)
    for variant in tree_sequence.variants():
        V[variant.index] = variant.genotypes
        gt = allel.HaplotypeArray(V)
    pos = allel.SortedIndex(
        [int(variant.position) for variant in tree_sequence.variants()])
    for i, p in enumerate(pix):
        ac = gt[:, p].count_alleles()[:, 1]
        d = open("{}.Neutral.sf2inrecomb".format(i), 'w')
        d.write("position\trate\n")
        with open("{}.Neutral.sf2in".format(i), 'w') as f:
            f.write("position\tx\tn\tfolded\n")
            for r, dac in enumerate(ac):
                if dac > 0:
                    f.write("{}\t{}\t{}\t0\n".format(pos[r], dac, len(p)))
                    if r != 0:
                        d.write("{}\t{}\n".format(pos[r], pos[r] / 850000.0))
                    else:
                        d.write("{}\t{}\n".format(pos[r], 0))
            d.close()
    return (None)
Beispiel #5
0
 def test_masked_windowed_diversity(self):
     # four haplotypes, 6 pairwise comparison
     h = allel.HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1],
                               [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2],
                               [0, 1, 1, 2], [0, 1, -1, -1],
                               [-1, -1, -1, -1]])
     ac = h.count_alleles()
     # mean pairwise diversity
     # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
     pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27])
     mask = np.tile(np.repeat(np.array([True, False]), 5), 3)
     # expected is every other window with size 5
     expect, _, _, _ = allel.windowed_diversity(pos,
                                                ac,
                                                size=5,
                                                start=1,
                                                stop=31)
     # only getting every other element
     expect = expect[::2]
     # actual is window of size 10 with the last half masked out
     actual, _, _, _ = allel.windowed_diversity(pos,
                                                ac,
                                                size=10,
                                                start=1,
                                                stop=31,
                                                is_accessible=mask)
     assert_array_almost_equal(expect, actual)
Beispiel #6
0
def simulate(out_path,
             species,
             model,
             genetic_map,
             seed,
             chrmStr,
             sample_size=20,
             population=0,
             ld_thresh=1.0,
             max_workers=1):
    mask_path = out_path + ".r2Mask.p"
    sfs_path = out_path + ".sfs.pdf"
    chrom = species.genome.chromosomes[chrmStr]
    samples = [msp.Sample(population=population, time=0)] * sample_size
    print("Simulating...")
    ts = msp.simulate(samples=samples,
                      recombination_map=chrom.recombination_map(
                          genetic_map.name),
                      mutation_rate=chrom.default_mutation_rate,
                      random_seed=seed,
                      **model.asdict())
    ts.dump(out_path)
    haps = allel.HaplotypeArray(ts.genotype_matrix())
    SFSs = []
    SFSs.append(allel.sfs(haps.count_alleles()[:, 1])[1:])
    print("Simulation finished!")
    if ld_thresh < 1.0:
        ul = unlinked(ts, ld_thresh, max_workers)
        mask_file = open(mask_path, "wb")
        pickle.dump(ul, mask_file)
        SFSs.append(allel.sfs(haps[ul, :].count_alleles()[:, 1])[1:])
    plot_sfs(SFSs, sfs_path)
Beispiel #7
0
def extract_haplotype_array(haparray, haplotype_pos, core_left, core_right,
                            flank):
    # Get the haplotype on the right of the group.
    loc_right = haplotype_pos.locate_range(core_right, core_right + flank)
    haps_right_3d = haparray[loc_right, :1142, :]
    haps_right = allel.HaplotypeArray(
        haps_right_3d.reshape(haps_right_3d.shape[0], 2284))
    pos_right = haplotype_pos[loc_right]
    # Get the haplotype on the left of the group.
    loc_left = haplotype_pos.locate_range(core_left - flank, core_left)
    haps_left_3d = haparray[loc_left, :1142, :]
    haps_left = allel.HaplotypeArray(
        haps_left_3d.reshape(haps_left_3d.shape[0], 2284))
    pos_left = haplotype_pos[loc_left]

    return (pos_left, haps_left, pos_right, haps_right)
Beispiel #8
0
 def __init__(self, input_path: str, genotypes: allel.GenotypeArray,
              variants: allel.VariantTable, chrom: str, sample_list: list,
              parent_sample: str):
     self.__input_path: str = input_path
     self.__genotypes: allel.GenotypeArray = genotypes
     self.__variants: allel.VariantTable = variants
     self.__chrom: str = chrom
     self.__sample_list: list = sample_list
     self.__parent_sample: str = parent_sample
     self.__zygosity: Zygosity = Zygosity.UNDEFINED
     self.__parent_haplotypes: allel.HaplotypeArray = allel.HaplotypeArray(
         # Empty allel.HaplotypeArray
         np.empty((self.__genotypes.n_variants, self.__genotypes.n_samples),
                  dtype='i1'))
     self.__parent_n_progeny_haplotypes: allel.HaplotypeArray = allel.HaplotypeArray(
         np.empty((self.__genotypes.n_variants, self.__genotypes.n_samples),
                  dtype='i1'))
Beispiel #9
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """
        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0
        for i, ts_p in enumerate(ts_path):
            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples
            haps = ts.genotype_matrix()

            # Mask high-ld sites and return genotypes
            mask_path = ts_p + ".unlinkedMask.p"
            if os.path.exists(mask_path):
                mask_file = open(mask_path, "rb")
                ul = pickle.load(mask_file)
                allele_counts = allel.HaplotypeArray(
                    haps[ul, :]).count_alleles()
            else:
                allele_counts = allel.HaplotypeArray(haps).count_alleles()

            # Bootstrap allele counts
            derived_counts_all[0].extend(allele_counts[:, 1])
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1),
                                           nsites,
                                           replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)

        # Get the SFS minus the 0 bin and write output
        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
Beispiel #10
0
    def jsfs(self, fold=False):
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        stats_ls = []
        for p1, p2 in combinations(self.stats["pop_config"], 2):
            gtpops = gt.take(p1 + p2, axis=1)
            props = afs.jsfs_stats(len(p1), gtpops, pos, fold)
            stats_ls.extend(props)

        return stats_ls
Beispiel #11
0
    def calc_heterozygous_haplotypes(
            self) -> (allel.HaplotypeArray, allel.HaplotypeArray):
        """ Returns a the parent and progeny haplotypes from a given 'allel.GenotypeArray'.
            It considers BOTH alleles due to it supposes the genotypes are heterozygous.

            Parent haplotypes are found in self.__parent_haplotypes.
            Parent plus progeny haplotypes are found in self.__parent_n_progeny_haplotypes.
        """
        # Parent genotype is indexed in position 0
        genotypes_parent = self.genotypes[:, 0]
        # Convert to haplotype array
        haplotypes_parent = genotypes_parent.to_haplotypes()
        # Pull out the both allele (haplotypes) from the other samples in the VCF, treated as progeny
        # Skip genotype 0 (parent)
        left_alleles = allel.HaplotypeArray(self.genotypes[:, :, 0])
        right_alleles = allel.HaplotypeArray(self.genotypes[:, :, 1])

        # Initially, copy parent alleles
        haplotypes_parent_n_progeny = allel.HaplotypeArray(haplotypes_parent,
                                                           copy=True)
        wanted_variants = np.repeat(
            True, self.genotypes.n_variants)  # Get all variants
        wanted_sample = np.repeat(
            False, self.genotypes.n_samples)  # Set to False initially
        # Start at 1, we already copied the parents haplotypes in 'haplotypes_parent_n_progeny'
        for sample_index in range(
                1, self.genotypes.n_samples
        ):  # Skip haplotype 0 (parent), it is already inserted
            wanted_sample[sample_index] = True
            subset_left_alleles = left_alleles.subset(wanted_variants,
                                                      wanted_sample)[:]
            subset_right_alleles = right_alleles.subset(
                wanted_variants, wanted_sample)[:]
            wanted_sample[sample_index] = False
            haplotypes_parent_n_progeny = haplotypes_parent_n_progeny.concatenate(
                subset_left_alleles, axis=1)
            haplotypes_parent_n_progeny = haplotypes_parent_n_progeny.concatenate(
                subset_right_alleles, axis=1)

        self.__parent_haplotypes = haplotypes_parent
        self.__parent_n_progeny_haplotypes = haplotypes_parent_n_progeny
        self.__set_heterozygous()
Beispiel #12
0
    def sfs(self, fold=False):
        fold = self.stats["sfs_fold"]
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        stats_ls = []
        for pop in self.stats["pop_config"]:
            gtpop = gt.take(pop, axis=1)
            sfs = afs.asfs_stats(gtpop, pos, fold)
            stats_ls.extend(sfs)

        return stats_ls
Beispiel #13
0
def Fst_IBD(trees):
    ts = pyslim.load(trees)
    mutated_tree = msprime.mutate(ts, 1e-8)
    #	muts = len( [ v for v  in mutated_tree.variants() ] )
    # Get the genotype matrix, ready for using sci-kit.allel
    msprime_genotype_matrix = mutated_tree.genotype_matrix()
    # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes
    haplotype_array = allel.HaplotypeArray(msprime_genotype_matrix)

    genotype_array = haplotype_array.to_genotypes(ploidy=2)
    print(genotype_array.shape)
    ## Calculate Diversity
    pi = mutated_tree.diversity(windows=[
        0, 1e6, 2e6, 3e6, 4e6, 5e6, 6e6, 7e6, 8e6, 9e6, 10e6, 10e6 + 1
    ])
    ## Calculate Tajima's D
    ac = genotype_array.count_alleles()
    TD = allel.tajima_d(ac)
    print(TD)

    row = np.random.choice(13)
    pairs = [[row, row + (14 * i)] for i in range(14)]

    subpopulations = [[y for y in range(x, x + 100)]
                      for x in range(0, genotype_array.shape[1], 100)]

    subpops = np.array(subpopulations)[np.random.choice(len(subpopulations),
                                                        10,
                                                        replace=False)]

    mean_fst = allel.average_weir_cockerham_fst(genotype_array,
                                                blen=10000,
                                                subpops=subpops)

    rep = trees.split("/")[-1].split("_")[0]

    output = []

    output.append([str(rep), str(int(-1)), str(mean_fst[0])])

    for p in pairs:
        print(p)
        dist = (p[1] - p[0]) / 14
        if dist == 0: continue
        subpops = np.array(subpopulations)[p]

        mean_fst = allel.average_weir_cockerham_fst(genotype_array,
                                                    blen=1000,
                                                    subpops=subpops)
        output.append([str(rep), str(int(dist)), str(mean_fst[0])])


#		output.write( ",".join( str(rep), str(int(dist)), str(mean_fst[0]) ) + "\n")
    return (output)
Beispiel #14
0
def ac_from_ts(ts, n_pops, N):
    '''
    This function takes a tree sequence,  and returns tuple with a list of allele counts for each subpop and the positions'''
    acs = []
    hap = allel.HaplotypeArray(ts.genotype_matrix())
    geno = hap.to_genotypes(ploidy=2)
    for i in range(n_pops):
        subpop_indexes = list(np.arange(i * N, (i + 1) * N))
        acs.append(geno.count_alleles(subpop=subpop_indexes))
    pos = np.array([s.position for s in ts.sites()])
    return (acs, pos)
Beispiel #15
0
    def tajd(self):
        gt = allel.HaplotypeArray(self.haparr.T)
        pos = allel.SortedIndex(self.pos)
        win_size = self.stats["win_size1"]
        length_bp = self.stats["length_bp"]
        stats_ls = []
        for pop in self.stats["pop_config"]:
            gtpop = gt.take(pop, axis=1)
            tajd_, tajd_std = popstats.tajimaD(pos, gtpop, win_size, length_bp)
            stats_ls.extend([tajd_, tajd_std])

        return stats_ls
Beispiel #16
0
def genotypes(tree_seq):
    """ Returns sampled genotypes in scikit allel genotypes format"""
    samples = get_sampled_nodes(tree_seq)
    samples = np.concatenate(samples).flatten()
    haplotype_array = np.empty((tree_seq.num_mutations, len(samples)),
                               dtype=np.int8)
    for j, variant in enumerate(tree_seq.variants(
            samples=samples)):  # output order corresponds to samples
        haplotype_array[j, :] = variant.genotypes
    haplotype_array = allel.HaplotypeArray(haplotype_array)
    allel_genotypes = haplotype_array.to_genotypes(ploidy=2)
    return allel_genotypes
Beispiel #17
0
def msprime_to_dadi_simulation(path, seed, org, chrom, sample_size=20):
    '''
	Generate however many different SFS with msprime and convert+save them into SFS for dadi to use.
	'''
    #For testing
    # print(path, seed, chrom, sample_size)
    # chrom = homo_sapiens.genome.chromosomes[chrom]
    # model = homo_sapiens.GutenkunstThreePopOutOfAfrica()
    chrom = getattr(stdpopsim,
                    '_'.join(org.split('_')[:-1])).genome.chromosomes[chrom]
    model = getattr(getattr(stdpopsim, '_'.join(org.split('_')[:-1])),
                    org.split('_')[-1:][0])()

    samples_pops_joint = [
        msprime.Sample(population=0, time=0)
    ] * sample_size + [msprime.Sample(population=1, time=0)] * sample_size
    ts_pops_joint = msprime.simulate(
        samples=samples_pops_joint,
        recombination_map=chrom.recombination_map(),
        mutation_rate=chrom.default_mutation_rate,
        random_seed=seed,
        **model.asdict())
    haps_pops_joint = np.array(ts_pops_joint.genotype_matrix())

    #Break up the haplotypes into seperate populations based on sample_size
    haps_pop0_joint = haps_pops_joint[:, :sample_size]
    haps_pop1_joint = haps_pops_joint[:, sample_size:]

    genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles()
    genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes(
        ploidy=2)
    allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles()

    sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1],
                                allele_counts_pop1_joint[:, 1])
    sfs_joint = dadi.Spectrum(sfs_joint)

    sfs_joint.to_file(path)
Beispiel #18
0
 def delta_tajD(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     win_size = self.stats["win_size1"]
     length_bp = self.stats["length_bp"]
     quants = self.stats["pw_quants"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.d_tajD(len(p1), pos, gtpops, win_size, length_bp,
                                 quants)
         stats_ls.extend(flt)
     return stats_ls
Beispiel #19
0
 def ddRank12(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     win_size = self.stats["win_size2"]
     length_bp = self.stats["length_bp"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.ddRank1_2(len(p1), pos, gtpops, win_size,
                                    length_bp, quants)
         stats_ls.extend(flt)  # 2 values returned as list [dd1, dd2]
     return stats_ls
Beispiel #20
0
 def FST(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.fst(len(p1), pos, gtpops, quants)
         try:
             stats_ls.extend(flt)
         except TypeError:
             flt = [np.nan] * len(quants)
             stats_ls.extend(flt)
     return stats_ls
Beispiel #21
0
 def dmin(self):
     gt = allel.HaplotypeArray(self.haparr.T)
     pos = allel.SortedIndex(self.pos)
     quants = self.stats["pw_quants"]
     win_size = self.stats["win_size2"]
     length_bp = self.stats["length_bp"]
     stats_ls = []
     for p1, p2 in combinations(self.stats["pop_config"], 2):
         gtpops = gt.take(p1 + p2, axis=1)
         flt = pwpopstats.dmin(len(p1), pos, gtpops, win_size, length_bp)
         if quants[0] < 0:
             dminq = [np.nanmean(flt)]
         else:
             dminq = np.nanquantile(flt, quants)
         stats_ls.extend(dminq)
     return stats_ls
def pop_sample_ac(geno_mat):
    haplo_arr = allel.HaplotypeArray(geno_mat)
    ac_one = haplo_arr[:, 0:10].count_alleles()
    ac_two = haplo_arr[:, 10:20].count_alleles()
    ac_three = haplo_arr[:, 20:30].count_alleles()
    ac_four = haplo_arr[:, 30:40].count_alleles()
    ac_five = haplo_arr[:, 40:50].count_alleles()
    ac_six = haplo_arr[:, 50:60].count_alleles()
    ac_seven = haplo_arr[:, 60:70].count_alleles()
    ac_eight = haplo_arr[:, 70:80].count_alleles()
    # stack arrays with frames = population allele counts for all SNPs
    arrays = [
        ac_one, ac_two, ac_three, ac_four, ac_five, ac_six, ac_seven, ac_eight
    ]
    ac_All = np.stack(arrays, axis=0)
    return ac_All
Beispiel #23
0
def geno2genediv( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    # set group
    groups = lineparser.parse_grouping()

    cout('Grouping:')
    group_keys = sorted(groups.keys())
    for k in group_keys:
        cout(' %12s %3d' % (k, len(groups[k])))

    outfile = open(args.outfile, 'wt')
    outfile.write('CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tFST\tdHe\tHe\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' %
    				'\t'.join( group_keys ))

    for idx, region in enumerate(lineparser.parse_genes()):
        haplotypes = set( region.haplotypes())
        enc_haplos = region.encode_haplotypes()
        haploarray = allel.HaplotypeArray( [enc_haplos] )

        cerr( 'I: calculating %d - %s' % (idx, region.name))

        # calculate total He first
        He = 1 - np.sum( haploarray.count_alleles().to_frequencies()**2 )

        # calculate He per population, He_p
        values = []
        pHe = 0
        for g in group_keys:

            he_p = 1 - np.sum(
                haploarray.count_alleles(subpop=groups[g]).to_frequencies()**2 )
            pHe += he_p * len(groups[g])
            values.append(he_p)

        dHe = He - pHe / sum( len(x) for x in groups.values() )
        FST = dHe/He

    	#print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value)
        params = ( FST, dHe, He, np.mean(values), np.median(values), np.max(values), np.min(values))
        outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % (
                region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes),
                '\t'.join( '%5.4f' % x for x in params),
                '\t'.join( '%5.4f' % x for x in values)))
Beispiel #24
0
def geno2genediv(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.diploid_translator)

    # set group
    groups = lineparser.parse_grouping()

    cout('Grouping:')
    group_keys = sorted(groups.keys())
    for k in group_keys:
        cout(' %12s %3d' % (k, len(groups[k])))

    outfile = open(args.outfile, 'wt')
    outfile.write(
        'CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' %
        '\t'.join(group_keys))

    for idx, region in enumerate(lineparser.parse_genes()):
        haplotypes = set(region.haplotypes())
        enc_haplos = region.encode_haplotypes()
        assert len(haplotypes) == max(enc_haplos) + 1
        haploarray = allel.HaplotypeArray([enc_haplos])

        cerr('I: calculating %d - %s' % (idx, region.name))

        value = []
        for g in group_keys:
            ac_g = haploarray.count_alleles(subpop=groups[g])
            ac_ng = haploarray.count_alleles(
                subpop=list(lineparser.sample_idx - set(groups[g])))
            num, den = allel.stats.hudson_fst(ac_g, ac_ng)
            value.append(den)

        #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value)
        params = (np.mean(value), np.median(value), np.max(value),
                  np.min(value))
        outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' %
                      (region.P[0][0], region.P[0][1], region.name,
                       len(region.P), len(haplotypes), '\t'.join(
                           '%5.4f' % x
                           for x in params), '\t'.join('%5.4f' % x
                                                       for x in value)))
Beispiel #25
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
        positions = callset['variants/POS']
    elif args.vcf is not None:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        genotypes = allel.GenotypeArray(vcf['calldata/GT'])
        samples = vcf['samples']
    elif args.matrix is not None:
        gmat = pd.read_csv(args.matrix, sep="\t")
        samples = np.array(gmat['sampleID'])
        gmat = gmat.drop(labels="sampleID", axis=1)
        gmat = np.array(gmat, dtype="int8")
        for i in range(gmat.shape[0]
                       ):  #kludge to get haplotypes for reading in to allel.
            h1 = []
            h2 = []
            for j in range(gmat.shape[1]):
                count = gmat[i, j]
                if count == 0:
                    h1.append(0)
                    h2.append(0)
                elif count == 1:
                    h1.append(1)
                    h2.append(0)
                elif count == 2:
                    h1.append(1)
                    h2.append(1)
            if i == 0:
                hmat = h1
                hmat = np.vstack((hmat, h2))
            else:
                hmat = np.vstack((hmat, h1))
                hmat = np.vstack((hmat, h2))
        genotypes = allel.HaplotypeArray(
            np.transpose(hmat)).to_genotypes(ploidy=2)
    return genotypes, samples
Beispiel #26
0
    def ts_to_stairway(self, ts_path, num_bootstraps=1):
        """
        Converts the specified tskit tree sequence to text files used by
        stairway plot.
        """

        derived_counts_all = [[] for _ in range(num_bootstraps + 1)]
        total_length = 0
        num_samples = 0

        for i, ts_p in enumerate(ts_path):

            ts = tskit.load(ts_p)
            total_length += ts.sequence_length
            num_samples = ts.num_samples

            # count alleles, bootstrap over sites, return the SFS minus the 0% bin
            haps = ts.genotype_matrix()
            genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2)
            allele_counts = genotypes.count_alleles()
            derived_allele_counts = allele_counts[:, 1]
            derived_counts_all[0].extend(derived_allele_counts)

            # Write bootstrapped inputs
            for j in range(1, num_bootstraps + 1):
                nsites = np.shape(allele_counts)[0]
                bootset = np.random.choice(np.arange(0, nsites, 1),
                                           nsites,
                                           replace=True)
                bootac = allele_counts[bootset, :]
                der_bootac = bootac[:, 1]
                derived_counts_all[j].extend(der_bootac)

        stairway_files = []
        for l in range(len(derived_counts_all)):
            sfs = allel.sfs(derived_counts_all[l])[1:]
            filename = self.workdir / "sfs_{}.txt".format(l)
            write_stairway_sfs(total_length, num_samples, sfs, filename)
            stairway_files.append(filename)

        return stairway_files
Beispiel #27
0
    def calc_homozygous_haplotypes(self):
        """ Calculates a the parent and progeny haplotypes from a given 'allel.GenotypeArray'.
            It considers ONLY ONE of the alleles due to it supposes the genotypes are homozygous.

            Parent haplotypes are found in self.__parent_haplotypes.
            Parent plus progeny haplotypes are found in self.__parent_n_progeny_haplotypes.
        """
        # Parent genotype
        genotypes_parent = self.genotypes[:, 0]
        # Convert to haplotype array
        haplotypes_parent = genotypes_parent.to_haplotypes()
        # Pull out the "left" allele (haplotypes) from the other samples in the VCF, treated as progeny
        # Here we assume the genotypes are homozygous
        haplotypes_rest_varieties = allel.HaplotypeArray(self.genotypes[:, 1:,
                                                                        0])
        # Stack parent's haplotypes alongside haplotypes it transmitted to its progeny
        haplotypes_parent_n_progeny = haplotypes_parent.concatenate(
            haplotypes_rest_varieties, axis=1)
        self.__parent_haplotypes = haplotypes_parent
        self.__parent_n_progeny_haplotypes = haplotypes_parent_n_progeny
        self.__set_homozygous()
Beispiel #28
0
def extract_haplotype_array(haparray, genarray, haplotype_pos, genotype_pos,
                            male_indices, core_left, core_right, flank):
    # Get the haplotype on the right of the group.
    haps_loc_right = haplotype_pos.locate_range(core_right, core_right + flank)
    haps_right_3d = haparray[haps_loc_right, :1058, :]
    haps_right = allel.HaplotypeArray(
        haps_right_3d.reshape(haps_right_3d.shape[0], 1058 * 2))
    pos_right = haplotype_pos[haps_loc_right]
    # Get the genotypes on the right of the group. Some genotypes are not present in the haplotype table so
    # we need to remove them. It takes a long time to do this directly, and is much quicker to first take a
    # slice for the correct range, and then filter out the loci not found in the haplotypes data.
    gen_loc_right = genotype_pos.locate_range(core_right, core_right + flank)
    gen_right_all_3d = genarray[gen_loc_right, :, :]
    gen_pos_right = genotype_pos[gen_loc_right]
    gen_loc_right_inhap = gen_pos_right.locate_keys(pos_right)
    gen_right_all_inhap_3d = gen_right_all_3d[gen_loc_right_inhap, :, :]
    gen_right = allel.HaplotypeArray(gen_right_all_inhap_3d[:, male_indices,
                                                            0])
    # Combine the haplotypes and genotypes
    genhaps_right_all = allel.HaplotypeArray(
        np.concatenate((haps_right, gen_right), 1))
    # Get the haplotype on the left of the group.
    haps_loc_left = haplotype_pos.locate_range(core_left - flank, core_left)
    haps_left_3d = haparray[haps_loc_left, :1058, :]
    haps_left = allel.HaplotypeArray(
        haps_left_3d.reshape(haps_left_3d.shape[0], 1058 * 2))
    pos_left = haplotype_pos[haps_loc_left]
    # Get the genotypes on the left of the group
    gen_loc_left = genotype_pos.locate_range(core_left - flank, core_left)
    gen_left_all_3d = genarray[gen_loc_left, :, :]
    gen_pos_left = genotype_pos[gen_loc_left]
    gen_loc_left_inhap = gen_pos_left.locate_keys(pos_left)
    gen_left_all_inhap_3d = gen_left_all_3d[gen_loc_left_inhap, :, :]
    gen_left = allel.HaplotypeArray(gen_left_all_inhap_3d[:, male_indices, 0])
    # Combine the haplotypes and genotypes
    genhaps_left_all = allel.HaplotypeArray(
        np.concatenate((haps_left, gen_left), 1))
    # We need to exclude loci where there are missing values (which occurs in the genotype table)
    no_missing_values_right = np.apply_along_axis(
        lambda x: len(np.where(x == -1)[0]) == 0, 1, genhaps_right_all)
    genhaps_right = genhaps_right_all[no_missing_values_right, :]
    no_missing_values_left = np.apply_along_axis(
        lambda x: len(np.where(x == -1)[0]) == 0, 1, genhaps_left_all)
    genhaps_left = genhaps_left_all[no_missing_values_left, :]

    return (pos_left[no_missing_values_left], genhaps_left,
            pos_right[no_missing_values_right], genhaps_right)
Beispiel #29
0
def load_genotypes():
    if args.zarr is not None:
        print("reading zarr")
        callset = zarr.open_group(args.zarr, mode='r')
        gt = callset['calldata/GT']
        genotypes = allel.GenotypeArray(gt[:])
        samples = callset['samples'][:]
    else:
        print("reading VCF")
        vcf = allel.read_vcf(args.vcf, log=sys.stderr)
        gt = vcf['calldata/GT']
        genotypes = allel.GenotypeArray(gt)
        hap0 = genotypes[:, :, 0]
        hap1 = genotypes[:, :, 1]
        haps = allel.HaplotypeArray(
            np.concatenate((hap0, hap1), axis=1)
        )  #note order is all hap0 in order of samples, then all hap1 in order of samples.
        samples = vcf['samples']
        s0 = [x + "_h0" for x in samples]
        s1 = [x + "_h1" for x in samples]
        samples = np.concatenate((s0, s1), axis=0)
    return haps, samples
Beispiel #30
0
def summary_stats(tree_sequence_file):
#	ts = pyslim.load(tree_sequence_file)
	ts = pyslim.load(tree_sequence_file)
	mutated_tree = msprime.mutate(ts, 1e-7)
#	muts = len( [ v for v  in mutated_tree.variants() ] )
# Get the genotype matrix, ready for using sci-kit.allel
	msprime_genotype_matrix = mutated_tree.genotype_matrix()
# Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes
	haplotype_array = allel.HaplotypeArray( msprime_genotype_matrix )

	genotype_array = haplotype_array.to_genotypes(ploidy=2)

## Calculate Diversity
	pi = mutated_tree.diversity(windows =[0,1e6,1e6+3000])
#	print(pi, genotype_array.shape)
	subpopulations = [ [y for y in range(x, x+100)] for x in range(0,genotype_array.shape[1],100)]
	
#	print(len(individuals), genotype_array.shape)
	subpops = np.array(subpopulations)[np.random.choice(len(subpopulations),10, replace = False)]
	mean_fst = allel.average_weir_cockerham_fst(genotype_array, blen = 10000, subpops=subpops)
#	print(mean_fst)
	return(pi[0], mean_fst[0])