def test_ihs_data(): h = np.hstack([hap1, hap2]) pos = np.arange(1, h.shape[0] + 1) expect = np.log(5.5/1.5) for use_threads in True, False: for include_edges in True, False: score = ihs(h, pos, include_edges=include_edges, use_threads=use_threads) actual = score[9] assert expect == actual
def test_ihs(): n_variants = 1000 n_haplotypes = 20 h = np.random.randint(0, 2, size=(n_variants, n_haplotypes)).astype('i1') pos = np.arange(0, n_variants * 10, 10) for use_threads in True, False: for min_ehh in 0, 0.05, 0.5: for include_edges in True, False: score = ihs(h, pos, min_ehh=min_ehh, include_edges=include_edges, use_threads=use_threads) assert isinstance(score, np.ndarray) assert (n_variants,) == score.shape assert np.dtype('f8') == score.dtype with pytest.raises(ValueError): ihs(h, pos[1:]) with pytest.raises(ValueError): ihs(h, pos, map_pos=pos[1:])
def ihs(haplotype, pos_vec, window=None): """Compute the standardize integrated haplotype score""" ihs = allel.ihs(haplotype, pos_vec, min_maf=0.01, include_edges=True) ihs_stand, bins = allel.standardize_by_allele_count( ihs, haplotype.count_alleles().T[1], diagnostics=False) if window: di = pd.DataFrame(ihs_stand, columns=["iHS"]) di["pos_cat"] = pd.cut(pos_vec, window, labels=range(1, window + 1)) dig = di.groupby("pos_cat").iHS.mean() return dig else: return ihs_stand
import seaborn as sns import pandas as pd chromlist = ["Wb_Chr1_0", "Wb_Chr1_1", "Wb_Chr2_0", "Wb_Chr2_1", "Wb_Chr2_2", "Wb_Chr2_3", "Wb_Chr3_0", "Wb_Chr3_1", "Wb_Chr4_0", "Wb_Chr4_1", "Wb_Chr4_2"] seldict = {} for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # ihs ihs = allel.ihs(h, pos, include_edges=True) ihs_std = allel.standardize_by_allele_count(ihs, acc) plt.plot(pos, -np.log10(ihs_std[0])) nan = ~np.isnan(ihs) ihs_real = ihs[nan] pos_ihs = pos[nan] # nsl nsl = allel.nsl(h) nsl_std = allel.standardize_by_allele_count(nsl, acc) plt.plot(pos, -np.log10(nsl_std[0])) nan = ~np.isnan(ihs) nsl_real = ihs[nan] pos_nsl = pos[nan] seldict[c] = (ihs_std[0], nsl_std[0]) ## ehh is site dependent site dependent #ehh = allel.ehh_decay(h)
"Wb_Chr1_0", "Wb_Chr1_1", "Wb_Chr2_0", "Wb_Chr2_1", "Wb_Chr2_2", "Wb_Chr2_3", "Wb_Chr3_0", "Wb_Chr3_1", "Wb_Chr4_0", "Wb_Chr4_1", "Wb_Chr4_2" ] seldict = {} for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # ihs ihs = allel.ihs(h, pos, include_edges=True) ihs_std = allel.standardize_by_allele_count(ihs, acc) plt.plot(pos, -np.log10(ihs_std[0])) nan = ~np.isnan(ihs) ihs_real = ihs[nan] pos_ihs = pos[nan] # nsl nsl = allel.nsl(h) nsl_std = allel.standardize_by_allele_count(nsl, acc) plt.plot(pos, -np.log10(nsl_std[0])) nan = ~np.isnan(ihs) nsl_real = ihs[nan] pos_nsl = pos[nan] seldict[c] = (ihs_std[0], nsl_std[0]) ## ehh is site dependent site dependent #ehh = allel.ehh_decay(h)