Ejemplo n.º 1
0
def test_Fst__windowed(sample_size, n_cohorts, chunks):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts, chunks)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    fst_ds = Fst(ds, estimator="Nei")
    fst = fst_ds["stat_Fst"].values

    # Calculate Fst using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    n_windows = len(windows) - 1
    ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan)
    for i, j in itertools.combinations(range(n_cohorts), 2):
        ts_fst[:, i, j] = ts.Fst([subsets[i], subsets[j]],
                                 windows=windows,
                                 span_normalise=False)
        ts_fst[:, j, i] = ts_fst[:, i, j]

    # We can values close to zero, and the default value of atol isn't
    # appropriate for this.
    atol = 1e-8
    np.testing.assert_allclose(fst, ts_fst, atol=atol)

    # scikit-allel
    fst_ds = Fst(ds, estimator="Hudson")
    for i, j in itertools.combinations(range(n_cohorts), 2):
        fst = fst_ds["stat_Fst"].sel(cohorts_0=f"co_{i}",
                                     cohorts_1=f"co_{j}").values

        ac_i = fst_ds.cohort_allele_count.values[:, i, :]
        ac_j = fst_ds.cohort_allele_count.values[:, j, :]
        ska_fst = allel.moving_hudson_fst(ac_i, ac_j, size=25)

        np.testing.assert_allclose(
            fst[:-1], ska_fst,
            atol=atol)  # scikit-allel has final window missing
Ejemplo n.º 2
0
        chrom=chrom,
        samples=metadata,
        numbers=numbers,
        ploidy=ploidy,
        qualflt=qualflt,
        missingfltprop=missingprop)

    #### Fst in windows ####
    for sus, res in comparisons:
        name = sus + "_" + res
        cohortText = f"{sus} v {res}"
        print(f"Calculating Fst values in sliding windows for {name}\n")

        for wname, size, step in zip(windownames, windowsizes, windowsteps):
            FstArray = allel.moving_hudson_fst(acsubpops[sus],
                                               acsubpops[res],
                                               size=size,
                                               step=step)
            midpoint = allel.moving_statistic(pos,
                                              np.median,
                                              size=size,
                                              step=step)

            cohortNoSpaceText = name + "." + wname
            rnaseqpop.plotWindowed(
                statName="Fst",
                cohortText=cohortText,
                cohortNoSpaceText=cohortNoSpaceText,
                values=FstArray,
                midpoints=midpoint,
                colour='dodgerblue',
                prefix="results/variantAnalysis/selection/fst",
Ejemplo n.º 3
0
def main(args):

    ## Step 0: get null model for SNP calling
    null_loc = os.path.dirname(
        __file__) + '/helper_files/combined_null1000000.txt'
    null_model = generate_snp_model(null_loc)
    P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3}
    C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'}

    ## Step 1: build new counts table from all objects
    s_final = SNPprofile()
    s_final.filename = args.output
    i = 0
    counts_per_block = {}
    s1 = SNPprofile()
    print("loading " + args.input[0])
    s1.load(args.input[0])

    s_final.scaffold_list = s1.scaffold_list
    s_final.counts_table = copy.deepcopy(s1.counts_table)

    s2 = SNPprofile()
    print("loading " + args.input[1])
    s2.load(args.input[1])

    for scaf in s2.scaffold_list:
        if scaf not in s_final.scaffold_list:
            sys.exit(
                "Error: scaffold " + scaf + " in " + fn +
                " not found in initial file. Your inStrain objects were probably not run on the same FASTA."
            )

    scaf_counter = 0
    for scaf in s2.counts_table:
        s_final.counts_table[scaf_counter] += scaf
        scaf_counter += 1
    i += 1

    # Step 2: call all SNPs for new object
    allele_counts_total = {}
    allele_counts1 = {}
    allele_counts2 = {}
    snp_table = defaultdict(list)
    scaf_counter = 0

    for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'):
        pos_counter = 0
        for counts in scaf:
            snp = call_snv_site(counts,
                                min_cov=5,
                                min_freq=0.05,
                                model=null_model)

            if snp:  # means that there was coverage at this position
                if snp != -1:  # means this is a SNP
                    # calculate varBase
                    snp, varbase = major_minor_allele(counts)

                    snp_table['scaffold'].append(
                        s_final.scaffold_list[scaf_counter])
                    snp_table['position'].append(pos_counter)
                    snp_table['varBase'].append(snp)
                    snp_table['conBase'].append(varbase)
                    allele_counts_total[s_final.scaffold_list[scaf_counter] +
                                        ":" + str(pos_counter)] = (
                                            s_final.counts_table[scaf_counter]
                                            [pos_counter])
                    allele_counts1[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s1.counts_table[scaf_counter]
                                       [pos_counter])
                    allele_counts2[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s2.counts_table[scaf_counter]
                                       [pos_counter])
            pos_counter += 1  # 0 based positions!!
        scaf_counter += 1

    # Step 3: Save new FST_SNP table to disk.
    SNPTable = pd.DataFrame(snp_table)

    FstTable = defaultdict(list)
    for gene in tqdm(create_gene_index(args.gene_file),
                     desc="calculating fst"):
        snps = SNPTable[(SNPTable.scaffold == gene['scaf'])
                        & (SNPTable.position >= gene['start']) &
                        (SNPTable.position <= gene['end'])]
        snp_list = []
        for index, row in snps.iterrows():
            snp_list.append(row['scaffold'] + ":" + str(row['position']))

        # only continue if there are at least 3 snps in this gene
        if len(snp_list) >= 3:
            allele_counts_1 = []
            allele_counts_2 = []
            for snp in snp_list:
                allele_counts_1.append(allele_counts1[snp])
                allele_counts_2.append(allele_counts2[snp])

            allel1 = allel.AlleleCountsArray(allele_counts_1)
            allel2 = allel.AlleleCountsArray(allele_counts_2)
            fst_h = allel.moving_hudson_fst(
                allel1, allel2,
                size=len(snp_list))[0]  #allel.moving_hudson_fst(a1,a2, size=3)
            nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / (
                1 + gene['end'] - gene['start'])
            nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / (
                1 + gene['end'] - gene['start'])

            FstTable['gene'].append(gene['name'])
            FstTable['snp_num'].append(len(snp_list))
            FstTable['fst'].append(fst_h)
            FstTable['pi_1'].append(nd_1)
            FstTable['pi_2'].append(nd_2)
            FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1)))
            FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1)))

    FstTable = pd.DataFrame(FstTable)
    print(np.mean(FstTable['fst']))
    FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')