Example #1
0
def generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=False):
    # NOTE: ps for homogeneous cases still needed to generate betas
    #num_cases = 5000
    #num_conts = 5000
    #num_snps = 10
    prev = 0.01
    thresh = norm.ppf(1-prev, loc=0, scale=1)
    #h_sq = 0.1

    ps, betas = generate_snp_props(num_snps, ps, h_sq)
    cases, conts = generate_cohort(num_cases=num_cases,
                                   num_conts=num_conts,
                                   # freqs=ps,
                                   num_snps=num_snps, # freqs are sampled randomly when generating stratification
                                   betas=betas,
                                   ps=ps,
                                   ps_stratif=ps_stratif,
                                   h_sq=h_sq,
                                   thresh=thresh)
    if het:
        pi = 0.5
        # conts_temp = np.random.binomial(n=2, p=ps, size=(num_conts,num_snps))
        conts_temp = generate_stratified_controls(num_conts, num_snps, ps_stratif)
        np.random.shuffle(conts_temp)
        cases = np.concatenate((cases[:int(num_cases * pi)], conts_temp[:int(num_cases * (1-pi))]),axis=0)
    return cases,conts
Example #2
0
def run_liability():
    FILE_PATH = "subtype_frac_liab.p"
    if os.path.exists(FILE_PATH):
        h_sq_frac_range, results, hetsc_exp = pickle.load(open(
            FILE_PATH, "rb"))
        plot_results(h_sq_frac_range,
                     results,
                     hetsc_exp,
                     outname="subtype_scale_liability.eps",
                     xaxis_label='Fraction of subtype variance explained')
    else:
        num_cases = 30000
        num_conts = 30000
        num_snps = 100
        fixed_ps = np.array([0.2] * num_snps)
        # num_trials = 3
        num_trials = 20
        h_sq = 0.05
        h_sq_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

        sub_betas_list, _ = generate_snps_splits(num_sub_phenos=2,
                                                 frac_shared_effects=0,
                                                 num_snps=num_snps,
                                                 ps=fixed_ps,
                                                 h_sq=h_sq)

        # subtype 1 properties
        ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
        prev = 0.01
        thresh = norm.ppf(1 - prev, loc=0, scale=1)

        # expected score if all cases followed subtype 1
        hetsc_exp = heterogeneity_expected_corr(ncases=num_cases,
                                                nconts=num_cases,
                                                effects=betas,
                                                thresh=thresh,
                                                freqs=fixed_ps,
                                                heritability=h_sq,
                                                verbose=False)
        print(hetsc_exp)

        results = []
        for h_sq_frac in h_sq_frac_range:
            scores = []
            # generate subtype 2 properties
            # _, betas_sub2 = generate_snp_props(num_snps, fixed_ps, h_sq*h_sq_frac)
            sub_betas_list2, _ = generate_snps_splits(num_sub_phenos=2,
                                                      frac_shared_effects=0,
                                                      num_snps=num_snps,
                                                      ps=fixed_ps,
                                                      h_sq=h_sq * h_sq_frac)
            for nt in range(num_trials):
                print("h_sq_frac: %s, trial: %s" % (h_sq_frac, nt))
                cases_sub1, conts = generate_cohort(
                    num_cases=int(num_cases / 2),
                    num_conts=num_conts,
                    freqs=fixed_ps,  # freqs=ps,
                    betas=sub_betas_list[0],  # betas=betas,
                    h_sq=h_sq,
                    thresh=thresh)
                cases_sub2, _ = generate_cohort(
                    num_cases=int(num_cases / 2),
                    num_conts=num_conts,
                    freqs=fixed_ps,  # freqs=ps,
                    betas=sub_betas_list2[1],  # betas=betas_sub2,
                    h_sq=h_sq * h_sq_frac,
                    thresh=thresh)
                cases = np.concatenate((cases_sub1, cases_sub2), axis=0)
                score = heterogeneity(cases, conts)
                scores.append(score)
            results.append(scores)

        pickle.dump((h_sq_frac_range, results, hetsc_exp),
                    open(FILE_PATH, "wb"))
Example #3
0
def run_heritability():
    FILE_PATH = "simulate_results_varexp.p"
    fig, ax = plt.subplots(1,1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    if os.path.exists(FILE_PATH):
        pckl = pickle.load(open(FILE_PATH, "rb"))
        # hetsc_means_hom = pckl["hetsc_means_hom"]
        # hetsc_stds_hom = pckl["hetsc_stds_hom"]
        # hetsc_means_het = pckl["hetsc_means_het"]
        # hetsc_stds_het = pckl["hetsc_stds_het"]
        # hetsc_means_cont = pckl["hetsc_means_cont"]
        # hetsc_stds_cont = pckl["hetsc_stds_cont"]
        hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]]
        hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]]
        hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]]
        hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]]
        hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]]
        hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]]
        hetsc_exps = pckl["hetsc_exps"]
        h_sqs = pckl["h_sqs"]
        plt.errorbar(h_sqs, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5)
        plt.errorbar(h_sqs, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5)
        plt.errorbar(h_sqs, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5)
        plt.plot(h_sqs, hetsc_exps, color='blue')
        plt.xlabel("Variance explained by modeled SNPs", fontsize=14)
        plt.ylabel("Heterogeneity Score", fontsize=14)

        # label CLiP score
        xloc = 0.8 * (h_sqs[-1] - h_sqs[0])
        yloc1 = np.interp(xloc, h_sqs, hetsc_exps)
        yloc2 = np.interp(xloc, h_sqs, hetsc_means_het)
        ax.annotate(s='',xy=(xloc, yloc1), xycoords='data',
				    xytext=(xloc, yloc2),textcoords='data',
				    arrowprops=dict(arrowstyle="<->", color='gray'))
        ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), 
                    xycoords='data',fontsize=14.0,textcoords='data',
                    ha='left', color='gray')

        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.savefig("simulate_basic_heritability.eps", format="eps", dpi=1000)
        plt.show()

    else:
        # hetsc_means_hom = []
        # hetsc_stds_hom = []
        # hetsc_means_het = []
        # hetsc_stds_het = []
        # hetsc_means_cont = []
        # hetsc_stds_cont = []
        hetscs_homs = []
        hetscs_hets = []
        hetscs_conts = []
        hetsc_exps = []
        h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1]
        num_trials = 20

        for h_sq in h_sqs:
            num_cases = 30000
            num_conts = 30000
            num_snps = 100
            fixed_ps = np.array([0.2]*num_snps)
            hetscs_hom = []
            hetscs_het = []
            hetscs_cont = []

            # expected score, homogeneous cases
            ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
            prev = 0.01
            thresh = norm.ppf(1-prev, loc=0, scale=1)
            hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                        nconts = num_conts,
                                        effects = betas,
                                        thresh = thresh,
                                        freqs = ps,
                                        heritability = h_sq, verbose=False)
            hetsc_exps.append(hetsc_exp)

            for i in range(num_trials):
                print("h_sq: %s, trial: %s" % (h_sq, i))

                # homogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_hom.append(score)

                # heterogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq, het=True)
                score = heterogeneity(cases,conts)
                hetscs_het.append(score)

                # controls, controls
                cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_cont.append(score)


            # hetsc_means_hom.append(np.mean(hetscs_hom))
            # hetsc_stds_hom.append(np.std(hetscs_hom))
            # hetsc_means_het.append(np.mean(hetscs_het))
            # hetsc_stds_het.append(np.std(hetscs_het))
            # hetsc_means_cont.append(np.mean(hetscs_cont))
            # hetsc_stds_cont.append(np.std(hetscs_cont))
            hetscs_homs.append(hetscs_hom)
            hetscs_hets.append(hetscs_het)
            hetscs_conts.append(hetscs_cont)
        pickle.dump({"hetscs_homs":hetscs_homs,
                     "hetscs_hets":hetscs_hets,
                     "hetscs_conts":hetscs_conts,
                     "hetsc_exps":hetsc_exps,
                     "h_sqs":h_sqs}, open(FILE_PATH, "wb"))
        """
Example #4
0
def run_sample_size():
    FILE_PATH = "simulate_results_samplesize.p"
    fig, ax = plt.subplots(1,1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    if os.path.exists(FILE_PATH):
        pckl = pickle.load(open(FILE_PATH, "rb"))
        # hetsc_means_hom = pckl["hetsc_means_hom"]
        # hetsc_stds_hom = pckl["hetsc_stds_hom"]
        # hetsc_means_het = pckl["hetsc_means_het"]
        # hetsc_stds_het = pckl["hetsc_stds_het"]
        # hetsc_means_cont = pckl["hetsc_means_cont"]
        # hetsc_stds_cont = pckl["hetsc_stds_cont"]

        hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]]
        hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]]
        hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]]
        hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]]
        hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]]
        hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]]
        hetsc_exps = pckl["hetsc_exps"]
        sample_size = pckl["sample_size"]
        plt.errorbar(sample_size, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5)
        plt.errorbar(sample_size, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5)
        plt.errorbar(sample_size, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5)

        # plot values
        for i in range(len(sample_size)):
            print("sample size: %s, score: %s, std dev: %s" % (sample_size[i], hetsc_means_hom[i], hetsc_stds_hom[i]))

        plt.plot(sample_size, hetsc_exps, color='blue')
        plt.xlabel("Number of simulated cases and controls", fontsize=14)
        plt.ylabel("Heterogeneity Score", fontsize=14)

        # label CLiP score
        xloc = 0.8 * (sample_size[-1] - sample_size[0])
        yloc1 = np.interp(xloc, sample_size, hetsc_exps)
        yloc2 = np.interp(xloc, sample_size, hetsc_means_het)
        ax.annotate(s='',xy=(xloc, yloc1), xycoords='data',
				    xytext=(xloc, yloc2),textcoords='data',
				    arrowprops=dict(arrowstyle="<->", color='gray'))
        ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), 
                    xycoords='data',fontsize=14.0,textcoords='data',
                    ha='left', color='gray')

        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.savefig("simulate_basic_num_inds.eps", format="eps", dpi=1000)
        plt.show()

    else:
        # hetsc_means_hom = []
        # hetsc_stds_hom = []
        # hetsc_means_het = []
        # hetsc_stds_het = []
        # hetsc_means_cont = []
        # hetsc_stds_cont = []
        hetscs_homs = []
        hetscs_hets = []
        hetscs_conts = []
        hetsc_exps = []
        sample_sizes = [1000, 5000, 10000, 20000, 30000, 50000]
        # sample_sizes = [5000, 10000]

        # sample_sizes = [1000, 5000, 10000]
        num_trials = 20
        for sample_size in sample_sizes:
            num_cases = sample_size
            num_conts = sample_size
            num_snps = 100
            h_sq = 0.034
            fixed_ps = np.array([0.2]*num_snps)
            hetscs_hom = []
            hetscs_het = []
            hetscs_cont = []

            # expected score, homogeneous cases
            ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
            prev = 0.01
            thresh = norm.ppf(1-prev, loc=0, scale=1)
            hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                                  nconts = num_conts,
                                                  effects = betas,
                                                  thresh = thresh,
                                                  freqs = ps,
                                                  heritability = h_sq, verbose=False)
            hetsc_exps.append(hetsc_exp)

            for i in range(num_trials):
                print("sample_size: %s, trial: %s" % (sample_size, i))

                # homogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_hom.append(score)

                # heterogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq, het=True)
                score = heterogeneity(cases,conts)
                hetscs_het.append(score)

                # controls, controls
                cases,conts = generate_controls(num_cases, num_conts, num_snps, fixed_ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_cont.append(score)

            # hetsc_means_hom.append(np.mean(hetscs_hom))
            # hetsc_stds_hom.append(np.std(hetscs_hom))
            # hetsc_means_het.append(np.mean(hetscs_het))
            # hetsc_stds_het.append(np.std(hetscs_het))
            # hetsc_means_cont.append(np.mean(hetscs_cont))
            # hetsc_stds_cont.append(np.std(hetscs_cont))
            hetscs_homs.append(hetscs_hom)
            hetscs_hets.append(hetscs_het)
            hetscs_conts.append(hetscs_cont)
        pickle.dump({"hetscs_homs":hetscs_homs,
                     "hetscs_hets":hetscs_hets,
                     "hetscs_conts":hetscs_conts,
                     "hetsc_exps":hetsc_exps,
                     "sample_size":sample_sizes}, open(FILE_PATH, "wb"))
        """
Example #5
0
def run_heritability():
    # FILE_PATH = "population_stratif_out.p"
    # pop_allele_diffs = [0.0, 0.05, 0.1, 0.15, 0.2]
    fsts = np.array([0.001, 0.005, 0.01, 0.05, 0.1])
    pop_allele_diffs = 0.5*np.sqrt(fsts)

    # for allele_diff in pop_allele_diffs:
    #     FILE_PATH = "population_stratif_out_%s.p" % (allele_diff)
    for fst, allele_diff in zip(fsts, pop_allele_diffs):
        FILE_PATH = "population_stratif_out_%s.p" % (fst)
        if not os.path.exists(FILE_PATH):
            num_snps = 100
            fixed_ps_val = 0.5
            fixed_ps = np.array([fixed_ps_val]*num_snps)
            # generate stratified populations
            ps_stratif = [[fixed_ps_val - allele_diff]*int(num_snps/2) + [fixed_ps_val + allele_diff]*int(num_snps/2),
                          [fixed_ps_val + allele_diff]*int(num_snps/2) + [fixed_ps_val - allele_diff]*int(num_snps/2)]

            hetscs_homs = []
            hetscs_hets = []
            hetscs_conts = []
            hetsc_exps = []
            mean_allele_diffs = [] # allele difference between homogeneous cases and controls
            h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1]
            # h_sqs = [0.001, 0.025, 0.05, 0.075, 0.1]
            num_trials = 30

            for h_sq in h_sqs:
                num_cases = 30000
                num_conts = 30000

                hetscs_hom = []
                hetscs_het = []
                hetscs_cont = []
                mean_allele_diff = []

                # expected score, homogeneous cases
                ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
                prev = 0.01
                thresh = norm.ppf(1-prev, loc=0, scale=1)
                hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                            nconts = num_conts,
                                            effects = betas,
                                            thresh = thresh,
                                            freqs = ps,
                                            heritability = h_sq, verbose=False)
                hetsc_exps.append(hetsc_exp)

                for i in range(num_trials):
                    print("allele_diff: %s, h_sq: %s, trial: %s" % (allele_diff, h_sq, i))

                    # homogeneous cases, controls
                    cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq)
                    score = heterogeneity(cases,conts)
                    hetscs_hom.append(score)

                    # heterogeneous cases, controls
                    cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=True)
                    score = heterogeneity(cases,conts)
                    hetscs_het.append(score)
                    mean_allele_diff.append(np.mean(cases) - np.mean(conts))

                    # controls, controls
                    # cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq)
                    cases, conts = generate_controls(num_cases, num_conts, num_snps, ps_stratif)
                    score = heterogeneity(cases,conts)
                    hetscs_cont.append(score)

                hetscs_homs.append(hetscs_hom)
                hetscs_hets.append(hetscs_het)
                hetscs_conts.append(hetscs_cont)
                mean_allele_diffs.append(mean_allele_diff)
            pickle.dump({"hetscs_homs":hetscs_homs,
                         "hetscs_hets":hetscs_hets,
                         "hetscs_conts":hetscs_conts,
                         "hetsc_exps":hetsc_exps,
                         "mean_allele_diffs":mean_allele_diffs,
                         "h_sqs":h_sqs}, open(FILE_PATH, "wb"))
    plot_results(fsts)