Beispiel #1
0
def test_n_splits(num_sub_phenos,
                  frac_shared_effects,
                  num_snps=100,
                  num_cases=5000,
                  num_conts=5000):
    num_snps_shared = num_snps * frac_shared_effects
    # ensure equal number of non-shared SNPs across sub-phenotypes
    assert (num_snps - num_snps_shared) % num_sub_phenos == 0
    num_snps_exclsv = (num_snps - num_snps_shared) / num_sub_phenos
    h_sq = 0.034
    ps = np.array([0.2] * num_snps)

    cases = np.zeros((0, num_snps))
    conts = np.zeros((0, num_snps))
    for i in range(num_sub_phenos):
        num_sub_cases = int(num_cases / num_sub_phenos)
        num_sub_conts = num_sub_cases
        sub_betas = np.array([ int(j < num_snps_shared or (
                                   j>= num_snps_shared + i*(num_snps_exclsv) and \
                                   j< num_snps_shared + (i+1)*(num_snps_exclsv))
                               ) for j in range(num_snps)])
        print(num_snps_shared)
        print(num_snps_exclsv)
        print(sub_betas)

        # set variance explained over the subset
        beta_val = np.sqrt(
            h_sq / np.sum(np.multiply(2 * np.multiply(ps, 1 - ps), sub_betas)))
        sub_betas = beta_val * sub_betas
        print(i, num_sub_cases)
        print(sub_betas)
        print("subset variance explained:",
              np.dot(np.square(sub_betas), 2 * np.multiply(ps, 1 - ps)))

        # generate sub-cohort
        prev = 0.01
        thresh = norm.ppf(1 - prev, loc=0, scale=1)
        sub_cases, sub_conts = generate_cohort(num_cases=num_sub_cases,
                                               num_conts=num_sub_conts,
                                               freqs=ps,
                                               betas=sub_betas,
                                               h_sq=h_sq,
                                               thresh=thresh)
        cases = np.concatenate((cases, sub_cases), axis=0)
        conts = np.concatenate((conts, sub_conts), axis=0)
    score = heterogeneity(cases, conts)
    return score
Beispiel #2
0
def run_liability():
    FILE_PATH = "subtype_frac_liab.p"
    if os.path.exists(FILE_PATH):
        h_sq_frac_range, results, hetsc_exp = pickle.load(open(
            FILE_PATH, "rb"))
        plot_results(h_sq_frac_range,
                     results,
                     hetsc_exp,
                     outname="subtype_scale_liability.eps",
                     xaxis_label='Fraction of subtype variance explained')
    else:
        num_cases = 30000
        num_conts = 30000
        num_snps = 100
        fixed_ps = np.array([0.2] * num_snps)
        # num_trials = 3
        num_trials = 20
        h_sq = 0.05
        h_sq_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

        sub_betas_list, _ = generate_snps_splits(num_sub_phenos=2,
                                                 frac_shared_effects=0,
                                                 num_snps=num_snps,
                                                 ps=fixed_ps,
                                                 h_sq=h_sq)

        # subtype 1 properties
        ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
        prev = 0.01
        thresh = norm.ppf(1 - prev, loc=0, scale=1)

        # expected score if all cases followed subtype 1
        hetsc_exp = heterogeneity_expected_corr(ncases=num_cases,
                                                nconts=num_cases,
                                                effects=betas,
                                                thresh=thresh,
                                                freqs=fixed_ps,
                                                heritability=h_sq,
                                                verbose=False)
        print(hetsc_exp)

        results = []
        for h_sq_frac in h_sq_frac_range:
            scores = []
            # generate subtype 2 properties
            # _, betas_sub2 = generate_snp_props(num_snps, fixed_ps, h_sq*h_sq_frac)
            sub_betas_list2, _ = generate_snps_splits(num_sub_phenos=2,
                                                      frac_shared_effects=0,
                                                      num_snps=num_snps,
                                                      ps=fixed_ps,
                                                      h_sq=h_sq * h_sq_frac)
            for nt in range(num_trials):
                print("h_sq_frac: %s, trial: %s" % (h_sq_frac, nt))
                cases_sub1, conts = generate_cohort(
                    num_cases=int(num_cases / 2),
                    num_conts=num_conts,
                    freqs=fixed_ps,  # freqs=ps,
                    betas=sub_betas_list[0],  # betas=betas,
                    h_sq=h_sq,
                    thresh=thresh)
                cases_sub2, _ = generate_cohort(
                    num_cases=int(num_cases / 2),
                    num_conts=num_conts,
                    freqs=fixed_ps,  # freqs=ps,
                    betas=sub_betas_list2[1],  # betas=betas_sub2,
                    h_sq=h_sq * h_sq_frac,
                    thresh=thresh)
                cases = np.concatenate((cases_sub1, cases_sub2), axis=0)
                score = heterogeneity(cases, conts)
                scores.append(score)
            results.append(scores)

        pickle.dump((h_sq_frac_range, results, hetsc_exp),
                    open(FILE_PATH, "wb"))
Beispiel #3
0
def run_logistic():
    FILE_PATH = "subtype_frac_logit.p"
    if os.path.exists(FILE_PATH):
        h_sq_frac_range, results, hetsc_exp = pickle.load(open(
            FILE_PATH, "rb"))
        plot_results(h_sq_frac_range,
                     results,
                     hetsc_exp,
                     outname="subtype_scale_logit.eps",
                     xaxis_label='Fraction of subtype odds ratio magnitude')
    else:
        num_cases = 30000
        num_conts = 30000
        num_snps = 100
        fixed_ps = np.array([0.2] * num_snps)
        # num_trials = 3
        num_trials = 20
        OR_val = 1.1
        OR_val_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
        prev = 0.01

        # subtype 1 properties
        # ORs = np.array([OR_val]*num_snps)
        ORs = np.array([OR_val] * int(num_snps / 2) +
                       [1.0] * int(num_snps / 2))

        # expected score if all cases followed subtype 1
        hetsc_exp = heterogeneity_expected_corr_logit(ncases=num_cases,
                                                      nconts=num_cases,
                                                      ORs=ORs,
                                                      freqs=fixed_ps,
                                                      prev=prev,
                                                      verbose=False)
        print(hetsc_exp)

        results = []
        for OR_val_frac in OR_val_frac_range:
            scores = []
            # generate subtype 2 properties
            ORs_frac_val = 1.0 + (OR_val - 1) * OR_val_frac
            # ORs_frac = np.array([ORs_frac_val]*num_snps)
            ORs2 = np.array([1.0] * int(num_snps / 2) +
                            [ORs_frac_val] * int(num_snps / 2))
            for nt in range(num_trials):
                print("OR_val_frac: %s, trial: %s" % (OR_val_frac, nt))
                cases_sub1, conts = generate_cohort_logistic(
                    num_cases=int(num_cases / 2),
                    num_conts=num_conts,
                    freqs=fixed_ps,
                    ORs=ORs,
                    prev=prev)
                cases_sub2, _ = generate_cohort_logistic(num_cases=int(
                    num_cases / 2),
                                                         num_conts=num_conts,
                                                         freqs=fixed_ps,
                                                         ORs=ORs2,
                                                         prev=prev)
                cases = np.concatenate((cases_sub1, cases_sub2), axis=0)
                score = heterogeneity(cases, conts)
                scores.append(score)
            results.append(scores)

        pickle.dump((OR_val_frac_range, results, hetsc_exp),
                    open(FILE_PATH, "wb"))
Beispiel #4
0
def run_heritability():
    FILE_PATH = "simulate_results_varexp.p"
    fig, ax = plt.subplots(1,1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    if os.path.exists(FILE_PATH):
        pckl = pickle.load(open(FILE_PATH, "rb"))
        # hetsc_means_hom = pckl["hetsc_means_hom"]
        # hetsc_stds_hom = pckl["hetsc_stds_hom"]
        # hetsc_means_het = pckl["hetsc_means_het"]
        # hetsc_stds_het = pckl["hetsc_stds_het"]
        # hetsc_means_cont = pckl["hetsc_means_cont"]
        # hetsc_stds_cont = pckl["hetsc_stds_cont"]
        hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]]
        hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]]
        hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]]
        hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]]
        hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]]
        hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]]
        hetsc_exps = pckl["hetsc_exps"]
        h_sqs = pckl["h_sqs"]
        plt.errorbar(h_sqs, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5)
        plt.errorbar(h_sqs, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5)
        plt.errorbar(h_sqs, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5)
        plt.plot(h_sqs, hetsc_exps, color='blue')
        plt.xlabel("Variance explained by modeled SNPs", fontsize=14)
        plt.ylabel("Heterogeneity Score", fontsize=14)

        # label CLiP score
        xloc = 0.8 * (h_sqs[-1] - h_sqs[0])
        yloc1 = np.interp(xloc, h_sqs, hetsc_exps)
        yloc2 = np.interp(xloc, h_sqs, hetsc_means_het)
        ax.annotate(s='',xy=(xloc, yloc1), xycoords='data',
				    xytext=(xloc, yloc2),textcoords='data',
				    arrowprops=dict(arrowstyle="<->", color='gray'))
        ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), 
                    xycoords='data',fontsize=14.0,textcoords='data',
                    ha='left', color='gray')

        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.savefig("simulate_basic_heritability.eps", format="eps", dpi=1000)
        plt.show()

    else:
        # hetsc_means_hom = []
        # hetsc_stds_hom = []
        # hetsc_means_het = []
        # hetsc_stds_het = []
        # hetsc_means_cont = []
        # hetsc_stds_cont = []
        hetscs_homs = []
        hetscs_hets = []
        hetscs_conts = []
        hetsc_exps = []
        h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1]
        num_trials = 20

        for h_sq in h_sqs:
            num_cases = 30000
            num_conts = 30000
            num_snps = 100
            fixed_ps = np.array([0.2]*num_snps)
            hetscs_hom = []
            hetscs_het = []
            hetscs_cont = []

            # expected score, homogeneous cases
            ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
            prev = 0.01
            thresh = norm.ppf(1-prev, loc=0, scale=1)
            hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                        nconts = num_conts,
                                        effects = betas,
                                        thresh = thresh,
                                        freqs = ps,
                                        heritability = h_sq, verbose=False)
            hetsc_exps.append(hetsc_exp)

            for i in range(num_trials):
                print("h_sq: %s, trial: %s" % (h_sq, i))

                # homogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_hom.append(score)

                # heterogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq, het=True)
                score = heterogeneity(cases,conts)
                hetscs_het.append(score)

                # controls, controls
                cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_cont.append(score)


            # hetsc_means_hom.append(np.mean(hetscs_hom))
            # hetsc_stds_hom.append(np.std(hetscs_hom))
            # hetsc_means_het.append(np.mean(hetscs_het))
            # hetsc_stds_het.append(np.std(hetscs_het))
            # hetsc_means_cont.append(np.mean(hetscs_cont))
            # hetsc_stds_cont.append(np.std(hetscs_cont))
            hetscs_homs.append(hetscs_hom)
            hetscs_hets.append(hetscs_het)
            hetscs_conts.append(hetscs_cont)
        pickle.dump({"hetscs_homs":hetscs_homs,
                     "hetscs_hets":hetscs_hets,
                     "hetscs_conts":hetscs_conts,
                     "hetsc_exps":hetsc_exps,
                     "h_sqs":h_sqs}, open(FILE_PATH, "wb"))
        """
Beispiel #5
0
def run_sample_size():
    FILE_PATH = "simulate_results_samplesize.p"
    fig, ax = plt.subplots(1,1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    if os.path.exists(FILE_PATH):
        pckl = pickle.load(open(FILE_PATH, "rb"))
        # hetsc_means_hom = pckl["hetsc_means_hom"]
        # hetsc_stds_hom = pckl["hetsc_stds_hom"]
        # hetsc_means_het = pckl["hetsc_means_het"]
        # hetsc_stds_het = pckl["hetsc_stds_het"]
        # hetsc_means_cont = pckl["hetsc_means_cont"]
        # hetsc_stds_cont = pckl["hetsc_stds_cont"]

        hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]]
        hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]]
        hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]]
        hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]]
        hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]]
        hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]]
        hetsc_exps = pckl["hetsc_exps"]
        sample_size = pckl["sample_size"]
        plt.errorbar(sample_size, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5)
        plt.errorbar(sample_size, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5)
        plt.errorbar(sample_size, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5)

        # plot values
        for i in range(len(sample_size)):
            print("sample size: %s, score: %s, std dev: %s" % (sample_size[i], hetsc_means_hom[i], hetsc_stds_hom[i]))

        plt.plot(sample_size, hetsc_exps, color='blue')
        plt.xlabel("Number of simulated cases and controls", fontsize=14)
        plt.ylabel("Heterogeneity Score", fontsize=14)

        # label CLiP score
        xloc = 0.8 * (sample_size[-1] - sample_size[0])
        yloc1 = np.interp(xloc, sample_size, hetsc_exps)
        yloc2 = np.interp(xloc, sample_size, hetsc_means_het)
        ax.annotate(s='',xy=(xloc, yloc1), xycoords='data',
				    xytext=(xloc, yloc2),textcoords='data',
				    arrowprops=dict(arrowstyle="<->", color='gray'))
        ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), 
                    xycoords='data',fontsize=14.0,textcoords='data',
                    ha='left', color='gray')

        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.savefig("simulate_basic_num_inds.eps", format="eps", dpi=1000)
        plt.show()

    else:
        # hetsc_means_hom = []
        # hetsc_stds_hom = []
        # hetsc_means_het = []
        # hetsc_stds_het = []
        # hetsc_means_cont = []
        # hetsc_stds_cont = []
        hetscs_homs = []
        hetscs_hets = []
        hetscs_conts = []
        hetsc_exps = []
        sample_sizes = [1000, 5000, 10000, 20000, 30000, 50000]
        # sample_sizes = [5000, 10000]

        # sample_sizes = [1000, 5000, 10000]
        num_trials = 20
        for sample_size in sample_sizes:
            num_cases = sample_size
            num_conts = sample_size
            num_snps = 100
            h_sq = 0.034
            fixed_ps = np.array([0.2]*num_snps)
            hetscs_hom = []
            hetscs_het = []
            hetscs_cont = []

            # expected score, homogeneous cases
            ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
            prev = 0.01
            thresh = norm.ppf(1-prev, loc=0, scale=1)
            hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                                  nconts = num_conts,
                                                  effects = betas,
                                                  thresh = thresh,
                                                  freqs = ps,
                                                  heritability = h_sq, verbose=False)
            hetsc_exps.append(hetsc_exp)

            for i in range(num_trials):
                print("sample_size: %s, trial: %s" % (sample_size, i))

                # homogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_hom.append(score)

                # heterogeneous cases, controls
                cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq, het=True)
                score = heterogeneity(cases,conts)
                hetscs_het.append(score)

                # controls, controls
                cases,conts = generate_controls(num_cases, num_conts, num_snps, fixed_ps, h_sq)
                score = heterogeneity(cases,conts)
                hetscs_cont.append(score)

            # hetsc_means_hom.append(np.mean(hetscs_hom))
            # hetsc_stds_hom.append(np.std(hetscs_hom))
            # hetsc_means_het.append(np.mean(hetscs_het))
            # hetsc_stds_het.append(np.std(hetscs_het))
            # hetsc_means_cont.append(np.mean(hetscs_cont))
            # hetsc_stds_cont.append(np.std(hetscs_cont))
            hetscs_homs.append(hetscs_hom)
            hetscs_hets.append(hetscs_het)
            hetscs_conts.append(hetscs_cont)
        pickle.dump({"hetscs_homs":hetscs_homs,
                     "hetscs_hets":hetscs_hets,
                     "hetscs_conts":hetscs_conts,
                     "hetsc_exps":hetsc_exps,
                     "sample_size":sample_sizes}, open(FILE_PATH, "wb"))
        """
Beispiel #6
0
        # frac_shared_effects_list = [0, .25, .5, .75, 1]
        frac_shared_effects_list = [0, .5, 1]

        split_results = {}
        for i in num_subpheno_list:
            split_results[i] = {}
            for j in frac_shared_effects_list:
                split_results[i][j] = []
        for nsubph in num_subpheno_list:
            for fsheff in frac_shared_effects_list:
                for nt in range(num_trials):
                    print(nsubph, fsheff, nt)
                    h_sq = 0.05
                    sub_betas_list, ps = generate_snps_splits(
                        num_sub_phenos=nsubph,
                        frac_shared_effects=fsheff,
                        num_snps=num_snps,
                        ps=ps,
                        h_sq=h_sq)

                    cases, conts = generate_cohort_splits(
                        num_sub_phenos=nsubph,
                        sub_betas_list=sub_betas_list,
                        ps=ps,
                        num_cases=num_cases,
                        num_conts=num_cases,
                        h_sq=h_sq)
                    score = heterogeneity(cases, conts)
                    split_results[nsubph][fsheff].append(score)
        pickle.dump(split_results, open("split_results.p", "wb"))
Beispiel #7
0
        # pickle file for saving correlation tests
        corr_matrices = []

        # for display table and FDR calculation
        table_vals = []

        # for plotting score vs num_cases
        plot_vals = []
        score_std = 1
        for fl in files:
            cht_name, FILE_PATH, SAMPLE_PATH = fl
            # try:
            cases, conts, cases_abr, conts_abr, snpsdel, ors, frqs = extract(
                SNP_PATH, FILE_PATH, SAMPLE_PATH)

            hetsc = heterogeneity(cases_abr, conts_abr)

            # convert odds ratios to liability threshold ratios
            betas = convertORs(ors, prev)
            h_sq = np.sum(
                np.multiply(np.square(betas), 2 * np.multiply(frqs, 1 - frqs)))
            # print("h_sq:", h_sq)
            expected_score = heterogeneity_expected_corr(ncases=cases.shape[0],
                                                         nconts=conts.shape[0],
                                                         effects=betas,
                                                         thresh=thresh,
                                                         freqs=frqs,
                                                         heritability=h_sq,
                                                         verbose=False)
            if snpsdel == 0:  # store for combined set
                super_betas = betas
Beispiel #8
0
def run_heritability():
    # FILE_PATH = "population_stratif_out.p"
    # pop_allele_diffs = [0.0, 0.05, 0.1, 0.15, 0.2]
    fsts = np.array([0.001, 0.005, 0.01, 0.05, 0.1])
    pop_allele_diffs = 0.5*np.sqrt(fsts)

    # for allele_diff in pop_allele_diffs:
    #     FILE_PATH = "population_stratif_out_%s.p" % (allele_diff)
    for fst, allele_diff in zip(fsts, pop_allele_diffs):
        FILE_PATH = "population_stratif_out_%s.p" % (fst)
        if not os.path.exists(FILE_PATH):
            num_snps = 100
            fixed_ps_val = 0.5
            fixed_ps = np.array([fixed_ps_val]*num_snps)
            # generate stratified populations
            ps_stratif = [[fixed_ps_val - allele_diff]*int(num_snps/2) + [fixed_ps_val + allele_diff]*int(num_snps/2),
                          [fixed_ps_val + allele_diff]*int(num_snps/2) + [fixed_ps_val - allele_diff]*int(num_snps/2)]

            hetscs_homs = []
            hetscs_hets = []
            hetscs_conts = []
            hetsc_exps = []
            mean_allele_diffs = [] # allele difference between homogeneous cases and controls
            h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1]
            # h_sqs = [0.001, 0.025, 0.05, 0.075, 0.1]
            num_trials = 30

            for h_sq in h_sqs:
                num_cases = 30000
                num_conts = 30000

                hetscs_hom = []
                hetscs_het = []
                hetscs_cont = []
                mean_allele_diff = []

                # expected score, homogeneous cases
                ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq)
                prev = 0.01
                thresh = norm.ppf(1-prev, loc=0, scale=1)
                hetsc_exp = heterogeneity_expected_corr(ncases = num_cases,
                                            nconts = num_conts,
                                            effects = betas,
                                            thresh = thresh,
                                            freqs = ps,
                                            heritability = h_sq, verbose=False)
                hetsc_exps.append(hetsc_exp)

                for i in range(num_trials):
                    print("allele_diff: %s, h_sq: %s, trial: %s" % (allele_diff, h_sq, i))

                    # homogeneous cases, controls
                    cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq)
                    score = heterogeneity(cases,conts)
                    hetscs_hom.append(score)

                    # heterogeneous cases, controls
                    cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=True)
                    score = heterogeneity(cases,conts)
                    hetscs_het.append(score)
                    mean_allele_diff.append(np.mean(cases) - np.mean(conts))

                    # controls, controls
                    # cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq)
                    cases, conts = generate_controls(num_cases, num_conts, num_snps, ps_stratif)
                    score = heterogeneity(cases,conts)
                    hetscs_cont.append(score)

                hetscs_homs.append(hetscs_hom)
                hetscs_hets.append(hetscs_het)
                hetscs_conts.append(hetscs_cont)
                mean_allele_diffs.append(mean_allele_diff)
            pickle.dump({"hetscs_homs":hetscs_homs,
                         "hetscs_hets":hetscs_hets,
                         "hetscs_conts":hetscs_conts,
                         "hetsc_exps":hetsc_exps,
                         "mean_allele_diffs":mean_allele_diffs,
                         "h_sqs":h_sqs}, open(FILE_PATH, "wb"))
    plot_results(fsts)