コード例 #1
0
ファイル: test_lists.py プロジェクト: pola-rs/polars
def test_list_ternary_concat() -> None:
    df = pl.DataFrame(
        {
            "list1": [["123", "456"], None],
            "list2": [["789"], ["zzz"]],
        }
    )

    assert df.with_column(
        pl.when(pl.col("list1").is_null())
        .then(pl.col("list1").arr.concat(pl.col("list2")))
        .otherwise(pl.col("list2"))
        .alias("result")
    ).to_dict(False) == {
        "list1": [["123", "456"], None],
        "list2": [["789"], ["zzz"]],
        "result": [["789"], None],
    }

    assert df.with_column(
        pl.when(pl.col("list1").is_null())
        .then(pl.col("list2"))
        .otherwise(pl.col("list1").arr.concat(pl.col("list2")))
        .alias("result")
    ).to_dict(False) == {
        "list1": [["123", "456"], None],
        "list2": [["789"], ["zzz"]],
        "result": [["123", "456", "789"], ["zzz"]],
    }
コード例 #2
0
def load_dir(pheno, region, dir_):
    with open(f'{dir_}/converged.txt') as converged:
        assert converged.read().strip() == 'TRUE'

    alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t',
                         has_header=False).collect().to_numpy().T
    susie_pips = 1 - np.prod(1 - alphas, axis=1)

    df = pl.scan_csv(f'{dir_}/colnames.txt',
                     has_header=False,
                     with_column_names=lambda _: ['var_name']).with_column(
                         pl.lit(1).alias('row_number')).with_columns([
                             pl.col('row_number').cumsum(),
                             pl.lit(None, int).alias('cs_num'),
                             pl.lit(region).alias('region'),
                             pl.lit(pheno).alias('phenotype'),
                             pl.Series(susie_pips).alias('susie_pip'),
                             pl.lit(None, float).alias('susie_cs_pip')
                         ])

    for cs_num in range(50):
        cs_num += 1
        cs_fname = f'{dir_}/cs{cs_num}.txt'
        if not os.path.exists(cs_fname):
            continue
        with open(cs_fname) as cs:
            var_nums = [int(var_num) for var_num in next(cs).strip().split()]
            next(cs)
            min_ld = float(next(cs).split()[0])
            if min_ld < min_ld_thresh:
                continue
            df = df.with_columns([
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise(
                        cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'),
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.Series(alphas[:, cs_num - 1])).otherwise(
                        pl.col('susie_cs_pip')).alias('susie_cs_pip')
            ])

    df = df.with_column(
        pl.when(pl.col('cs_num') != -1).then(
            pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip'))
    df = df.filter(
        pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null()
        & (pl.col('susie_pip') > 0.05)).drop('row_number')

    return df
コード例 #3
0
def test_type_coercion_when_then_otherwise_2806() -> None:
    out = (pl.DataFrame({
        "names": ["foo", "spam", "spam"],
        "nrs": [1, 2, 3]
    }).select([
        pl.when(pl.col("names") == "spam").then(pl.col("nrs") * 2).otherwise(
            pl.lit("other")).alias("new_col"),
    ]).to_series())
    expected = pl.Series("new_col", ["other", "4", "6"])
    assert out.to_list() == expected.to_list()

    # test it remains float32
    assert (pl.Series(
        "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select(
            pl.when(pl.col("a") > 2.0).then(
                pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
コード例 #4
0
def calculate_friction_number(column_names: List[str]) -> "pl.Expr":
    if "fs" in column_names and "qc" in column_names:
        return (col("fs") /
                when(col("qc") == 0.0).then(None).otherwise(col("qc")) *
                100.0).alias("friction_number")
    else:
        return lit(0.0).alias("friction_number")
コード例 #5
0
def full_genome_polars_df(df):
    df = df.with_column(pl.col('pos').alias('plot_pos'))
    for chrom in range(2, 23):
        df = df.with_column(
            pl.when(pl.col('chr') >= chrom).then(
                pl.col('plot_pos') + int(chr_lens[chrom - 2])).otherwise(
                    pl.col('plot_pos')).alias('plot_pos'))
    return df
コード例 #6
0
def test_set_null() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = (df.lazy().with_column(
        when(col("a") > 1).then(
            lit(None)).otherwise(100).alias("foo")).collect())
    s = out["foo"]
    assert s[0] == 100
    assert s[1] is None
    assert s[2] is None
コード例 #7
0
ファイル: test_lists.py プロジェクト: pola-rs/polars
def test_list_fill_list() -> None:
    assert pl.DataFrame({"a": [[1, 2, 3], []]}).select(
        [
            pl.when(pl.col("a").arr.lengths() == 0)
            .then([5])
            .otherwise(pl.col("a"))
            .alias("filled")
        ]
    ).to_dict(False) == {"filled": [[1, 2, 3], [5]]}
コード例 #8
0
ファイル: test_lists.py プロジェクト: pola-rs/polars
def test_list_fill_null() -> None:
    df = pl.DataFrame({"C": [["a", "b", "c"], [], [], ["d", "e"]]})
    assert df.with_columns(
        [
            pl.when(pl.col("C").arr.lengths() == 0)
            .then(None)
            .otherwise(pl.col("C"))
            .alias("C")
        ]
    ).to_series().to_list() == [["a", "b", "c"], None, None, ["d", "e"]]
コード例 #9
0
def test_when_then_flatten() -> None:
    df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 4, 5]})

    assert df[
        when(col("foo") > 1)
        .then(col("bar"))
        .when(col("bar") < 3)
        .then(10)
        .otherwise(30)
    ]["bar"] == [30, 4, 5]
コード例 #10
0
def test_when_then_edge_cases_3994() -> None:
    df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]})

    # this tests if lazy correctly assigns the list schema to the column aggregation
    assert (df.lazy().groupby(["id"]).agg(pl.col("type")).with_column(
        pl.when(pl.col("type").arr.lengths() == 0).then(
            pl.lit(None)).otherwise(
                pl.col("type")).keep_name()).collect()).to_dict(False) == {
                    "id": [1],
                    "type": [[2, 2]]
                }

    # this tests ternary with an empty argument
    assert (df.filter(pl.col("id") == 42).groupby([
        "id"
    ]).agg(pl.col("type")).with_column(
        pl.when(pl.col("type").arr.lengths == 0).then(pl.lit(None)).otherwise(
            pl.col("type")).keep_name())).to_dict(False) == {
                "id": [],
                "type": []
            }
コード例 #11
0
def test_lazy() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    _ = df.lazy().with_column(lit(1).alias("foo")).select(
        [col("a"), col("foo")])

    # test if it executes
    _ = (df.lazy().with_column(
        when(col("a").gt(lit(2))).then(lit(10)).otherwise(
            lit(1)).alias("new")).collect())

    # test if pl.list is available, this is `to_list` re-exported as list
    df.groupby("a").agg(pl.list("b"))
コード例 #12
0
def test_list_arr_empty() -> None:
    df = pl.DataFrame({"cars": [[1, 2, 3], [2, 3], [4], []]})

    out = df.select([
        pl.col("cars").arr.first().alias("cars_first"),
        pl.when(pl.col("cars").arr.first() == 2).then(1).when(
            pl.col("cars").arr.contains(2)).then(2).otherwise(3).alias(
                "cars_literal"),
    ])
    expected = pl.DataFrame({
        "cars_first": [1, 2, 4, None],
        "cars_literal": [2, 1, 3, 3]
    })
    assert out.frame_equal(expected)
コード例 #13
0
ファイル: test_categorical.py プロジェクト: pola-rs/polars
def test_comp_categorical_lit_dtype() -> None:
    df = pl.DataFrame(
        data={
            "column": ["a", "b", "e"],
            "values": [1, 5, 9]
        },
        columns=[("column", pl.Categorical), ("more", pl.Int32)],
    )

    assert df.with_column(
        pl.when(pl.col("column") == "e").then("d").otherwise(
            pl.col("column")).alias("column")).dtypes == [
                pl.Categorical, pl.Int32
            ]
コード例 #14
0
def replace_column_void(lf: pl.LazyFrame, column_void) -> pl.LazyFrame:
    if column_void is None:
        return lf

    # TODO: what to do with multiple columnvoids?
    if isinstance(column_void, list):
        column_void = column_void[0]

    return (
        # Get all values matching column_void and change them to null
        lf.select(
            pl.when(pl.all() == pl.lit(column_void)).then(
                pl.lit(None)).otherwise(pl.all()).keep_name())
        # Interpolate all null values
        .select(pl.all().interpolate())
        # Remove the rows with null values
        .drop_nulls())
コード例 #15
0
def get_str_loci(phenotype, my_str_fname, thresh):
    p_col = f'p_{phenotype}'
    csv = pl.scan_csv(
        my_str_fname,
        sep='\t',
        dtypes={'alleles': str, 'locus_filtered': str}
    ).filter(
        pl.col(p_col) <= thresh
    ).with_column(
        pl.when(pl.col(p_col) <= 1e-300)
          .then(0)
          .otherwise(pl.col(p_col))
          .alias(p_col)
    ).collect().to_dict(as_series=False)

    return sortedcontainers.SortedSet(
        iterable = zip(csv[p_col], csv['chrom'], csv['pos'], itertools.repeat('STR'))
    )
コード例 #16
0
def correct_depth_with_inclination(columns):
    """
    Return the expression needed to correct depth
    """
    if "corrected_depth" in columns:
        return col("corrected_depth").abs().alias("depth")
    elif "inclination" in columns:
        pt = "penetration_length"

        # every different in depth needs to be corrected with the angle
        correction_factor = np.cos(
            np.radians(col("inclination").cast(pl.Float32).fill_null(0)))

        corrected_depth = (correction_factor * col(pt).diff()).cumsum()
        return (pl.when(corrected_depth.is_null()).then(
            col(pt)).otherwise(corrected_depth).alias("depth"))
    else:
        return col("penetration_length").alias("depth")
コード例 #17
0
ファイル: tests.py プロジェクト: ritchie46/pygef
 def test_sum_to_one(self):
     cols = [
         "gravel_component",
         "sand_component",
         "clay_component",
         "loam_component",
         "peat_component",
         "silt_component",
     ]
     s = self.bore.df.select([
         pl.fold(
             pl.lit(0),
             lambda a, b: a + b,
             [
                 pl.when(pl.col(a) < 0).then(1 / len(cols)).otherwise(
                     pl.col(a)) for a in cols
             ],
         ).alias("sum")
     ])
     self.assertTrue(np.all(np.isclose(s, 1)))
コード例 #18
0
def ic_to_gamma(water_level):
    """
    Return the expression needed to compute "gamma_predict"
    """
    below_water = (1.0 - col("depth")) < water_level
    ti = col("type_index")
    return (
        pl.when(ti > 3.22)
        .then(11.0)
        .when((ti <= 3.22) & (ti > 2.76))
        .then(16.0)
        .when((ti <= 2.76) & ~(below_water))
        .then(18.0)
        .when(ti <= 2.40 & below_water)
        .then(19.0)
        .when(ti <= 1.80 & below_water)
        .then(20.0)
        .otherwise(1.0)
        .alias("gamma_predict")
    )
コード例 #19
0
def ic_to_soil_type():
    """
    Assign the soil type to the corresponding Ic.
    """
    ti = col("type_index")
    return (
        pl.when(ti > 3.22)
        .then("Peat")
        .when((ti <= 3.22) & (ti > 2.67))
        .then("Clays")
        .when((ti <= 2.67) & (ti > 2.4))
        .then("Clayey silt to silty clay")
        .when((ti <= 2.4) & (ti > 1.8))
        .then("Silty sand to sandy silt")
        .when((ti <= 1.8) & (ti > 1.25))
        .then("Sands: clean sand to silty")
        .when(ti <= 1.25)
        .then("Gravelly sands")
        .otherwise("")
        .alias("soil_type")
    )
コード例 #20
0
def get_snp_loci(plink_imputed_snp_fname, thresh):
    csv = pl.scan_csv(
        plink_imputed_snp_fname,
        sep='\t',
        null_values='NA'
    ).filter(
        pl.col('P') <= thresh
    ).with_column(
        pl.when(pl.col('P') <= 1e-300)
          .then(0)
          .otherwise(pl.col('P'))
          .alias('P')
    ).filter(
        pl.col('ERRCODE') != 'CONST_OMITTED_ALLELE'
    ).collect()

    assert np.all((csv['ERRCODE'] == '.').to_numpy())

    dict_csv = csv.to_dict(as_series = False)
    return sortedcontainers.SortedSet(
        iterable = zip(dict_csv['P'], dict_csv['#CHROM'], dict_csv['POS'], itertools.repeat('SNP'), dict_csv['REF'], dict_csv['ALT'])
    )
コード例 #21
0
        sep='\t',
        dtypes={
            col: (float if 'cs' not in col else int)
            for col in concordance_cols
            if 'finemap' in col or 'susie' in col or 'p_val' in col
        }) for phenotype in phenotypes.phenotypes_in_use
    if not os.path.exists(
        f'{ukb}/post_finemapping/intermediate_results/finemapping_putatively_causal_concordance_{phenotype}.tab.empty'
    )
]).filter('is_STR').with_column(pl.col('pos').alias('snpstr_pos'))

finemapping_results = finemapping_results.join(
    concordance_results, how='left',
    on=['phenotype', 'chrom', 'snpstr_pos']).with_columns([
        pl.when(pl.col('susie_alpha').is_null()).then(None).when(
            pl.col('susie_cs') >= 0).then(
                pl.col('susie_alpha')).otherwise(0).alias('susie_CP'),
        pl.when(pl.col('susie_alpha_hardcall').is_null()).then(None).when(
            pl.col('susie_cs_hardcall') >= 0).then(
                pl.col('susie_alpha_hardcall')).otherwise(0).alias(
                    'susie_CP_best_guess_genotypes'),
        pl.when(pl.col('susie_alpha_ratio').is_null()).then(None).when(
            pl.col('susie_cs_ratio') >= 0).then(
                pl.col('susie_alpha_ratio')).otherwise(0).alias(
                    'susie_CP_prior_snps_over_strs'),
        pl.col('finemap_pip').alias('finemap_CP'),
        pl.col('finemap_pip_p_thresh').alias('finemap_CP_pval_thresh_5e-4'),
        pl.col('finemap_pip_mac').alias('finemap_CP_mac_thresh_100'),
        pl.col('finemap_pip_prior_std_derived').alias(
            'finemap_CP_prior_effect_size_0.05%'),
        pl.col('finemap_pip_total_prob').alias('finemap_CP_prior_4_signals'),
コード例 #22
0
from .dataset import df
import polars as pl
from polars import col

df = df[[pl.when(col("random") > 0.5).then(0).otherwise(col("random")) * pl.sum("nrs")]]
コード例 #23
0
import bokeh.io
import bokeh.models
import bokeh.plotting
import numpy as np
import polars as pl

ukb = os.environ['UKB']

df = pl.read_csv(
    f'{ukb}/post_finemapping/results/singly_finemapped_strs_for_paper.tab',
    sep='\t'
).with_column(
    pl.when(
        pl.col('association_p_value') < 1e-300
    ).then(
        pl.lit(300)
    ).otherwise(
        -pl.col('association_p_value').log10()
    ).alias('association_p_value')
)

# Min abs corr across all CSes
fig = bokeh.plotting.figure(
    width=1200,
    height=1200,
    title='Fine-mapped loci by association p-value',
    x_axis_label='-log10(p-value)',
    y_axis_label='Number of fine-mapped loci',
)
fig.axis.axis_label_text_font_size = '30px'
fig.title.text_font_size = '30px'
コード例 #24
0
 def map_expr(name: str) -> pl.Expr:
     return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then(
         pl.struct([
             pl.sum(name).alias("sum"),
             (pl.count() - pl.col(name).null_count()).alias("count"),
         ]), ).otherwise(None)).alias("out")
コード例 #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('outfile')
    parser.add_argument('hits_table')
    parser.add_argument('qtl_STRs_table')

    args = parser.parse_args()

    df = pl.read_csv(
        args.hits_table,
        sep='\t',
    ).filter(pl.col('association_p_value') <= 1e-10)

    closest_gene_merge = annotation_utils.get_merged_annotations(
        df.with_column(pl.col('start_pos').alias('pos')).to_pandas(),
        f'{ukb}/side_analyses/str_annotations/closest_gene',
        distance=True
    )
    closest_gene = [None]*df.shape[0]

    nrows = df.shape[0]
    for idx in range(nrows):
        chrom = df['chrom'][idx]
        start_pos = df['start_pos'][idx]
        end_pos = df['end_pos'][idx]
        for line in closest_gene_merge[
            (closest_gene_merge['chrom'] == chrom) & (closest_gene_merge['STR_pos'] == start_pos)
        ].itertuples():
            if closest_gene[idx] is not None:
                closest_gene[idx] += ','
            else:
                closest_gene[idx] = ''
            closest_gene[idx] += str(line.annotation_distance) + ":"
            closest_gene[idx] += annotation_utils.get_relation(
                start_pos, end_pos, line.annotation_pos, line.annotation_end_pos, line.annotation_strand
            ) + ":"

            closest_gene[idx] += annotation_utils.get_gff_kvp(line.annotation_info, 'gene_name') + ":"
            closest_gene[idx] += annotation_utils.get_gff_kvp(line.annotation_info, 'gene_type')

    df = df.with_column(pl.Series(closest_gene).alias('closest_gene'))

    pheno_blocks = {
        'red blood cell' : [
            'haematocrit',
            'haemoglobin_concentration',
            'red_blood_cell_count',
            'mean_corpuscular_volume',
            'mean_corpuscular_haemoglobin',
            'mean_sphered_cell_volume',
            'red_blood_cell_distribution_width',
            'mean_corpuscular_haemoglobin_concentration',
        ],
        'platelet' : [
            'platelet_count',
            'platelet_crit',
            'mean_platelet_volume',
            'platelet_distribution_width',
        ],
        'white blood cell' : [
            'eosinophil_count',
            'eosinophil_percent',
            'neutrophil_count',
            'neutrophil_percent',
            'lymphocyte_count',
            'lymphocyte_percent',
            'white_blood_cell_count',
        ],
        'renal' : [
            'cystatin_c',
            'creatinine',
            'urate',
            'urea',
        ],
        'liver' : [
            'gamma_glutamyltransferase',
            'alkaline_phosphatase',
            'total_bilirubin',
            'alanine_aminotransferase',
            'total_protein',
            'albumin',
            'aspartate_aminotransferase',
        ],
        'endocrine' : [
            'glycated_haemoglobin',
            'igf_1',
            'shbg',
            'calcium',
            'glucose',
            'phosphate',
        ],
        'lipid' : [
            'apolipoprotein_a',
            'hdl_cholesterol',
            'apolipoprotein_b',
            'ldl_cholesterol_direct',
            'cholesterol',
            'triglycerides',
        ],
        '' : ['c_reactive_protein']
    }
    pheno_order = sum(pheno_blocks.values(), [])
    shaded_phenos = sum([pheno_blocks[key] for key in list(pheno_blocks.keys())[1::2]], [])

    pheno_indices = np.where(df['phenotype'].to_numpy()[:, None] == np.array(pheno_order)[None, :])[1]
    for phenotype in pheno_order:
        if not phenotype in phenotypes.phenotypes_in_use:
            print(phenotype)
            assert False

    df = df.with_column(
        pl.Series(pheno_indices).alias('pheno_indices')
    ).with_row_count().with_column(
        (pl.col('pheno_indices') + pl.col('row_nr')/1e5 + pl.when(pl.col('finemapping') != 'confidently').then(100000).otherwise(0)).min().over(['chrom', 'start_pos']).alias('min_assoc_pheno_index')
    ).with_columns([
        pl.when(
            pl.col('association_p_value') == 0
        ).then(300).otherwise(
            -pl.col('association_p_value').log10()
        ).alias('p_val'),
        (pl.col('finemapping') == 'confidently').sum().over(['chrom', 'start_pos']).alias('n_confident_assocs'),
        pl.col('white_brit_allele_frequencies').apply(
            lambda dosage_dict_str: sum(float(part.split(' ')[-1]) >= 1 for part in dosage_dict_str.split('%')[:-1])
        ).alias('n_common_alleles')
    ])

    mapi = df['min_assoc_pheno_index'].to_numpy().copy()
    old_mapi = mapi.copy()
    for i in range(len(pheno_order)):
        if i not in np.floor(old_mapi):
            mapi[old_mapi > i] -= 1
    x_coords = np.floor(mapi).astype(int)
    for i in np.unique(mapi):
        num_dups = np.unique(mapi[(mapi < i) & (np.floor(i) == np.floor(mapi))]).shape[0]
        if num_dups > 0:
            x_coords[np.floor(mapi) > np.floor(i)] += 1
            x_coords[mapi == i] += num_dups
    for i in range(max(x_coords) + 1):
        if i not in x_coords:
            print(i)
            exit()

    for pair in [(21, 23), (23, 24), (25, 28), (26, 30), (27, 31), (25, 27), (35, 39), (36, 39), (37, 38), (40, 42), (40, 41), (43, 45), (56, 57), (59, 60), (48, 53), (49, 55), (50, 54), (49, 51), (68, 71), (84, 85),(85, 87), (87, 88)]:
        x_coords[x_coords == pair[0]] = 1000000
        x_coords[x_coords == pair[1]] = pair[0]
        x_coords[x_coords == 1000000] = pair[1]

    strs = df[
        pl.col('chrom').cast(str) + ':' + pl.col('start_pos').cast(str) + '_' + pl.when(
            pl.col('relation_to_gene').str.contains('protein_coding.*protein_coding')
        ).then(
            pl.col('relation_to_gene').str.extract("([^:]*):protein_coding.*:([^:]*):protein_coding", 1) + ' & ' +
            pl.col('relation_to_gene').str.extract("([^:]*):protein_coding.*:([^:]*):protein_coding", 2)
        ).when(
            pl.col('relation_to_gene').str.contains('protein_coding')
        ).then(
            pl.col('relation_to_gene').str.extract("([^:]*):protein_coding", 1)
        ).when(
            pl.col('relation_to_gene').str.contains('intergenic')
        ).then(
            pl.col('chrom').cast(str)+':'+pl.col('start_pos').cast(str)+' (' + pl.col('closest_gene').str.split_exact(',', 1).struct.field('field_0').str.split_exact(':', 2).struct.field('field_2') + ')'
        ).when(
            pl.col('relation_to_gene').str.contains('multigene')
        ).then(
            pl.col('relation_to_gene').str.extract(r"multigene;[^:]*:([^:]*):", 1) + '/' +
            pl.col('relation_to_gene').str.extract(r"multigene;[^;]*;[^:]*:([^:]*):", 1)
        ).otherwise(
            pl.col('relation_to_gene').str.extract(r"\w[^{:]*:([^:]*):", 1)
        )
    ].to_numpy()[np.argsort(x_coords)]
    _, idxs = np.unique(strs, return_index=True)
    unique_stable_strs = strs[np.sort(idxs)].flatten()
    unique_stable_strs = list(np.char.partition(np.array(unique_stable_strs, dtype=str), '_')[:, 2])
    # two genes each containing two unique hits
    unique_stable_strs[unique_stable_strs.index('CCDC26')] += ' #1'
    unique_stable_strs[unique_stable_strs.index('CCDC26')] += ' #2'
    unique_stable_strs[unique_stable_strs.index('TFDP2')] += ' #1'
    unique_stable_strs[unique_stable_strs.index('TFDP2')] += ' #2'

    plot_width = 2500
    results_plot = bokeh.plotting.figure(
        width=plot_width,
        height=1400,
        x_axis_label='hits (containing gene, or position and nearest gene if intergenic)',
        y_axis_label='phenotypes',
        #y_range=bokeh.models.FactorRange(*[(group, pheno) for group in pheno_blocks for pheno in pheno_blocks[group]][::-1]),
        y_range=[pheno.replace('_', ' ') for pheno in pheno_order[::-1]],
        x_range=unique_stable_strs,#(0,94),
        toolbar_location=None,
        outline_line_color='black'
    )
    results_plot.axis.axis_label_text_font_size = '26px'
    results_plot.xaxis.major_label_orientation = 1.3
    cds = bokeh.models.ColumnDataSource(dict(
        x=[len(x_coords)/2]*len(shaded_phenos), y=[pheno.replace('_', ' ') for pheno in shaded_phenos], width=[len(x_coords)]*len(shaded_phenos), height=[1]*len(shaded_phenos), color=['grey']*len(shaded_phenos), alpha=['0.15']*len(shaded_phenos), line_color=[None]*len(shaded_phenos)
    ))
    results_plot.rect(
        x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds
    )
    half_xs=np.arange(1, np.max(x_coords) + 1, 2)
    cds = bokeh.models.ColumnDataSource(dict(
        x=half_xs+0.5, y=[len(pheno_order)/2]*half_xs.shape[0], width=[1]*half_xs.shape[0], height=[len(pheno_order)+len(pheno_blocks)+20]*half_xs.shape[0], color=['grey']*half_xs.shape[0], alpha=['0.15']*half_xs.shape[0], line_color=[None]*half_xs.shape[0]
    ))
    results_plot.rect(
        x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds
    )
    results_plot.xgrid.ticker = []

    '''
    with open(args.e_splice_STRs_table) as table:
        e_splice_lines = table.readlines()

    for loc, gene_rels in zip(
        df[pl.col('chrom').cast(str)+'_'+pl.col('start_pos').cast(str)].to_numpy().flatten(),
        df['relation_to_gene'].to_numpy().flatten()
    ):
        gene_rel_list = gene_rels.split(';')
        color = 'blue'
        for gene_rel in gene_rel_list:
            if gene_rel == 'intergenic':
                break
            if gene_rel == 'multigene':
                continue
            for line in e_splice_lines:
                if re.search(loc + '.*' + gene_rel.split(':')[1], line):
                    color = 'red'

        str_colors.append(color)
    str_colors = np.array(str_colors)
    '''

    def pheno_to_ycoords(phenos):
        return [pheno.replace('_', ' ') for pheno in phenos]
        #return [(group, pheno) for pheno in phenos for group in pheno_blocks if pheno in pheno_blocks[group]]

    fill_colors = np.array(['           ']*len(x_coords))
    fill_colors[:] = 'grey'
    fill_colors[df['finemapping'].to_numpy() == 'confidently'] = 'black'
    fill_colors[df['finemapping'].to_numpy() == 'not'] = 'white'

    up_triangle = df['direction_of_association'].to_numpy() == '+'
    results_plot.triangle(
        (x_coords + 0.5)[up_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[up_triangle]),
        size=np.sqrt(df['p_val'].to_numpy())[up_triangle]*2,
        #fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle],
        color=fill_colors[up_triangle],
        line_color='black'
    )
    results_plot.inverted_triangle(
        (x_coords + 0.5)[~up_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[~up_triangle]),
        size=np.sqrt(df['p_val'].to_numpy())[~up_triangle]*2,
        #fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle],
        fill_color=fill_colors[~up_triangle],
        line_color='black'
    )

    '''
    undotted_triangle = (df['direction_of_association'].to_numpy() == '+') & undotted_loci
    results_plot.triangle(
        (x_coords + 0.5)[undotted_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[undotted_triangle]),
        #[(group, pheno) for pheno in df['phenotype'].to_numpy()[undotted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]],
        #df['phenotype'].to_numpy()[undotted_triangle],
        size=np.sqrt(df['p_val'].to_numpy())[undotted_triangle]*2,
        fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle],
        color='black'#str_colors[undotted_triangle],
    )
    undotted_inverted_triangle = (df['direction_of_association'].to_numpy() == '-') & undotted_loci
    results_plot.inverted_triangle(
        (x_coords + 0.5)[undotted_inverted_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[undotted_inverted_triangle]),
        #[(group, pheno) for pheno in df['phenotype'].to_numpy()[undotted_inverted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]],
        size=np.sqrt(df['p_val'].to_numpy())[undotted_inverted_triangle]*2,
        fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_inverted_triangle],
        color='black'#str_colors[undotted_inverted_triangle],
    )

    dotted_triangle = (df['direction_of_association'].to_numpy() == '+') & ~undotted_loci
    results_plot.triangle_dot(
    #results_plot.triangle(
        (x_coords + 0.5)[dotted_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[dotted_triangle]),
        #[(group, pheno) for pheno in df['phenotype'].to_numpy()[dotted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]],
        #df['phenotype'].to_numpy()[dotted_triangle],
        size=np.sqrt(df['p_val'].to_numpy())[dotted_triangle]*2,
        fill_alpha=[0]*np.sum(dotted_triangle),
        color='black',#str_colors[dotted_triangle],
    )
    dotted_inverted_triangle = (df['direction_of_association'].to_numpy() == '-') & ~undotted_loci
    results_plot.triangle_dot(
    #results_plot.triangle(
        (x_coords + 0.5)[dotted_inverted_triangle],
        pheno_to_ycoords(df['phenotype'].to_numpy()[dotted_inverted_triangle]),
        #[(group, pheno) for pheno in df['phenotype'].to_numpy()[dotted_inverted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]],
        size=np.sqrt(df['p_val'].to_numpy())[dotted_inverted_triangle]*2,
        fill_alpha=[0]*np.sum(dotted_inverted_triangle),
        color='black',#str_colors[dotted_inverted_triangle],
        angle=np.pi
    )
    '''

    other_ethnicities = ['Black',  'South Asian', 'Chinese', 'Irish', 'White Other']

    def get_topper(height, factors, color):
        topper = bokeh.plotting.figure(
            width=plot_width,
            height=height,
            y_range=bokeh.models.FactorRange(*factors),
            x_range=results_plot.x_range,
            toolbar_location=None,
            outline_line_color='black'
        )
        topper.xgrid.ticker = []
        topper.xaxis.ticker = []
        if color:
            cds = bokeh.models.ColumnDataSource(dict(
                x=half_xs+0.5, y=[7.5/2]*half_xs.shape[0], width=[1]*half_xs.shape[0], height=[7.5]*half_xs.shape[0], color=['grey']*half_xs.shape[0], alpha=['0.15']*half_xs.shape[0], line_color=[None]*half_xs.shape[0]
            ))
            topper.rect(
                x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds
            )
        return topper
    
    qtl_topper = get_topper(
        60, ['expression QTL', 'splice or isoform QTL'], True
    )

    qtl_STRs = pl.read_csv(args.qtl_STRs_table, sep='\t')

    eqtl_STR_locs = qtl_STRs.filter(~pl.col('p_vals_expression').is_null())['chrom_pos']
    eqtl_STRs = df[
        ('chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str)).is_in(eqtl_STR_locs)
    ].to_numpy().flatten()
    qtl_topper.circle(
        (x_coords + 0.5)[eqtl_STRs],
        ['expression QTL']*np.sum(eqtl_STRs),
        color='black',
    )

    splice_iso_STR_locs = qtl_STRs.filter(~pl.col('p_vals_splice').is_null() | ~pl.col('p_vals_isoform').is_null())['chrom_pos']
    splice_iso_STRs = df[
        ('chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str)).is_in(splice_iso_STR_locs)
    ].to_numpy().flatten()
    qtl_topper.circle(
        (x_coords + 0.5)[splice_iso_STRs],
        ['splice or isoform QTL']*np.sum(splice_iso_STRs),
        color='black',
    )


    replication_topper = get_topper(
        150, [f'{ethnicity} replication'.replace('Other', 'other') for ethnicity in other_ethnicities], True
    )

    for ethnicity_num, ethnicity in enumerate(other_ethnicities):
        replicates = df.select((
            (pl.col('finemapping') == 'confidently') &
            (pl.col('other_ethnicity_effect_directions').str.split_exact(",", ethnicity_num+1).struct.field(f'field_{ethnicity_num}').str.strip() == pl.col('direction_of_association')) &
            (pl.col('other_ethnicity_association_p_values').str.split_exact(",", ethnicity_num+1).struct.field(f'field_{ethnicity_num}').str.strip().cast(float)*pl.col('n_confident_assocs') <= 0.05)
        ).alias('out'))['out'].to_numpy()
        replication_topper.circle(
            (x_coords + 0.5)[replicates],
            [f'{ethnicity} replication'.replace('Other', 'other')]*np.sum(replicates),
            color='black',
        )

    repeat_unit_topper = get_topper(90, ['polyA', 'polyAC', 'polyCCG'], True)
    for repeat_unit in 'A', 'AC', 'CCG':
        selection = df['repeat_unit'].to_numpy() == repeat_unit
        repeat_unit_topper.circle(
            (x_coords + 0.5)[selection],
            [f'poly{repeat_unit}'] * np.sum(selection),
            color='black',
        )


    indices = []
    for index, coord in enumerate(x_coords):
        if index == list(x_coords).index(coord):
            indices.append(index)
    assert len(indices) == max(x_coords) + 1
    unique_stable_strs = list(np.char.partition(np.array(unique_stable_strs, dtype=str), '_')[:, 2])

    multiallelic_topper = get_topper(30, ['number of common alleles'], False)

    max_common_alleles = np.max(df['n_common_alleles'].to_numpy())
    cds = bokeh.models.ColumnDataSource(dict(
        x=x_coords[indices]+0.5,
        y=['number of common alleles']*(int(np.max(x_coords)) + 1),
        width=[1]*(int(np.max(x_coords)) + 1),
        height=[1]*(int(np.max(x_coords)) + 1),
        color=['black']*(int(np.max(x_coords)) + 1),
        alpha=df['n_common_alleles'].to_numpy()[indices]/max_common_alleles,
        line_color=[None]*(int(np.max(x_coords)) + 1)
    ))
    multiallelic_topper.rect(
        x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds
    )

    multiallelic_scale = bokeh.plotting.figure(
        width=120,
        height=40*max_common_alleles,
        y_range=[0.5, max_common_alleles+0.5],
        y_axis_label='# common alleles',
        toolbar_location=None,
        outline_line_color='black'
    )
    multiallelic_scale.axis.axis_label_text_font_size = '26px'
    multiallelic_scale.xaxis.ticker = []
    multiallelic_scale.ygrid.ticker = []
    multiallelic_scale.yaxis.ticker = np.arange(1, max_common_alleles + 1)

    cds = bokeh.models.ColumnDataSource(dict(
        x=[0]*max_common_alleles,
        y=np.arange(1, max_common_alleles+1),
        width=[1]*max_common_alleles,
        height=[1]*max_common_alleles,
        color=['black']*max_common_alleles,
        alpha=np.arange(1, max_common_alleles+1)/max_common_alleles,
        line_color=[None]*max_common_alleles
    ))
    multiallelic_scale.rect(
        x='x', y='y', width='width', height='height',
        color='color', alpha='alpha', line_color='line_color',
        source=cds
    )

    scale_ps = [10, 20, 40, 80, 160, 300]
    str_scale_ps = [str(p) for p in scale_ps]
    p_val_scale = bokeh.plotting.figure(
        width=120,
        height=30*len(scale_ps) + 90,
        y_range=bokeh.models.FactorRange(*str_scale_ps),
        y_axis_label='-log10 p-value',
        toolbar_location=None,
        outline_line_color='black'
    )
    p_val_scale.axis.axis_label_text_font_size = '26px'
    p_val_scale.xaxis.ticker = []
    p_val_scale.grid.ticker = []
    p_val_scale.triangle(
        [0]*len(scale_ps),
        str_scale_ps,
        size=np.sqrt(scale_ps)*2,
        color='black'
    )

    bokeh.io.export_png(
        bokeh.layouts.row(
            bokeh.layouts.column(multiallelic_topper, repeat_unit_topper, replication_topper, qtl_topper, results_plot),
            bokeh.layouts.column(multiallelic_scale, p_val_scale)
        ),
        filename=args.outfile
    )
コード例 #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('phenotypes', nargs='+')
    phenotypes = parser.parse_args().phenotypes

    all_dfs = []
    susie_cs_min_abs_corrs = []
    finemap_cs_coverages = []
    unconverged_regions = []
    #underexplored_regions = []
    unfinished_regions = []

    for phenotype in phenotypes:

        pheno_dfs = []
        str_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/my_str/results.tab',
            sep='\t',
        ).select([
            pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos',
            pl.col(f'p_{phenotype}').alias('p_val'),
            pl.lit(True).alias('is_STR'),
            pl.lit(None).cast(int).alias('reflen'),
            pl.lit(None).cast(int).alias('altlen')
        ])

        snp_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/plink_snp/results.tab',
            sep='\t',
            null_values='NA',
        ).select([
            pl.col('#CHROM').alias('chrom'),
            pl.col('POS').alias('pos'),
            pl.col('REF').str.lengths().cast(int).alias('reflen'),
            pl.col('ALT').str.lengths().cast(int).alias('altlen'),
            pl.col('P').alias('p_val'),
        ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([
            pl.col('p_val').min().alias('p_val'),
        ]).with_columns([
            pl.lit(phenotype).alias('phenotype'),
            pl.lit(False).alias('is_STR')
        ]).select([
            'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen'
        ])

        assocs = pl.concat([str_assocs, snp_assocs
                            ]).filter(pl.col('p_val') <= p_val_thresh)

        regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab',
                                 sep='\t')
        for chrom, start, end, any_strs in zip(regions_df['chrom'],
                                               regions_df['start'],
                                               regions_df['end'],
                                               regions_df['any_strs']):
            if not any_strs:
                continue
            converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt'
            if not os.path.exists(converged_fname):
                unfinished_regions.append((phenotype, chrom, start, end))
                continue
            with open(converged_fname) as converged_file:
                if not next(converged_file).strip() == 'TRUE':
                    unconverged_regions.append((phenotype, chrom, start, end))
                    continue
            print(f'Loading {phenotype} region {chrom}:{start}-{end}',
                  flush=True)
            with open(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt'
            ) as var_file:
                susie_vars = [line.strip() for line in var_file]
            alphas = pl.scan_csv(
                f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab',
                sep='\t',
                has_header=False).collect().to_numpy().T
            n_alphas = alphas.shape[1]
            susie_pips = 1 - np.prod(1 - alphas, axis=1)
            assert susie_pips.shape[0] == len(susie_vars)
            susie_idx = np.arange(len(susie_vars)) + 1
            susie_df = pl.DataFrame({
                'varname': susie_vars,
                'susie_pip': susie_pips,
                'susie_alpha': np.zeros(len(susie_vars)),
                'susie_cs': [-1] * len(susie_vars),
                'susie_idx': susie_idx,
                **{f'alpha_{i}': alphas[:, i]
                   for i in range(n_alphas)}
            }).lazy()
            finemap_df = pl.scan_csv(
                f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp',
                sep=' ').select([
                    pl.col('rsid').alias('varname'),
                    pl.col('prob').alias('finemap_pip')
                ])

            df = susie_df.join(finemap_df, how='inner', on=[
                'varname'
            ]).with_columns([
                pl.col('varname').str.extract('^[^_]*_([^_]*)',
                                              1).cast(int).alias('pos'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_([^_]*)_.*',
                    1).str.lengths().cast(int).alias('reflen'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_[^_]*_([^_]*)',
                    1).str.lengths().cast(int).alias('altlen'),
                pl.col('varname').str.contains('^STR').alias('is_STR'),
                pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'),
                pl.lit(chrom).alias('chrom').cast(int),
                pl.lit(phenotype).alias('phenotype')
            ]).sort('susie_idx')

            real_cs_count = 0
            for cs_fname in glob.glob(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt'
            ):
                cs_id = int(cs_fname.split('cs')[-1].split('.')[0])
                with open(cs_fname) as cs_file:
                    # susie uses 1 based indexing, python uses 0
                    # make sure cs idxs are in increasing order
                    cs_susie_idx = np.array(
                        [int(idx) for idx in next(cs_file).strip().split()])
                    assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0)
                    cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx)
                    next(cs_file)  # skip cs credibility
                    min_abs_corr, _, _ = [
                        float(idx) for idx in next(cs_file).strip().split()
                    ]
                susie_cs_min_abs_corrs.append(min_abs_corr)
                finemap_cs_coverages.append(
                    df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select(
                        pl.col('finemap_pip').sum()).collect())
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        pl.when(
                            pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha')
                        ).then(pl.col(f'alpha_{cs_id-1}')).otherwise(
                            pl.col('susie_alpha'))).otherwise(
                                pl.col('susie_alpha')).alias('susie_alpha'))
                if min_abs_corr < corr_cutoff:
                    continue
                real_cs_count += 1
                # could worry about variants being in multiple CSes
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs'))
            pheno_dfs.append(df)
            '''
            if real_cs_count >= 10:
                underexplored_regions.append((phenotype, chrom, start, end))
            '''
        pheno_dfs = [
            df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs
        ]
        pheno_df = pl.concat(pheno_dfs).join(
            assocs,
            how='left',
            on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen',
                'altlen']).collect()
        all_dfs.append(pheno_df)

    del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df
    susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs)
    finemap_cs_coverages = np.array(finemap_cs_coverages)

    total_df = pl.concat(all_dfs)
    #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh)
    ''''
    start_time = time.time()
    print('Gathering data ... ', flush=True)
    total_df = total_df.join(
        total_assocs,
        how='left',
        on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']
    ).collect()
    print(f'Done. Time: {time.time() - start_time:.2}')
    '''

    total_df.filter(
        ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv(
            f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab',
            sep='\t')

    print(
        'Any vars with null Ps?',
        total_df.select(pl.col('p_val').is_null().alias('null?')).select(
            pl.any('null?').alias('any_nulls'))['any_nulls'][0])
    print(
        'n regions',
        total_df.select(
            pl.col('region').unique().count().alias('region_count'))
        ['region_count'][0])

    cses_per_region = total_df.filter(
        pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([
            'susie_cs', 'region'
        ]).agg(
            pl.col('p_val').min().alias('min_p'),
        ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg(
            pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses']
    print(
        f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})'
    )

    for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'),
                          (~pl.col('is_STR'), ' SNP')):
        susie_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold)
             & (pl.col('p_val') <= p_val_thresh)
             ).alias('susie_hit')).groupby('region').agg(
                 pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict(
                     False)['n_susie_hits']
        print(
            f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})'
        )

        finemap_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('finemap_pip') >= pip_threshold) &
             (pl.col('p_val') <= p_val_thresh)
             ).alias('finemap_hit')).groupby('region').agg(
                 pl.col('finemap_hit').sum().alias('n_finemap_hits')).select(
                     'n_finemap_hits').to_numpy()
        print(
            f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})'
        )

        print('Exporting FINEMAP vs SuSiE PIP plots', flush=True)
        comparison_thresh = 0.3
        title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}'
        if text == '':
            title = 'Vars ' + title
        fig = bokeh.plotting.figure(
            width=1200,
            height=1200,
            title=title,
            x_axis_label='FINEMAP PIPs',
            y_axis_label='SuSiE PIPs',
        )
        fig.title.text_font_size = '30px'
        fig.axis.axis_label_text_font_size = '26px'
        fig.axis.major_label_text_font_size = '20px'

        fig.background_fill_color = None
        fig.border_fill_color = None
        fig.ygrid.grid_line_color = None
        fig.xgrid.grid_line_color = None
        fig.toolbar.logo = None
        fig.toolbar_location = None
        print(total_df.filter(filter_))
        print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)))
        pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)
                               & ((pl.col('finemap_pip') >= comparison_thresh)
                                  | ((pl.col('susie_pip') >= comparison_thresh)
                                     & (pl.col('susie_cs') >= 0)))).select(
                                         ['susie_pip', 'finemap_pip'])
        print(pips)

        bin_size = .05
        bins = bokeh.util.hex.hexbin(
            pips['finemap_pip'].to_numpy().reshape(-1),
            pips['susie_pip'].to_numpy().reshape(-1),
            size=bin_size)

        palette = [
            linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254)
            for i in range(-1, 255)
        ]
        cmap = bokeh.transform.log_cmap('counts',
                                        palette=palette,
                                        low=1,
                                        high=max(bins.counts),
                                        low_color=(255, 255, 255))
        color_mapper = bokeh.models.LogColorMapper(palette=palette,
                                                   low=1,
                                                   high=max(bins.counts))

        fig.hex_tile(q='q',
                     r='r',
                     size=bin_size,
                     line_color=None,
                     source=bins,
                     fill_color=cmap)
        color_bar = bokeh.models.ColorBar(color_mapper=color_mapper,
                                          width=70,
                                          major_label_text_font_size='20px')
        fig.add_layout(color_bar, 'right')
        ext = text.replace(' ', '_')
        bokeh.io.export_png(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png')
        bokeh.io.export_svg(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg')

    print(f'unconverged regions: {unconverged_regions}')
    print(f'unfinished regions: {unfinished_regions}')
    #print(f'underexplored regions: {underexplored_regions}')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title='SuSiE credible set min absolute correlations',
        x_axis_label='min absolute correlation',
        y_axis_label='# credible sets',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    step = 0.01
    left_edges = np.arange(0, 1 + step, step)
    ys = [
        np.sum((left_edge <= susie_cs_min_abs_corrs)
               & (susie_cs_min_abs_corrs < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting cs plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region',
        x_axis_label='# cses in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(cses_per_region) + 1)
    ys = [
        np.sum((left_edge <= cses_per_region)
               & (cses_per_region < left_edge + 1)) for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting cs per region plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region',
        x_axis_label='# hits in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(finemap_hits_per_region) + 1)
    ys = [
        np.sum((left_edge <= finemap_hits_per_region)
               & (finemap_hits_per_region < left_edge + 1))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting finemap hits per region plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png')
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}',
        x_axis_label='FINEMAP PIPs',
        y_axis_label='# credible sets',
    )
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.ygrid.grid_line_color = None
    fig.xgrid.grid_line_color = None
    fig.toolbar.logo = None
    fig.toolbar_location = None
    include = susie_cs_min_abs_corrs >= corr_cutoff
    max_total_pip = max(1, np.max(finemap_cs_coverages[include]))
    step = 0.01
    left_edges = np.arange(0, max_total_pip + step, step)
    ys = [
        np.sum((left_edge <= finemap_cs_coverages[include])
               & (finemap_cs_coverages[include] < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting FINEMAP CS PIP plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png'
    )
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg'
    )

    total_cses = np.sum(include)
    total_cses_large_finemap_pip = np.sum(
        finemap_cs_coverages[include] >= pip_threshold)
    print(
        f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})'
    )

    susie_pip_threshold_for_finemap = .3
    n_replicates_from_finemap = total_df.filter(
        (pl.col('susie_cs') >= 0)
        & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap)
        & (pl.col('finemap_pip') >= pip_threshold)).shape[0]
    n_finemap_total = total_df.filter(
        pl.col('finemap_pip') >= pip_threshold).shape[0]
    print(
        f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})'
    )

    for (curr_df, text) in [(total_df, 'all hits no filter'),
                            (total_df.filter(pl.col('p_val') <= 1e-10),
                             'all hits p<=1e-10')]:
        print(text)
        var_thresh1 = .8
        var_thresh2 = .3
        for susie_thresh in (var_thresh1, var_thresh2):
            for finemap_thresh in (var_thresh1, var_thresh2):
                count = curr_df.filter(
                    (pl.col('susie_cs') >= 0)
                    & (pl.col('susie_pip') >= susie_thresh)
                    & (pl.col('finemap_pip') >= finemap_thresh)).shape[0]
                print(
                    f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}'
                )

        for susie_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('susie_cs') >= 0)
                & (pl.col('susie_pip') >= susie_thresh)
                & (pl.col('finemap_pip') < var_thresh2)).shape[0]
            print(
                f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}'
            )
        for finemap_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('finemap_pip') >= finemap_thresh)
                & ((pl.col('susie_cs') < 0)
                   | (pl.col('susie_pip') < var_thresh2))).shape[0]
            print(
                f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}'
            )

    # Not going to report susie alphas v pips - just know that they're similar if we look
    # at vars in good credible sets and not otherwise
    '''
コード例 #27
0
import polars as pl
from .dataset import df

q = df.lazy().with_column(
    pl.when(pl.col("range") >= 5).then(pl.col("left")).otherwise(
        pl.col("right")).alias("foo_or_bar"))

df = q.collect()
コード例 #28
0
joined = pl.read_csv(
    f'{ukb}/export_scripts/results/causal_STR_candidates_for_publication.tab',
    sep='\t').select((
        'chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str)
    ).alias('chr_pos')).distinct().join(
        results, how='inner', left_on='chr_pos',
        right_on='hg19_START').join(filtered, how='left', on='chr_pos').filter(
            pl.col('FILTER').is_null()).drop([
                'START',
                'FILTER',
                'CHROM_START', 'CHROM', 'POS'
            ]).filter((pl.col('splice_p_vals').str.lengths() > 0) | (
                pl.col('expression_p_vals').str.lengths() > 0)).with_columns([
                    pl.when(pl.col('splice_p_vals').str.lengths() == 0).then(
                        None).otherwise(
                            pl.col('splice_p_vals')).alias('splice_p_vals'),
                    pl.when(pl.col('splice_p_vals').str.lengths() == 0).then(
                        None).otherwise(
                            pl.col('splice_associations (tissue:gene:exonID)')
                        ).alias('splice_associations (tissue:gene:exonID)'),
                    pl.when(pl.col('splice_p_vals').str.lengths() == 0).then(
                        None).otherwise(
                            pl.col('splice_n_tests')).alias('splice_n_tests'),
                    pl.when(pl.col('expression_p_vals').str.lengths()
                            == 0).then(None).otherwise(
                                pl.col('expression_p_vals')).alias(
                                    'expression_p_vals'),
                    pl.when(pl.col('expression_p_vals').str.lengths() == 0).
                    then(None).otherwise(
                        pl.col('expression_associations (tissue:gene)')).alias(
コード例 #29
0
    pl.col('FILTER').is_null()
).drop(['START', 'FILTER', 'CHROM_START', 'CHROM', 'POS']).filter(
'''
'''
total_qtl_str = total_qtl_str.join(
    trait_assocs,
    on='chrom_pos'
)
'''
total_qtl_str = total_qtl_str.filter(
    (pl.col('p_vals_expression').str.lengths() > 0)
    | (pl.col('p_vals_splice').str.lengths() > 0)
    | (pl.col('p_vals_isoform').str.lengths() > 0)
).with_columns([
    pl.when(
        pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise(
            pl.col('p_vals_expression')).alias('p_vals_expression'),
    pl.when(
        pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise(
            pl.col('associations (tissue:target)_expression')).alias(
                'associations (tissue:target)_expression'),
    pl.when(
        pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise(
            pl.col('n_tests_expression')).alias('n_tests_expression'),
    pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise(
        pl.col('p_vals_splice')).alias('p_vals_splice'),
    pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise(
        pl.col('associations (tissue:target)_splice')).alias(
            'associations (tissue:target)_splice'),
    pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise(
        pl.col('n_tests_splice')).alias('n_tests_splice'),
コード例 #30
0
    print('Loading eSTRs ... ')
    eSTRs = pl.read_csv(f'{ukb}/misc_data/eSTR/eSTRs.csv', sep=',').rename({
        'score':
        'eSTR_CAVIAR_score'
    }).with_column(pl.col('chrom').str.slice(3).cast(int)).groupby(
        ['chrom', 'str.start']).agg(pl.col('eSTR_CAVIAR_score').max())

    all_STRs = all_STRs.join(
        eSTRs,
        how='left',
        left_on=['chrom', 'pos'],
        right_on=['chrom', 'str.start'],
    ).with_columns([
        (~pl.col('eSTR_CAVIAR_score').is_null()).alias('eSTR'),
        pl.when(pl.col('eSTR_CAVIAR_score').is_null()).then(False).otherwise(
            pl.col('eSTR_CAVIAR_score') >= .3).alias('FM_eSTR')
    ])

    print('Getting promoters ... ', flush=True, end='')
    genes = pl.read_csv(
        f'{ukb}/misc_data/gencode/gencode.v38lift37.annotation.without_chr.sorted.gene.gff3',
        sep='\t',
        has_header=False,
        columns=[0, 3, 4, 6, 8],
        dtypes={
            'column_1': str
        }).select([
            pl.col('column_1').alias('chrom'),
            pl.col('column_4').alias('start_pos'),
            pl.col('column_5').alias('end_pos'),
            pl.col('column_7').alias('strand'),