コード例 #1
0
ファイル: test_lazy_csv.py プロジェクト: pola-rs/polars
def test_row_count(foods_csv: str) -> None:
    df = pl.read_csv(foods_csv, row_count_name="row_count")
    assert df["row_count"].to_list() == list(range(27))

    df = (pl.scan_csv(foods_csv, row_count_name="row_count").filter(
        pl.col("category") == pl.lit("vegetables")).collect())

    assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25]

    df = (pl.scan_csv(foods_csv, row_count_name="row_count").with_row_count(
        "foo",
        10).filter(pl.col("category") == pl.lit("vegetables")).collect())

    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
コード例 #2
0
def load_dir(pheno, region, dir_):
    with open(f'{dir_}/converged.txt') as converged:
        assert converged.read().strip() == 'TRUE'

    alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t',
                         has_header=False).collect().to_numpy().T
    susie_pips = 1 - np.prod(1 - alphas, axis=1)

    df = pl.scan_csv(f'{dir_}/colnames.txt',
                     has_header=False,
                     with_column_names=lambda _: ['var_name']).with_column(
                         pl.lit(1).alias('row_number')).with_columns([
                             pl.col('row_number').cumsum(),
                             pl.lit(None, int).alias('cs_num'),
                             pl.lit(region).alias('region'),
                             pl.lit(pheno).alias('phenotype'),
                             pl.Series(susie_pips).alias('susie_pip'),
                             pl.lit(None, float).alias('susie_cs_pip')
                         ])

    for cs_num in range(50):
        cs_num += 1
        cs_fname = f'{dir_}/cs{cs_num}.txt'
        if not os.path.exists(cs_fname):
            continue
        with open(cs_fname) as cs:
            var_nums = [int(var_num) for var_num in next(cs).strip().split()]
            next(cs)
            min_ld = float(next(cs).split()[0])
            if min_ld < min_ld_thresh:
                continue
            df = df.with_columns([
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise(
                        cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'),
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.Series(alphas[:, cs_num - 1])).otherwise(
                        pl.col('susie_cs_pip')).alias('susie_cs_pip')
            ])

    df = df.with_column(
        pl.when(pl.col('cs_num') != -1).then(
            pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip'))
    df = df.filter(
        pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null()
        & (pl.col('susie_pip') > 0.05)).drop('row_number')

    return df
コード例 #3
0
ファイル: test_lazy_csv.py プロジェクト: pola-rs/polars
def test_scan_csv_schema_overwrite_and_dtypes_overwrite(
        foods_csv: str) -> None:
    assert (pl.scan_csv(
        foods_csv,
        dtypes={
            "calories_foo": pl.Utf8,
            "fats_g_foo": pl.Float32
        },
        with_column_names=lambda names: [f"{a}_foo" for a in names],
    ).collect().dtypes) == [pl.Utf8, pl.Utf8, pl.Float32, pl.Int64]
コード例 #4
0
ファイル: test_lazy_csv.py プロジェクト: pola-rs/polars
def test_invalid_utf8() -> None:
    np.random.seed(1)
    bts = bytes(np.random.randint(0, 255, 200))
    file = path.join(path.dirname(__file__), "nonutf8.csv")

    with open(file, "wb") as f:
        f.write(bts)

    a = pl.read_csv(file, has_headers=False, encoding="utf8-lossy")
    b = pl.scan_csv(file, has_headers=False, encoding="utf8-lossy").collect()
    assert a.frame_equal(b, null_equal=True)
コード例 #5
0
def test_csv_schema_offset(foods_csv: str) -> None:
    csv = """metadata
line
foo,bar
1,2
3,4
5,6
""".encode()
    df = pl.read_csv(csv, skip_rows=2)
    assert df.columns == ["foo", "bar"]
    assert df.shape == (3, 2)
    df = pl.read_csv(csv, skip_rows=2, skip_rows_after_header=2)
    assert df.columns == ["foo", "bar"]
    assert df.shape == (1, 2)

    df = pl.scan_csv(foods_csv, skip_rows=4).collect()
    assert df.columns == ["fruit", "60", "0", "11"]
    assert df.shape == (23, 4)

    df = pl.scan_csv(foods_csv, skip_rows_after_header=10).collect()
    assert df.columns == ["category", "calories", "fats_g", "sugars_g"]
    assert df.shape == (17, 4)
コード例 #6
0
def get_str_loci(phenotype, my_str_fname, thresh):
    p_col = f'p_{phenotype}'
    csv = pl.scan_csv(
        my_str_fname,
        sep='\t',
        dtypes={'alleles': str, 'locus_filtered': str}
    ).filter(
        pl.col(p_col) <= thresh
    ).with_column(
        pl.when(pl.col(p_col) <= 1e-300)
          .then(0)
          .otherwise(pl.col(p_col))
          .alias(p_col)
    ).collect().to_dict(as_series=False)

    return sortedcontainers.SortedSet(
        iterable = zip(csv[p_col], csv['chrom'], csv['pos'], itertools.repeat('STR'))
    )
コード例 #7
0
def get_snp_loci(plink_imputed_snp_fname, thresh):
    csv = pl.scan_csv(
        plink_imputed_snp_fname,
        sep='\t',
        null_values='NA'
    ).filter(
        pl.col('P') <= thresh
    ).with_column(
        pl.when(pl.col('P') <= 1e-300)
          .then(0)
          .otherwise(pl.col('P'))
          .alias('P')
    ).filter(
        pl.col('ERRCODE') != 'CONST_OMITTED_ALLELE'
    ).collect()

    assert np.all((csv['ERRCODE'] == '.').to_numpy())

    dict_csv = csv.to_dict(as_series = False)
    return sortedcontainers.SortedSet(
        iterable = zip(dict_csv['P'], dict_csv['#CHROM'], dict_csv['POS'], itertools.repeat('SNP'), dict_csv['REF'], dict_csv['ALT'])
    )
コード例 #8
0
ファイル: snippet2.py プロジェクト: stjordanis/polars-book
import polars as pl

from ..paths import DATA_DIR

q = pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter(
    (pl.col("comment_karma") > 0)
    & (pl.col("link_karma") > 0)
    & (pl.col("name").str_contains(r"^a")))

df = q.fetch(int(1e7))
コード例 #9
0
import bokeh.plotting
import numpy as np
import polars as pl
import scipy.stats

parser = argparse.ArgumentParser()
parser.add_argument('outdir')
parser.add_argument('chrom_files', nargs = '+',
                    help='4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs lenght confusion')
args = parser.parse_args()
outdir = args.outdir
chrom_fnames = args.chrom_files

loci = pl.concat([
    pl.scan_csv(
        chrom_fname,
        sep='\t'
    ) for chrom_fname in chrom_fnames
]).drop('pos').collect()

for col in loci.columns:
    print(f'Plotting column {col} ...', flush=True)
    max_val = loci.select(pl.col(col).max()).to_numpy()
    min_val = loci.select(pl.col(col).min()).to_numpy()
    n_steps = 1000
    step_size = (max_val - min_val)/n_steps
    xs = np.arange(min_val, max_val + step_size, step_size)
    ys = scipy.stats.gaussian_kde(loci[col].to_numpy())(xs)

    if col.startswith('chance'):
        unit = '%'
    elif col.startswith('avg'):
コード例 #10
0
import polars as pl
from polars.lazy import *
import time

reddit = pl.scan_csv("data/reddit.csv")
runestar = pl.scan_csv("data/runescape.csv", has_headers=False).with_column(
    col("column_1").alias("name")
)

reddit = (
    reddit.filter(col("comment_karma") > 0)
    .filter(col("link_karma") > 0)
    .filter(col("name").str_contains(r"^a"))  # filter name that start with an "a"
)

joined = reddit.join(runestar, on="name", how="inner").select(
    ["name", "comment_karma", "link_karma"]
)

t0 = time.time()

joined.show_graph(True)

df = joined.fetch(int(1e7))

print(time.time() - t0)
print(df)
コード例 #11
0
import polars as pl
from polars.lazy import *
import time

reddit = pl.scan_csv("data/reddit.csv")

# doesn't really matter due to predicate optimizations
optimal = True

# reddit = reddit.filter(
#     (col("comment_karma") > 0) &
#     (col("link_karma") > 0) &
#     (col("name").str_contains(r"^a"))
# )
reddit = (
    reddit.filter(col("comment_karma") > 0)
    .filter(col("link_karma") > 0)
    .filter(col("name").str_contains(r"^a"))  # filter name that start with an "a"
)

# if optimal:
# this is exactly the same result as below as the query optimizer will combine predicates.
#     reddit = reddit.filter(
#         (col("comment_karma") > 0) &
#         (col("link_karma") > 0) &
#         (col("name").str_contains(r"^a"))
# )
# else:
#     reddit = (
#         reddit
#         .filter(col("comment_karma") > 0)
コード例 #12
0
pheno_datas_d = json.loads(args.pheno_datas_json)
assert set(assoc_results_d.keys()) == set(pheno_datas_d.keys()) == set(ethnicities)

figs = []
for ethnicity in ethnicities:
    if not args.binary:
        stat_name = 'mean'
    else:
        stat_name = 'fraction'

    result = pl.scan_csv(
        assoc_results_d[ethnicity],
        sep='\t',
        dtypes={'locus_filtered': str}
    ).filter(
        (pl.col('chrom') == args.chrom) & (pl.col('pos') == args.pos)
    ).collect().select([ # have to collect first due to some sort of bug
        'motif',
        '0.05_significance_CI',
        '5e-8_significance_CI',
        f'{stat_name}_{args.phenotype}_per_single_dosage',
    ])
    assert result.shape[0] == 1
        
    pheno_data = np.load(pheno_datas_d[ethnicity])

    bgen_samples = []
    with open(f'{ukb}/microarray/ukb46122_hap_chr1_v2_s487314.sample') as samplefile:
        for num, line in enumerate(samplefile):
            if num <= 1:
                # skip first two lines
                continue
コード例 #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('outtable')
    parser.add_argument('outreadme')
    parser.add_argument('pos_to_snpstr_pos')
    parser.add_argument('intable')
    parser.add_argument('inreadme')
    parser.add_argument('spot_test_fname_json_dict_fname')

    args = parser.parse_args()

    with open(args.spot_test_fname_json_dict_fname) as json_file:
        spot_test_fname_json_dict = next(json_file)

    with open(args.outreadme, 'w+') as readme:
        with open(args.inreadme) as inreadme:
            readme.write(inreadme.read())
        readme.write(
            'other_ethnic_association_ps - association p-values for the other '
            'ethnicities in the order ' +
            ','.join(other_ethnicities) + '\n'
        )
        readme.write(
            'other_ethnic_effect_directions - direction of association (+/-) '
            'for the other ethnicities in the order ' +
            ','.join(other_ethnicities) +
            " (NaN if that ethnicity's p > 0.05)\n"
        )
        for ethnicity in other_ethnicities:
            readme.write(
                f'{ethnicity}_population_allele_frequencies - frequencies of each allele '
                "(by dosage) among the ethnicity's tested population\n"
            )

    hits = pl.scan_csv(
        args.intable,
        sep='\t',
        # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped
        dtype={'alleles': str}
    )
    cols = hits.columns

    # hack to only clean in one of the two cases this function is running
    if 'white_brit_allele_frequencies' in cols:
        hits = hits.with_column(
            pl.col('white_brit_allele_frequencies').str.replace_all('"', "'")
        )

    hits = hits.join(
        pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'),
        how='left',
        left_on=['chrom', 'start_pos'],
        right_on=['chrom', 'pos']
    )

    spot_tests_fnames = {
        tuple(key.split('__')): fname
        for key, fname in
        json.loads(spot_test_fname_json_dict).items()
    }

    spot_tests = {}
    for outer_ethnicity in other_ethnicities:
        spot_tests[outer_ethnicity] = pl.concat([
            (pl.scan_csv(
                    spot_test_fname,
                    sep='\t',
                    dtype={'alleles': str},
                    null_values=['nan'],
                    with_column_names=lambda cols: list(fix_cols(cols, phenotype))
                ).select([
                    pl.lit(phenotype).alias('phenotype'),
                    'chrom',
                    'pos',
                    pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'),
                    pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'),
                    pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies')
                ]))
            for (phenotype, _, _, ethnicity), spot_test_fname
            in spot_tests_fnames.items()
            if ethnicity == outer_ethnicity
        ])

    for ethnicity in other_ethnicities:
        hits = hits.join(
            spot_tests[ethnicity],
            how='left',
            left_on=['phenotype', 'chrom', 'snpstr_pos'],
            right_on=['phenotype', 'chrom', 'pos']
        )

    hits = hits.with_columns([
        pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_association_ps'),
        pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_effect_directions')
    ])

    hits = hits.select([
        *cols,
        'other_ethnic_association_ps',
        'other_ethnic_effect_directions',
        *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities]
    ]).collect()
    assert hits.shape[0] == pl.read_csv(
        args.intable,
        sep='\t',
        # same hack as above
        dtype = {'alleles': str}
    ).shape[0]

    hits.to_csv(args.outtable, sep='\t',)
コード例 #14
0
ファイル: test_lazy_csv.py プロジェクト: pola-rs/polars
def test_scan_csv() -> None:
    df = pl.scan_csv(Path(__file__).parent.parent / "files" / "small.csv")
    assert df.collect().shape == (4, 3)
コード例 #15
0
            info_arr[i][info_arr[i] == None] = 'Missing'
            if not first:
                findings += ':'
            else:
                first = False
            findings += info_arr[i][argsort][:count]
        findingss.append(', '.join(findings))
        pss.append(', '.join(str(x) for x in sort[:count]))
    return (pss, findingss, n_tests)


spliceSTR = pl.scan_csv(f'{workdir}/yang_spliceSTRs.tab',
                        sep='\t').distinct().groupby('hg19_START').agg([
                            pl.col('p_values').list(),
                            pl.col('Tissue').list(),
                            pl.col('gene_name').list(),
                            pl.col('str-exon').str.split_exact(
                                '-',
                                1).struct.field('field_1').list().alias('exon')
                        ]).collect()

pss, findingss, n_tests = fdr_cols(
    spliceSTR['p_values'],
    [spliceSTR['Tissue'], spliceSTR['gene_name'], spliceSTR['exon']])

new_splice = pl.DataFrame({
    'hg19_START':
    spliceSTR['hg19_START'],
    'splice_p_vals':
    pd.Series(pss),
    'splice_associations (tissue:gene:exonID)':
コード例 #16
0
ファイル: snippet.py プロジェクト: pola-rs/polars-book
import polars as pl

from ..paths import DATA_DIR

reddit = (pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter(
    pl.col("comment_karma") > 0).filter(pl.col("link_karma") > 0).filter(
        pl.col("name").str.contains(r"^a")))

runescape = pl.scan_csv("data/runescape.csv", has_headers=False).select(
    pl.col("column_1").alias("name"))

dataset = reddit.join(runescape, on="name", how="inner").select(
    ["name", "comment_karma", "link_karma"])

df1 = dataset.fetch(int(1e7))
df2 = dataset.fetch(int(1e7),
                    predicate_pushdown=True,
                    projection_pushdown=True)
コード例 #17
0
def load_plink_results(phenotype,
                       binary,
                       unconditional_results_fname,
                       conditional_results_fname=None):
    # TODO remove conditional snps
    # Load plink SNP results
    print(f"Loading plink SNP results for {phenotype} ... ",
          end='',
          flush=True)
    start_time = time.time()

    if binary:
        binary_colnames = {
            'A1_CASE_CT': 'alt_case_count',
            'A1_CTRL_CT': 'alt_control_count',
            'FIRTH?': 'firth?'
        }
    else:
        binary_colnames = {}
    start_time = time.time()
    unconditional_results = pl.scan_csv(
        unconditional_results_fname, sep='\t',
        null_values='NA').filter(pl.col('P') < 5e-5).rename({
            '#CHROM': 'chr',
            'POS': 'pos',
            'ID': 'id',
            'REF': 'ref',
            'ALT': 'alt',
            'P': 'p_val',
            'ERRCODE': 'error',
            # these last three only occur in logistic regression
            **binary_colnames
        }).select([
            pl.col(col) for col in [
                'chr', 'pos', 'id', 'ref', 'alt', 'p_val', 'error',
                *binary_colnames.values()
            ]
        ]).collect().to_pandas()

    if not conditional_results_fname:
        results = unconditional_results
    else:
        results = pl.scan_csv(
            conditional_results_fname, sep='\t', null_values='NA').rename({
                '#CHROM':
                'chr',
                'POS':
                'pos',
                'ID':
                'id',
                'REF':
                'ref',
                'ALT':
                'alt',
                'P':
                'p_val',
                'ERRCODE':
                'error',
                # these last three only occur in logistic regression
                **binary_colnames
            }).select([
                pl.col(col) for col in [
                    'chr', 'pos', 'id', 'ref', 'alt', 'p_val', 'error',
                    *binary_colnames.values()
                ]
            ]).collect().to_pandas()

        unconditional_results['p_val'] = np.maximum(
            unconditional_results['p_val'], 1 / 10**max_p_val)
        unconditional_results['p_val'] = -np.log10(
            unconditional_results['p_val'])
        unconditional_results.rename(columns={'p_val': 'unconditional_p'},
                                     inplace=True)
        unconditional_results = unconditional_results[[
            'chr', 'pos', 'unconditional_p'
        ]]

        results = results.merge(
            unconditional_results, on=['chr', 'pos'], how='inner'
        )  # subsets to only those which passed the p-val threshold in the unconditional run

    if binary == 'logistic':
        results.rename(columns={'firth?': 'firth'}, inplace=True)

    results = utils.df_to_recarray(results)

    results = results[results['error'] != 'CONST_OMITTED_ALLELE']
    if binary == 'logistic':
        # in theory could keep unfinished error codes and just note them,
        # but easier to ignore
        results = results[(results['error'] != 'FIRTH_CONVERGE_FAIL')
                          & (results['error'] != 'UNFINISHED')]
    results['p_val'] = np.maximum(results['p_val'], 1 / 10**max_p_val)
    results['p_val'] = -np.log10(results['p_val'])

    # we've already filtered all the spots that had errors in the unconditional run
    # having a VIF_TOO_HIGH or CORR_TOO_HIGH only in the conditional run just means that
    # SNP is extremely correlated with the conditioning variants, which means
    # its p-value should be very small, so this isn't an issue.
    if not conditional_results_fname:
        if not np.all(results['error'] == '.'):
            print(np.unique(results['error']))
            assert False
    else:
        assert np.all((results['error'] == '.')
                      | (results['error'] == 'VIF_TOO_HIGH')
                      | (results['error'] == 'CORR_TOO_HIGH'))
    # rename for readability
    results['error'][results['error'] == '.'] = 'none'
    results['p_val'][results['error'] == 'VIF_TOO_HIGH'] = 0

    print(f"done ({time.time() - start_time:.2e}s)", flush=True)
    return results
コード例 #18
0
def load_my_str_results(phenotype,
                        binary,
                        unconditional_results_fname,
                        conditional_results_fname=None):
    print(f"Loading my STR results for {phenotype} ... ", end='', flush=True)
    start_time = time.time()
    with open(unconditional_results_fname) as tsv:
        header = tsv.readline().strip()
    unconditional_results = pl.scan_csv(
        unconditional_results_fname,
        sep='\t',
        skip_rows=1,
        has_header=False,
        with_column_names=lambda _: fix_cols(header),
        dtypes={
            'alleles': str,
            'locus_filtered': str
        }).filter(pl.col(f'p_{phenotype}') < 5e-5).collect().to_pandas()

    if not conditional_results_fname:
        results = unconditional_results
    else:
        results = pd.read_csv(conditional_results_fname,
                              header=0,
                              delimiter='\t',
                              encoding='UTF-8',
                              dtype=utils.get_dtypes(conditional_results_fname,
                                                     {'locus_filtered': str}))

        unconditional_results[f'p_{phenotype}'] = np.maximum(
            unconditional_results[f'p_{phenotype}'], 1 / 10**max_p_val)
        unconditional_results[f'p_{phenotype}'] = -np.log10(
            unconditional_results[f'p_{phenotype}'])
        unconditional_results.rename(
            columns={f'p_{phenotype}': 'unconditional_p'}, inplace=True)
        unconditional_results = unconditional_results[[
            'chrom', 'pos', 'unconditional_p'
        ]]

        results = results.merge(
            unconditional_results, on=['chrom', 'pos'], how='inner'
        )  # subsets to only those which passed the p-val threshold in the unconditional run

    if binary == 'logistic':
        results.rename(columns={'firth?': 'firth'}, inplace=True)

    rename_dict = {}
    for idx, name in my_results_rename.items():
        rename_dict[results.columns[idx]] = name
    rename_dict.update(my_str_results_rename)
    for colname in ('total_per_allele_dosages', 'total_hardcall_alleles',
                    'subset_total_per_allele_dosages',
                    'subset_total_hardcall_alleles',
                    'subset_allele_dosage_r2'):
        # convert allele lens from strings to floats, in addition round allele lens and values, but not NaN values
        new_col = np.array(
            list(
                map(
                    lambda dict_str: {
                        round(float(allele_len), 2): (round(val, 2)
                                                      if val != 'NaN' else val)
                        for allele_len, val in ast.literal_eval(dict_str).
                        items()
                    }, results[colname])))
        # convert allele_lens to ints if they are close enough
        new_col = np.array(
            list(
                map(
                    lambda d: str({(int(key) if key == int(key) else key): val
                                   for key, val in d.items()}), new_col)))
        results[colname] = new_col
    results.rename(columns=rename_dict, inplace=True)
    results = utils.df_to_recarray(results)
    results['p_val'] = np.maximum(results['p_val'], 1 / 10**max_p_val)
    results['p_val'] = -np.log10(results['p_val'])
    if conditional_results_fname:
        for STR in get_conditioned_strs(conditional_results_fname):
            results = results[results['pos'] != STR]
    print(f"done ({time.time() - start_time:.2e}s)", flush=True)
    return results
コード例 #19
0
ファイル: aggregate.py プロジェクト: blackrez/polars-book
import polars as pl
from polars.lazy import *

reddit = pl.scan_csv("data/reddit.csv").select(
    [pl.sum("comment_karma"), pl.min("link_karma")])

if __name__ == "__main__":
    df = reddit.fetch()
    with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f:
        f.write(str(df))
コード例 #20
0
ukb = os.environ['UKB']

parser = argparse.ArgumentParser()
parser.add_argument('--load', action='store_true', default=False)
parser.add_argument('--calc', action='store_true', default=False)
args = parser.parse_args()

if args.load:
    # pos (start), snpstr_pos (hipstr)
    all_STRs = pl.read_csv(f'{ukb}/snpstr/flank_trimmed_vcf/vars.tab',
                           sep='\t')
    # pos (hisptr)
    snpstr_strs = pl.scan_csv(
        f'{ukb}/snpstr/str_loci.txt',
        sep='\t',
        has_header=False,
        with_column_names=lambda _: ['chrom', 'pos'],
    )

    all_STRs = all_STRs.lazy().join(
        snpstr_strs,
        left_on=['chrom', 'snpstr_pos'],
        right_on=['chrom', 'pos'],
        how='inner',
        suffix='_other').select([
            'chrom', 'pos', 'end_pos', 'snpstr_pos'
        ]).with_column(pl.col('snpstr_pos').alias('SNPSTR_start_pos')).drop(
            'snpstr_pos').distinct(subset=['chrom', 'pos']).collect()
    assert ~np.any(np.isnan(all_STRs['chrom'].to_numpy()))
    assert ~np.any(np.isnan(all_STRs['pos'].to_numpy()))
    assert ~np.any(np.isnan(all_STRs['end_pos'].to_numpy()))
コード例 #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('phenotypes', nargs='+')
    phenotypes = parser.parse_args().phenotypes

    all_dfs = []
    susie_cs_min_abs_corrs = []
    finemap_cs_coverages = []
    unconverged_regions = []
    #underexplored_regions = []
    unfinished_regions = []

    for phenotype in phenotypes:

        pheno_dfs = []
        str_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/my_str/results.tab',
            sep='\t',
        ).select([
            pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos',
            pl.col(f'p_{phenotype}').alias('p_val'),
            pl.lit(True).alias('is_STR'),
            pl.lit(None).cast(int).alias('reflen'),
            pl.lit(None).cast(int).alias('altlen')
        ])

        snp_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/plink_snp/results.tab',
            sep='\t',
            null_values='NA',
        ).select([
            pl.col('#CHROM').alias('chrom'),
            pl.col('POS').alias('pos'),
            pl.col('REF').str.lengths().cast(int).alias('reflen'),
            pl.col('ALT').str.lengths().cast(int).alias('altlen'),
            pl.col('P').alias('p_val'),
        ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([
            pl.col('p_val').min().alias('p_val'),
        ]).with_columns([
            pl.lit(phenotype).alias('phenotype'),
            pl.lit(False).alias('is_STR')
        ]).select([
            'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen'
        ])

        assocs = pl.concat([str_assocs, snp_assocs
                            ]).filter(pl.col('p_val') <= p_val_thresh)

        regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab',
                                 sep='\t')
        for chrom, start, end, any_strs in zip(regions_df['chrom'],
                                               regions_df['start'],
                                               regions_df['end'],
                                               regions_df['any_strs']):
            if not any_strs:
                continue
            converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt'
            if not os.path.exists(converged_fname):
                unfinished_regions.append((phenotype, chrom, start, end))
                continue
            with open(converged_fname) as converged_file:
                if not next(converged_file).strip() == 'TRUE':
                    unconverged_regions.append((phenotype, chrom, start, end))
                    continue
            print(f'Loading {phenotype} region {chrom}:{start}-{end}',
                  flush=True)
            with open(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt'
            ) as var_file:
                susie_vars = [line.strip() for line in var_file]
            alphas = pl.scan_csv(
                f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab',
                sep='\t',
                has_header=False).collect().to_numpy().T
            n_alphas = alphas.shape[1]
            susie_pips = 1 - np.prod(1 - alphas, axis=1)
            assert susie_pips.shape[0] == len(susie_vars)
            susie_idx = np.arange(len(susie_vars)) + 1
            susie_df = pl.DataFrame({
                'varname': susie_vars,
                'susie_pip': susie_pips,
                'susie_alpha': np.zeros(len(susie_vars)),
                'susie_cs': [-1] * len(susie_vars),
                'susie_idx': susie_idx,
                **{f'alpha_{i}': alphas[:, i]
                   for i in range(n_alphas)}
            }).lazy()
            finemap_df = pl.scan_csv(
                f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp',
                sep=' ').select([
                    pl.col('rsid').alias('varname'),
                    pl.col('prob').alias('finemap_pip')
                ])

            df = susie_df.join(finemap_df, how='inner', on=[
                'varname'
            ]).with_columns([
                pl.col('varname').str.extract('^[^_]*_([^_]*)',
                                              1).cast(int).alias('pos'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_([^_]*)_.*',
                    1).str.lengths().cast(int).alias('reflen'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_[^_]*_([^_]*)',
                    1).str.lengths().cast(int).alias('altlen'),
                pl.col('varname').str.contains('^STR').alias('is_STR'),
                pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'),
                pl.lit(chrom).alias('chrom').cast(int),
                pl.lit(phenotype).alias('phenotype')
            ]).sort('susie_idx')

            real_cs_count = 0
            for cs_fname in glob.glob(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt'
            ):
                cs_id = int(cs_fname.split('cs')[-1].split('.')[0])
                with open(cs_fname) as cs_file:
                    # susie uses 1 based indexing, python uses 0
                    # make sure cs idxs are in increasing order
                    cs_susie_idx = np.array(
                        [int(idx) for idx in next(cs_file).strip().split()])
                    assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0)
                    cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx)
                    next(cs_file)  # skip cs credibility
                    min_abs_corr, _, _ = [
                        float(idx) for idx in next(cs_file).strip().split()
                    ]
                susie_cs_min_abs_corrs.append(min_abs_corr)
                finemap_cs_coverages.append(
                    df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select(
                        pl.col('finemap_pip').sum()).collect())
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        pl.when(
                            pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha')
                        ).then(pl.col(f'alpha_{cs_id-1}')).otherwise(
                            pl.col('susie_alpha'))).otherwise(
                                pl.col('susie_alpha')).alias('susie_alpha'))
                if min_abs_corr < corr_cutoff:
                    continue
                real_cs_count += 1
                # could worry about variants being in multiple CSes
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs'))
            pheno_dfs.append(df)
            '''
            if real_cs_count >= 10:
                underexplored_regions.append((phenotype, chrom, start, end))
            '''
        pheno_dfs = [
            df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs
        ]
        pheno_df = pl.concat(pheno_dfs).join(
            assocs,
            how='left',
            on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen',
                'altlen']).collect()
        all_dfs.append(pheno_df)

    del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df
    susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs)
    finemap_cs_coverages = np.array(finemap_cs_coverages)

    total_df = pl.concat(all_dfs)
    #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh)
    ''''
    start_time = time.time()
    print('Gathering data ... ', flush=True)
    total_df = total_df.join(
        total_assocs,
        how='left',
        on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']
    ).collect()
    print(f'Done. Time: {time.time() - start_time:.2}')
    '''

    total_df.filter(
        ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv(
            f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab',
            sep='\t')

    print(
        'Any vars with null Ps?',
        total_df.select(pl.col('p_val').is_null().alias('null?')).select(
            pl.any('null?').alias('any_nulls'))['any_nulls'][0])
    print(
        'n regions',
        total_df.select(
            pl.col('region').unique().count().alias('region_count'))
        ['region_count'][0])

    cses_per_region = total_df.filter(
        pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([
            'susie_cs', 'region'
        ]).agg(
            pl.col('p_val').min().alias('min_p'),
        ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg(
            pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses']
    print(
        f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})'
    )

    for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'),
                          (~pl.col('is_STR'), ' SNP')):
        susie_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold)
             & (pl.col('p_val') <= p_val_thresh)
             ).alias('susie_hit')).groupby('region').agg(
                 pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict(
                     False)['n_susie_hits']
        print(
            f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})'
        )

        finemap_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('finemap_pip') >= pip_threshold) &
             (pl.col('p_val') <= p_val_thresh)
             ).alias('finemap_hit')).groupby('region').agg(
                 pl.col('finemap_hit').sum().alias('n_finemap_hits')).select(
                     'n_finemap_hits').to_numpy()
        print(
            f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})'
        )

        print('Exporting FINEMAP vs SuSiE PIP plots', flush=True)
        comparison_thresh = 0.3
        title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}'
        if text == '':
            title = 'Vars ' + title
        fig = bokeh.plotting.figure(
            width=1200,
            height=1200,
            title=title,
            x_axis_label='FINEMAP PIPs',
            y_axis_label='SuSiE PIPs',
        )
        fig.title.text_font_size = '30px'
        fig.axis.axis_label_text_font_size = '26px'
        fig.axis.major_label_text_font_size = '20px'

        fig.background_fill_color = None
        fig.border_fill_color = None
        fig.ygrid.grid_line_color = None
        fig.xgrid.grid_line_color = None
        fig.toolbar.logo = None
        fig.toolbar_location = None
        print(total_df.filter(filter_))
        print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)))
        pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)
                               & ((pl.col('finemap_pip') >= comparison_thresh)
                                  | ((pl.col('susie_pip') >= comparison_thresh)
                                     & (pl.col('susie_cs') >= 0)))).select(
                                         ['susie_pip', 'finemap_pip'])
        print(pips)

        bin_size = .05
        bins = bokeh.util.hex.hexbin(
            pips['finemap_pip'].to_numpy().reshape(-1),
            pips['susie_pip'].to_numpy().reshape(-1),
            size=bin_size)

        palette = [
            linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254)
            for i in range(-1, 255)
        ]
        cmap = bokeh.transform.log_cmap('counts',
                                        palette=palette,
                                        low=1,
                                        high=max(bins.counts),
                                        low_color=(255, 255, 255))
        color_mapper = bokeh.models.LogColorMapper(palette=palette,
                                                   low=1,
                                                   high=max(bins.counts))

        fig.hex_tile(q='q',
                     r='r',
                     size=bin_size,
                     line_color=None,
                     source=bins,
                     fill_color=cmap)
        color_bar = bokeh.models.ColorBar(color_mapper=color_mapper,
                                          width=70,
                                          major_label_text_font_size='20px')
        fig.add_layout(color_bar, 'right')
        ext = text.replace(' ', '_')
        bokeh.io.export_png(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png')
        bokeh.io.export_svg(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg')

    print(f'unconverged regions: {unconverged_regions}')
    print(f'unfinished regions: {unfinished_regions}')
    #print(f'underexplored regions: {underexplored_regions}')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title='SuSiE credible set min absolute correlations',
        x_axis_label='min absolute correlation',
        y_axis_label='# credible sets',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    step = 0.01
    left_edges = np.arange(0, 1 + step, step)
    ys = [
        np.sum((left_edge <= susie_cs_min_abs_corrs)
               & (susie_cs_min_abs_corrs < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting cs plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region',
        x_axis_label='# cses in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(cses_per_region) + 1)
    ys = [
        np.sum((left_edge <= cses_per_region)
               & (cses_per_region < left_edge + 1)) for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting cs per region plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region',
        x_axis_label='# hits in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(finemap_hits_per_region) + 1)
    ys = [
        np.sum((left_edge <= finemap_hits_per_region)
               & (finemap_hits_per_region < left_edge + 1))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting finemap hits per region plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png')
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}',
        x_axis_label='FINEMAP PIPs',
        y_axis_label='# credible sets',
    )
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.ygrid.grid_line_color = None
    fig.xgrid.grid_line_color = None
    fig.toolbar.logo = None
    fig.toolbar_location = None
    include = susie_cs_min_abs_corrs >= corr_cutoff
    max_total_pip = max(1, np.max(finemap_cs_coverages[include]))
    step = 0.01
    left_edges = np.arange(0, max_total_pip + step, step)
    ys = [
        np.sum((left_edge <= finemap_cs_coverages[include])
               & (finemap_cs_coverages[include] < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting FINEMAP CS PIP plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png'
    )
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg'
    )

    total_cses = np.sum(include)
    total_cses_large_finemap_pip = np.sum(
        finemap_cs_coverages[include] >= pip_threshold)
    print(
        f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})'
    )

    susie_pip_threshold_for_finemap = .3
    n_replicates_from_finemap = total_df.filter(
        (pl.col('susie_cs') >= 0)
        & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap)
        & (pl.col('finemap_pip') >= pip_threshold)).shape[0]
    n_finemap_total = total_df.filter(
        pl.col('finemap_pip') >= pip_threshold).shape[0]
    print(
        f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})'
    )

    for (curr_df, text) in [(total_df, 'all hits no filter'),
                            (total_df.filter(pl.col('p_val') <= 1e-10),
                             'all hits p<=1e-10')]:
        print(text)
        var_thresh1 = .8
        var_thresh2 = .3
        for susie_thresh in (var_thresh1, var_thresh2):
            for finemap_thresh in (var_thresh1, var_thresh2):
                count = curr_df.filter(
                    (pl.col('susie_cs') >= 0)
                    & (pl.col('susie_pip') >= susie_thresh)
                    & (pl.col('finemap_pip') >= finemap_thresh)).shape[0]
                print(
                    f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}'
                )

        for susie_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('susie_cs') >= 0)
                & (pl.col('susie_pip') >= susie_thresh)
                & (pl.col('finemap_pip') < var_thresh2)).shape[0]
            print(
                f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}'
            )
        for finemap_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('finemap_pip') >= finemap_thresh)
                & ((pl.col('susie_cs') < 0)
                   | (pl.col('susie_pip') < var_thresh2))).shape[0]
            print(
                f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}'
            )

    # Not going to report susie alphas v pips - just know that they're similar if we look
    # at vars in good credible sets and not otherwise
    '''
コード例 #22
0
def different_vars(susie_vars_to_compare_fname, phenotype, chrom, start_pos, end_pos):
    filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab'

    p_cutoff = 5e-4

    # first choose STRs and SNPs only with p <= p_cutoff to lessen memory burden
    #print('Loading strs and snps var list ... ', flush=True)
    strs_to_include = set()
    snps_to_include = set()

    strs_to_include = pl.scan_csv(
        f'{ukb}/association/results/{phenotype}/my_str/results.tab',
        sep='\t'
    ).filter(
        (pl.col('chrom') == chrom) &
        (pl.col('pos') >= start_pos) &
        (pl.col('pos') <= end_pos) &
        (pl.col(f'p_{phenotype}') <= p_cutoff)
    ).select(pl.col('pos')).collect().to_numpy().flatten()

    assert len(strs_to_include) != 0

    snps_to_filter = set()
    snps_to_filter = pl.scan_csv(
        filter_set_fname,
        sep='\t'
    ).select([
        pl.col('snp_pos'),
        pl.col('snp_ref'),
        pl.col('snp_alt'),
        pl.lit('1').alias('join_marker')
    ])

    snps_to_include = pl.scan_csv(
        f'{ukb}/association/results/{phenotype}/plink_snp/results.tab',
        sep='\t',
        null_values='NA'
    ).filter(
        (pl.col('#CHROM') == chrom) &
        (pl.col('POS') >= start_pos) &
        (pl.col('POS') <= end_pos) &
        (pl.col('P') <= p_cutoff)
    ).join(
        snps_to_filter,
        how = 'left',
        left_on = ['POS', 'REF', 'ALT'],
        right_on = ['snp_pos', 'snp_ref', 'snp_alt']
    ).filter(
        pl.col('join_marker').is_null()
    ).select([
        pl.col('POS'),
        pl.col('REF'),
        pl.col('ALT')
    ]).collect().pipe(
        lambda df: list(zip(*df.to_dict().values()))
    )
    # returns a list of tuples

    snp_sort_tuples = set((pos, 'SNP', ref, alt) for (pos, ref, alt) in snps_to_include)
    str_sort_tuples = set((pos, 'STR') for pos in strs_to_include)
    vars_ = snp_sort_tuples.union(str_sort_tuples)
    var_names = {
        f'STR_{tuple[0]}' if tuple[1] == 'STR' else f'SNP_{tuple[0]}_{tuple[2]}_{tuple[3]}'
        for tuple in vars_
    }

    with open(susie_vars_to_compare_fname) as susie_vars_to_compare_file:
        susie_vars = {line.strip() for line in susie_vars_to_compare_file.readlines() if line.strip()}

    if susie_vars == var_names:
        return None
    else:
        assert all(x in var_names for x in susie_vars)
        return list(x for x in var_names if x not in susie_vars)
コード例 #23
0
             },
             **{
                 f'{ethnicity}_se': float
                 for ethnicity in other_ethnicities
             }
         })).filter('is_STR')
 fname = f'{ukb}/association/results/{phenotype}/my_str/results.tab'
 with open(fname) as tsv:
     header = tsv.readline().strip()
 assoc_df = pl.scan_csv(
     fname,
     sep='\t',
     skip_rows=1,
     has_header=False,
     with_column_names=lambda _: header.
     replace('0.05_significance_CI', 'foo', 1).replace(
         '5e-8_significance_CI', 'bar', 1).split(
             '\t')  # these duplicate column names won't be used anyway
 ).select([
     'chrom', 'pos',
     pl.col('subset_total_per_allele_dosages').alias(
         'white_brit_allele_dosages')
 ])
 df = df.lazy().join(assoc_df, how='left', on=['chrom', 'pos'])
 for ethnicity in other_ethnicities:
     fname = f'{ukb}/association/results_finemapped_only/{ethnicity}/{phenotype}/my_str/results.tab'
     with open(fname) as tsv:
         header = tsv.readline().strip()
     assoc_df = pl.scan_csv(
         fname,
         sep='\t',
         skip_rows=1,
コード例 #24
0
def choose_vars(readme_fname, outcols_fname, phenotype, chrom, start_pos,
                end_pos, p_cutoff, mac, use_PACSIN2):
    if use_PACSIN2:
        assert int(chrom) == 22

    filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab'

    if mac:
        mac_threshold = int(mac[0])
        snp_mac_fname = mac[1]
        str_mac_fname = mac[2]
        snps_exclude_mac = pl.scan_csv(
            snp_mac_fname,
            sep='\t').filter(pl.col('ALT_CTS') < mac_threshold).select([
                '#POS', 'REF', 'ALT'
            ]).collect().pipe(lambda df: list(zip(*df.to_dict().values())))

        # need to make that look like a list of strings to polars b/c buggy, so add a single nonsense to it
        snps_exclude_mac.append('asdf')

        strs_exclude_mac = pl.scan_csv(
            str_mac_fname,
            sep='\t').filter(pl.col('mac') < mac_threshold).select(
                'pos').collect()['pos'].to_list()

    today = datetime.datetime.now().strftime("%Y_%M_%D")
    with open(readme_fname, 'w') as readme:
        readme.write(
            f'Run date: {today}\n'
            f'Choosing variants for which association tests were not skipped and with p <= {p_cutoff}. '
            'SNPs in the filter set are also skipped. '
            f'(Filter set at {filter_set_fname})\n')

    # first choose STRs and SNPs only with p <= p_cutoff to lessen memory burden
    print('Choosing which strs and snps to include ... ', flush=True)
    strs_to_include = set()
    snps_to_include = set()

    strs_to_include = pl.scan_csv(
        f'{ukb}/association/results/{phenotype}/my_str/results.tab',
        sep='\t').filter((pl.col('chrom') == chrom)
                         & (pl.col('pos') >= start_pos)
                         & (pl.col('pos') <= end_pos)
                         & (pl.col(f'p_{phenotype}') <= p_cutoff)).select(
                             pl.col('pos')).collect().to_numpy().flatten()

    if mac:
        strs_to_include = strs_to_include[
            ~np.isin(strs_to_include, strs_exclude_mac)]

    snps_to_filter = set()
    snps_to_filter = pl.scan_csv(filter_set_fname, sep='\t').select([
        pl.col('snp_pos'),
        pl.col('snp_ref'),
        pl.col('snp_alt'),
        pl.lit('1').alias('join_marker')
    ])

    snps_to_include = pl.scan_csv(
        f'{ukb}/association/results/{phenotype}/plink_snp/results.tab',
        sep='\t',
        null_values='NA').filter(
            (pl.col('#CHROM') == chrom) & (pl.col('POS') >= start_pos)
            & (pl.col('POS') <= end_pos) & (pl.col('P') <= p_cutoff)).join(
                snps_to_filter,
                how='left',
                left_on=['POS', 'REF', 'ALT'],
                right_on=[
                    'snp_pos', 'snp_ref', 'snp_alt'
                ]).filter(pl.col('join_marker').is_null()).select([
                    'POS', 'REF', 'ALT'
                ]).collect().pipe(lambda df: list(zip(*df.to_dict().values())))
    # returns a list of tuples

    if mac:
        snps_to_include = [
            snps_to_include[idx]
            for idx in np.where(~np.isin(snps_to_include, snps_exclude_mac))[0]
        ]

    snp_sort_tuples = set(
        (pos, 'SNP', ref, alt) for (pos, ref, alt) in snps_to_include)
    str_sort_tuples = set((pos, 'STR') for pos in strs_to_include)

    vars_ = snp_sort_tuples.union(str_sort_tuples)
    if use_PACSIN2:
        vars_.remove((43385872, 'STR'))
        vars_.add((43385866, 'PACSIN2_STR'))
        vars_.add((43385875, 'PACSIN2_STR'))
        vars_.add((43385893, 'PACSIN2_STR'))
    sorted_vars = sorted(vars_)
    sorted_var_names = [
        f'STR_{tuple[0]}'
        if tuple[1] == 'STR' else f'SNP_{tuple[0]}_{tuple[2]}_{tuple[3]}'
        if tuple[1] == 'SNP' else f'PACSIN2_STR_{tuple[0]}'
        if tuple[1] == 'PACSIN2_STR' else None  # break the sort
        for tuple in sorted_vars
    ]
    assert len(set(sorted_var_names)) == len(
        sorted_var_names)  # make sure is unique

    print(f'# STRs: {len(strs_to_include)} # SNPs: {len(snps_to_include)}',
          flush=True)

    with open(outcols_fname, 'w') as colfile:
        for var_name in sorted_var_names:
            colfile.write(var_name + '\n')
コード例 #25
0
ファイル: test_lazy_csv.py プロジェクト: pola-rs/polars
def test_scan_empty_csv() -> None:
    with pytest.raises(Exception) as excinfo:
        pl.scan_csv(Path(__file__).parent.parent / "files" /
                    "empty.csv").collect()
    assert str(excinfo.value) == "empty csv"
コード例 #26
0
#!/usr/bin/env python3

import os

import polars as pl

import phenotypes

ukb = os.environ['UKB']

dfs = []
for phenotype in phenotypes.phenotypes_in_use:
    dfs.append(
        pl.scan_csv(f'{ukb}/signals/regions/{phenotype}.tab',
                    sep='\t').with_column(
                        pl.lit(phenotype).alias('phenotype')))

pl.concat(dfs).collect().with_column(
    (((pl.col('phenotype') == 'total_bilirubin') & (pl.col('chrom') == 12) &
      (pl.col('start') == 19976272) & (pl.col('end') == 22524428)) |
     ((pl.col('phenotype') == 'urate') & (pl.col('chrom') == 4) &
      (pl.col('start') == 8165642) & (pl.col('end') == 11717761)) |
     ((pl.col('phenotype') == 'alkaline_phosphatase') &
      (pl.col('chrom') == 1) & (pl.col('start') == 19430673) &
      (pl.col('end') == 24309348))
     ).alias('filtered_due_to_computation_burden')).select([
         'phenotype', 'chrom', 'start', 'end',
         'filtered_due_to_computation_burden'
     ]).to_csv(
         f'{ukb}/export_scripts/results/supp_table_2_finemapping_regions.tab',
         sep='\t')
コード例 #27
0
def generate_figure(assoc_results_fname, pheno_data_fname, chrom, pos,
                    phenotype, dosage_fraction_threshold, unit, binary,
                    publication):

    assert bool(unit) or binary

    assert 0 <= dosage_fraction_threshold <= 1

    if not binary:
        y_axis_label = 'Mean ' + phenotype.replace('_', ' ') + f' ({unit})'
    else:
        y_axis_label = 'Fraction ' + phenotype.replace('_', ' ') + ' cases'

    figure = bokeh.plotting.figure(
        width=600,
        height=600,
        y_axis_label=y_axis_label,
        x_axis_label='Sum of allele lengths (repeat copies)')
    figure.grid.grid_line_color = None
    figure.background_fill_color = None
    figure.border_fill_color = None
    figure.toolbar_location = None
    figure.title.text_font_size = '18px'
    figure.axis.axis_label_text_font_size = '18px'
    figure.axis.major_label_text_font_size = '14px'

    if not binary:
        stat_name = 'mean'
    else:
        stat_name = 'fraction'

    def fix_header(header):
        def fix_header_helper(_):
            part1 = header.rpartition('0.05_significance_CI')
            fix1 = part1[0] + 'foo' + part1[2]
            part2 = fix1.rpartition('5e-8_significance_CI')
            fix2 = part2[0] + 'bar' + part2[2]
            return fix2.split('\t')

        return fix_header_helper

    with open(assoc_results_fname) as tsv:
        header = tsv.readline().strip()
    result = pl.scan_csv(
        assoc_results_fname,
        sep='\t',
        dtypes={
            'locus_filtered': str
        },
        skip_rows=1,
        has_header=False,
        with_column_names=fix_header(header)).filter(
            (pl.col('chrom') == chrom)
            & (pl.col('pos') == pos)).collect().select(
                [  # have to collect first due to some sort of bug
                    'motif', '0.05_significance_CI', '5e-8_significance_CI',
                    f'{stat_name}_{phenotype}_per_single_dosage',
                    'total_subset_dosage_per_summed_gt'
                ])
    assert result.shape[0] == 1

    pheno_data = np.load(pheno_data_fname)

    bgen_samples = sample_utils.get_all_samples()
    assert len(bgen_samples) == 487409
    samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1)

    merged_arr = utils.merge_arrays(samples_array, pheno_data)
    unfiltered_subset = ~np.isnan(merged_arr[:, 1])
    n_samples = np.sum(unfiltered_subset)

    subset_summed_dosage_fractions = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['total_subset_dosage_per_summed_gt'].to_numpy()[0]).items()
    }
    total_dosage = np.sum(list(subset_summed_dosage_fractions.values()))
    subset_summed_dosage_fractions = {
        key: val / total_dosage
        for key, val in subset_summed_dosage_fractions.items()
    }

    alleles = list(subset_summed_dosage_fractions.keys())
    alleles_copy = alleles.copy()
    for allele in alleles_copy:
        if subset_summed_dosage_fractions[allele] < dosage_fraction_threshold:
            alleles.remove(allele)
    alleles = sorted(alleles)

    mean_per_dosage = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result[f'{stat_name}_{phenotype}_per_single_dosage'].to_numpy()
            [0]).items()
    }
    ci5e_2 = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['0.05_significance_CI'].to_numpy()[0]).items()
    }
    ci5e_8 = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['5e-8_significance_CI'].to_numpy()[0]).items()
    }
    y_min = min(ci5e_8[allele][0] for allele in alleles)
    y_max = max(ci5e_8[allele][1] for allele in alleles)

    figure.varea(alleles, [ci5e_2[allele][1] for allele in alleles],
                 [ci5e_8[allele][1] for allele in alleles],
                 color="red",
                 alpha=0.2,
                 legend_label='1 - 5e-8 Confidence Interval')
    figure.varea(alleles, [ci5e_2[allele][0] for allele in alleles],
                 [ci5e_2[allele][1] for allele in alleles],
                 color="red",
                 alpha=0.4,
                 legend_label='0.95 Confidence Interval')
    figure.varea(alleles, [ci5e_8[allele][0] for allele in alleles],
                 [ci5e_2[allele][0] for allele in alleles],
                 color="red",
                 alpha=0.2)
    figure.line(alleles, [mean_per_dosage[allele] for allele in alleles],
                line_width=2,
                color="black")
    figure.circle(alleles, [mean_per_dosage[allele] for allele in alleles],
                  color="black",
                  size=6,
                  legend_label='mean')
    figure.legend.label_text_font_size = '10px'

    figure.y_range = bokeh.models.Range1d(y_min - 0.05 * (y_max - y_min),
                                          y_max + 0.05 * (y_max - y_min))

    figure.add_layout(
        bokeh.models.Title(text=f'STR {chrom}:{pos}',
                           align="center",
                           text_font_size='18px'), "above")
    figure.add_layout(
        bokeh.models.Title(text=phenotype.replace('_', ' ').capitalize() +
                           " vs genotype",
                           align="center",
                           text_font_size='18px'), "above")

    if not publication:
        figure.add_layout(
            bokeh.models.Title(
                text="Phenotype values are unadjusted for covariates",
                align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(
                text=
                "People contribute to each genotype based on their prob. of having that genotype",
                align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(text="Only considers tested individuals",
                               align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(
                text=
                f"Genotypes with dosages less than {100*dosage_fraction_threshold}% of the population are omitted",
                align="center"), "below")

    return figure
コード例 #28
0
ファイル: snippet1.py プロジェクト: pola-rs/polars-book
import polars as pl

from ..paths import DATA_DIR

q = (
    pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter(
        pl.col("comment_karma") > 0).filter(pl.col("link_karma") > 0).filter(
            pl.col("name").str.contains(
                r"^a"))  # filter name that start with an "a"
)

df1 = q.fetch(int(1e7))
df2 = q.fetch(int(1e7), predicate_pushdown=True)
コード例 #29
0
ファイル: snippet.py プロジェクト: stjordanis/polars-book
import polars as pl

q = (pl.scan_csv("data/reddit.csv").groupby("comment_karma").agg(
    [pl.col("name").n_unique().alias("unique_names"),
     pl.max("link_karma")]).sort(by_columns="unique_names", reverse=True))

df = q.fetch()
コード例 #30
0
def main():
    df = pl.scan_csv('post_finemapping/intermediate_results/gathered_data.tab',
                     sep='\t').filter((pl.col('susie_pip') >= 0.3)
                                      | (pl.col('finemap_pip') >= 0.3))
    df = df.with_column(
        (pl.col('susie_pip') -
         pl.col('finemap_pip')).alias('susie_f_pip_diff')).with_column(
             pl.col('susie_f_pip_diff').abs().alias('abs_pip_diff'))
    locus_summary_df = pl.concat([
        pl.scan_csv(
            f'export_scripts/intermediate_results/chr{chrom}_loci_summary.tab',
            sep='\t') for chrom in range(1, 23)
    ]).select(['chr', 'pos', 'multiallelicness', 'allele_dist'])
    allele_threshes = (0.0004, 0.002, 0.01, 0.05)
    #allele_threshes = [0.01]
    df = df.join(
        locus_summary_df,
        how='left',
        #left_on=['chrom', 'snpstr_pos'],
        left_on=['chrom', 'pos'],
        right_on=['chr', 'pos']).collect()

    snp_df = df.filter(~pl.col('is_STR'))
    str_df = df.filter(pl.col('is_STR'))
    assert not str_df.select(
        pl.col('multiallelicness').is_null().any()).to_numpy()[0]

    str_df = str_df.with_columns([
        pl.apply('allele_dist', count_alleles(thresh),
                 pl.UInt32).alias(f'alleles_{thresh}')
        for thresh in allele_threshes
    ])
    confusions = pl.concat([
        pl.scan_csv(f'side_analyses/length_confusion/chr{i}.tab',
                    sep='\t').with_column(pl.lit(i).alias('chrom').cast(int))
        for i in range(1, 23)
    ]).collect()
    merged_df = str_df.join(confusions, how='left', on=['chrom', 'pos'])

    step = 0.05
    fig = bokeh.plotting.figure(title='STR PIP histogram',
                                width=size,
                                height=size,
                                x_axis_label='PIP',
                                y_axis_label='density',
                                tools='',
                                toolbar_location=None)
    xs = np.arange(0, 1 + step, step)
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(str_df['susie_pip'], bins=xs, density=True)[0],
        color='red',
        legend_label='SuSiE STRs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(str_df['finemap_pip'], bins=xs, density=True)[0],
        color='blue',
        legend_label='FINEMAP STRs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(snp_df['susie_pip'], bins=xs, density=True)[0],
        color='green',
        legend_label='SuSiE SNPs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(snp_df['finemap_pip'], bins=xs, density=True)[0],
        color='purple',
        legend_label='FINEMAP SNPs')
    bokeh.io.export_png(fig,
                        filename='post_finemapping/results/pip_histogram.png')

    fig = bokeh.plotting.figure(title='STR PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                toolbar_location=None)
    fig.circle(str_df['susie_pip'], str_df['finemap_pip'])
    bokeh.io.export_png(
        fig, filename='post_finemapping/results/str_comp_pip_scatter.png')

    fig = bokeh.plotting.figure(title='STR PIP heatmap',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    heat_map(fig, str_df['finemap_pip'], str_df['susie_pip'],
             'post_finemapping/results/str_comp_pip_heatmap.png')

    fig = bokeh.plotting.figure(title='STR PIPs',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    weighted_heat_map(
        fig, merged_df['finemap_pip'], merged_df['susie_pip'],
        merged_df['chance_of_length_confusion'],
        'average chance of misgenotyping per sample at any such locus',
        'post_finemapping/results/str_comp_pip_chance_map.png')

    fig = bokeh.plotting.figure(title='STR PIPs',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    weighted_heat_map(
        fig, merged_df['finemap_pip'], merged_df['susie_pip'],
        merged_df['normalized_avg_abs_length_confusion'],
        'average number of standard deviations of misgenotyping per sample at any such locus',
        'post_finemapping/results/str_comp_pip_sd_map.png')

    fig = bokeh.plotting.figure(title='SNP PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                toolbar_location=None)
    fig.circle(snp_df['susie_pip'], snp_df['finemap_pip'])
    bokeh.io.export_png(
        fig, filename='post_finemapping/results/snp_comp_pip_scatter.png')

    fig = bokeh.plotting.figure(title='SNP PIP heatmap',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    heat_map(fig, snp_df['finemap_pip'], snp_df['susie_pip'],
             'post_finemapping/results/snp_comp_pip_heatmap.png')

    color_mapper = bokeh.models.LinearColorMapper(palette=palette,
                                                  low=0,
                                                  high=1)
    color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=30)
    cmap = bokeh.transform.linear_cmap('foo', palette=palette, low=0, high=1)

    fig = bokeh.plotting.figure(title='STR PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                match_aspect=True,
                                toolbar_location=None)
    cb_title = bokeh.models.Title(
        text='chance a genotype call at this locus is wrong', align='center')
    fig.add_layout(color_bar, 'right')
    fig.add_layout(cb_title, 'right')
    cds = bokeh.models.ColumnDataSource(
        dict(x=merged_df['finemap_pip'],
             y=merged_df['susie_pip'],
             color=[
                 linear_int_interpolate((134, 204, 195), (9, 41, 46), val)
                 for val in merged_df['chance_of_length_confusion']
             ]))
    fig.circle(x='x', y='y', color='color', source=cds)
    bokeh.io.export_png(
        fig,
        filename='post_finemapping/results/colored_str_comp_pip_scatter.png')

    step = 0.05
    for thresh in allele_threshes:
        for pip_thresh in (0.3, 0.8):
            for xs, x_label, out_loc, title, col in [
                (
                    np.arange(-1, 1 + step, step),
                    'SuSiE PIP - FINEMAP PIP',
                    f'post_finemapping/results/pip_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png',
                    f'PIP diff, STR allele penetrance threshold = {thresh:.4}',
                    'susie_f_pip_diff',
                ),
                (np.arange(0, 1 + step, step), 'absolute PIP difference',
                 f'post_finemapping/results/pip_abs_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png',
                 f'absolute PIP diff, STR allele penetrance threshold = {thresh:.4}',
                 'abs_pip_diff')
            ]:
                filter_exp = (pl.col('susie_pip') >= pip_thresh) | (
                    pl.col('finemap_pip') >= pip_thresh)
                fig = bokeh.plotting.figure(title=title,
                                            width=size,
                                            height=size,
                                            x_axis_label=x_label,
                                            y_axis_label='density',
                                            tools='',
                                            toolbar_location=None)
                fig.line(
                    x=xs[:-1],
                    y=np.histogram(snp_df.filter(filter_exp)[col].to_numpy(),
                                   bins=xs,
                                   density=True)[0],
                    #y=scipy.stats.gaussian_kde(snp_df['susie_f_pip_diff'].to_numpy())(xs),
                    color='black',
                    legend_label=f'SNPs (n={snp_df.shape[0]})')
                for count, color in ((2, 'brown'), (3, 'red'), (4, 'orange')):
                    arr = str_df.filter(filter_exp).filter(
                        pl.col(f'alleles_{thresh}') == count)[col].to_numpy()
                    fig.line(
                        x=xs[:-1],
                        #y=scipy.stats.gaussian_kde(arr)(xs),
                        y=np.histogram(arr, bins=xs, density=True)[0],
                        color=color,
                        legend_label=f'{count}-allele STRs (n={arr.shape[0]})')
                arr = str_df.filter(filter_exp).filter(
                    pl.col(f'alleles_{thresh}') >= 5)[col].to_numpy()
                fig.line(
                    x=xs[:-1],
                    #y=scipy.stats.gaussian_kde(arr)(xs),
                    y=np.histogram(arr, bins=xs, density=True)[0],
                    color='gold',
                    legend_label=
                    f'STRs with at least 5 alleles (n={arr.shape[0]})')
                fig.add_layout(
                    bokeh.models.Title(
                        text=
                        f'Variants with PIP at least {pip_thresh} for SuSiE or FINEMAP'
                    ), 'below')
                bokeh.io.export_png(fig, filename=out_loc)

    fig = bokeh.plotting.figure(title='STR PIP diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='SuSiE PIP - FINEMAP PIP',
                                tools='',
                                toolbar_location=None)
    heat_map(fig,
             str_df['multiallelicness'],
             str_df['susie_f_pip_diff'],
             'post_finemapping/results/str_pip_diff_heatmap.png',
             y_min=-1)
    fig = bokeh.plotting.figure(title='STR PIP abs diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='absolute PIP difference',
                                tools='',
                                toolbar_location=None)
    heat_map(fig, str_df['multiallelicness'], str_df['abs_pip_diff'],
             'post_finemapping/results/str_pip_abs_diff_heatmap.png')

    fig = bokeh.plotting.figure(title='PIP abs diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='absolute PIP difference',
                                tools='',
                                toolbar_location=None)