Example #1
0
def read_list_data(input_file_path: str) -> List[str]:
    """
    Reads a file input into a python list (each line will be an element).
    Supports Google storage paths and .gz compression.

    :param input_file_path: File path
    :return: List of lines
    """
    if input_file_path.startswith("gs://"):
        hl.hadoop_copy(input_file_path, "file:///" + input_file_path.split("/")[-1])
        f = (
            gzip.open("/" + os.path.basename(input_file_path))
            if input_file_path.endswith("gz")
            else open("/" + os.path.basename(input_file_path))
        )
    else:
        f = (
            gzip.open(input_file_path)
            if input_file_path.endswith("gz")
            else open(input_file_path)
        )
    output = []
    for line in f:
        output.append(line.strip())
    f.close()
    return output
Example #2
0
def get_codings():
    """
    Read codings data from Duncan's repo and load into hail Table

    :return: Hail table with codings
    :rtype: Table
    """
    root = f'{tempfile.gettempdir()}/PHESANT'
    if subprocess.check_call(
        ['git', 'clone', 'https://github.com/astheeggeggs/PHESANT.git', root]):
        raise Exception('Could not clone repo')
    hts = []
    coding_dir = f'{root}/WAS/codings'
    for coding_file in os.listdir(f'{coding_dir}'):
        hl.hadoop_copy(f'file://{coding_dir}/{coding_file}',
                       f'{coding_dir}/{coding_file}')
        ht = hl.import_table(f'{coding_dir}/{coding_file}')
        if 'node_id' not in ht.row:
            ht = ht.annotate(node_id=hl.null(hl.tstr),
                             parent_id=hl.null(hl.tstr),
                             selectable=hl.null(hl.tstr))
        ht = ht.annotate(
            coding_id=hl.int(coding_file.split('.')[0].replace('coding', '')))
        hts.append(ht)
    full_ht = hts[0].union(*hts[1:]).key_by('coding_id', 'coding')
    return full_ht.repartition(10)
Example #3
0
def get_all_codings():
    """
    Download all coding data files from UKB website
    """
    import requests
    coding_prefix = '/tmp/coding'
    all_codings = requests.post(url='http://biobank.ndph.ox.ac.uk/showcase/scdown.cgi', data={'fmt': 'txt', 'id': 2})
    all_codings = all_codings.text.strip().split('\n')[1:]
    hts = []
    for coding_list in all_codings:
        coding = coding_list.split('\t')[0]
        r = requests.post(url='http://biobank.ndph.ox.ac.uk/showcase/codown.cgi', data={'id': coding})
        req_data = r.text
        if r.status_code != 200 or not req_data or req_data.startswith('<!DOCTYPE HTML>'):
            print(f'Issue with {coding}: {r.text}')
            continue
        with open(f'{coding_prefix}{coding}.tsv', 'w') as f:
            f.write(req_data)
        hl.hadoop_copy(f'file://{coding_prefix}{coding}.tsv', f'{coding_prefix}{coding}.tsv')
        ht = hl.import_table(f'{coding_prefix}{coding}.tsv')
        if 'node_id' not in ht.row:
            ht = ht.annotate(node_id=hl.null(hl.tstr), parent_id=hl.null(hl.tstr), selectable=hl.null(hl.tstr))
        ht = ht.annotate(coding_id=hl.int(coding))
        hts.append(ht)
    full_ht = hts[0].union(*hts[1:]).key_by('coding_id', 'coding')
    return full_ht.repartition(10)
Example #4
0
def read_list_data(input_file: str) -> List[str]:
    if input_file.startswith('gs://'):
        hl.hadoop_copy(input_file, 'file:///' + input_file.split("/")[-1])
        f = gzip.open("/" + os.path.basename(input_file)) if input_file.endswith('gz') else open("/" + os.path.basename(input_file))
    else:
        f = gzip.open(input_file) if input_file.endswith('gz') else open(input_file)
    output = []
    for line in f:
        output.append(line.strip())
    f.close()
    return output
Example #5
0
def pre_process_data_dictionary(pheno_description_raw_path, pheno_description_path):
    """
    Convert Data_Dictionary_Showcase.csv to tsv to enable load into hail

    :param str pheno_description_raw_path: Input file
    :param str pheno_description_path: Parsed tsv file
    """
    local_pheno_description_path = '/tmp/Data_Dictionary_Showcase.csv'
    local_pheno_description_out_path = '/tmp/Data_Dictionary_Showcase.tsv'
    hl.hadoop_copy(pheno_description_raw_path, f'file://{local_pheno_description_path}')
    with open(local_pheno_description_path) as f, open(local_pheno_description_out_path, 'w') as g:
        reader = csv.reader(f)
        for line in reader:
            g.write('\t'.join(line) + '\n')
    hl.hadoop_copy(f'file://{local_pheno_description_out_path}', pheno_description_path)
Example #6
0
def get_1kg(output_dir, overwrite: bool = False):
    """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__
    dataset and sample annotations.

    Notes
    -----
    The download is about 15M.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite any existing files/directories at `output_dir`.
    """
    jhc = Env.hc()._jhc

    _mkdir(jhc, output_dir)

    matrix_table_path = os.path.join(output_dir, '1kg.mt')
    annotations_path = os.path.join(output_dir, '1kg_annotations.txt')

    if (overwrite or not Env.jutils().dirExists(jhc, matrix_table_path)
            or not Env.jutils().fileExists(jhc, annotations_path)):
        init_temp_dir()
        tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz')
        source = resources['1kg_matrix_table']
        info(f'downloading 1KG VCF ...\n' f'  Source: {source}')
        urlretrieve(resources['1kg_matrix_table'], tmp_vcf)
        cluster_readable_vcf = Env.jutils().copyToTmp(jhc,
                                                      local_path_uri(tmp_vcf),
                                                      'vcf')
        info('importing VCF and writing to matrix table...')
        hl.import_vcf(cluster_readable_vcf,
                      min_partitions=16).write(matrix_table_path,
                                               overwrite=True)

        tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt')
        source = resources['1kg_annotations']
        info(f'downloading 1KG annotations ...\n' f'  Source: {source}')
        urlretrieve(source, tmp_annot)
        hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path)
        info('Done!')
    else:
        info('1KG files found')
Example #7
0
def get_1kg(output_dir, overwrite: bool = False):
    """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__
    dataset and sample annotations.

    Notes
    -----
    The download is about 15M.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite any existing files/directories at `output_dir`.
    """
    jhc = Env.hc()._jhc

    _mkdir(jhc, output_dir)

    matrix_table_path = os.path.join(output_dir, '1kg.mt')
    annotations_path = os.path.join(output_dir, '1kg_annotations.txt')

    if (overwrite
            or not Env.jutils().dirExists(jhc, matrix_table_path)
            or not Env.jutils().fileExists(jhc, annotations_path)):
        init_temp_dir()
        tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz')
        source = resources['1kg_matrix_table']
        info(f'downloading 1KG VCF ...\n'
             f'  Source: {source}')
        urlretrieve(resources['1kg_matrix_table'], tmp_vcf)
        cluster_readable_vcf = Env.jutils().copyToTmp(jhc, local_path_uri(tmp_vcf), 'vcf')
        info('importing VCF and writing to matrix table...')
        hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True)

        tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt')
        source = resources['1kg_annotations']
        info(f'downloading 1KG annotations ...\n'
             f'  Source: {source}')
        urlretrieve(source, tmp_annot)
        hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path)
        info('Done!')
    else:
        info('1KG files found')
Example #8
0
def main(args):
    ss1, p1 = import_key(args.ss1, args.ss1_chr_pos_ref_alt_p)
    ss2, p2 = import_key(args.ss2, args.ss2_chr_pos_ref_alt_p)
    ss1 = ss1.annotate(ss2=ss2[ss1.key])
    x = (-hl.log10(ss1[p1])).collect()
    y = (-hl.log10(ss1.ss2[p2])).collect()

    fig, ax = plt.subplots()
    plt.xlabel(args.ss1_name)
    plt.ylabel(args.ss2_name)
    plt.title(args.trait)
    ax.scatter(x, y)
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    out_base = args.out.split('/')[-1]
    fig.savefig('/tmp/' + out_base)
    hl.hadoop_copy('file:///tmp/' + out_base, args.out)
Example #9
0
def get_hgdp(output_dir, overwrite: bool = False):
    """Download subset of the `Human Genome Diversity Panel
    <https://www.internationalgenome.org/data-portal/data-collection/hgdp/>`__
    dataset and sample annotations.

    Notes
    -----
    The download is about 30MB.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite any existing files/directories at `output_dir`.
    """
    fs = Env.fs()

    if not _dir_exists(fs, output_dir):
        fs.mkdir(output_dir)

    matrix_table_path = os.path.join(output_dir, 'HGDP.mt')
    vcf_path = os.path.join(output_dir, 'HGDP.vcf.bgz')
    sample_annotations_path = os.path.join(output_dir, 'HGDP_annotations.txt')
    gene_annotations_path = os.path.join(output_dir, 'ensembl_gene_annotations.txt')

    if (overwrite
            or not _dir_exists(fs, matrix_table_path)
            or not _file_exists(fs, sample_annotations_path)
            or not _file_exists(fs, vcf_path)
            or not _file_exists(fs, gene_annotations_path)):
        init_temp_dir()
        tmp_vcf = os.path.join(tmp_dir, 'HGDP.vcf.bgz')
        source = resources['HGDP_matrix_table']
        info(f'downloading HGDP VCF ...\n'
             f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, resources['HGDP_matrix_table'], tmp_vcf)
        cluster_readable_vcf = _copy_to_tmp(fs, local_path_uri(tmp_vcf), extension='vcf.bgz')
        info('importing VCF and writing to matrix table...')
        hl.import_vcf(cluster_readable_vcf, min_partitions=16, reference_genome='GRCh38').write(matrix_table_path, overwrite=True)

        tmp_sample_annot = os.path.join(tmp_dir, 'HGDP_annotations.txt')
        source = resources['HGDP_annotations']
        info(f'downloading HGDP annotations ...\n'
             f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, source, tmp_sample_annot)

        tmp_gene_annot = os.path.join(tmp_dir, 'ensembl_gene_annotations.txt')
        source = resources['HGDP_ensembl_gene_annotations']
        info(f'downloading Ensembl gene annotations ...\n'
             f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, source, tmp_gene_annot)

        hl.hadoop_copy(local_path_uri(tmp_sample_annot), sample_annotations_path)
        hl.hadoop_copy(local_path_uri(tmp_gene_annot), gene_annotations_path)
        hl.hadoop_copy(local_path_uri(tmp_vcf), vcf_path)
        info('Done!')
    else:
        info('HGDP files found')
Example #10
0
def plot_svg_to_gcs(plot, filename):
    export_svgs(plot, filename)
    hl.hadoop_copy(filename, f'{OUTPUT_BUCKET}/{filename}')
Example #11
0
def preimp_qc(input_type: str = None, dirname: str = None, basename: str = None, pre_geno_thresh: Union[int, float] = 0.95,
              mind_thresh: Union[int, float] = 0.98, fhet_aut: Union[int, float] = 0.2, fstat_x: Union[int, float] = 0.5,
              fstat_y: Union[int, float] = 0.5, geno_thresh: Union[int, float] = 0.98,
              cr_diff_thresh: Union[int, float] = 0.02, maf_thresh: Union[int, float] = 0.01,
              hwe_th_con_thresh: Union[int, float] = 1e-6, hwe_th_cas_thresh: Union[int, float] = 1e-10,
              hwe_th_all_thresh: Union[int, float] = 1e-06, annotations_file: str = None, report: bool = True,
              export_type: str = 'hail', out_dir: str = None, reference: str = 'GRCh38'):
    print('\nRunning QC')

    global mt, row_filters, filters, data_type, lambda_gc_pos, lambda_gc_pre, n_sig_var_pre, n_sig_var_pos, man_table_results, remove_fields

    # create temp directory for storing temp files
    if not os.path.exists('gwaspy_tmp'):
        os.makedirs('gwaspy_tmp')

    # LaTex needs full path to files
    gwaspy_dir = os.getcwd() + '/gwaspy_tmp'

    output_directory = out_dir if out_dir else dirname

    hl.init(default_reference=reference)

    # read input
    mt = read_infile(input_type=input_type, dirname=dirname, basename=basename, annotations=annotations_file)

    if 'is_case' in mt.col:
        gwas_pre, n_sig_var_pre = manhattan(qqtitle='Pre-QC QQ Plot', mantitle='Pre-QC Manhattan Plot').filter(mt)
        qqplt_pre, lambda_gc_pre, manplt_pre = manhattan(qqtitle='Pre-QC QQ Plot',
                                                         mantitle='Pre-QC Manhattan Plot').plot(gwas_pre)
        qqplt_pre.savefig('gwaspy_tmp/gwaspy_qq_pre.png', dpi=300)
        manplt_pre.savefig('gwaspy_tmp/gwaspy_man_pre.png', dpi=300)

    mt = mt.annotate_rows(exclude_row=False)
    mt = mt.annotate_cols(exclude_col=False)

    mt, pre_qc_counts = summary_stats(mt)

    if 'is_case' in mt.col:
        if (pre_qc_counts['is_case_counts']['case'] > 0) & (pre_qc_counts['is_case_counts']['control'] == 0):
            data_type = 'Case-only'
        elif (pre_qc_counts['is_case_counts']['control'] > 0) & (pre_qc_counts['is_case_counts']['case'] == 0):
            data_type = 'Control-only'
        elif (pre_qc_counts['is_case_counts']['case'] > 0) & (pre_qc_counts['is_case_counts']['control'] > 0):
            data_type = 'Case-Control'
        else:
            data_type = 'Trio'
    else:
        data_type = 'no-pheno'

    chroms = mt.aggregate_rows(hl.agg.collect_as_set(mt.locus.contig))
    if ('chrX' or 'chrY' or 'chrMT') in chroms:
        chromx, chromy, chrommt = 'chrX', 'chrY', 'chrMT'
    else:
        chromx, chromy, chrommt = 'X', 'Y', 'MT'

    # we need to compute call rate for chr1-23 and chrY separately since females have no chrY
    mt = mt.annotate_entries(
        geno_y_excluded=(hl.case()
                         .when(mt.locus.contig == chromy, False)
                         .default(True)
                         ),
        geno_y_only=(hl.case()
                     .when(mt.locus.contig == chromy, mt.is_female == False)
                     .default(False)
                     )
    )

    mt = pre_geno(pre_geno_cr=pre_geno_thresh).filter(mt)
    mt = id_call_rate(mind=mind_thresh, pre_row_filter='pre_geno').filter(mt)

    mt = fhet_autosomes(pre_row_filter='pre_geno', fhet_thresh=fhet_aut).filter(mt)
    mt = fhet_sex(pre_row_filter='pre_geno', fstat_x=fstat_x, fstat_y=fstat_y).filter(mt)
    mt = fhet_sex_warnings(pre_row_filter='pre_geno', pre_col_filter='sex_violations').filter(mt)

    mt = mt.annotate_cols(**{
        'id_pass': hl.struct(
            filters=((hl.agg.any(mt['mind'].filters) == True) | (hl.agg.any(mt['fstat'].filters) == True) |
                     (hl.agg.any(mt['sex_violations'].filters) == True))
        )})

    mt = geno(pre_row_filter='pre_geno', pre_col_filter='id_pass', geno_thresh=geno_thresh, data_type=data_type).filter(mt)
    mt = invariant(pre_col_filter='id_pass').filter(mt)

    # for HWE, markers in: (1) autosomes - include males+females; (2) chrX - include ONLY females; (3) exclude chrY
    mt = mt.annotate_entries(
        hwe_aut=(hl.case()
                 .when(mt.locus.contig == chromx, False)
                 .when(mt.locus.contig == chromy, False)
                 .when(mt.locus.contig == chrommt, False)
                 .default(True)
                 ),
        hwe_sex=(hl.case()
                 .when(mt.locus.contig == chromx, mt.is_female)
                 .default(False)
                 )
    )

    if 'is_case' in mt.col:
        mt = call_rate_diff(pre_row_filter='geno', pre_col_filter='id_pass', initial_row_filter='pre_geno',
                            cr_thresh=cr_diff_thresh).filter(mt)

        # check if data is case-/control-only, case-control, or trio
        # (a) Case-Only
        if data_type == 'Case-only':
            print("\n" + data_type)
            mt = hwe_cas(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_ca=1e-6).filter(mt)
            row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_cas']
            filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff',
                       'monomorphic_var', 'hwe_cas']
            remove_fields = ['cr', 'diff', 'hwe_cas_aut', 'hwe_cas_sex']
        # (b) Control-Only
        elif data_type == 'Control-only':
            print("\n" + data_type)
            mt = hwe_con(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_co=1e-6).filter(mt)
            row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con']
            filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff',
                       'monomorphic_var', 'hwe_con']
            remove_fields = ['cr', 'diff', 'hwe_con_aut', 'hwe_con_sex']
        elif data_type == 'Case-Control':
            print("\n" + data_type)
            mt = hwe_cas(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_ca=hwe_th_cas_thresh).filter(mt)
            mt = hwe_con(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_co=hwe_th_con_thresh).filter(mt)
            row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con', 'hwe_cas']
            filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff',
                       'monomorphic_var', 'hwe_con', 'hwe_cas']
            remove_fields = ['cr', 'diff', 'hwe_cas_aut', 'hwe_cas_sex', 'hwe_con_aut', 'hwe_con_sex']
        else:
            # trio data
            print(data_type)
    else:
        print('Running HWE filters on whole dataset without spliting by phenotype status')
        mt = hwe_all(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_all=1e-08).filter(mt)
        row_filters = ['pre_geno', 'geno', 'monomorphic_var', 'hwe_all']
        filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno',
                   'monomorphic_var', 'hwe_all']
        remove_fields = ['hwe_all_aut', 'hwe_all_sex']

    results = {}
    column_filters = ['mind', 'fstat', 'sex_violations', 'sex_warnings']

    mt.select_entries().select_rows(*row_filters).select_cols(*column_filters).write(f'{output_directory}/temp.mt',
                                                                                     overwrite=True)
    mt_temp = hl.read_matrix_table(f'{output_directory}/temp.mt')
    column_aggregations = mt_temp.aggregate_cols(
        [hl.agg.counter(mt_temp[filter].filters) for filter in column_filters])

    row_aggregations = mt_temp.aggregate_rows(
        [hl.agg.counter(mt_temp[filter].filters) for filter in row_filters])

    for filt, cont in zip(column_filters, column_aggregations):
        results[filt] = dict(cont) # aggregate returns a frozendict, convert that back to a dict

    for filt, cont in zip(row_filters, row_aggregations):
        results[filt] = dict(cont) # # aggregate returns a frozendict, convert that back to a dict

    for i in filters:
        # some filters will have zero snps/id filtered, and there won't be a True, so add it
        if True not in results[i]:
            results[i][True] = 0
        for key, value in results.items():
            if i == key:
                print(key, ': ', value)

    if report:
        fstat_fig = fhet_sex(pre_row_filter='pre_geno', fstat_x=fstat_x, fstat_y=fstat_y, figsize=(15, 20)).plot(mt)
        fstat_fig.savefig('gwaspy_tmp/gwaspy_fstat_fig.png', dpi=300)

        id_cr_plot = id_call_rate(mind=mind_thresh, pre_row_filter='pre_geno', data_type=data_type).plot(mt)
        var_cr_plot = geno(pre_row_filter='pre_geno', pre_col_filter='id_pass', geno_thresh=geno_thresh,
                           data_type=data_type).plot(mt)

        if 'is_case' in mt.col:
            if data_type == 'Case-only':
                id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_cas_pre.png', dpi=300)
                var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_cas_pre.png', dpi=300)

            if data_type == 'Control-only':
                id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_con_pre.png', dpi=300)
                var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_con_pre.png', dpi=300)

            if data_type == 'Case-Control':
                id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_con_pre.png', dpi=300)
                id_cr_plot[1].savefig('gwaspy_tmp/gwaspy_id_cas_pre.png', dpi=300)
                var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_con_pre.png', dpi=300)
                var_cr_plot[1].savefig('gwaspy_tmp/gwaspy_var_cas_pre.png', dpi=300)

        else:
            id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_cas_con_pre.png', dpi=300)
            var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_cas_con_pre.png', dpi=300)

    # FILTER OUT ALL SNPs and IDs THAT FAIL QC
    column_filters = ['mind', 'fstat', 'sex_violations']
    for row in row_filters:
        mt = mt.filter_rows(mt[row].filters == True, keep=False)
    for col in column_filters:
        mt = mt.filter_cols(mt[col].filters == True, keep=False)

    mt_filtered, pos_qc_counts = summary_stats(mt)

    # drop entry fields we added as they will cause errors when exporting to VCF
    # e.g. Error summary: HailException: Invalid type for format field 'geno_y_excluded'. Found 'bool'.
    drop_fields = filters + ['geno_y_excluded', 'geno_y_only', 'pre_geno_noy', 'pre_geno_y', 'hwe_aut', 'hwe_sex',
                             'exclude_col', 'exclude_row', 'variant_qc', 'aaf', 'geno_noy', 'geno_y', 'sex_ambiguous',
                             'id_pass'] + remove_fields
    mt_filtered = mt_filtered.drop(*drop_fields)

    if 'is_case' in mt.col:
        gwas_pos, n_sig_var_pos = manhattan(qqtitle='Post-QC QQ Plot', mantitle='Post-QC Manhattan Plot').filter(mt)
        qqplt_pos, lambda_gc_pos, manplt_pos = manhattan(qqtitle='Post-QC QQ Plot',
                                                         mantitle='Post-QC Manhattan Plot').plot(gwas_pos)

        ncas_pre = pre_qc_counts['is_case_counts']['case']
        ncas_pos = pos_qc_counts['is_case_counts']['case']
        ncon_pre = pre_qc_counts['is_case_counts']['control']
        ncon_pos = pos_qc_counts['is_case_counts']['control']
        lambda_thous_pre = 1 + (lambda_gc_pre-1)*(1/ncas_pre+1/ncon_pre)/(1/1000+1/1000)
        lambda_thous_pos = 1 + (lambda_gc_pos-1)*(1/ncas_pos+1/ncon_pos)/(1/1000+1/1000)

        qqplt_pos.savefig('gwaspy_tmp/gwaspy_qq_pos.png', dpi=300)
        manplt_pos.savefig('gwaspy_tmp/gwaspy_man_pos.png', dpi=300)

        man_table_results = [n_sig_var_pre, n_sig_var_pos, lambda_gc_pre, lambda_gc_pos, round(lambda_thous_pre, 3),
                             round(lambda_thous_pos, 3)]

    # report
    if report:
        print('\nWriting report')
        doc = MyDocument(basename=basename)
        if 'is_case' in mt.col:
            doc.flags_table(pre_qc_counts=pre_qc_counts, pos_qc_counts=pos_qc_counts, results=results,
                            lambda_gc=lambda_gc_pos, sig_vars=n_sig_var_pos)
        else:
            doc.flags_table(pre_qc_counts=pre_qc_counts, pos_qc_counts=pos_qc_counts, results=results)
        doc.general_info(pre_qc_conts=pre_qc_counts, post_qc_conts=pos_qc_counts,
                         count_results=results, pre_filter=pre_geno_thresh, id_cr=mind_thresh, fhet_thresh=fhet_aut,
                         var_cr=geno_thresh, miss_diff=cr_diff_thresh, hwe_con=hwe_th_con_thresh,
                         hwe_cas=hwe_th_cas_thresh, hwe_all=hwe_th_all_thresh, data_type=data_type)
        if 'is_case' in mt.col:
            doc.manhattan_sec(qq_pre_path=f'{gwaspy_dir}/gwaspy_qq_pre.png', qq_pos_path=f'{gwaspy_dir}/gwaspy_qq_pos.png',
                              man_pre_path=f'{gwaspy_dir}/gwaspy_man_pre.png',
                              man_pos_path=f'{gwaspy_dir}/gwaspy_man_pos.png',
                              table_results=man_table_results)

        if data_type == 'Case-only':
            doc.individual_char(id_con_pre_path='nothing here', id_cas_pre_path=f'{gwaspy_dir}/gwaspy_id_cas_pre.png',
                                fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here',
                                data_type=data_type)
            doc.snp_char(var_con_pre_path='nothing here', var_cas_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png',
                         var_all_path='nothing here', data_type=data_type)
        if data_type == 'Control-only':
            doc.individual_char(id_con_pre_path=f'{gwaspy_dir}/gwaspy_id_con_pre.png', id_cas_pre_path='nothing here',
                                fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here',
                                data_type=data_type)
            doc.snp_char(var_con_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png', var_cas_pre_path='nothing here',
                         var_all_path='nothing here', data_type=data_type)
        if data_type == 'Case-Control':
            doc.individual_char(id_con_pre_path=f'{gwaspy_dir}/gwaspy_id_con_pre.png',
                                id_cas_pre_path=f'{gwaspy_dir}/gwaspy_id_cas_pre.png',
                                fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here',
                                data_type=data_type)
            doc.snp_char(var_con_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png',
                         var_cas_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png',
                         var_all_path='nothing here', data_type=data_type)
        if data_type == 'no-pheno':
            doc.individual_char(id_con_pre_path='nothing here', id_cas_pre_path='nothing here',
                                fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png',
                                id_all_path=f'{gwaspy_dir}/gwaspy_id_cas_con_pre.png', data_type=data_type)
            doc.snp_char(var_con_pre_path='nothing here', var_cas_pre_path=f'nothing here',
                         var_all_path=f'{gwaspy_dir}/gwaspy_var_cas_con_pre.png', data_type=data_type)
        doc.generate_pdf(f'{gwaspy_dir}/{basename}.preimp_qc.report', clean=True, clean_tex=True)

    print('\nExporting qced file')
    if export_type:
        from gwaspy.utils.export_file import export_qced_file
        export_qced_file(mt=mt_filtered, out_dir=output_directory, basename=basename, export_type=export_type)

    if output_directory.startswith('gs://'):
        hl.hadoop_copy(f'file://{gwaspy_dir}/{basename}.preimp_qc.report.pdf',
                       f'{output_directory}GWASpy/Preimp_QC/{basename}.preimp_qc.report.pdf')
    else:
        shutil.copyfile(f'{gwaspy_dir}/{basename}.preimp_qc.report.pdf',
                        f'{output_directory}{basename}.preimp_qc.report.pdf')

    # clean-up
    print('\nCleaning up')
    shutil.rmtree('gwaspy_tmp')

    print("\nDone running QC!")
Example #12
0
print('\nSUCCESSFULLY CONVERTED GRM TO NUMPY!\n')

# Compute eigenvalue decomposition
print('\nCOMPUTE EIGENVALUES...\n')
time_start = time.time()
#eigenvals = scipy.linalg.eigvalsh(np_grm)
eigenvals = np.linalg.eigvalsh(np_grm)
time_end = time.time()
eigenval_time = time_end - time_start
print(eigenval_time)
print('\nSUCCESSFULLY COMPUTED EIGENVALUES!\n')

# Save eigenvalues
np.save("eigenvals_"+str(pct)+"pct_"+str(nsnps)+"_x_"+str(nindv)+".npy", eigenvals)
np.save("/tmp/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+".npy", eigenvals)
hl.hadoop_copy("file:///tmp/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+".npy", "gs://ukb-gt/"+str(pct)+"pct/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+"meta0.npy")
print('\nSAVED EIGENVALUES!\n')

# Plot
lmda = nindv / nsnps
lmdap = (1 + np.sqrt(lmda))**2
lmdam = (1 - np.sqrt(lmda))**2
x = np.arange(lmdam, lmdap, 0.001)
y = (1/(2*math.pi)) * np.sqrt((lmdap-x)*(x-lmdam)) / (lmda*x)

plt.clf()
plt.hist(eigenvals[1:], bins=1000, density = True)
plt.plot(x, y, '-b', label = 'Marchenko-Pastur Distrubution')
plt.title('Eigenvalues for '+str(nindv)+' Individuals and '+str(nsnps)+' SNPs')
plt.xlabel('Eigenvalues')
plt.ylabel('Density')
def main(args):
    #  set up tracking variables corresponding to each dataset
    sumstats = []
    p = []
    ss = args.ss.split(',')
    if args.clumps:
        clumps = args.clumps.split(',')
    ss_names = args.ss_names.split(',')
    chr_pos_ref_alt_p = args.chr_pos_ref_alt_p.split(';')

    #  read in each set of sumstats
    for sumstat in range(len(ss)):
        ss_data, p_data = import_key(ss[sumstat], chr_pos_ref_alt_p[sumstat],
                                     clumps[sumstat])
        sumstats.append(ss_data)
        p.append(p_data)

    #  join across datasets, filter to target region
    ss_joined = sumstats[0]
    for sumstat in range(1, len(ss)):
        annot_val = 'ss' + str(sumstat)
        ss_joined = ss_joined.annotate(
            **{annot_val: sumstats[sumstat][ss_joined.key]})
    ss_joined_filt = ss_joined.filter(
        hl.parse_locus_interval(args.region).contains(ss_joined.locus))
    ss_to_plot = ss_joined_filt.to_pandas()

    # set up figure and plot
    print('Arial')
    sns.set(font='Arial')
    sns.set_style('white')
    sns.despine()
    fig = plt.figure(figsize=(8, 8))
    fig.subplots_adjust(hspace=0.5)
    plt.tight_layout()
    spacing = 0.04
    fig.text(0.5, spacing, 'Position', ha='center')
    fig.text(spacing, 0.5, '-log10(p)', va='center', rotation='vertical')
    fig.text(0.5, 1 - spacing, args.trait, ha='center')
    # tips = sns.load_dataset("tips")

    for sumstat in range(len(ss)):
        ax = fig.add_subplot(len(ss), 1, sumstat + 1)  #, sharex='col')
        if sumstat > 0:
            current_p = 'ss' + str(sumstat) + '.' + p[sumstat]
            pos = 'ss' + str(sumstat) + '.' + chr_pos_ref_alt_p[sumstat].split(
                ',')[1]
        else:
            current_p = p[sumstat]
            pos = chr_pos_ref_alt_p[sumstat].split(',')[1]
        ss_current_plot = ss_to_plot.filter(items=[pos, current_p, 'clump'],
                                            axis=1)
        ss_current_plot.dropna(inplace=True)
        sns.scatterplot(ss_current_plot[pos],
                        -np.log10(ss_current_plot[current_p]),
                        hue=ss_current_plot.clump,
                        linewidth=0,
                        ax=ax)
        plt.xlabel("")
        plt.ylabel("")
        plt.title(ss_names[sumstat])

    # write plot out
    # gs.tight_layout(fig)
    out_base = args.out.split('/')[-1]
    fig.savefig('/tmp/' + out_base)
    hl.hadoop_copy('file:///tmp/' + out_base, args.out)
Example #14
0
def run_pca_normal(dirname: str = None,
                   basename: str = None,
                   input_type: str = None,
                   reference: str = 'GRCh38',
                   maf: float = 0.05,
                   hwe: float = 1e-3,
                   call_rate: float = 0.98,
                   ld_cor: float = 0.2,
                   ld_window: int = 250000,
                   n_pcs: int = 20,
                   relatedness_method: str = 'pc_relate',
                   relatedness_thresh: float = 0.98,
                   out_dir: str = None):

    print('\nReading mt')
    if reference.lower() == 'grch37':
        lifted_over = f'{dirname}{basename}.liftover.grch38.mt'
        if not hl.hadoop_exists(lifted_over):
            from gwaspy.utils.reference_liftover import liftover_to_grch38
            mt = liftover_to_grch38(dirname=dirname,
                                    basename=basename,
                                    input_type=input_type)
        else:
            print(f'\nFound lifted-over over file: {lifted_over}')
            mt = hl.read_matrix_table(lifted_over)
    else:
        from gwaspy.utils.read_file import read_infile
        mt = read_infile(input_type=input_type,
                         dirname=dirname,
                         basename=basename)

    print('\nFiltering mt')
    mt = pca_filter_mt(in_mt=mt,
                       maf=maf,
                       hwe=hwe,
                       call_rate=call_rate,
                       ld_cor=ld_cor,
                       ld_window=ld_window)

    mt = relatedness_check(in_mt=mt,
                           method=relatedness_method,
                           outdir=out_dir,
                           kin_estimate=relatedness_thresh)

    pca_snps = mt.count_rows()
    if pca_snps > 1000000:
        import warnings
        warnings.warn(
            f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer'
        )

    print('\nRunning PCA')
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=n_pcs)

    pcs_ht = pcs.transmute(
        **{f'PC{i}': pcs.scores[i - 1]
           for i in range(1, n_pcs + 1)})

    # add phenotype and sex to the output, using information from the mt
    # first check if is_case and os_female fields exist in the mt
    all_column_field_names = list(mt.col)
    # sex status is a MUST but not phenotype status
    if 'is_case' in all_column_field_names:
        ann_cols = ['is_case', 'is_female']
    else:
        ann_cols = ['is_female']

    annotations_ht = mt.cols().select(*ann_cols)

    if 'is_case' in all_column_field_names:
        pcs_ht = pcs_ht.annotate(is_case=annotations_ht[pcs_ht.s].is_case)
    pcs_ht = pcs_ht.annotate(is_female=annotations_ht[pcs_ht.s].is_female)

    print('\nSaving PC scores file')
    out_scores_file = f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.normal.scores.tsv'
    pcs_ht.export(out_scores_file)

    print('\nGenerating PCA plots')
    pcs_scores = pd.read_table(out_scores_file, header=0, sep='\t')

    if 'is_case' in all_column_field_names:
        pcs_scores[['is_case'
                    ]] = pcs_scores[['is_case'
                                     ]].replace([True, False, None],
                                                ['case', 'control', 'unknown'])
    pcs_scores[['is_female'
                ]] = pcs_scores[['is_female'
                                 ]].replace([True, False, None],
                                            ['female', 'male', 'unknown'])

    figs_dict = {}
    for col in ann_cols:
        for i in range(1, n_pcs, 2):
            xpc = f'PC{i}'
            ypc = f'PC{i + 1}'

            figs_dict["fig{}{}".format(col,
                                       i)] = plot_pca(pcs_scores, xpc, ypc,
                                                      col)

    pdf = PdfPages('/tmp/pca.no.ref.plots.pdf')
    for figname, figure in figs_dict.items():
        pdf.savefig(figure)
    pdf.close()
    hl.hadoop_copy(
        'file:///tmp/pca.no.ref.plots.pdf',
        f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.no.ref.plots.pdf')
Example #15
0
    # gwas = hl.linear_regression_rows(y=[[mt_filtered.sample_qc_and_phenotype.wbc_gwas_normalised, ...], [family2...]],
    print("Linear regression CHECKPOINT")
    # TIM NOTE: checkpoint here to prevent multiple execution (write to a file, read that file)
    gwas = gwas.checkpoint(f"{BUCKET}/gwas/{CHROMOSOME}-gwasfbc-checkpoint",
                           overwrite=True)
    # gwas = gwas.checkpoint(s3"{tmp_dir}/gwas_wbc_chr19_checkpoint.mt")
    print("Linear regression output table")
    gwas.export(f"{BUCKET}/gwas/gwas-{CHROMOSOME}-export.tsv.bgz", header=True)

    print("Plotting")

    for i in range(0, 36):
        print(f"Plotting {i}:{covariates[i]}")
        p = hl.plot.manhattan(
            gwas.p_value[i],
            title=f"Interval WGS GWAS Manhattan Plot: {covariates[i]}")
        output_file(f"{i}.WGS-manhattan-{covariates[i]}.html")
        save(p)
        hl.hadoop_copy(f"{i}.WGS-manhattan-{covariates[i]}.html",
                       f"{BUCKET}/gwas/plots/")

    # p = hl.plot.qq(gwas.p_value[i], title=f"Interval WGS GWAS QQ Plot: {covariates[i]}")
    # export_png(p,f"{BUCKET}/output-tables/wgs-qq-{covariates[i]}.html")
    # output_file(f"{i}.WGS-qq-{covariates[i]}.html")
    # save(p)
    # hl.hadoop_copy(f"{i}.WGS-qq-{covariates[i]}.html", f"{BUCKET}/gwas/plots/")

    mt = mt.checkpoint(
        f"{BUCKET}/matrixtables/{CHROMOSOME}/{CHROMOSOME}-sampleqc-variantqc-filtered-FINAL.mt",
        overwrite=True)
Example #16
0
def run_pca_project(
        ref_dirname: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/',
        ref_basename: str = 'unrelated',
        ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv',
        data_dirname: str = None,
        data_basename: str = None,
        out_dir: str = None,
        input_type: str = None,
        reference: str = 'GRCh38',
        npcs: int = 20,
        maf: float = 0.05,
        hwe: float = 1e-3,
        call_rate: float = 0.98,
        ld_cor: float = 0.2,
        ld_window: int = 250000,
        relatedness_method: str = 'pc_relate',
        relatedness_thresh: float = 0.98,
        prob_threshold: float = 0.8):
    """
    Project samples into predefined PCA space
    :param ref_dirname: directory name where reference data is
    :param ref_basename: base filename for reference data
    :param ref_info: reference sample information
    :param data_dirname: matrix table of data to project
    :param data_basename: matrix table of data to project
    :param out_dir: directory and filename prefix for where to put PCA projection output
    :param input_type: input file(s) type: hail, plink, or vcf
    :param reference: reference build
    :param npcs: number of principal components to be used in PCA
    :param maf: minor allele frequency threshold
    :param hwe: hardy-weinberg fiter threshold
    :param call_rate: variant call rate filter threshold
    :param ld_cor: reference build
    :param ld_window: window size
    :param relatedness_method: method to use for relatedness filtering
    :param relatedness_thresh: threshold to use for filtering out related individuals
    :param prob_threshold: a list of probability thresholds to use for classifying samples
    :return: a pandas Dataframe with data PCA scores projected on the same PCA space using the Human Genome Diversity
    """
    print('\nReading data mt')
    if reference.lower() == 'grch37':
        lifted_over = f'{data_dirname}{data_basename}.liftover.grch38.mt'
        if not hl.hadoop_exists(lifted_over):
            from gwaspy.utils.reference_liftover import liftover_to_grch38
            mt = liftover_to_grch38(dirname=data_dirname, basename=data_basename, input_type=input_type)
        else:
            print(f'\nFound lifted-over over file: {lifted_over}')
            mt = hl.read_matrix_table(lifted_over)
    else:
        from gwaspy.utils.read_file import read_infile
        mt = read_infile(input_type=input_type, dirname=data_dirname, basename=data_basename)

    print('\nFiltering data mt')
    mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window)

    mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh)

    # Intersect data with reference
    intersect_ref(ref_dirname=ref_dirname, ref_basename=ref_basename, data_mt=mt, data_basename=data_basename,
                  out_dir=out_dir)

    ref_in_data = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_intersect_{data_basename}.mt')

    print('\nComputing reference PCs')
    run_ref_pca(mt=ref_in_data, npcs=npcs, out_dir=out_dir, data_basename=data_basename)

    # project data
    pca_loadings = hl.read_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht')
    project_mt = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}_intersect_1kg_hgdp.mt')

    ht_projections = pc_project(mt=project_mt, loadings_ht=pca_loadings)
    ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, npcs+1)})
    ht_projections.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv')

    ref_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz'
    data_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv'
    data_ref = merge_data_with_ref(ref_scores=ref_scores, ref_info=ref_info, data_scores=data_scores)

    from gwaspy.pca.assign_pop_labels import assign_population_pcs
    pcs_df, clf = assign_population_pcs(pop_pc_pd=data_ref, num_pcs=npcs, min_prob=prob_threshold)

    data_pops = pcs_df.loc[pcs_df['SuperPop'].isnull()]
    data_pops['pop'].value_counts()
    cols = ['s', 'pop'] + [f'prob_{i}' for i in ["AFR", "AMR", "CSA", "EAS", "EUR", "MID", "OCE"]] + [f'PC{i}' for i in
                                                                                                      range(1, npcs+1)]
    data_pops_df = data_pops[cols]

    data_pops_df.to_csv(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt',
                        sep='\t', index=False)

    print("\nGenerating PCA plots")
    data_scores_prob = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt'

    figs_dict = {}
    # plotting more than 10 PCA plots in HTML generates wobbly, large files
    for i in range(1, 10, 2):
        xpc = f'PC{i}'
        ypc = f'PC{i + 1}'
        figs_dict["fig{}{}".format(xpc, ypc)] = plot_pca_ref(data_scores=data_scores_prob,
                                                             ref_scores=ref_scores,
                                                             ref_info=ref_info,
                                                             x_pc=xpc, y_pc=ypc)
    with open('/tmp/pca.project.plots.html', 'a') as f:
        for figname, figure in figs_dict.items():
            f.write(figure.to_html(include_plotlyjs='cdn'))

    hl.hadoop_copy('file:///tmp/pca.project.plots.html',
                   f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.pca.project.plots.html')