def read_list_data(input_file_path: str) -> List[str]: """ Reads a file input into a python list (each line will be an element). Supports Google storage paths and .gz compression. :param input_file_path: File path :return: List of lines """ if input_file_path.startswith("gs://"): hl.hadoop_copy(input_file_path, "file:///" + input_file_path.split("/")[-1]) f = ( gzip.open("/" + os.path.basename(input_file_path)) if input_file_path.endswith("gz") else open("/" + os.path.basename(input_file_path)) ) else: f = ( gzip.open(input_file_path) if input_file_path.endswith("gz") else open(input_file_path) ) output = [] for line in f: output.append(line.strip()) f.close() return output
def get_codings(): """ Read codings data from Duncan's repo and load into hail Table :return: Hail table with codings :rtype: Table """ root = f'{tempfile.gettempdir()}/PHESANT' if subprocess.check_call( ['git', 'clone', 'https://github.com/astheeggeggs/PHESANT.git', root]): raise Exception('Could not clone repo') hts = [] coding_dir = f'{root}/WAS/codings' for coding_file in os.listdir(f'{coding_dir}'): hl.hadoop_copy(f'file://{coding_dir}/{coding_file}', f'{coding_dir}/{coding_file}') ht = hl.import_table(f'{coding_dir}/{coding_file}') if 'node_id' not in ht.row: ht = ht.annotate(node_id=hl.null(hl.tstr), parent_id=hl.null(hl.tstr), selectable=hl.null(hl.tstr)) ht = ht.annotate( coding_id=hl.int(coding_file.split('.')[0].replace('coding', ''))) hts.append(ht) full_ht = hts[0].union(*hts[1:]).key_by('coding_id', 'coding') return full_ht.repartition(10)
def get_all_codings(): """ Download all coding data files from UKB website """ import requests coding_prefix = '/tmp/coding' all_codings = requests.post(url='http://biobank.ndph.ox.ac.uk/showcase/scdown.cgi', data={'fmt': 'txt', 'id': 2}) all_codings = all_codings.text.strip().split('\n')[1:] hts = [] for coding_list in all_codings: coding = coding_list.split('\t')[0] r = requests.post(url='http://biobank.ndph.ox.ac.uk/showcase/codown.cgi', data={'id': coding}) req_data = r.text if r.status_code != 200 or not req_data or req_data.startswith('<!DOCTYPE HTML>'): print(f'Issue with {coding}: {r.text}') continue with open(f'{coding_prefix}{coding}.tsv', 'w') as f: f.write(req_data) hl.hadoop_copy(f'file://{coding_prefix}{coding}.tsv', f'{coding_prefix}{coding}.tsv') ht = hl.import_table(f'{coding_prefix}{coding}.tsv') if 'node_id' not in ht.row: ht = ht.annotate(node_id=hl.null(hl.tstr), parent_id=hl.null(hl.tstr), selectable=hl.null(hl.tstr)) ht = ht.annotate(coding_id=hl.int(coding)) hts.append(ht) full_ht = hts[0].union(*hts[1:]).key_by('coding_id', 'coding') return full_ht.repartition(10)
def read_list_data(input_file: str) -> List[str]: if input_file.startswith('gs://'): hl.hadoop_copy(input_file, 'file:///' + input_file.split("/")[-1]) f = gzip.open("/" + os.path.basename(input_file)) if input_file.endswith('gz') else open("/" + os.path.basename(input_file)) else: f = gzip.open(input_file) if input_file.endswith('gz') else open(input_file) output = [] for line in f: output.append(line.strip()) f.close() return output
def pre_process_data_dictionary(pheno_description_raw_path, pheno_description_path): """ Convert Data_Dictionary_Showcase.csv to tsv to enable load into hail :param str pheno_description_raw_path: Input file :param str pheno_description_path: Parsed tsv file """ local_pheno_description_path = '/tmp/Data_Dictionary_Showcase.csv' local_pheno_description_out_path = '/tmp/Data_Dictionary_Showcase.tsv' hl.hadoop_copy(pheno_description_raw_path, f'file://{local_pheno_description_path}') with open(local_pheno_description_path) as f, open(local_pheno_description_out_path, 'w') as g: reader = csv.reader(f) for line in reader: g.write('\t'.join(line) + '\n') hl.hadoop_copy(f'file://{local_pheno_description_out_path}', pheno_description_path)
def get_1kg(output_dir, overwrite: bool = False): """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__ dataset and sample annotations. Notes ----- The download is about 15M. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite any existing files/directories at `output_dir`. """ jhc = Env.hc()._jhc _mkdir(jhc, output_dir) matrix_table_path = os.path.join(output_dir, '1kg.mt') annotations_path = os.path.join(output_dir, '1kg_annotations.txt') if (overwrite or not Env.jutils().dirExists(jhc, matrix_table_path) or not Env.jutils().fileExists(jhc, annotations_path)): init_temp_dir() tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz') source = resources['1kg_matrix_table'] info(f'downloading 1KG VCF ...\n' f' Source: {source}') urlretrieve(resources['1kg_matrix_table'], tmp_vcf) cluster_readable_vcf = Env.jutils().copyToTmp(jhc, local_path_uri(tmp_vcf), 'vcf') info('importing VCF and writing to matrix table...') hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True) tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt') source = resources['1kg_annotations'] info(f'downloading 1KG annotations ...\n' f' Source: {source}') urlretrieve(source, tmp_annot) hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path) info('Done!') else: info('1KG files found')
def get_1kg(output_dir, overwrite: bool = False): """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__ dataset and sample annotations. Notes ----- The download is about 15M. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite any existing files/directories at `output_dir`. """ jhc = Env.hc()._jhc _mkdir(jhc, output_dir) matrix_table_path = os.path.join(output_dir, '1kg.mt') annotations_path = os.path.join(output_dir, '1kg_annotations.txt') if (overwrite or not Env.jutils().dirExists(jhc, matrix_table_path) or not Env.jutils().fileExists(jhc, annotations_path)): init_temp_dir() tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz') source = resources['1kg_matrix_table'] info(f'downloading 1KG VCF ...\n' f' Source: {source}') urlretrieve(resources['1kg_matrix_table'], tmp_vcf) cluster_readable_vcf = Env.jutils().copyToTmp(jhc, local_path_uri(tmp_vcf), 'vcf') info('importing VCF and writing to matrix table...') hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True) tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt') source = resources['1kg_annotations'] info(f'downloading 1KG annotations ...\n' f' Source: {source}') urlretrieve(source, tmp_annot) hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path) info('Done!') else: info('1KG files found')
def main(args): ss1, p1 = import_key(args.ss1, args.ss1_chr_pos_ref_alt_p) ss2, p2 = import_key(args.ss2, args.ss2_chr_pos_ref_alt_p) ss1 = ss1.annotate(ss2=ss2[ss1.key]) x = (-hl.log10(ss1[p1])).collect() y = (-hl.log10(ss1.ss2[p2])).collect() fig, ax = plt.subplots() plt.xlabel(args.ss1_name) plt.ylabel(args.ss2_name) plt.title(args.trait) ax.scatter(x, y) lims = [ np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes ] ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0) out_base = args.out.split('/')[-1] fig.savefig('/tmp/' + out_base) hl.hadoop_copy('file:///tmp/' + out_base, args.out)
def get_hgdp(output_dir, overwrite: bool = False): """Download subset of the `Human Genome Diversity Panel <https://www.internationalgenome.org/data-portal/data-collection/hgdp/>`__ dataset and sample annotations. Notes ----- The download is about 30MB. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite any existing files/directories at `output_dir`. """ fs = Env.fs() if not _dir_exists(fs, output_dir): fs.mkdir(output_dir) matrix_table_path = os.path.join(output_dir, 'HGDP.mt') vcf_path = os.path.join(output_dir, 'HGDP.vcf.bgz') sample_annotations_path = os.path.join(output_dir, 'HGDP_annotations.txt') gene_annotations_path = os.path.join(output_dir, 'ensembl_gene_annotations.txt') if (overwrite or not _dir_exists(fs, matrix_table_path) or not _file_exists(fs, sample_annotations_path) or not _file_exists(fs, vcf_path) or not _file_exists(fs, gene_annotations_path)): init_temp_dir() tmp_vcf = os.path.join(tmp_dir, 'HGDP.vcf.bgz') source = resources['HGDP_matrix_table'] info(f'downloading HGDP VCF ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, resources['HGDP_matrix_table'], tmp_vcf) cluster_readable_vcf = _copy_to_tmp(fs, local_path_uri(tmp_vcf), extension='vcf.bgz') info('importing VCF and writing to matrix table...') hl.import_vcf(cluster_readable_vcf, min_partitions=16, reference_genome='GRCh38').write(matrix_table_path, overwrite=True) tmp_sample_annot = os.path.join(tmp_dir, 'HGDP_annotations.txt') source = resources['HGDP_annotations'] info(f'downloading HGDP annotations ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, source, tmp_sample_annot) tmp_gene_annot = os.path.join(tmp_dir, 'ensembl_gene_annotations.txt') source = resources['HGDP_ensembl_gene_annotations'] info(f'downloading Ensembl gene annotations ...\n' f' Source: {source}') sync_retry_transient_errors(urlretrieve, source, tmp_gene_annot) hl.hadoop_copy(local_path_uri(tmp_sample_annot), sample_annotations_path) hl.hadoop_copy(local_path_uri(tmp_gene_annot), gene_annotations_path) hl.hadoop_copy(local_path_uri(tmp_vcf), vcf_path) info('Done!') else: info('HGDP files found')
def plot_svg_to_gcs(plot, filename): export_svgs(plot, filename) hl.hadoop_copy(filename, f'{OUTPUT_BUCKET}/{filename}')
def preimp_qc(input_type: str = None, dirname: str = None, basename: str = None, pre_geno_thresh: Union[int, float] = 0.95, mind_thresh: Union[int, float] = 0.98, fhet_aut: Union[int, float] = 0.2, fstat_x: Union[int, float] = 0.5, fstat_y: Union[int, float] = 0.5, geno_thresh: Union[int, float] = 0.98, cr_diff_thresh: Union[int, float] = 0.02, maf_thresh: Union[int, float] = 0.01, hwe_th_con_thresh: Union[int, float] = 1e-6, hwe_th_cas_thresh: Union[int, float] = 1e-10, hwe_th_all_thresh: Union[int, float] = 1e-06, annotations_file: str = None, report: bool = True, export_type: str = 'hail', out_dir: str = None, reference: str = 'GRCh38'): print('\nRunning QC') global mt, row_filters, filters, data_type, lambda_gc_pos, lambda_gc_pre, n_sig_var_pre, n_sig_var_pos, man_table_results, remove_fields # create temp directory for storing temp files if not os.path.exists('gwaspy_tmp'): os.makedirs('gwaspy_tmp') # LaTex needs full path to files gwaspy_dir = os.getcwd() + '/gwaspy_tmp' output_directory = out_dir if out_dir else dirname hl.init(default_reference=reference) # read input mt = read_infile(input_type=input_type, dirname=dirname, basename=basename, annotations=annotations_file) if 'is_case' in mt.col: gwas_pre, n_sig_var_pre = manhattan(qqtitle='Pre-QC QQ Plot', mantitle='Pre-QC Manhattan Plot').filter(mt) qqplt_pre, lambda_gc_pre, manplt_pre = manhattan(qqtitle='Pre-QC QQ Plot', mantitle='Pre-QC Manhattan Plot').plot(gwas_pre) qqplt_pre.savefig('gwaspy_tmp/gwaspy_qq_pre.png', dpi=300) manplt_pre.savefig('gwaspy_tmp/gwaspy_man_pre.png', dpi=300) mt = mt.annotate_rows(exclude_row=False) mt = mt.annotate_cols(exclude_col=False) mt, pre_qc_counts = summary_stats(mt) if 'is_case' in mt.col: if (pre_qc_counts['is_case_counts']['case'] > 0) & (pre_qc_counts['is_case_counts']['control'] == 0): data_type = 'Case-only' elif (pre_qc_counts['is_case_counts']['control'] > 0) & (pre_qc_counts['is_case_counts']['case'] == 0): data_type = 'Control-only' elif (pre_qc_counts['is_case_counts']['case'] > 0) & (pre_qc_counts['is_case_counts']['control'] > 0): data_type = 'Case-Control' else: data_type = 'Trio' else: data_type = 'no-pheno' chroms = mt.aggregate_rows(hl.agg.collect_as_set(mt.locus.contig)) if ('chrX' or 'chrY' or 'chrMT') in chroms: chromx, chromy, chrommt = 'chrX', 'chrY', 'chrMT' else: chromx, chromy, chrommt = 'X', 'Y', 'MT' # we need to compute call rate for chr1-23 and chrY separately since females have no chrY mt = mt.annotate_entries( geno_y_excluded=(hl.case() .when(mt.locus.contig == chromy, False) .default(True) ), geno_y_only=(hl.case() .when(mt.locus.contig == chromy, mt.is_female == False) .default(False) ) ) mt = pre_geno(pre_geno_cr=pre_geno_thresh).filter(mt) mt = id_call_rate(mind=mind_thresh, pre_row_filter='pre_geno').filter(mt) mt = fhet_autosomes(pre_row_filter='pre_geno', fhet_thresh=fhet_aut).filter(mt) mt = fhet_sex(pre_row_filter='pre_geno', fstat_x=fstat_x, fstat_y=fstat_y).filter(mt) mt = fhet_sex_warnings(pre_row_filter='pre_geno', pre_col_filter='sex_violations').filter(mt) mt = mt.annotate_cols(**{ 'id_pass': hl.struct( filters=((hl.agg.any(mt['mind'].filters) == True) | (hl.agg.any(mt['fstat'].filters) == True) | (hl.agg.any(mt['sex_violations'].filters) == True)) )}) mt = geno(pre_row_filter='pre_geno', pre_col_filter='id_pass', geno_thresh=geno_thresh, data_type=data_type).filter(mt) mt = invariant(pre_col_filter='id_pass').filter(mt) # for HWE, markers in: (1) autosomes - include males+females; (2) chrX - include ONLY females; (3) exclude chrY mt = mt.annotate_entries( hwe_aut=(hl.case() .when(mt.locus.contig == chromx, False) .when(mt.locus.contig == chromy, False) .when(mt.locus.contig == chrommt, False) .default(True) ), hwe_sex=(hl.case() .when(mt.locus.contig == chromx, mt.is_female) .default(False) ) ) if 'is_case' in mt.col: mt = call_rate_diff(pre_row_filter='geno', pre_col_filter='id_pass', initial_row_filter='pre_geno', cr_thresh=cr_diff_thresh).filter(mt) # check if data is case-/control-only, case-control, or trio # (a) Case-Only if data_type == 'Case-only': print("\n" + data_type) mt = hwe_cas(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_ca=1e-6).filter(mt) row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_cas'] filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_cas'] remove_fields = ['cr', 'diff', 'hwe_cas_aut', 'hwe_cas_sex'] # (b) Control-Only elif data_type == 'Control-only': print("\n" + data_type) mt = hwe_con(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_co=1e-6).filter(mt) row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con'] filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con'] remove_fields = ['cr', 'diff', 'hwe_con_aut', 'hwe_con_sex'] elif data_type == 'Case-Control': print("\n" + data_type) mt = hwe_cas(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_ca=hwe_th_cas_thresh).filter(mt) mt = hwe_con(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_co=hwe_th_con_thresh).filter(mt) row_filters = ['pre_geno', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con', 'hwe_cas'] filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'cr_diff', 'monomorphic_var', 'hwe_con', 'hwe_cas'] remove_fields = ['cr', 'diff', 'hwe_cas_aut', 'hwe_cas_sex', 'hwe_con_aut', 'hwe_con_sex'] else: # trio data print(data_type) else: print('Running HWE filters on whole dataset without spliting by phenotype status') mt = hwe_all(pre_col_filter='id_pass', pre_row_filter='geno', hwe_th_all=1e-08).filter(mt) row_filters = ['pre_geno', 'geno', 'monomorphic_var', 'hwe_all'] filters = ['pre_geno', 'mind', 'fstat', 'sex_violations', 'sex_warnings', 'geno', 'monomorphic_var', 'hwe_all'] remove_fields = ['hwe_all_aut', 'hwe_all_sex'] results = {} column_filters = ['mind', 'fstat', 'sex_violations', 'sex_warnings'] mt.select_entries().select_rows(*row_filters).select_cols(*column_filters).write(f'{output_directory}/temp.mt', overwrite=True) mt_temp = hl.read_matrix_table(f'{output_directory}/temp.mt') column_aggregations = mt_temp.aggregate_cols( [hl.agg.counter(mt_temp[filter].filters) for filter in column_filters]) row_aggregations = mt_temp.aggregate_rows( [hl.agg.counter(mt_temp[filter].filters) for filter in row_filters]) for filt, cont in zip(column_filters, column_aggregations): results[filt] = dict(cont) # aggregate returns a frozendict, convert that back to a dict for filt, cont in zip(row_filters, row_aggregations): results[filt] = dict(cont) # # aggregate returns a frozendict, convert that back to a dict for i in filters: # some filters will have zero snps/id filtered, and there won't be a True, so add it if True not in results[i]: results[i][True] = 0 for key, value in results.items(): if i == key: print(key, ': ', value) if report: fstat_fig = fhet_sex(pre_row_filter='pre_geno', fstat_x=fstat_x, fstat_y=fstat_y, figsize=(15, 20)).plot(mt) fstat_fig.savefig('gwaspy_tmp/gwaspy_fstat_fig.png', dpi=300) id_cr_plot = id_call_rate(mind=mind_thresh, pre_row_filter='pre_geno', data_type=data_type).plot(mt) var_cr_plot = geno(pre_row_filter='pre_geno', pre_col_filter='id_pass', geno_thresh=geno_thresh, data_type=data_type).plot(mt) if 'is_case' in mt.col: if data_type == 'Case-only': id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_cas_pre.png', dpi=300) var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_cas_pre.png', dpi=300) if data_type == 'Control-only': id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_con_pre.png', dpi=300) var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_con_pre.png', dpi=300) if data_type == 'Case-Control': id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_con_pre.png', dpi=300) id_cr_plot[1].savefig('gwaspy_tmp/gwaspy_id_cas_pre.png', dpi=300) var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_con_pre.png', dpi=300) var_cr_plot[1].savefig('gwaspy_tmp/gwaspy_var_cas_pre.png', dpi=300) else: id_cr_plot[0].savefig('gwaspy_tmp/gwaspy_id_cas_con_pre.png', dpi=300) var_cr_plot[0].savefig('gwaspy_tmp/gwaspy_var_cas_con_pre.png', dpi=300) # FILTER OUT ALL SNPs and IDs THAT FAIL QC column_filters = ['mind', 'fstat', 'sex_violations'] for row in row_filters: mt = mt.filter_rows(mt[row].filters == True, keep=False) for col in column_filters: mt = mt.filter_cols(mt[col].filters == True, keep=False) mt_filtered, pos_qc_counts = summary_stats(mt) # drop entry fields we added as they will cause errors when exporting to VCF # e.g. Error summary: HailException: Invalid type for format field 'geno_y_excluded'. Found 'bool'. drop_fields = filters + ['geno_y_excluded', 'geno_y_only', 'pre_geno_noy', 'pre_geno_y', 'hwe_aut', 'hwe_sex', 'exclude_col', 'exclude_row', 'variant_qc', 'aaf', 'geno_noy', 'geno_y', 'sex_ambiguous', 'id_pass'] + remove_fields mt_filtered = mt_filtered.drop(*drop_fields) if 'is_case' in mt.col: gwas_pos, n_sig_var_pos = manhattan(qqtitle='Post-QC QQ Plot', mantitle='Post-QC Manhattan Plot').filter(mt) qqplt_pos, lambda_gc_pos, manplt_pos = manhattan(qqtitle='Post-QC QQ Plot', mantitle='Post-QC Manhattan Plot').plot(gwas_pos) ncas_pre = pre_qc_counts['is_case_counts']['case'] ncas_pos = pos_qc_counts['is_case_counts']['case'] ncon_pre = pre_qc_counts['is_case_counts']['control'] ncon_pos = pos_qc_counts['is_case_counts']['control'] lambda_thous_pre = 1 + (lambda_gc_pre-1)*(1/ncas_pre+1/ncon_pre)/(1/1000+1/1000) lambda_thous_pos = 1 + (lambda_gc_pos-1)*(1/ncas_pos+1/ncon_pos)/(1/1000+1/1000) qqplt_pos.savefig('gwaspy_tmp/gwaspy_qq_pos.png', dpi=300) manplt_pos.savefig('gwaspy_tmp/gwaspy_man_pos.png', dpi=300) man_table_results = [n_sig_var_pre, n_sig_var_pos, lambda_gc_pre, lambda_gc_pos, round(lambda_thous_pre, 3), round(lambda_thous_pos, 3)] # report if report: print('\nWriting report') doc = MyDocument(basename=basename) if 'is_case' in mt.col: doc.flags_table(pre_qc_counts=pre_qc_counts, pos_qc_counts=pos_qc_counts, results=results, lambda_gc=lambda_gc_pos, sig_vars=n_sig_var_pos) else: doc.flags_table(pre_qc_counts=pre_qc_counts, pos_qc_counts=pos_qc_counts, results=results) doc.general_info(pre_qc_conts=pre_qc_counts, post_qc_conts=pos_qc_counts, count_results=results, pre_filter=pre_geno_thresh, id_cr=mind_thresh, fhet_thresh=fhet_aut, var_cr=geno_thresh, miss_diff=cr_diff_thresh, hwe_con=hwe_th_con_thresh, hwe_cas=hwe_th_cas_thresh, hwe_all=hwe_th_all_thresh, data_type=data_type) if 'is_case' in mt.col: doc.manhattan_sec(qq_pre_path=f'{gwaspy_dir}/gwaspy_qq_pre.png', qq_pos_path=f'{gwaspy_dir}/gwaspy_qq_pos.png', man_pre_path=f'{gwaspy_dir}/gwaspy_man_pre.png', man_pos_path=f'{gwaspy_dir}/gwaspy_man_pos.png', table_results=man_table_results) if data_type == 'Case-only': doc.individual_char(id_con_pre_path='nothing here', id_cas_pre_path=f'{gwaspy_dir}/gwaspy_id_cas_pre.png', fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here', data_type=data_type) doc.snp_char(var_con_pre_path='nothing here', var_cas_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png', var_all_path='nothing here', data_type=data_type) if data_type == 'Control-only': doc.individual_char(id_con_pre_path=f'{gwaspy_dir}/gwaspy_id_con_pre.png', id_cas_pre_path='nothing here', fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here', data_type=data_type) doc.snp_char(var_con_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png', var_cas_pre_path='nothing here', var_all_path='nothing here', data_type=data_type) if data_type == 'Case-Control': doc.individual_char(id_con_pre_path=f'{gwaspy_dir}/gwaspy_id_con_pre.png', id_cas_pre_path=f'{gwaspy_dir}/gwaspy_id_cas_pre.png', fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path='nothing here', data_type=data_type) doc.snp_char(var_con_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png', var_cas_pre_path=f'{gwaspy_dir}/gwaspy_var_cas_pre.png', var_all_path='nothing here', data_type=data_type) if data_type == 'no-pheno': doc.individual_char(id_con_pre_path='nothing here', id_cas_pre_path='nothing here', fstat_fig_path=f'{gwaspy_dir}/gwaspy_fstat_fig.png', id_all_path=f'{gwaspy_dir}/gwaspy_id_cas_con_pre.png', data_type=data_type) doc.snp_char(var_con_pre_path='nothing here', var_cas_pre_path=f'nothing here', var_all_path=f'{gwaspy_dir}/gwaspy_var_cas_con_pre.png', data_type=data_type) doc.generate_pdf(f'{gwaspy_dir}/{basename}.preimp_qc.report', clean=True, clean_tex=True) print('\nExporting qced file') if export_type: from gwaspy.utils.export_file import export_qced_file export_qced_file(mt=mt_filtered, out_dir=output_directory, basename=basename, export_type=export_type) if output_directory.startswith('gs://'): hl.hadoop_copy(f'file://{gwaspy_dir}/{basename}.preimp_qc.report.pdf', f'{output_directory}GWASpy/Preimp_QC/{basename}.preimp_qc.report.pdf') else: shutil.copyfile(f'{gwaspy_dir}/{basename}.preimp_qc.report.pdf', f'{output_directory}{basename}.preimp_qc.report.pdf') # clean-up print('\nCleaning up') shutil.rmtree('gwaspy_tmp') print("\nDone running QC!")
print('\nSUCCESSFULLY CONVERTED GRM TO NUMPY!\n') # Compute eigenvalue decomposition print('\nCOMPUTE EIGENVALUES...\n') time_start = time.time() #eigenvals = scipy.linalg.eigvalsh(np_grm) eigenvals = np.linalg.eigvalsh(np_grm) time_end = time.time() eigenval_time = time_end - time_start print(eigenval_time) print('\nSUCCESSFULLY COMPUTED EIGENVALUES!\n') # Save eigenvalues np.save("eigenvals_"+str(pct)+"pct_"+str(nsnps)+"_x_"+str(nindv)+".npy", eigenvals) np.save("/tmp/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+".npy", eigenvals) hl.hadoop_copy("file:///tmp/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+".npy", "gs://ukb-gt/"+str(pct)+"pct/eigenvals_"+str(nsnps)+"_x_"+str(nindv)+"meta0.npy") print('\nSAVED EIGENVALUES!\n') # Plot lmda = nindv / nsnps lmdap = (1 + np.sqrt(lmda))**2 lmdam = (1 - np.sqrt(lmda))**2 x = np.arange(lmdam, lmdap, 0.001) y = (1/(2*math.pi)) * np.sqrt((lmdap-x)*(x-lmdam)) / (lmda*x) plt.clf() plt.hist(eigenvals[1:], bins=1000, density = True) plt.plot(x, y, '-b', label = 'Marchenko-Pastur Distrubution') plt.title('Eigenvalues for '+str(nindv)+' Individuals and '+str(nsnps)+' SNPs') plt.xlabel('Eigenvalues') plt.ylabel('Density')
def main(args): # set up tracking variables corresponding to each dataset sumstats = [] p = [] ss = args.ss.split(',') if args.clumps: clumps = args.clumps.split(',') ss_names = args.ss_names.split(',') chr_pos_ref_alt_p = args.chr_pos_ref_alt_p.split(';') # read in each set of sumstats for sumstat in range(len(ss)): ss_data, p_data = import_key(ss[sumstat], chr_pos_ref_alt_p[sumstat], clumps[sumstat]) sumstats.append(ss_data) p.append(p_data) # join across datasets, filter to target region ss_joined = sumstats[0] for sumstat in range(1, len(ss)): annot_val = 'ss' + str(sumstat) ss_joined = ss_joined.annotate( **{annot_val: sumstats[sumstat][ss_joined.key]}) ss_joined_filt = ss_joined.filter( hl.parse_locus_interval(args.region).contains(ss_joined.locus)) ss_to_plot = ss_joined_filt.to_pandas() # set up figure and plot print('Arial') sns.set(font='Arial') sns.set_style('white') sns.despine() fig = plt.figure(figsize=(8, 8)) fig.subplots_adjust(hspace=0.5) plt.tight_layout() spacing = 0.04 fig.text(0.5, spacing, 'Position', ha='center') fig.text(spacing, 0.5, '-log10(p)', va='center', rotation='vertical') fig.text(0.5, 1 - spacing, args.trait, ha='center') # tips = sns.load_dataset("tips") for sumstat in range(len(ss)): ax = fig.add_subplot(len(ss), 1, sumstat + 1) #, sharex='col') if sumstat > 0: current_p = 'ss' + str(sumstat) + '.' + p[sumstat] pos = 'ss' + str(sumstat) + '.' + chr_pos_ref_alt_p[sumstat].split( ',')[1] else: current_p = p[sumstat] pos = chr_pos_ref_alt_p[sumstat].split(',')[1] ss_current_plot = ss_to_plot.filter(items=[pos, current_p, 'clump'], axis=1) ss_current_plot.dropna(inplace=True) sns.scatterplot(ss_current_plot[pos], -np.log10(ss_current_plot[current_p]), hue=ss_current_plot.clump, linewidth=0, ax=ax) plt.xlabel("") plt.ylabel("") plt.title(ss_names[sumstat]) # write plot out # gs.tight_layout(fig) out_base = args.out.split('/')[-1] fig.savefig('/tmp/' + out_base) hl.hadoop_copy('file:///tmp/' + out_base, args.out)
def run_pca_normal(dirname: str = None, basename: str = None, input_type: str = None, reference: str = 'GRCh38', maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000, n_pcs: int = 20, relatedness_method: str = 'pc_relate', relatedness_thresh: float = 0.98, out_dir: str = None): print('\nReading mt') if reference.lower() == 'grch37': lifted_over = f'{dirname}{basename}.liftover.grch38.mt' if not hl.hadoop_exists(lifted_over): from gwaspy.utils.reference_liftover import liftover_to_grch38 mt = liftover_to_grch38(dirname=dirname, basename=basename, input_type=input_type) else: print(f'\nFound lifted-over over file: {lifted_over}') mt = hl.read_matrix_table(lifted_over) else: from gwaspy.utils.read_file import read_infile mt = read_infile(input_type=input_type, dirname=dirname, basename=basename) print('\nFiltering mt') mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window) mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh) pca_snps = mt.count_rows() if pca_snps > 1000000: import warnings warnings.warn( f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer' ) print('\nRunning PCA') eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=n_pcs) pcs_ht = pcs.transmute( **{f'PC{i}': pcs.scores[i - 1] for i in range(1, n_pcs + 1)}) # add phenotype and sex to the output, using information from the mt # first check if is_case and os_female fields exist in the mt all_column_field_names = list(mt.col) # sex status is a MUST but not phenotype status if 'is_case' in all_column_field_names: ann_cols = ['is_case', 'is_female'] else: ann_cols = ['is_female'] annotations_ht = mt.cols().select(*ann_cols) if 'is_case' in all_column_field_names: pcs_ht = pcs_ht.annotate(is_case=annotations_ht[pcs_ht.s].is_case) pcs_ht = pcs_ht.annotate(is_female=annotations_ht[pcs_ht.s].is_female) print('\nSaving PC scores file') out_scores_file = f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.normal.scores.tsv' pcs_ht.export(out_scores_file) print('\nGenerating PCA plots') pcs_scores = pd.read_table(out_scores_file, header=0, sep='\t') if 'is_case' in all_column_field_names: pcs_scores[['is_case' ]] = pcs_scores[['is_case' ]].replace([True, False, None], ['case', 'control', 'unknown']) pcs_scores[['is_female' ]] = pcs_scores[['is_female' ]].replace([True, False, None], ['female', 'male', 'unknown']) figs_dict = {} for col in ann_cols: for i in range(1, n_pcs, 2): xpc = f'PC{i}' ypc = f'PC{i + 1}' figs_dict["fig{}{}".format(col, i)] = plot_pca(pcs_scores, xpc, ypc, col) pdf = PdfPages('/tmp/pca.no.ref.plots.pdf') for figname, figure in figs_dict.items(): pdf.savefig(figure) pdf.close() hl.hadoop_copy( 'file:///tmp/pca.no.ref.plots.pdf', f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.no.ref.plots.pdf')
# gwas = hl.linear_regression_rows(y=[[mt_filtered.sample_qc_and_phenotype.wbc_gwas_normalised, ...], [family2...]], print("Linear regression CHECKPOINT") # TIM NOTE: checkpoint here to prevent multiple execution (write to a file, read that file) gwas = gwas.checkpoint(f"{BUCKET}/gwas/{CHROMOSOME}-gwasfbc-checkpoint", overwrite=True) # gwas = gwas.checkpoint(s3"{tmp_dir}/gwas_wbc_chr19_checkpoint.mt") print("Linear regression output table") gwas.export(f"{BUCKET}/gwas/gwas-{CHROMOSOME}-export.tsv.bgz", header=True) print("Plotting") for i in range(0, 36): print(f"Plotting {i}:{covariates[i]}") p = hl.plot.manhattan( gwas.p_value[i], title=f"Interval WGS GWAS Manhattan Plot: {covariates[i]}") output_file(f"{i}.WGS-manhattan-{covariates[i]}.html") save(p) hl.hadoop_copy(f"{i}.WGS-manhattan-{covariates[i]}.html", f"{BUCKET}/gwas/plots/") # p = hl.plot.qq(gwas.p_value[i], title=f"Interval WGS GWAS QQ Plot: {covariates[i]}") # export_png(p,f"{BUCKET}/output-tables/wgs-qq-{covariates[i]}.html") # output_file(f"{i}.WGS-qq-{covariates[i]}.html") # save(p) # hl.hadoop_copy(f"{i}.WGS-qq-{covariates[i]}.html", f"{BUCKET}/gwas/plots/") mt = mt.checkpoint( f"{BUCKET}/matrixtables/{CHROMOSOME}/{CHROMOSOME}-sampleqc-variantqc-filtered-FINAL.mt", overwrite=True)
def run_pca_project( ref_dirname: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/', ref_basename: str = 'unrelated', ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv', data_dirname: str = None, data_basename: str = None, out_dir: str = None, input_type: str = None, reference: str = 'GRCh38', npcs: int = 20, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000, relatedness_method: str = 'pc_relate', relatedness_thresh: float = 0.98, prob_threshold: float = 0.8): """ Project samples into predefined PCA space :param ref_dirname: directory name where reference data is :param ref_basename: base filename for reference data :param ref_info: reference sample information :param data_dirname: matrix table of data to project :param data_basename: matrix table of data to project :param out_dir: directory and filename prefix for where to put PCA projection output :param input_type: input file(s) type: hail, plink, or vcf :param reference: reference build :param npcs: number of principal components to be used in PCA :param maf: minor allele frequency threshold :param hwe: hardy-weinberg fiter threshold :param call_rate: variant call rate filter threshold :param ld_cor: reference build :param ld_window: window size :param relatedness_method: method to use for relatedness filtering :param relatedness_thresh: threshold to use for filtering out related individuals :param prob_threshold: a list of probability thresholds to use for classifying samples :return: a pandas Dataframe with data PCA scores projected on the same PCA space using the Human Genome Diversity """ print('\nReading data mt') if reference.lower() == 'grch37': lifted_over = f'{data_dirname}{data_basename}.liftover.grch38.mt' if not hl.hadoop_exists(lifted_over): from gwaspy.utils.reference_liftover import liftover_to_grch38 mt = liftover_to_grch38(dirname=data_dirname, basename=data_basename, input_type=input_type) else: print(f'\nFound lifted-over over file: {lifted_over}') mt = hl.read_matrix_table(lifted_over) else: from gwaspy.utils.read_file import read_infile mt = read_infile(input_type=input_type, dirname=data_dirname, basename=data_basename) print('\nFiltering data mt') mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window) mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh) # Intersect data with reference intersect_ref(ref_dirname=ref_dirname, ref_basename=ref_basename, data_mt=mt, data_basename=data_basename, out_dir=out_dir) ref_in_data = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_intersect_{data_basename}.mt') print('\nComputing reference PCs') run_ref_pca(mt=ref_in_data, npcs=npcs, out_dir=out_dir, data_basename=data_basename) # project data pca_loadings = hl.read_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht') project_mt = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}_intersect_1kg_hgdp.mt') ht_projections = pc_project(mt=project_mt, loadings_ht=pca_loadings) ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, npcs+1)}) ht_projections.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv') ref_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz' data_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv' data_ref = merge_data_with_ref(ref_scores=ref_scores, ref_info=ref_info, data_scores=data_scores) from gwaspy.pca.assign_pop_labels import assign_population_pcs pcs_df, clf = assign_population_pcs(pop_pc_pd=data_ref, num_pcs=npcs, min_prob=prob_threshold) data_pops = pcs_df.loc[pcs_df['SuperPop'].isnull()] data_pops['pop'].value_counts() cols = ['s', 'pop'] + [f'prob_{i}' for i in ["AFR", "AMR", "CSA", "EAS", "EUR", "MID", "OCE"]] + [f'PC{i}' for i in range(1, npcs+1)] data_pops_df = data_pops[cols] data_pops_df.to_csv(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt', sep='\t', index=False) print("\nGenerating PCA plots") data_scores_prob = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt' figs_dict = {} # plotting more than 10 PCA plots in HTML generates wobbly, large files for i in range(1, 10, 2): xpc = f'PC{i}' ypc = f'PC{i + 1}' figs_dict["fig{}{}".format(xpc, ypc)] = plot_pca_ref(data_scores=data_scores_prob, ref_scores=ref_scores, ref_info=ref_info, x_pc=xpc, y_pc=ypc) with open('/tmp/pca.project.plots.html', 'a') as f: for figname, figure in figs_dict.items(): f.write(figure.to_html(include_plotlyjs='cdn')) hl.hadoop_copy('file:///tmp/pca.project.plots.html', f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.pca.project.plots.html')