def export_ldscore(ht, pop): hm3_snps = hl.read_table(get_hm3_snplist_path(pop)) ht = ht.select(CHR=ht.locus.contig, SNP=hl.variant_str(ht.locus, ht.alleles), RSID=ht.rsid, BP=ht.locus.position, L2=ht.ld_score, MAF=0.5 - hl.abs(0.5 - ht.AF)) count = ht.aggregate( hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05))) ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles])) ht = ht.key_by().drop('locus', 'alleles', 'MAF') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'), 'w') as f: f.write(f'{count.M}\n') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'), 'w') as f: f.write(f'{count.M_5_50}\n') # LD score with variant ids ht.drop('RSID').export(get_ld_score_flat_file_path(pop)) # with rsids ht.transmute(SNP=ht.RSID).export( get_ld_score_flat_file_path(pop, rsid=True))
def load_bin(ns, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path, 'rb') as f: for i in range(ns): for j in range(i + 1): b = f.read(4) self.assertEqual(len(b), 4) m[i, j] = unpack('<f', bytearray(b))[0] left = f.read() self.assertEqual(len(left), 0) return m
def test_hadoop_is_file(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir a_file = f'{prefix}/test_hadoop_is_file.txt' with hadoop_open(a_file, 'w') as f: f.write("HELLO WORLD") self.assertTrue(hl.hadoop_is_file(a_file)) self.assertFalse(hl.hadoop_is_file(f'{prefix}/')) self.assertFalse(hl.hadoop_is_file(f'{prefix}/invalid-path'))
def test_hadoop_exists(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir with hadoop_open(f'{prefix}/test_exists.txt', 'w') as f: f.write("HELLO WORLD") r_exists = f'{prefix}/test_exists.txt' r_not_exists = f'{prefix}/not_exists.txt' self.assertTrue(hl.hadoop_exists(r_exists)) self.assertFalse(hl.hadoop_exists(r_not_exists))
def test_hadoop_exists(self, bucket=None): if bucket is None: bucket = self.remote_bucket with hadoop_open(f'{bucket}/test_exists.txt', 'w') as f: f.write("HELLO WORLD") r_exists = f'{bucket}/test_exists.txt' r_not_exists = f'{bucket}/not_exists.txt' self.assertTrue(hl.hadoop_exists(r_exists)) self.assertFalse(hl.hadoop_exists(r_not_exists))
def test_hadoop_is_file(self, bucket=None): if bucket is None: bucket = self.remote_bucket a_file = f'{bucket}/test_hadoop_is_file.txt' with hadoop_open(a_file, 'w') as f: f.write("HELLO WORLD") self.assertTrue(hl.hadoop_is_file(a_file)) self.assertFalse(hl.hadoop_is_file(f'{bucket}/')) self.assertFalse(hl.hadoop_is_file(f'{bucket}/invalid-path'))
def load_grm(ns, nv, path): m = np.zeros((ns, ns)) with utils.hadoop_open(path) as f: i = 0 for l in f: row = l.strip().split('\t') self.assertEqual(int(row[2]), nv) m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3]) i += 1 self.assertEqual(i, ns * (ns + 1) / 2) return m
def test_hadoop_methods(self, bucket=None): if bucket is None: bucket = self.remote_bucket data = ['foo', 'bar', 'baz'] data.extend(map(str, range(100))) with hadoop_open(f'{bucket}/test_out.txt', 'w') as f: for d in data: f.write(d) f.write('\n') with hadoop_open(f'{bucket}/test_out.txt') as f: data2 = [line.strip() for line in f] self.assertEqual(data, data2) with hadoop_open(f'{bucket}/test_out.txt.gz', 'w') as f: for d in data: f.write(d) f.write('\n') with hadoop_open(f'{bucket}/test_out.txt.gz') as f: data3 = [line.strip() for line in f] self.assertEqual(data, data3) hadoop_copy(f'{bucket}/test_out.txt.gz', f'{bucket}/test_out.copy.txt.gz') with hadoop_open(f'{bucket}/test_out.copy.txt.gz') as f: data4 = [line.strip() for line in f] self.assertEqual(data, data4) local_fs = HadoopFS() with local_fs.open(resource('randomBytes'), buffer_size=100) as f: with hadoop_open(f'{bucket}/randomBytesOut', 'w', buffer_size=2**18) as out: b = f.read() out.write(b) with hadoop_open(f'{bucket}/randomBytesOut', buffer_size=2**18) as f: b2 = f.read() self.assertEqual(b, b2)
def test_hadoop_methods(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir data = ['foo', 'bar', 'baz'] data.extend(map(str, range(100))) with hadoop_open(f'{prefix}/test_out.txt', 'w') as f: for d in data: f.write(d) f.write('\n') with hadoop_open(f'{prefix}/test_out.txt') as f: data2 = [line.strip() for line in f] self.assertEqual(data, data2) with hadoop_open(f'{prefix}/test_out.txt.gz', 'w') as f: for d in data: f.write(d) f.write('\n') with hadoop_open(f'{prefix}/test_out.txt.gz') as f: data3 = [line.strip() for line in f] self.assertEqual(data, data3) hadoop_copy(f'{prefix}/test_out.txt.gz', f'{prefix}/test_out.copy.txt.gz') with hadoop_open(f'{prefix}/test_out.copy.txt.gz') as f: data4 = [line.strip() for line in f] self.assertEqual(data, data4) local_fs = LocalFS() with local_fs.open(os.path.join(self.local_dir, 'randomBytes'), 'rb', buffer_size=100) as f: with hadoop_open(f'{prefix}/randomBytesOut', 'wb', buffer_size=2**18) as out: b = f.read() out.write(b) with hadoop_open(f'{prefix}/randomBytesOut', 'rb', buffer_size=2**18) as f: b2 = f.read() self.assertEqual(b, b2)
def test_hadoop_stat(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir data = ['foo', 'bar', 'baz'] data.extend(map(str, range(100))) with hadoop_open(f'{prefix}/test_hadoop_stat.txt.gz', 'w') as f: for d in data: f.write(d) f.write('\n') stat1 = hl.hadoop_stat(f'{prefix}') self.assertEqual(stat1['is_dir'], True) hadoop_copy(f'{prefix}/test_hadoop_stat.txt.gz', f'{prefix}/test_hadoop_stat.copy.txt.gz') stat2 = hl.hadoop_stat(f'{prefix}/test_hadoop_stat.copy.txt.gz') # The gzip format permits metadata which makes the compressed file's size unpredictable. In # practice, Hadoop creates a 175 byte file and gzip.GzipFile creates a 202 byte file. The 27 # extra bytes appear to include at least the filename (20 bytes) and a modification timestamp. assert stat2['size_bytes'] == 175 or stat2['size_bytes'] == 202 self.assertEqual(stat2['is_dir'], False) self.assertTrue('path' in stat2)
def main(args): hl.init(log='/assign_phecodes.log') # Read in the Phecode (v1.2b1) <-> ICD 9/10 codes mapping with hadoop_open( 'gs://ukb-diverse-pops/phecode/UKB_Phecode_v1.2b1_ICD_Mapping.txt', 'r') as f: df = pd.read_csv(f, delimiter='\t', dtype=str) list_of_icd_codes_to_include = [ row.icd_codes.split(',') for _, row in df.iterrows() ] list_of_phecodes_to_exclude = [ row.exclude_phecodes.split(',') for _, row in df.iterrows() ] df['icd_codes'] = list_of_icd_codes_to_include df['exclude_phecodes'] = list_of_phecodes_to_exclude # Convert it to HailTable phecode_ht = hl.Table.from_pandas(df) phecode_ht = phecode_ht.key_by('icd_codes') phecode_ht = phecode_ht.checkpoint( 'gs://ukb-diverse-pops/phecode/UKB_Phecode_v1.2b1_ICD_Mapping.ht', overwrite=args.overwrite) # Retreive UKB ICD MatrixTable and combine codes based on Phecode definitions icd_all = hl.read_matrix_table(get_ukb_pheno_mt_path('icd_all')) mt = combine_phenotypes(icd_all, icd_all.icd_code, icd_all.any_codes, list_of_icd_codes_to_include, new_col_name='icd_codes', new_entry_name='include_to_cases') mt = mt.annotate_cols( phecode=phecode_ht[mt.icd_codes].phecode, phecode_sex=phecode_ht[mt.icd_codes].sex, phecode_description=phecode_ht[mt.icd_codes].description, phecode_group=phecode_ht[mt.icd_codes].group, exclude_phecodes=phecode_ht[mt.icd_codes].exclude_phecodes) # Annotate sex for sex-specific phenotypes ukb_pheno_ht = hl.read_table(get_ukb_pheno_ht_path()) mt = mt.annotate_rows(isFemale=ukb_pheno_ht[mt.userId].sex == 0) mt = checkpoint_tmp(mt) # Compute phecode excluded from controls mt = mt.key_cols_by() exclude_mt = combine_phenotypes(mt, mt.phecode, mt.include_to_cases, list_of_phecodes_to_exclude, new_entry_name='exclude_from_controls') exclude_mt = checkpoint_tmp(exclude_mt) # Annotate exclusion mt = mt.key_cols_by('exclude_phecodes') mt = mt.annotate_entries( exclude_sex=(hl.switch(mt.phecode_sex).when("males", mt.isFemale).when( "females", ~mt.isFemale).default(False)), exclude_from_controls=hl.coalesce( exclude_mt[mt.userId, mt.exclude_phecodes].exclude_from_controls, False)) # Compute final case/control status # `case_control` becomes missing (NA) if a sample 1) is excluded because of sex, 2) is not cases and excluded from controls. mt = mt.annotate_entries(case_control=hl.if_else( mt.exclude_sex | (~mt.include_to_cases & mt.exclude_from_controls), hl.null(hl.tbool), mt.include_to_cases)) mt = mt.key_cols_by('phecode') mt.describe() mt.write(get_ukb_pheno_mt_path('phecode'), overwrite=args.overwrite)