Exemple #1
0
def export_ldscore(ht, pop):
    hm3_snps = hl.read_table(get_hm3_snplist_path(pop))

    ht = ht.select(CHR=ht.locus.contig,
                   SNP=hl.variant_str(ht.locus, ht.alleles),
                   RSID=ht.rsid,
                   BP=ht.locus.position,
                   L2=ht.ld_score,
                   MAF=0.5 - hl.abs(0.5 - ht.AF))
    count = ht.aggregate(
        hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05)))
    ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles]))
    ht = ht.key_by().drop('locus', 'alleles', 'MAF')

    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'),
                     'w') as f:
        f.write(f'{count.M}\n')
    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'),
                     'w') as f:
        f.write(f'{count.M_5_50}\n')

    # LD score with variant ids
    ht.drop('RSID').export(get_ld_score_flat_file_path(pop))
    # with rsids
    ht.transmute(SNP=ht.RSID).export(
        get_ld_score_flat_file_path(pop, rsid=True))
Exemple #2
0
 def load_bin(ns, path):
     m = np.zeros((ns, ns))
     with utils.hadoop_open(path, 'rb') as f:
         for i in range(ns):
             for j in range(i + 1):
                 b = f.read(4)
                 self.assertEqual(len(b), 4)
                 m[i, j] = unpack('<f', bytearray(b))[0]
         left = f.read()
         self.assertEqual(len(left), 0)
     return m
Exemple #3
0
    def test_hadoop_is_file(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        a_file = f'{prefix}/test_hadoop_is_file.txt'
        with hadoop_open(a_file, 'w') as f:
            f.write("HELLO WORLD")

        self.assertTrue(hl.hadoop_is_file(a_file))
        self.assertFalse(hl.hadoop_is_file(f'{prefix}/'))
        self.assertFalse(hl.hadoop_is_file(f'{prefix}/invalid-path'))
Exemple #4
0
    def test_hadoop_exists(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        with hadoop_open(f'{prefix}/test_exists.txt', 'w') as f:
            f.write("HELLO WORLD")

        r_exists = f'{prefix}/test_exists.txt'
        r_not_exists = f'{prefix}/not_exists.txt'
        self.assertTrue(hl.hadoop_exists(r_exists))
        self.assertFalse(hl.hadoop_exists(r_not_exists))
Exemple #5
0
    def test_hadoop_exists(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        with hadoop_open(f'{bucket}/test_exists.txt', 'w') as f:
            f.write("HELLO WORLD")

        r_exists = f'{bucket}/test_exists.txt'
        r_not_exists = f'{bucket}/not_exists.txt'
        self.assertTrue(hl.hadoop_exists(r_exists))
        self.assertFalse(hl.hadoop_exists(r_not_exists))
Exemple #6
0
    def test_hadoop_is_file(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        a_file = f'{bucket}/test_hadoop_is_file.txt'
        with hadoop_open(a_file, 'w') as f:
            f.write("HELLO WORLD")

        self.assertTrue(hl.hadoop_is_file(a_file))
        self.assertFalse(hl.hadoop_is_file(f'{bucket}/'))
        self.assertFalse(hl.hadoop_is_file(f'{bucket}/invalid-path'))
Exemple #7
0
        def load_grm(ns, nv, path):
            m = np.zeros((ns, ns))
            with utils.hadoop_open(path) as f:
                i = 0
                for l in f:
                    row = l.strip().split('\t')
                    self.assertEqual(int(row[2]), nv)
                    m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3])
                    i += 1

                self.assertEqual(i, ns * (ns + 1) / 2)
            return m
Exemple #8
0
    def test_hadoop_methods(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        data = ['foo', 'bar', 'baz']
        data.extend(map(str, range(100)))

        with hadoop_open(f'{bucket}/test_out.txt', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        with hadoop_open(f'{bucket}/test_out.txt') as f:
            data2 = [line.strip() for line in f]

        self.assertEqual(data, data2)

        with hadoop_open(f'{bucket}/test_out.txt.gz', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        with hadoop_open(f'{bucket}/test_out.txt.gz') as f:
            data3 = [line.strip() for line in f]

        self.assertEqual(data, data3)

        hadoop_copy(f'{bucket}/test_out.txt.gz',
                    f'{bucket}/test_out.copy.txt.gz')

        with hadoop_open(f'{bucket}/test_out.copy.txt.gz') as f:
            data4 = [line.strip() for line in f]

        self.assertEqual(data, data4)

        local_fs = HadoopFS()
        with local_fs.open(resource('randomBytes'), buffer_size=100) as f:
            with hadoop_open(f'{bucket}/randomBytesOut',
                             'w',
                             buffer_size=2**18) as out:
                b = f.read()
                out.write(b)

        with hadoop_open(f'{bucket}/randomBytesOut', buffer_size=2**18) as f:
            b2 = f.read()

        self.assertEqual(b, b2)
Exemple #9
0
    def test_hadoop_methods(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        data = ['foo', 'bar', 'baz']
        data.extend(map(str, range(100)))

        with hadoop_open(f'{prefix}/test_out.txt', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        with hadoop_open(f'{prefix}/test_out.txt') as f:
            data2 = [line.strip() for line in f]

        self.assertEqual(data, data2)

        with hadoop_open(f'{prefix}/test_out.txt.gz', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        with hadoop_open(f'{prefix}/test_out.txt.gz') as f:
            data3 = [line.strip() for line in f]

        self.assertEqual(data, data3)

        hadoop_copy(f'{prefix}/test_out.txt.gz',
                    f'{prefix}/test_out.copy.txt.gz')

        with hadoop_open(f'{prefix}/test_out.copy.txt.gz') as f:
            data4 = [line.strip() for line in f]

        self.assertEqual(data, data4)

        local_fs = LocalFS()
        with local_fs.open(os.path.join(self.local_dir, 'randomBytes'), 'rb', buffer_size=100) as f:
            with hadoop_open(f'{prefix}/randomBytesOut', 'wb', buffer_size=2**18) as out:
                b = f.read()
                out.write(b)

        with hadoop_open(f'{prefix}/randomBytesOut', 'rb', buffer_size=2**18) as f:
            b2 = f.read()

        self.assertEqual(b, b2)
Exemple #10
0
    def test_hadoop_stat(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        data = ['foo', 'bar', 'baz']
        data.extend(map(str, range(100)))
        with hadoop_open(f'{prefix}/test_hadoop_stat.txt.gz', 'w') as f:
            for d in data:
                f.write(d)
                f.write('\n')

        stat1 = hl.hadoop_stat(f'{prefix}')
        self.assertEqual(stat1['is_dir'], True)

        hadoop_copy(f'{prefix}/test_hadoop_stat.txt.gz',
                    f'{prefix}/test_hadoop_stat.copy.txt.gz')

        stat2 = hl.hadoop_stat(f'{prefix}/test_hadoop_stat.copy.txt.gz')
        # The gzip format permits metadata which makes the compressed file's size unpredictable. In
        # practice, Hadoop creates a 175 byte file and gzip.GzipFile creates a 202 byte file. The 27
        # extra bytes appear to include at least the filename (20 bytes) and a modification timestamp.
        assert stat2['size_bytes'] == 175 or stat2['size_bytes'] == 202
        self.assertEqual(stat2['is_dir'], False)
        self.assertTrue('path' in stat2)
def main(args):
    hl.init(log='/assign_phecodes.log')

    # Read in the Phecode (v1.2b1) <-> ICD 9/10 codes mapping
    with hadoop_open(
            'gs://ukb-diverse-pops/phecode/UKB_Phecode_v1.2b1_ICD_Mapping.txt',
            'r') as f:
        df = pd.read_csv(f, delimiter='\t', dtype=str)
    list_of_icd_codes_to_include = [
        row.icd_codes.split(',') for _, row in df.iterrows()
    ]
    list_of_phecodes_to_exclude = [
        row.exclude_phecodes.split(',') for _, row in df.iterrows()
    ]
    df['icd_codes'] = list_of_icd_codes_to_include
    df['exclude_phecodes'] = list_of_phecodes_to_exclude

    # Convert it to HailTable
    phecode_ht = hl.Table.from_pandas(df)
    phecode_ht = phecode_ht.key_by('icd_codes')
    phecode_ht = phecode_ht.checkpoint(
        'gs://ukb-diverse-pops/phecode/UKB_Phecode_v1.2b1_ICD_Mapping.ht',
        overwrite=args.overwrite)

    # Retreive UKB ICD MatrixTable and combine codes based on Phecode definitions
    icd_all = hl.read_matrix_table(get_ukb_pheno_mt_path('icd_all'))
    mt = combine_phenotypes(icd_all,
                            icd_all.icd_code,
                            icd_all.any_codes,
                            list_of_icd_codes_to_include,
                            new_col_name='icd_codes',
                            new_entry_name='include_to_cases')
    mt = mt.annotate_cols(
        phecode=phecode_ht[mt.icd_codes].phecode,
        phecode_sex=phecode_ht[mt.icd_codes].sex,
        phecode_description=phecode_ht[mt.icd_codes].description,
        phecode_group=phecode_ht[mt.icd_codes].group,
        exclude_phecodes=phecode_ht[mt.icd_codes].exclude_phecodes)

    # Annotate sex for sex-specific phenotypes
    ukb_pheno_ht = hl.read_table(get_ukb_pheno_ht_path())
    mt = mt.annotate_rows(isFemale=ukb_pheno_ht[mt.userId].sex == 0)
    mt = checkpoint_tmp(mt)

    # Compute phecode excluded from controls
    mt = mt.key_cols_by()
    exclude_mt = combine_phenotypes(mt,
                                    mt.phecode,
                                    mt.include_to_cases,
                                    list_of_phecodes_to_exclude,
                                    new_entry_name='exclude_from_controls')
    exclude_mt = checkpoint_tmp(exclude_mt)

    # Annotate exclusion
    mt = mt.key_cols_by('exclude_phecodes')
    mt = mt.annotate_entries(
        exclude_sex=(hl.switch(mt.phecode_sex).when("males", mt.isFemale).when(
            "females", ~mt.isFemale).default(False)),
        exclude_from_controls=hl.coalesce(
            exclude_mt[mt.userId, mt.exclude_phecodes].exclude_from_controls,
            False))

    # Compute final case/control status
    # `case_control` becomes missing (NA) if a sample 1) is excluded because of sex, 2) is not cases and excluded from controls.
    mt = mt.annotate_entries(case_control=hl.if_else(
        mt.exclude_sex | (~mt.include_to_cases & mt.exclude_from_controls),
        hl.null(hl.tbool), mt.include_to_cases))

    mt = mt.key_cols_by('phecode')
    mt.describe()

    mt.write(get_ukb_pheno_mt_path('phecode'), overwrite=args.overwrite)