def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi2 - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def test_order_by(self): ht = hl.utils.range_table(10) self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10))) self.assertEqual( ht.order_by(hl.asc('idx')).idx.collect(), list(range(10))) self.assertEqual( ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
def test_order_by(self): ht = hl.utils.range_table(10) self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10))) self.assertEqual(ht.order_by(hl.asc('idx')).idx.collect(), list(range(10))) self.assertEqual(ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
def prepare_base_level_pext(base_level_pext_path): tmp_dir = os.path.expanduser("~") # # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV # ds = hl.read_table(base_level_pext_path) ds = ds.select( gene_id=ds.ensg, chrom=ds.locus.contig, pos=ds.locus.position, # Replace NaNs and missing values with 0s mean=hl.if_else( hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion), hl.float(0), ds.mean_proportion), **{ renamed: hl.if_else( hl.is_missing(ds[original]) | hl.is_nan(ds[original]), hl.float(0), ds[original]) for original, renamed in TISSUE_NAME_MAP.items() }) ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus") ds.export("file://" + os.path.join(tmp_dir, "bases.tsv")) # # Step 2: Collect base-level data into regions # with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file: writer = csv.writer(output_file, delimiter="\t") writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] + TISSUE_FIELDS) def output_region(region): writer.writerow([ region.gene, region.chrom, region.start, region.stop, region.tissues["mean"] ] + [region.tissues[t] for t in TISSUE_FIELDS]) rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv")) first_row = next(rows) current_region = Region(gene=first_row.gene, chrom=first_row.chrom, start=first_row.pos, stop=None, tissues=first_row.tissues) last_pos = first_row.pos for row in tqdm(rows): if (row.gene != current_region.gene or row.chrom != current_region.chrom or row.pos > (last_pos + 1) or any(row.tissues[t] != current_region.tissues[t] for t in row.tissues)): output_region(current_region._replace(stop=last_pos)) current_region = Region(gene=row.gene, chrom=row.chrom, start=row.pos, stop=None, tissues=row.tissues) last_pos = row.pos output_region(current_region._replace(stop=last_pos)) # Copy regions file to HDFS subprocess.run( [ "hdfs", "dfs", "-cp", "file://" + os.path.join(tmp_dir, "regions.tsv"), os.path.join("/tmp/regions.tsv") ], check=True, ) # # Step 3: Convert regions to a Hail table. # types = {t: hl.tfloat for t in TISSUE_FIELDS} types["gene_id"] = hl.tstr types["chrom"] = hl.tstr types["start"] = hl.tint types["stop"] = hl.tint types["mean"] = hl.tfloat ds = hl.import_table("/tmp/regions.tsv", min_partitions=100, missing="", types=types) ds = ds.select("gene_id", "chrom", "start", "stop", "mean", tissues=hl.struct(**{t: ds[t] for t in TISSUE_FIELDS})) ds = ds.group_by("gene_id").aggregate( regions=hl.agg.collect(ds.row_value.drop("gene_id"))) return ds
ds = ds.select( gene_id=ds.ensg, chrom=ds.locus.contig, pos=ds.locus.position, # Replace NaNs and missing values with 0s mean=hl.cond( hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion), hl.float(0), ds.mean_proportion), **{ renamed: hl.cond( hl.is_missing(ds[original]) | hl.is_nan(ds[original]), hl.float(0), ds[original]) for original, renamed in tissue_map.items() }) ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus") ds.export("bases.tsv") # # Step 2: Collect base-level data into regions # Row = namedtuple("Row", ["gene", "chrom", "pos", "tissues"]) Region = namedtuple("Region", ["gene", "chrom", "start", "stop", "tissues"]) def read_bases_tsv(filename): with open(filename) as f: reader = csv.reader(f, delimiter="\t") header_row = next(reader) tissue_names = header_row[3:] for row in reader: