Exemple #1
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Exemple #2
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Exemple #3
0
 def test_order_by(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10)))
     self.assertEqual(
         ht.order_by(hl.asc('idx')).idx.collect(), list(range(10)))
     self.assertEqual(
         ht.order_by(hl.desc('idx')).idx.collect(),
         list(range(10))[::-1])
Exemple #4
0
 def test_order_by(self):
     ht = hl.utils.range_table(10)
     self.assertEqual(ht.order_by('idx').idx.collect(), list(range(10)))
     self.assertEqual(ht.order_by(hl.asc('idx')).idx.collect(), list(range(10)))
     self.assertEqual(ht.order_by(hl.desc('idx')).idx.collect(), list(range(10))[::-1])
Exemple #5
0
def prepare_base_level_pext(base_level_pext_path):
    tmp_dir = os.path.expanduser("~")

    #
    # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV
    #
    ds = hl.read_table(base_level_pext_path)

    ds = ds.select(
        gene_id=ds.ensg,
        chrom=ds.locus.contig,
        pos=ds.locus.position,
        # Replace NaNs and missing values with 0s
        mean=hl.if_else(
            hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion),
            hl.float(0), ds.mean_proportion),
        **{
            renamed: hl.if_else(
                hl.is_missing(ds[original]) | hl.is_nan(ds[original]),
                hl.float(0), ds[original])
            for original, renamed in TISSUE_NAME_MAP.items()
        })

    ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus")
    ds.export("file://" + os.path.join(tmp_dir, "bases.tsv"))

    #
    # Step 2: Collect base-level data into regions
    #
    with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file:
        writer = csv.writer(output_file, delimiter="\t")
        writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] +
                        TISSUE_FIELDS)

        def output_region(region):
            writer.writerow([
                region.gene, region.chrom, region.start, region.stop,
                region.tissues["mean"]
            ] + [region.tissues[t] for t in TISSUE_FIELDS])

        rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv"))
        first_row = next(rows)
        current_region = Region(gene=first_row.gene,
                                chrom=first_row.chrom,
                                start=first_row.pos,
                                stop=None,
                                tissues=first_row.tissues)
        last_pos = first_row.pos

        for row in tqdm(rows):
            if (row.gene != current_region.gene
                    or row.chrom != current_region.chrom or row.pos >
                (last_pos + 1)
                    or any(row.tissues[t] != current_region.tissues[t]
                           for t in row.tissues)):
                output_region(current_region._replace(stop=last_pos))
                current_region = Region(gene=row.gene,
                                        chrom=row.chrom,
                                        start=row.pos,
                                        stop=None,
                                        tissues=row.tissues)

            last_pos = row.pos

        output_region(current_region._replace(stop=last_pos))

    # Copy regions file to HDFS
    subprocess.run(
        [
            "hdfs", "dfs", "-cp",
            "file://" + os.path.join(tmp_dir, "regions.tsv"),
            os.path.join("/tmp/regions.tsv")
        ],
        check=True,
    )

    #
    # Step 3: Convert regions to a Hail table.
    #
    types = {t: hl.tfloat for t in TISSUE_FIELDS}
    types["gene_id"] = hl.tstr
    types["chrom"] = hl.tstr
    types["start"] = hl.tint
    types["stop"] = hl.tint
    types["mean"] = hl.tfloat

    ds = hl.import_table("/tmp/regions.tsv",
                         min_partitions=100,
                         missing="",
                         types=types)

    ds = ds.select("gene_id",
                   "chrom",
                   "start",
                   "stop",
                   "mean",
                   tissues=hl.struct(**{t: ds[t]
                                        for t in TISSUE_FIELDS}))

    ds = ds.group_by("gene_id").aggregate(
        regions=hl.agg.collect(ds.row_value.drop("gene_id")))

    return ds
ds = ds.select(
    gene_id=ds.ensg,
    chrom=ds.locus.contig,
    pos=ds.locus.position,
    # Replace NaNs and missing values with 0s
    mean=hl.cond(
        hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion),
        hl.float(0), ds.mean_proportion),
    **{
        renamed: hl.cond(
            hl.is_missing(ds[original]) | hl.is_nan(ds[original]), hl.float(0),
            ds[original])
        for original, renamed in tissue_map.items()
    })

ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus")
ds.export("bases.tsv")

#
# Step 2: Collect base-level data into regions
#
Row = namedtuple("Row", ["gene", "chrom", "pos", "tissues"])
Region = namedtuple("Region", ["gene", "chrom", "start", "stop", "tissues"])


def read_bases_tsv(filename):
    with open(filename) as f:
        reader = csv.reader(f, delimiter="\t")
        header_row = next(reader)
        tissue_names = header_row[3:]
        for row in reader: