Ejemplo n.º 1
0
def create_full_results_file(prune, overwrite=False):
    r'''
    Concatenates PRS-phentype regression results into a single table.
    '''
    reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv'
    ls = hl.hadoop_ls(reg_path_regex)
    reg_paths = sorted([f['path'] for f in ls])
    df_list = []
    for reg_path in reg_paths:
        with hl.hadoop_open(reg_path) as f:
            df_list.append(pd.read_csv(f, sep='\t'))
    df = pd.concat(df_list, sort=False)
    df.insert(1, 'phen_desc',
              df.phen.astype(str).apply(lambda x: phen_dict[x][0])
              )  # add phenotype description to dataframe

    all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv'
    if hl.hadoop_is_file(all_reg_results_path) and not overwrite:
        print('\n... Full PRS-phen regression results already written! ...')
        print(all_reg_results_path)
    else:
        print('\n... Writing PRS-phen regression results ...')
        print(all_reg_results_path)
        with hl.hadoop_open(all_reg_results_path, 'w') as f:
            df.to_csv(f, sep='\t', index=False)
Ejemplo n.º 2
0
def run_assign_population_pcs(
        pop_pc_table: hl.Table,
        outfile: str,
        picklefile: str,
        pcs: List[int],
        fit: RandomForestClassifier = None,
        seed: int = 42) -> Tuple[hl.Table, RandomForestClassifier]:
    """
    :param Table pop_pc_table: Table containing population PCs ('PC<n>') as well as a column 'known_pop' with population labels
    :param str outfile: filepath to tsv with input samples and imputed population labels
    :param str picklefile: filepath to which the pickled random forest model is written
    :param list of int pcs: 1-based list of PCs to train the model on
    :param RandomForestClassifier fit: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
    :param int seed: Random seed
    :return: Table containing sample IDs and imputed population labels, trained random forest model
    :rtype: Table, RandomForestClassifier
    """
    data = pop_pc_table.to_pandas()
    data = expand_pd_array_col(data, 'scores', max(pcs), 'PC')
    new_data, pop_clf = assign_population_pcs(
        data, pc_cols=['PC{}'.format(pc) for pc in pcs], fit=fit, seed=seed)

    if not fit:
        # Pickle RF
        with hl.hadoop_open(picklefile, 'wb') as out:
            pickle.dump(pop_clf, out)

    with hl.hadoop_open(outfile, 'w') as out:
        new_data.to_csv(out, sep="\t", na_rep="NA", index=False)
    return hl.import_table(outfile, impute=True).key_by('data_type',
                                                        's'), pop_clf
Ejemplo n.º 3
0
 def write(self, path, overwrite):  # pylint: disable=unused-argument
     with hl.hadoop_open(self.path, "r") as input_file:
         with hl.hadoop_open(path, "w") as output_file:
             reader = csv.reader(input_file, delimiter="\t")
             writer = csv.writer(output_file, delimiter="\t", quotechar="'")
             for row in reader:
                 writer.writerow(row)
Ejemplo n.º 4
0
def write_functional_pedigree(input_pedigree: str, vcf_samples: list,
                              output_dir: str,
                              output_name: str) -> Tuple[dict, dict, dict]:
    """
    Write a functional pedigree (pedigree with samples not in the VCF removed) and create dictionary of seqr projects, family IDs, and given sex.

    :param input_pedigree: Pedigree
    :param vcf_samples: Dictionary of samples found in the VCF
    :return: Dictionary of project IDs for each sample
    """
    seqr_projects = defaultdict(str)
    family_ids = defaultdict(str)
    given_sex = defaultdict(str)

    out_new_ped = hl.hadoop_open(
        f"{output_dir}/{output_name}_functioning_pedigree.ped", "w")
    out_new_ped.write(
        "Family_ID\tIndividual_ID\tPaternal_ID\tMaternal_ID\tSex\n")

    with hl.hadoop_open(input_pedigree, "r") as infile:
        next(infile)
        for line in infile:
            line = line.rstrip("\n")
            items = line.split("\t")
            (
                Project_GUID,
                Family_ID,
                Individual_ID,
                Paternal_ID,
                Maternal_ID,
                Sex,
            ) = items[0:6]

            if Individual_ID not in vcf_samples:
                Individual_ID = "."
            if Paternal_ID not in vcf_samples:
                Paternal_ID = "."
            if Maternal_ID not in vcf_samples:
                Maternal_ID = "."

            # Only output line from pedigree if the proband is not missing
            if Individual_ID != ".":
                seqr_projects[Individual_ID] = Project_GUID
                family_ids[Individual_ID] = Family_ID
                given_sex[Individual_ID] = Sex

                out_new_ped.write(
                    f"{Family_ID}\t{Individual_ID}\t{Paternal_ID}\t{Maternal_ID}\t{Sex}\n"
                )

    out_new_ped.close()

    return seqr_projects, family_ids, given_sex
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(FILTERED_VARIANTS)
    nrows = mt.count_rows()
    print(f'mt.count_rows() = {nrows}')

    # Plot the allele frequency
    fig = figure(
        title='Variant AF',
        x_axis_label='Allele Frequency',
        y_axis_label='Frequency (%)',
    )
    variant_af = mt.variant_qc.AF[1].collect()
    af_count, edges = np.histogram(variant_af,
                                   bins=100,
                                   weights=np.ones(len(variant_af)) /
                                   len(variant_af))
    variant_af_count = pd.DataFrame({
        'variant_af_count': af_count,
        'left': edges[:-1],
        'right': edges[1:]
    })
    fig.quad(
        bottom=0,
        top=variant_af_count['variant_af_count'],
        left=variant_af_count['left'],
        right=variant_af_count['right'],
        fill_color='blue',
        line_color='black',
    )
    # Add in the cumulative distribution
    cumulative_af = np.cumsum(af_count)
    fig.line(
        x=variant_af_count['right'],
        y=cumulative_af,
        color='gray',
        line_width=1,
        legend='Cum dist',
    )
    fig.legend.location = 'top_left'
    fig_filename = output_path('variant_selection_histogram.png', 'web')
    with hl.hadoop_open(fig_filename, 'wb') as f:
        get_screenshot_as_png(fig).save(f, format='PNG')
    html = file_html(fig, CDN, 'my plot')
    fig_filename_html = output_path('variant_selection_histogram.html', 'web')
    with hl.hadoop_open(fig_filename_html, 'w') as f:
        f.write(html)
Ejemplo n.º 6
0
def combine_gvcfs(
        gvcf_input_list: str,
        out_path: str = 'gs://african-seq-data/gambian-genomes/COMBINED_GVCFS/',
        samples: str = 'gs://african-seq-data/gambian-genomes/sample_names.txt',
        gvcf_header:
    str = 'gs://african-seq-data/gambian-genomes/merged-gvcf/SC_GMJOL5309875.alt_bwamem_GRCh38DH.20151208.JOLA.gambian_lowcov/SC_GMJOL5309875.alt_bwamem_GRCh38DH.20151208.JOLA.gambian_lowcov.g.vcf.gz',
        out_mt_name: str = 'gambian_genomes_merged_gvcfs',
        temp_bucket: str = 'gs://african-seq-data',
        reference: str = 'GRCh38',
        use_genome_default_intervals: bool = True,
        overwrite: bool = True,
        key_by_locus_and_alleles: bool = True):
    """
    Combine single-sample GVCFs into a multi-sample matrix table

    :param gvcf_input_list: MT to filter
    :param out_path: path to where multi-sample MT will be written
    :param samples: text file with sample names as they appear in each GVCF to be merged. One sample name per line
    :param gvcf_header: GVCF file whose header is going to be used as default
    :param out_mt_name: name to use for output MT
    :param temp_bucket: bucket for storing intermediate files
    :param reference: reference genome to use
    :param use_genome_default_intervals: import GVCFs with uniform partition intervals of default size for whole-genome
    data
    :param overwrite: overwrite MT if it exists
    :param key_by_locus_and_alleles: key by both locus and alleles in the final output
    """

    inputs = []
    with hl.hadoop_open(gvcf_input_list, 'r') as f:
        for line in f:
            inputs.append(line.strip())

    samples_list = []
    with hl.hadoop_open(samples, 'r') as f:
        for line in f:
            samples_list.append(line.strip())

    output_file = f'{out_path}{out_mt_name}.mt'  # output destination

    hl.experimental.run_combiner(
        inputs,
        out_file=output_file,
        tmp_path=temp_bucket,
        header=gvcf_header,
        sample_names=samples_list,
        reference_genome=reference,
        use_genome_default_intervals=use_genome_default_intervals,
        overwrite=overwrite,
        key_by_locus_and_alleles=key_by_locus_and_alleles)
Ejemplo n.º 7
0
def main(args):
    full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt')

    # liftover chains
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    chips = hl.hadoop_open(args.chip_loci)
    chip_dict = {}
    for chip in chips:
        chip = chip.strip().split()
        chip_pos = hl.import_table(chip[1],
                                   filter='\[Controls\]',
                                   skip_blank_lines=True)
        chip_pos = chip_pos.filter(
            hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains(
                chip_pos.chr))
        chip_pos = chip_pos.key_by(
            locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos)))

        #  liftover chip position info
        chip_pos = chip_pos.annotate(
            new_locus=hl.liftover(chip_pos.locus, 'GRCh38'))
        chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus))
        chip_pos = chip_pos.key_by(locus=chip_pos.new_locus)

        # filter full vcf to sites in genotype data
        geno_vcf = full_vcf.filter_rows(hl.is_defined(
            chip_pos[full_vcf.locus]))
        hl.export_vcf(
            geno_vcf,
            'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
Ejemplo n.º 8
0
  def run_population_pca(mt: hl.MatrixTable, build: int, num_pcs=6) -> hl.Table:
    """
    Projects samples onto pre-computed gnomAD and rare disease sample principal components using PCA loadings.  A
    random forest classifier assigns gnomAD and rare disease sample population labels
    :param mt: QC MatrixTable
    :param build: 37 or 38 for write path
    :param pop_fit_path: fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
    :param num_pcs: Number of PCs to use in model
    :return: Table annotated with assigned RDG and gnomAD population and PCs
    :rtype: Table
    """
    loadings = hl.read_table(rdg_gnomad_pop_pca_loadings_path(build))
    model_path = rdg_gnomad_rf_model_path()
    mt = mt.select_entries("GT")
    scores = pc_project(mt, loadings)
    scores = scores.annotate(
        scores=scores.scores[:num_pcs], known_pop="Unknown"
    ).key_by("s")

    logger.info("Unpacking RF model")
    fit = None
    with hl.hadoop_open(model_path, "rb") as f:
        fit = pickle.load(f)

    pop_pca_ht, ignore = assign_population_pcs(
        scores, pc_cols=scores.scores, output_col="qc_pop", fit=fit
    )
    pop_pca_ht = pop_pca_ht.key_by("s")
    pop_pcs = {f"pop_PC{i+1}": scores.scores[i] for i in range(num_pcs)}
    scores = scores.annotate(**pop_pcs).drop("scores", "known_pop")
    pop_pca_ht = pop_pca_ht.annotate(**scores[pop_pca_ht.key])
    return pop_pca_ht
Ejemplo n.º 9
0
def read_sample_ids(sample_ids_path,
                    start_with_sample_i,
                    n_samples_to_process,
                    n_sample_ids_to_print=10):
    """Read sample ids file.

    Args:
        sample_ids_path (str): sample ids path
        n_sample_ids_to_print (int): log no more than this many sample ids to stdout.
    Return:
        list: sample id strings
    """
    sample_ids = []
    with hl.hadoop_open(sample_ids_path) if sample_ids_path.startswith(
            "gs://") else open(sample_ids_path, "rt") as f:
        for i, line in enumerate(f):
            if i < start_with_sample_i:
                continue
            elif i >= start_with_sample_i + n_samples_to_process:
                break

            sample_id = line.rstrip("\n")
            sample_ids.append(sample_id)

            if i <= n_sample_ids_to_print:
                logging.info(sample_id)
                if i == n_sample_ids_to_print and n_sample_ids_to_print > 0:
                    logging.info("...")

    logging.info(f"Parsed {len(sample_ids)} sample ids from {sample_ids_path}")

    return sample_ids
Ejemplo n.º 10
0
def main(df_x_path, df_y_path, output_path, python_image):
    backend = hb.ServiceBackend()
    b = hb.Batch(name='rf-loo', default_python_image=python_image)

    with hl.hadoop_open(df_y_path) as f:
        local_df_y = pd.read_table(f, header=0, index_col=0)

    df_x_input = b.read_input(df_x_path)
    df_y_input = b.read_input(df_y_path)

    results = []

    for window in local_df_y.index.to_list():
        checkpoint = checkpoint_path(window)
        if hl.hadoop_exists(checkpoint):
            result = b.read_input(checkpoint)
            results.append(result)
            continue

        j = b.new_python_job()

        result = j.call(random_forest, df_x_input, df_y_input, window)
        tsv_result = j.call(as_tsv, result)
        tsv_result = tsv_result.as_str()

        b.write_output(tsv_result, checkpoint)
        results.append(tsv_result)

    output = hb.concatenate(b, results)
    b.write_output(output, output_path)

    b.run(wait=False)
    backend.close()
Ejemplo n.º 11
0
def main(args):

    print("main")
    ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht'
    )

    run_hash = str(uuid.uuid4())[:8]
    rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/')
    while run_hash in rf_runs:
        run_hash = str(uuid.uuid4())[:8]
    ht_result, rf_model = train_rf(ht, args)
    print("Writing out ht_training data")
    ht_result = ht_result.checkpoint(
        f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True)
    rf_runs[run_hash] = get_run_data(
        vqsr_training=False,
        transmitted_singletons=True,
        test_intervals=args.test_intervals,
        adj=False,
        features_importance=hl.eval(ht_result.features_importance),
        test_results=hl.eval(ht_result.test_results),
    )

    with hl.hadoop_open(f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json',
                        "w") as f:
        json.dump(rf_runs, f)

    logger.info("Saving RF model")
    save_model(rf_model, f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model')
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)

    # Perform kinship test with pc_relate
    pc_rel_path = output_path('pc_relate_kinship_estimate.ht')
    pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin')
    pc_rel.write(pc_rel_path, overwrite=True)
    pairs = pc_rel.filter(pc_rel['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')

    # save as html
    html = pd.DataFrame({
        'removed_individual':
        related_samples_to_remove.node.s.collect()
    }).to_html()
    plot_filename_html = output_path(f'removed_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
Ejemplo n.º 13
0
def print_ref_block_stats(path: str):
    import numpy as np

    def _print_block_stats(stats: hl.Struct):
        def get_quantile(cum_prop, quantile):
            return [i for i, x in enumerate(cum_prop) if x > quantile][0]

        for strat, ref_block_stats in [('all', stats.ref_block_stats),
                                       ('adj', stats.adj_ref_block_stats)]:
            n_blocks = np.sum(
                ref_block_stats.hist.bin_freq) + ref_block_stats.hist.n_larger
            cum_blocks = np.cumsum(ref_block_stats.hist.bin_freq)
            cum_prop = [x / n_blocks for x in cum_blocks]

            print(f"Stats for {strat}")
            print(f"Number of blocks: {n_blocks}")
            print(f"Largest block size: {ref_block_stats.stats.max}")
            print(f"95% block size: {get_quantile(cum_prop, 0.95)}")
            print(f"99% block size: {get_quantile(cum_prop, 0.99)}")
            print(f"99.9% block size: {get_quantile(cum_prop, 0.999)}")
            print(f"99.95% block size: {get_quantile(cum_prop, 0.9995)}")
            print(
                f"Percentage blocks below 10k: {1-(ref_block_stats.hist.n_larger/n_blocks)}"
            )

    if path.startswith('gs://'):
        with hl.hadoop_open(path, 'rb') as f:
            _print_block_stats(pickle.load(f))
    else:
        with open(path, 'rb') as f:
            _print_block_stats(pickle.load(f))
Ejemplo n.º 14
0
def get_rows_data(rows_files):
    file_sizes = []
    partition_bounds = []
    parts_file = [x['path'] for x in rows_files if x['path'].endswith('parts')]
    if parts_file:
        parts = hl.hadoop_ls(parts_file[0])
        for i, x in enumerate(parts):
            index = x['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0]
            if i < len(parts) - 1:
                test_index = parts[i + 1]['path'].split(
                    f'{parts_file[0]}/part-')[1].split('-')[0]
                if test_index == index:
                    continue
            file_sizes.append(x['size_bytes'])
    metadata_file = [
        x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz')
    ]
    if metadata_file:
        with hl.hadoop_open(metadata_file[0], 'rb') as f:
            rows_meta = json.loads(f.read())
            try:
                partition_bounds = [(x['start']['locus']['contig'],
                                     x['start']['locus']['position'],
                                     x['end']['locus']['contig'],
                                     x['end']['locus']['position'])
                                    for x in rows_meta['jRangeBounds']]
            except KeyError:
                pass
    return partition_bounds, file_sizes
Ejemplo n.º 15
0
    def test_export(self):
        t = hl.utils.range_table(1).annotate(foo = 3)
        tmp_file = new_temp_file()
        t.export(tmp_file)

        with hl.hadoop_open(tmp_file, 'r') as f_in:
            assert f_in.read() == 'idx\tfoo\n0\t3\n'
Ejemplo n.º 16
0
    def test_export_delim(self):
        t = hl.utils.range_table(1).annotate(foo = 3)
        tmp_file = new_temp_file()
        t.export(tmp_file, delimiter=',')

        with hl.hadoop_open(tmp_file, 'r') as f_in:
            assert f_in.read() == 'idx,foo\n0,3\n'
Ejemplo n.º 17
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    mt = mt.filter_cols(
        (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
        | (mt.s.contains('TOB'))
    )
    # Remove related samples (at the 2nd degree or closer)
    king = hl.king(mt.GT)
    king_path = output_path('king_kinship_estimate_NFE.ht')
    king.write(king_path)
    ht = king.entries()
    related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=related_samples.s_1, j=related_samples.s)
    struct = struct.annotate(phi=related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i, struct.j, False  # pylint: disable=E1101
    )
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')
    # save as html
    html = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()}
    ).to_html()
    plot_filename_html = output_path(f'related_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
Ejemplo n.º 18
0
def get_rows_data(rows_files):  # noqa: D103
    file_sizes = []
    partition_bounds = []
    parts_file = [x["path"] for x in rows_files if x["path"].endswith("parts")]
    if parts_file:
        parts = hl.hadoop_ls(parts_file[0])
        for i, x in enumerate(parts):
            index = x["path"].split(f"{parts_file[0]}/part-")[1].split("-")[0]
            if i < len(parts) - 1:
                test_index = (parts[i + 1]["path"].split(
                    f"{parts_file[0]}/part-")[1].split("-")[0])
                if test_index == index:
                    continue
            file_sizes.append(x["size_bytes"])
    metadata_file = [
        x["path"] for x in rows_files if x["path"].endswith("metadata.json.gz")
    ]
    if metadata_file:
        with hl.hadoop_open(metadata_file[0], "rb") as f:
            rows_meta = json.loads(f.read())
            try:
                partition_bounds = [(
                    x["start"]["locus"]["contig"],
                    x["start"]["locus"]["position"],
                    x["end"]["locus"]["contig"],
                    x["end"]["locus"]["position"],
                ) for x in rows_meta["jRangeBounds"]]
            except KeyError:
                pass
    return partition_bounds, file_sizes
Ejemplo n.º 19
0
def get_cases_and_controls_from_log(log_format):
    """
    'gs://path/to/result_chr{chrom}_000000001.variant.log'
    """
    cases = controls = -1
    for chrom in range(10, 23):
        try:
            with hl.hadoop_open(log_format.format(chrom=chrom)) as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('Analyzing'):
                        fields = line.split()
                        if len(fields) == 6:
                            try:
                                cases = int(fields[1])
                                controls = int(fields[4])
                                break
                            except ValueError:
                                logger.warn(
                                    f'Could not load number of cases or controls from {line}.'
                                )
                    elif line.endswith('samples were used in fitting the NULL glmm model and are found in sample file') or \
                            line.endswith('samples have been used to fit the glmm null model'):
                        # This is ahead of the case/control count line ("Analyzing ...") above so this should be ok
                        fields = line.split()
                        try:
                            cases = int(fields[0])
                        except ValueError:
                            logger.warn(
                                f'Could not load number of cases or controls from {line}.'
                            )
            return cases, controls
        except:
            pass
    return cases, controls
Ejemplo n.º 20
0
def infer_ped(related_data: GnomADRelatedData) -> None:
    """

    Infers trios based on `pc_relate` kinship output.
    Writes a CSV containing one row per trio.
    If there are duplicate samples, each combination of duplicate samples will be present in the output.

    :param GnomADRelatedData related_data: Input data for inference
    :return: Nothing
    :rtype: None
    """

    logger.info(f"Inferring pedigree for {related_data.data_type}")
    sex = {row.s: row.is_female for row in related_data.meta_pd.itertuples()}
    dups_to_remove = {s for d in related_data.dups for s in list(d)[1:]}

    raw_ped = infer_families(related_data.kin_ht, sex, dups_to_remove)
    logger.info(
        f"Found {len(raw_ped.complete_trios())} complete trios in {related_data.data_type}."
    )

    # Create dataframe with all combinations of dups
    dup_trios_pd = get_dup_trios(raw_ped, related_data.sample_to_dups)
    logger.info(
        f"Found {len(dup_trios_pd)} trios combinations with dups in {related_data.data_type}."
    )
    with hl.hadoop_open(dup_pedigree_tsv_path(related_data.data_type),
                        'w') as out:
        dup_trios_pd.to_csv(out, sep="\t", index=False)
Ejemplo n.º 21
0
def apply_mito_artifact_filter(
    mt: hl.MatrixTable,
    artifact_prone_sites_path: str,
) -> hl.MatrixTable:
    """Add back in artifact_prone_site filter

    :param hl.MatrixTable mt: MatrixTable to use an input
    :param str artifact_prone_sites_path: path to BED file of artifact_prone_sites to flag in the filters column

    :return: MatrixTable with artifact_prone_sites filter
    :rtype: hl.MatrixTable

    """

    # apply "artifact_prone_site" filter to any SNP or deletion that spans a known problematic site
    mt = mt.annotate_rows(
        position_range=hl.range(mt.locus.position, mt.locus.position +
                                hl.len(mt.alleles[0])))

    artifact_sites = []
    with hl.hadoop_open(artifact_prone_sites_path) as f:
        for line in f:
            pos = line.split()[2]
            artifact_sites.append(int(pos))
    sites = hl.literal(set(artifact_sites))

    mt = mt.annotate_rows(filters=hl.if_else(
        hl.len(hl.set(mt.position_range).intersection(sites)) > 0,
        {"artifact_prone_site"},
        {"PASS"},
    ))

    mt = mt.drop("position_range")

    return mt
Ejemplo n.º 22
0
 def load_id_file(path):
     ids = []
     with hl.hadoop_open(path) as f:
         for l in f:
             r = l.strip().split('\t')
             self.assertEqual(len(r), 2)
             ids.append(r[1])
     return ids
Ejemplo n.º 23
0
def main(args):

    print("main")
    ht = hl.read_table(
        f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht'
    )

    if args.train_rf:

        run_hash = str(uuid.uuid4())[:8]
        rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/')
        while run_hash in rf_runs:
            run_hash = str(uuid.uuid4())[:8]
        ht_result, rf_model = train_rf(ht, args)
        print("Writing out ht_training data")
        ht_result = ht_result.checkpoint(get_rf(data="training",
                                                run_hash=run_hash).path,
                                         overwrite=True)
        # f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True)
        rf_runs[run_hash] = get_run_data(
            vqsr_training=False,
            transmitted_singletons=True,
            test_intervals=args.test_intervals,
            adj=False,
            features_importance=hl.eval(ht_result.features_importance),
            test_results=hl.eval(ht_result.test_results),
        )

        with hl.hadoop_open(
                f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json', "w") as f:
            json.dump(rf_runs, f)

        logger.info("Saving RF model")
        save_model(rf_model,
                   get_rf(data="model", run_hash=run_hash),
                   overwrite=True)
        # f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model')
    else:
        run_hash = args.run_hash

    if args.apply_rf:

        logger.info(f"Applying RF model {run_hash}...")
        rf_model = load_model(get_rf(data="model", run_hash=run_hash))
        ht = get_rf(data="training", run_hash=run_hash).ht()
        features = hl.eval(ht.features)
        ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL)

        logger.info("Finished applying RF model")
        ht = ht.annotate_globals(rf_hash=run_hash)
        ht = ht.checkpoint(
            get_rf("rf_result", run_hash=run_hash).path,
            overwrite=True,
        )

        ht_summary = ht.group_by("tp", "fp", TRAIN_COL, LABEL_COL,
                                 PREDICTION_COL).aggregate(n=hl.agg.count())
        ht_summary.show(n=20)
Ejemplo n.º 24
0
def check_vcf_existence(participant_data: str, vcf_col: str, sample_map: str,
                        output_bucket: str) -> Dict[str, str]:
    """For each participant specified in sample_map, checks that the vcf file exists, and if so, add the sample and vcf path to a dictionary

    :param str participant_data: participant data (downloaded data tab from terra)
    :param str vcf_col: name of column that contains vcf output
    :param str sample_map: path to file of samples to subset (tab-delimited participant_id and sample)
    :param str output_bucket: path to bucket to which results should be written

    :return: dictionary of samples for which the vcf existence was confirmed (sample as key, path to vcf as value)
    :rtype: Dict[str, str]
    """

    # create file that will contain the samples with confirmed vcfs and their paths
    out_vcf = hl.hadoop_open(f"{output_bucket}/vcfs_to_combine.list", "w")

    # create participants_of_interest dictionary which will contain samples to which the results shoudl be subset
    participants_of_interest = {}
    confirmed_vcfs = {}
    with hl.hadoop_open(sample_map, "r") as f:
        next(f)
        for line in f:
            line = line.rstrip()
            items = line.split("\t")
            participant, sample = items[0:2]
            participants_of_interest[participant] = 0

    # load in data from terra
    participant_info = hl.import_table(participant_data)
    df = participant_info.to_pandas()

    # check if the sample is in participants_of_interest, check that the vcf exists, and if yes to both, add to confirmed_vcfs dictionary
    for _, row in df.iterrows():
        participant_id = row["entity:participant_id"]
        sample = row["s"]
        vcf = row[vcf_col]

        if participant_id in participants_of_interest and vcf != "":
            if hl.hadoop_is_file(vcf):
                out_vcf.write(f"{sample}\t{vcf}\n")
                confirmed_vcfs[sample] = vcf

    out_vcf.close()

    return confirmed_vcfs
Ejemplo n.º 25
0
 def load_rel(ns, path):
     rel = np.zeros((ns, ns))
     with hl.hadoop_open(path) as f:
         for i, l in enumerate(f):
             for j, n in enumerate(map(float, l.strip().split('\t'))):
                 rel[i, j] = n
             self.assertEqual(j, i)
         self.assertEqual(i, ns - 1)
     return rel
    def run(self):
        mt = self.import_mt()
        row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt)
        self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt))

        with hl.hadoop_open(self.completed_marker_path, "w") as f:
            f.write(".")

        self.cleanup()
Ejemplo n.º 27
0
def plot_correlation_matrices(chr_list):
    """
    Plot combined correlation matrices for genotype-correlation and 
    sumstats-correlation matrices
    """
    for ch in chr_list:
        ss_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_ss_correlation_chr{}.bm/'.format(ch))
        gt_ch = BlockMatrix.read('gs://nbaya/sumstats_corr/' + variant_set +
                                 '_gt_correlation_chr{}.bm/'.format(ch))
        M_max = int(
            1e4
        )  #max number of variants to be taken from the block matrices (suggested: 2e4)
        M = ss_ch.shape[0]  #dimension of block matrix
        #        for idx in range(int(M/M_max)+1):       #index of which disjoint window we are looking at in the block matrix
        for idx in range(
                0,
                int(M / M_max) + 1
        ):  #index of which disjoint window we are looking at in the block matrix
            M0 = M_max * (idx)  #start variant index for block matrix filtering
            M1 = min(M_max * (idx + 1),
                     M)  #stop variant index for block matrix filtering
            ss_np = ss_ch[M0:M1, M0:M1].to_numpy()
            gt_np = gt_ch[M0:M1, M0:M1].to_numpy()
            print('\nStarting variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
            w = int(
                5e3
            )  #window width of variants for correlation matrix (suggested: 2e3)
            for i in range(int((M1 - M0 - 1) / w) + 1):
                w0 = w * i  #start variant index for window of correlation matrix
                w1 = min(
                    w * (i + 1), M1 -
                    M0)  #stop variant index for window of correlation matrix
                full = (ss_np[w0:w1, w0:w1] + gt_np[w0:w1, w0:w1].T)
                np.fill_diagonal(full, 1)
                fig, ax = plt.subplots()
                ax.imshow(full, cmap='bwr')
                ax.plot([0, w], [0, w], 'k--', alpha=0.5, lw=2)
                plt.xlim([0, w])
                plt.ylim([w, 0])
                ax.text(w * 0.83, w * 0.1, "SS", fontsize=60, alpha=0.5)
                ax.text(w * 0.02, w * 0.97, "GT", fontsize=60, alpha=0.5)
                plt.title('chr' + str(ch) + ' ' + variant_set + ' variants (' +
                          str(M0 + w0) + '-' + str(M0 + w1) + ')')
                fig = plt.gcf()
                fig.set_size_inches(10, 10)
                path = ('gs://nbaya/sumstats_corr/plots/chr' + str(ch) + '_' +
                        variant_set + '_' + str(M0 + w0).zfill(len(str(M))) +
                        '-' + str(M0 + w1).zfill(len(str(M))) + '.png')
                with hl.hadoop_open(path, 'wb') as f:
                    fig.savefig(f, dpi=600)
                plt.close()
            print('\nFinished variant window: [' + str(M0) + ',' + str(M1) +
                  ']')
Ejemplo n.º 28
0
def get_sample_names_from_list_of_files(input_files, output_fname):
    sample_names_dict = hl.grep('#CHROM',
                                input_files,
                                max_count=100000,
                                show=False)
    sample_names = []
    for fname, lines in sample_names_dict.items():
        sample_names.append('\t'.join(
            [lines[0].strip().split('\t')[-1], fname]))
    with hl.hadoop_open(output_fname, 'w') as f:
        f.write('\n'.join(sample_names))
Ejemplo n.º 29
0
def parse_sample_mapping(sample_map_path: str) -> Tuple[List[str], List[str]]:
    sample_names: List[str] = list()
    sample_paths: List[str] = list()

    with hl.hadoop_open(sample_map_path) as f:
        for line in f:
            [name, path] = line.strip().split('\t')
            sample_names.append(name)
            sample_paths.append(path)

    return sample_names, sample_paths
Ejemplo n.º 30
0
def get_inverse_normalize_status(null_glmm_log):
    status = 'Unknown'
    with hl.hadoop_open(null_glmm_log) as f:
        for line in f:
            if line.startswith('$invNormalize'):
                try:
                    status = f.readline().strip().split()[1]
                except:
                    logger.warning(
                        f'Could not load inv_norm status from {line} in {null_glmm_log}.'
                    )
    return status.capitalize()
Ejemplo n.º 31
0
    def read(cls, path):
        """Load reference genome from a JSON file.

        Notes
        -----

        The JSON file must have the following format:

        .. code-block:: text

            {"name": "my_reference_genome",
             "contigs": [{"name": "1", "length": 10000000},
                         {"name": "2", "length": 20000000},
                         {"name": "X", "length": 19856300},
                         {"name": "Y", "length": 78140000},
                         {"name": "MT", "length": 532}],
             "xContigs": ["X"],
             "yContigs": ["Y"],
             "mtContigs": ["MT"],
             "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}},
                     {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}]
            }


        `name` must be unique and not overlap with Hail's pre-instantiated
        references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``.
        The contig names in `xContigs`, `yContigs`, and `mtContigs` must be
        present in `contigs`. The intervals listed in `par` must have contigs in
        either `xContigs` or `yContigs` and must have positions between 0 and
        the contig length given in `contigs`.

        Parameters
        ----------
        path : :obj:`str`
            Path to JSON file.

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        with hl.hadoop_open(path) as f:
            return ReferenceGenome._from_config(json.load(f))
Ejemplo n.º 32
0
def load_dataset(name,
                 version,
                 reference_genome,
                 config_file='gs://hail-datasets/datasets.json'):
    """Load a genetic dataset from Hail's repository.

    Example
    -------

    >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates
    >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes',   # doctest: +SKIP
    ...                                       version='phase3',
    ...                                       reference_genome='GRCh38')

    Parameters
    ----------
    name : :obj:`str`
        Name of the dataset to load.
    version : :obj:`str`
        Version of the named dataset to load
        (see available versions in documentation).
    reference_genome : `GRCh37` or `GRCh38`
        Reference genome build.

    Returns
    -------
    :class:`.Table` or :class:`.MatrixTable`"""

    with hl.hadoop_open(config_file, 'r') as f:
        datasets = json.load(f)

    names = set([dataset['name'] for dataset in datasets])
    if name not in names:
        raise ValueError('{} is not a dataset available in the repository.'.format(repr(name)))

    versions = set([dataset['version'] for dataset in datasets if dataset['name']==name])
    if version not in versions:
        raise ValueError("""Version {0} not available for dataset {1}.
                            Available versions: {{{2}}}.""".format(repr(version), 
                                                                   repr(name),
                                                                   repr('","'.join(versions))))

    reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name])
    if reference_genome not in reference_genomes:
        raise ValueError("""Reference genome build {0} not available for dataset {1}.
                            Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome),
                                                                                    repr(name), 
                                                                                    '\',\''.join((reference_genomes))))

    path = [dataset['path'] for dataset in datasets if all([dataset['name']==name,
                                                            dataset['version']==version,
                                                            dataset['reference_genome']==reference_genome])][0].strip('/')

    if path.endswith('.ht'):
        dataset = hl.read_table(path)
    else:
        if not path.endswith('.mt'):
            raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path)))
        dataset = hl.read_matrix_table(path)

    return dataset