Ejemplo n.º 1
0
def export_table_to_elasticsearch(table,
                                  host,
                                  index_name,
                                  block_size=5000,
                                  id_field=None,
                                  mapping=None,
                                  num_shards=10,
                                  port=9200,
                                  verbose=True):
    es_client = elasticsearch.Elasticsearch(host, port=port)

    if not mapping:
        mapping = elasticsearch_mapping_for_table(table)

    # Delete the index before creating it
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)

    mapping["_meta"] = dict(hl.eval(table.globals))

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#index-modules-settings
    request_body = {
        "mappings": mapping,
        "settings": {
            "index.codec": "best_compression",
            "index.mapping.total_fields.limit": 10000,
            "index.number_of_replicas": 0,
            "index.number_of_shards": num_shards,
            "index.refresh_interval": -1,
        },
    }

    es_client.indices.create(index=index_name, body=request_body)

    temp_file = "table-tmp.json.txt"
    table = table.key_by()
    table.select(json=hl.json(table.row_value)).export(temp_file, header=False)

    buffer = []
    with open(temp_file) as f:
        for line in f:
            data = json.loads(line)
            buffer.append(data)

            if len(buffer) >= block_size:
                helpers.bulk(es_client,
                             build_bulk_request(buffer, index_name, id_field))
                buffer = []

    if buffer:
        helpers.bulk(es_client, build_bulk_request(buffer, index_name,
                                                   id_field))
        buffer = []

    es_client.indices.forcemerge(index=index_name)
Ejemplo n.º 2
0
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
Ejemplo n.º 3
0
def main(args):
    hl.init(default_reference='GRCh38', log='/variant_histograms.log')

    ht = hl.read_table(release_ht_path())
    # NOTE: histogram aggregations are done on the entire callset (not just PASS variants), on raw data

    hist_dict = ANNOTATIONS_HISTS
    hist_dict['MQ'] = (
        20, 60, 40
    )  # Boundaries changed for v3, but could be a good idea to settle on a standard
    hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS)

    # NOTE: run the following code in a first pass to determine bounds for metrics
    # Evaluate minimum and maximum values for each metric of interest
    # This doesn't need to be run unless the defaults do not result in nice-looking histograms.
    if args.first_pass:
        minmax_dict = {}
        for metric in hist_ranges_expr.keys():
            minmax_dict[metric] = hl.struct(min=hl.agg.min(ht[metric]),
                                            max=hl.if_else(
                                                hl.agg.max(ht[metric]) < 1e10,
                                                hl.agg.max(ht[metric]), 1e10))
        minmax = ht.aggregate(hl.struct(**minmax_dict))
        print(minmax)
    else:
        # Aggregate hists over hand-tooled ranges
        hists = ht.aggregate(hl.array([
            hist_expr.annotate(metric=hist_metric)
            for hist_metric, hist_expr in hist_ranges_expr.items()
        ]).extend(
            hl.array(
                hl.agg.group_by(
                    create_frequency_bins_expr(AC=ht.freq[1].AC,
                                               AF=ht.freq[1].AF),
                    hl.agg.hist(
                        hl.log10(ht.info.QUALapprox), 1, 10,
                        36))).map(lambda x: x[1].annotate(metric=x[0]))),
                             _localize=False)

        with hl.hadoop_open(qual_hists_json_path(CURRENT_RELEASE), 'w') as f:
            f.write(hl.eval(hl.json(hists)))
Ejemplo n.º 4
0
def main(args):
    hl.init(default_reference="GRCh38", log="/variant_histograms.log")

    logger.info("Loading ANNOTATIONS_HISTS dictionary...")
    if not file_exists(annotation_hists_path()):
        raise DataException(
            "Annotation hists JSON file not found. Need to create this JSON before running script!"
        )

    with hl.hadoop_open(annotation_hists_path()) as a:
        ANNOTATIONS_HISTS = json.loads(a.read())

    # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data
    ht = hl.read_table(release_ht_path(public=False))
    ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS))

    inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"]

    # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be
    # handled differently. It is stored as a dictionary in annotation_hists_path
    ANNOTATIONS_HISTS.remove("InbreedingCoeff")

    logger.info("Getting info annotation histograms...")
    hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS)

    # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists
    # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary
    if args.determine_bounds:
        logger.info(
            "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10."
        )
        minmax_dict = {}
        for metric in ANNOTATIONS_HISTS:
            minmax_dict[metric] = hl.struct(
                min=hl.agg.min(ht.info[metric]),
                max=hl.if_else(
                    hl.agg.max(ht.info[metric]) < 1e10,
                    hl.agg.max(ht.info[metric]),
                    1e10,
                ),
            )
        minmax = ht.aggregate(hl.struct(**minmax_dict))
        logger.info(f"Metrics bounds: {minmax}")
    else:
        logger.info(
            "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can "
            "be used to help define these ranges..."
        )
        hists = ht.aggregate(
            hl.array(
                [
                    hist_expr.annotate(metric=hist_metric)
                    for hist_metric, hist_expr in hist_ranges_expr.items()
                ]
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.QUALapprox),
                            *ANNOTATIONS_HISTS["QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0]))
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.AS_QUALapprox),
                            *ANNOTATIONS_HISTS["AS_QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0]))
            ),
            _localize=False,
        )

        # Defining hist range and bins for allele frequency groups because they needed different ranges
        ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF))
        inbreeding_hists = [
            ht.aggregate(
                hl.agg.filter(
                    ht.af_bin == x,
                    hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],),
                )
            ).annotate(metric="InbreedingCoeff" + "-" + x)
            for x in inbreeding_bin_ranges
        ]

        hists = hl.eval(hl.json(hists))
        inbreeding_hists = hl.eval(hl.json(inbreeding_hists))

        # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in
        # inbreeding_hists then joins them together to be written out as a single JSON
        hists = hists[:-1] + "," + inbreeding_hists[1:]

        logger.info("Writing output")
        with hl.hadoop_open(qual_hists_json_path(), "w") as f:
            f.write(hists)
def write_data_files(table_path, output_directory, genes=None):
    if output_directory.startswith("gs://"):
        raise Exception("Cannot write output to Google Storage")

    ds = hl.read_table(table_path)

    os.makedirs(output_directory, exist_ok=True)

    with open(f"{output_directory}/metadata.json", "w") as output_file:
        output_file.write(hl.eval(hl.json(ds.globals.meta)))

    gene_search_terms = ds.select(data=hl.json(hl.tuple([ds.gene_id, ds.search_terms])))
    gene_search_terms.key_by().select("data").export(f"{output_directory}/gene_search_terms.json.txt", header=False)
    os.remove(f"{output_directory}/.gene_search_terms.json.txt.crc")

    ds = ds.drop("previous_symbols", "alias_symbols", "search_terms")

    os.makedirs(f"{output_directory}/results", exist_ok=True)
    for dataset in ds.globals.meta.datasets.dtype.fields:
        reference_genome = "GRCh38" if dataset == "bipex" else "GRCh37"
        gene_results = ds.filter(hl.is_defined(ds.gene_results[dataset]))
        gene_results = gene_results.select(
            result=hl.tuple(
                [
                    gene_results.gene_id,
                    gene_results.symbol,
                    gene_results.name,
                    gene_results[reference_genome].chrom,
                    (gene_results[reference_genome].start + gene_results[reference_genome].stop) // 2,
                    gene_results.gene_results[dataset].group_results,
                ]
            )
        )
        gene_results = gene_results.collect()

        gene_results = [r.result for r in gene_results]

        with open(f"{output_directory}/results/{dataset.lower()}.json", "w") as output_file:
            output_file.write(json.dumps({"results": gene_results}, cls=ResultEncoder))

    if genes:
        ds = ds.filter(hl.set(genes).contains(ds.gene_id))

    temp_file_name = "temp.tsv"
    n_rows = ds.count()
    ds.select(data=hl.json(ds.row)).export(f"{output_directory}/{temp_file_name}", header=False)

    csv.field_size_limit(sys.maxsize)
    os.makedirs(f"{output_directory}/genes", exist_ok=True)

    with multiprocessing.get_context("spawn").Pool() as pool:
        with open(f"{output_directory}/{temp_file_name}") as data_file:

            reader = csv.reader(data_file, delimiter="\t")
            for gene_id, gene_grch37, gene_grch38, all_variants in tqdm(pool.imap(split_data, reader), total=n_rows):
                num = int(gene_id.lstrip("ENSGR"))
                gene_dir = f"{output_directory}/genes/{str(num % 1000).zfill(3)}"
                os.makedirs(gene_dir, exist_ok=True)

                if gene_grch37:
                    with open(f"{gene_dir}/{gene_id}_GRCh37.json", "w") as out_file:
                        out_file.write(gene_grch37)

                if gene_grch38:
                    with open(f"{gene_dir}/{gene_id}_GRCh38.json", "w") as out_file:
                        out_file.write(gene_grch38)

                for dataset, dataset_variants in all_variants.items():
                    if dataset_variants:
                        with open(f"{gene_dir}/{gene_id}_{dataset.lower()}_variants.json", "w") as out_file:
                            out_file.write(dataset_variants)

    os.remove(f"{output_directory}/{temp_file_name}")
    os.remove(f"{output_directory}/.{temp_file_name}.crc")