def route_index_to_temp_es_cluster(yes, args):
    """Apply shard allocation filtering rules for the given index to elasticsearch data nodes with *loading* in their name:

    If yes is True, route new documents in the given index only to nodes named "*loading*".
    Otherwise, move any shards in this index off of nodes named "*loading*"

    Args:
        yes (bool): whether to route shards in the given index to the "*loading*" nodes, or move shards off of these nodes.
        args: args from ArgumentParser - used to compute the index name and get elasticsearch host and port.
    """
    if yes:
        require_name = "es-data-loading*"
        exclude_name = ""
    else:
        require_name = ""
        exclude_name = "es-data-loading*"

    body = {
        "index.routing.allocation.require._name": require_name,
        "index.routing.allocation.exclude._name": exclude_name
    }

    logger.info("==> Setting {}* settings = {}".format(args.index, body))

    index_arg = "{}*".format(args.index)
    client = ElasticsearchClient(args.host, args.port)
    client.es.indices.put_settings(index=index_arg, body=body)

    if not yes:
        wait_for_loading_shards_transfer(client, index=index_arg)
Exemple #2
0
def update_all_datasets(hc, args):
    client = ElasticsearchClient(args.host, port=args.port)
    indices = client.es.cat.indices(h="index", s="index").strip().split("\n")
    for i, index_name in enumerate(indices):
        _meta = client.get_index_meta(index_name)

        logger.info("==> updating index {} out of {}: {}".format(
            i + 1, len(indices), index_name))
        if _meta and "sourceFilePath" in _meta:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
            try:
                update_dataset(hc, index_name, args)
            except Exception as e:
                logger.error("ERROR while updating %s - %s: %s", index_name,
                             _meta["sourceFilePath"], e)
        else:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
def update_operations_log(args):
    if args.dont_update_operations_log:
        return

    logger.info("==> update operations log")
    client = ElasticsearchClient(args.host, args.port)
    client.save_index_operation_metadata(
        args.input_dataset,
        args.index,
        args.genome_version,
        fam_file=args.fam_file,
        remap_sample_ids=args.remap_sample_ids,
        subset_samples=args.subset_samples,
        skip_vep=args.skip_vep,
        project_id=args.project_guid,
        dataset_type=args.dataset_type,
        sample_type=args.sample_type,
        command=" ".join(sys.argv),
        directory=args.directory,
        username=args.username,
        operation="create_index",
        status="success",
    )
Exemple #4
0
def update_dataset(hc, index_name, args):

    elasticsearch_client = ElasticsearchClient(args.host, args.port)
    _meta = elasticsearch_client.get_index_meta(index_name)
    if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta):
        logger.error(
            "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use "
            "--index-name, --dataset-path, and --genome-version to update this index."
            .format(index_name))
        return

    dataset_path = args.dataset_path or _meta["sourceFilePath"]
    genome_version = args.genome_version or _meta.get("genomeVersion")

    if genome_version is None:
        match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE)
        if not match:
            logger.info(
                "ERROR: couldn't update clinvar in {} because the genome version wasn't found in _meta ({}) or in the index name."
                .format(index_name, _meta))
            return
        genome_version = match.group(1)

    vds = read_in_dataset(hc, dataset_path)
    vds = vds.drop_samples()
    vds = compute_minimal_schema(vds)
    vds = vds.annotate_global_expr(
        'global.genomeVersion = "{}"'.format(genome_version))

    # add reference data to vds
    filter_expr = []
    if args.update_primate_ai:
        vds = add_primate_ai_to_vds(hc,
                                    vds,
                                    genome_version,
                                    root="va.primate_ai")
        filter_expr.append("isDefined(va.primate_ai.score)")

    if args.update_splice_ai:
        vds = add_splice_ai_to_vds(hc,
                                   vds,
                                   genome_version,
                                   root="va.splice_ai")
        filter_expr.append("isDefined(va.splice_ai.delta_score)")

    if args.update_clinvar:
        #vds = reset_clinvar_fields_in_vds(hc, vds, genome_version, root="va.clinvar", subset=filter_interval)
        vds = add_clinvar_to_vds(hc, vds, genome_version, root="va.clinvar")
        filter_expr.append("isDefined(va.clinvar.allele_id)")

    if args.update_hgmd:
        #vds = reset_hgmd_fields_in_vds(hc, vds, genome_version, root="va.hgmd", subset=filter_interval)
        vds = add_hgmd_to_vds(hc, vds, genome_version, root="va.hgmd")
        filter_expr.append("isDefined(va.hgmd.accession)")

    # filter down to variants that have reference data

    vds = vds.filter_variants_expr(" || ".join(filter_expr), keep=True)

    print("\n\n==> schema: ")
    pprint(vds.variant_schema)

    _, variant_count = vds.count()
    logger.info(
        "\n==> exporting {} variants to elasticsearch:".format(variant_count))
    elasticsearch_client.export_vds_to_elasticsearch(
        vds,
        index_name=index_name,
        index_type_name="variant",
        block_size=args.block_size,
        elasticsearch_write_operation=ELASTICSEARCH_UPDATE,
        elasticsearch_mapping_id="docId",
        is_split_vds=True,
        verbose=False,
        delete_index_before_exporting=False,
        ignore_elasticsearch_write_errors=False,
        export_globals_to_index_meta=True,
    )
                     missing="NA",
                     quote='"',
                     types={
                         'gene_name': TString(),
                         'description': TString(),
                         'gene_id': TString(),
                         'xcase_lof': TInt(),
                         'xctrl_lof': TInt(),
                         'pval_lof': TDouble(),
                         'xcase_mpc': TInt(),
                         'xctrl_mpc': TInt(),
                         'pval_mpc': TDouble(),
                         'xcase_infrIndel': TInt(),
                         'xctrl_infrIndel': TInt(),
                         'pval_infrIndel': TDouble(),
                         'pval_meta': TDouble(),
                         'analysis_group': TString(),
                     })

es = ElasticsearchClient(args.host, args.port)

es.export_kt_to_elasticsearch(
    kt,
    index_name="epi_exome_gene_results_181107",
    index_type_name="result",
    block_size=args.block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    verbose=True,
)
Exemple #6
0
    type=int,
    help=
    "For use with --num-temp-loading-nodes. Number of temp loading nodes to create.",
    default=3)
p.add_argument("--host",
               help="Elastisearch host",
               default=os.environ.get("ELASTICSEARCH_SERVICE_HOSTNAME",
                                      "localhost"))
p.add_argument("--port", help="Elastisearch port", default="9200")
p.add_argument(
    "--k8s-cluster-name",
    help="Specifies the kubernetes cluster name that hosts elasticsearch.",
    required=True)
args = p.parse_args()

client = ElasticsearchClient(args.host, args.port)
wait_for_loading_shards_transfer(client, num_attempts=1)

settings = _get_es_node_settings(args.k8s_cluster_name,
                                 args.num_temp_loading_nodes)
_set_k8s_context(settings)

_process_kubernetes_configs(
    "delete",
    settings=settings,
    config_paths=[
        "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml",
    ])
_wait_for_data_nodes_state("delete", settings)

run("echo Y | gcloud container node-pools delete --cluster {} loading-cluster".
for group in analysis_groups:
    group_results = variant_results.filter('analysis_group == "%s"' % group).drop("analysis_group")
    group_results = group_results.annotate(
        "%s = { %s }" % (group, ", ".join(["%s: %s" % (col, col) for col in result_columns]))
    ).select(["v", group])
    variants = variants.join(group_results.key_by("v"))

variants = variants.annotate("groups = { %s }" % ", ".join(["%s:%s" % (group, group) for group in analysis_groups]))
variants = variants.drop(list(analysis_groups))

variants = variants.annotate("v = Variant(v)")
variants = variants.annotate("variant_id = %s" % get_expr_for_variant_id())
variants = variants.annotate("chrom = %s" % get_expr_for_contig())
variants = variants.annotate("pos = %s" % get_expr_for_start_pos())
variants = variants.annotate("xpos = %s" % get_expr_for_xpos())
variants = variants.drop(["v"])

pprint.pprint(variants.schema)

es = ElasticsearchClient(args.host, args.port)

es.export_kt_to_elasticsearch(
    variants,
    index_name=args.index,
    index_type_name="variant",
    block_size=args.block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    verbose=True,
)
def export_to_elasticsearch(
    vds,
    args,
    operation=ELASTICSEARCH_INDEX,
    delete_index_before_exporting=False,
    export_genotypes=True,
    disable_doc_values_for_fields=(),
    disable_index_for_fields=(),
    run_after_index_exists=None,
    force_merge=False,
):
    """Utility method for exporting the given vds to an elasticsearch index."""

    start_with_sample_group = args.start_with_sample_group if args.start_with_step == 0 else 0

    if not export_genotypes:
        genotype_fields_to_export = []
        genotype_field_to_elasticsearch_type_map = {}
    elif args.dataset_type == "VARIANTS":
        genotype_fields_to_export = VARIANT_GENOTYPE_FIELDS_TO_EXPORT
        genotype_field_to_elasticsearch_type_map = VARIANT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP
    elif args.dataset_type == "SV":
        genotype_fields_to_export = SV_GENOTYPE_FIELDS_TO_EXPORT
        genotype_field_to_elasticsearch_type_map = SV_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP
    else:
        raise ValueError("Unexpected args.dataset_type: %s" %
                         args.dataset_type)

    vds = vds.persist()

    sample_groups = compute_sample_groups(vds, args)
    client = ElasticsearchClient(args.host, args.port)
    for i, sample_group in enumerate(sample_groups):

        if i < start_with_sample_group:
            continue

        #if delete_index_before_exporting and i < 4:
        #    continue

        if len(sample_groups) > 1:
            vds_sample_subset = vds.filter_samples_list(sample_group,
                                                        keep=True)
            current_index_name = "%s_%s" % (args.index, i)
        else:
            vds_sample_subset = vds
            current_index_name = args.index

        logger.info("==> exporting %s samples into %s" %
                    (len(sample_group), current_index_name))
        logger.info(
            "Samples: %s .. %s" %
            (", ".join(sample_group[:3]), ", ".join(sample_group[-3:])))

        logger.info("==> export to elasticsearch - vds schema:\n" +
                    pformat(vds.variant_schema))

        timestamp1 = time.time()

        client.export_vds_to_elasticsearch(
            vds_sample_subset,
            genotype_fields_to_export=genotype_fields_to_export,
            genotype_field_to_elasticsearch_type_map=
            genotype_field_to_elasticsearch_type_map,
            export_genotypes_as_nested_field=bool(
                args.use_nested_objects_for_genotypes),
            export_genotypes_as_child_docs=bool(
                args.use_child_docs_for_genotypes),
            discard_missing_genotypes=bool(args.discard_missing_genotypes),
            index_name=current_index_name,
            index_type_name="variant",
            block_size=args.es_block_size,
            num_shards=args.num_shards,
            delete_index_before_exporting=delete_index_before_exporting,
            elasticsearch_write_operation=operation,
            elasticsearch_mapping_id="docId",
            disable_doc_values_for_fields=disable_doc_values_for_fields,
            disable_index_for_fields=disable_index_for_fields,
            is_split_vds=True,
            run_after_index_exists=run_after_index_exists,
            verbose=False,
            force_merge=force_merge,
        )

        timestamp2 = time.time()
        logger.info("==> finished exporting - time: %s seconds" %
                    (timestamp2 - timestamp1))
import os
os.system("pip install elasticsearch")

import argparse
from hail_scripts.v01.utils.elasticsearch_client import ElasticsearchClient

p = argparse.ArgumentParser()
p.add_argument(
    "-H",
    "--host",
    help="elasticsearch client host. The default address works if "
    "`kubectl proxy` is running in the background.",
    default=
    "http://localhost:8001/api/v1/namespaces/default/services/elasticsearch:9200/proxy"
)
p.add_argument("-p",
               "--port",
               help="elasticsearch client port.",
               default="30001")

args = p.parse_args()

# to get the ip address, run  `kubectl describe pod elasticsearch-1019229749-vhghc`
ELASTICSEARCH_HOST = args.host
ELASTICSEARCH_PORT = args.port

es = ElasticsearchClient(ELASTICSEARCH_HOST, port=ELASTICSEARCH_PORT)
es.print_elasticsearch_stats()
               type=int)
p.add_argument("--vep-block-size",
               help="Block size to use for VEP",
               default=200,
               type=int)
p.add_argument("--es-block-size",
               help="Block size to use when exporting to elasticsearch",
               default=200,
               type=int)
p.add_argument(
    "--subset",
    help="Specify an interval (eg. X:12345-54321 to load a subset of clinvar")
args = p.parse_args()

client = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

if args.index_name:
    index_name = args.index_name.lower()
else:
    index_name = "clinvar_grch{}".format(args.genome_version)

hc = create_hail_context()

# download vcf
vds = download_and_import_latest_clinvar_vcf(hc,
                                             args.genome_version,
                                             subset=args.subset)

# run VEP
    COVERAGE_PATHS = EXOME_COVERAGE_CSV_PATHS[-1]

kt_coverage = hc.import_table(COVERAGE_PATHS, types=types)
kt_coverage = kt_coverage.rename({
    '#chrom': 'chrom',
    '1': 'over1',
    '5': 'over5',
    '10': 'over10',
    '15': 'over15',
    '20': 'over20',
    '25': 'over25',
    '30': 'over30',
    '50': 'over50',
    '100': 'over100',
})
print(kt_coverage.schema)
print("======== Export exome coverage to elasticsearch ======")

es = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

es.export_kt_to_elasticsearch(kt_coverage,
                              index_name=args.index,
                              index_type_name=args.index_type,
                              num_shards=args.num_shards,
                              block_size=args.block_size,
                              delete_index_before_exporting=True,
                              verbose=True)
Exemple #12
0
for field_name in transcript_annotations_to_keep:
    new_field_name = field_name.split("_")[0] + "".join(
        map(lambda word: word.capitalize(),
            field_name.split("_")[1:]))
    combined_kt = combined_kt.annotate(
        "%(new_field_name)s = mainTranscript.%(field_name)s" % locals())

combined_kt = combined_kt.drop(["mainTranscript"])

pprint(combined_kt.schema)

DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", )

print("======== Export to elasticsearch ======")
es = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

es.export_kt_to_elasticsearch(
    combined_kt,
    index_name=args.index,
    index_type_name=args.index_type,
    block_size=args.block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    disable_doc_values_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS,
    disable_index_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS,
    verbose=True,
)
kt = hc.read_table(gene_results_url)

kt = kt.rename({
    'ensembl_gene_id': 'gene_id',
    'Xcase_lof': 'xcase_lof',
    'Xctrl_lof': 'xctrl_lof',
    'Pval_lof': 'pval_lof',
    'Xcase_mpc': 'xcase_mpc',
    'Xctrl_mpc': 'xctrl_mpc',
    'Pval_mpc': 'pval_mpc',
    'Pval_meta': 'pval_meta',
})

kt = kt.annotate("analysis_group = \"all\"")

es = ElasticsearchClient(
    host=args.host,
    port=args.port,
)

es.export_kt_to_elasticsearch(
    kt,
    index_name="schizophrenia_gene_results_171213",
    index_type_name="result",
    block_size=args.block_size,
    num_shards=args.num_shards,
    delete_index_before_exporting=True,
    verbose=True,
)