def route_index_to_temp_es_cluster(yes, args): """Apply shard allocation filtering rules for the given index to elasticsearch data nodes with *loading* in their name: If yes is True, route new documents in the given index only to nodes named "*loading*". Otherwise, move any shards in this index off of nodes named "*loading*" Args: yes (bool): whether to route shards in the given index to the "*loading*" nodes, or move shards off of these nodes. args: args from ArgumentParser - used to compute the index name and get elasticsearch host and port. """ if yes: require_name = "es-data-loading*" exclude_name = "" else: require_name = "" exclude_name = "es-data-loading*" body = { "index.routing.allocation.require._name": require_name, "index.routing.allocation.exclude._name": exclude_name } logger.info("==> Setting {}* settings = {}".format(args.index, body)) index_arg = "{}*".format(args.index) client = ElasticsearchClient(args.host, args.port) client.es.indices.put_settings(index=index_arg, body=body) if not yes: wait_for_loading_shards_transfer(client, index=index_arg)
def update_all_datasets(hc, args): client = ElasticsearchClient(args.host, port=args.port) indices = client.es.cat.indices(h="index", s="index").strip().split("\n") for i, index_name in enumerate(indices): _meta = client.get_index_meta(index_name) logger.info("==> updating index {} out of {}: {}".format( i + 1, len(indices), index_name)) if _meta and "sourceFilePath" in _meta: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta)) try: update_dataset(hc, index_name, args) except Exception as e: logger.error("ERROR while updating %s - %s: %s", index_name, _meta["sourceFilePath"], e) else: logger.info( "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}" .format(index_name, _meta))
def update_operations_log(args): if args.dont_update_operations_log: return logger.info("==> update operations log") client = ElasticsearchClient(args.host, args.port) client.save_index_operation_metadata( args.input_dataset, args.index, args.genome_version, fam_file=args.fam_file, remap_sample_ids=args.remap_sample_ids, subset_samples=args.subset_samples, skip_vep=args.skip_vep, project_id=args.project_guid, dataset_type=args.dataset_type, sample_type=args.sample_type, command=" ".join(sys.argv), directory=args.directory, username=args.username, operation="create_index", status="success", )
def update_dataset(hc, index_name, args): elasticsearch_client = ElasticsearchClient(args.host, args.port) _meta = elasticsearch_client.get_index_meta(index_name) if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta): logger.error( "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use " "--index-name, --dataset-path, and --genome-version to update this index." .format(index_name)) return dataset_path = args.dataset_path or _meta["sourceFilePath"] genome_version = args.genome_version or _meta.get("genomeVersion") if genome_version is None: match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE) if not match: logger.info( "ERROR: couldn't update clinvar in {} because the genome version wasn't found in _meta ({}) or in the index name." .format(index_name, _meta)) return genome_version = match.group(1) vds = read_in_dataset(hc, dataset_path) vds = vds.drop_samples() vds = compute_minimal_schema(vds) vds = vds.annotate_global_expr( 'global.genomeVersion = "{}"'.format(genome_version)) # add reference data to vds filter_expr = [] if args.update_primate_ai: vds = add_primate_ai_to_vds(hc, vds, genome_version, root="va.primate_ai") filter_expr.append("isDefined(va.primate_ai.score)") if args.update_splice_ai: vds = add_splice_ai_to_vds(hc, vds, genome_version, root="va.splice_ai") filter_expr.append("isDefined(va.splice_ai.delta_score)") if args.update_clinvar: #vds = reset_clinvar_fields_in_vds(hc, vds, genome_version, root="va.clinvar", subset=filter_interval) vds = add_clinvar_to_vds(hc, vds, genome_version, root="va.clinvar") filter_expr.append("isDefined(va.clinvar.allele_id)") if args.update_hgmd: #vds = reset_hgmd_fields_in_vds(hc, vds, genome_version, root="va.hgmd", subset=filter_interval) vds = add_hgmd_to_vds(hc, vds, genome_version, root="va.hgmd") filter_expr.append("isDefined(va.hgmd.accession)") # filter down to variants that have reference data vds = vds.filter_variants_expr(" || ".join(filter_expr), keep=True) print("\n\n==> schema: ") pprint(vds.variant_schema) _, variant_count = vds.count() logger.info( "\n==> exporting {} variants to elasticsearch:".format(variant_count)) elasticsearch_client.export_vds_to_elasticsearch( vds, index_name=index_name, index_type_name="variant", block_size=args.block_size, elasticsearch_write_operation=ELASTICSEARCH_UPDATE, elasticsearch_mapping_id="docId", is_split_vds=True, verbose=False, delete_index_before_exporting=False, ignore_elasticsearch_write_errors=False, export_globals_to_index_meta=True, )
missing="NA", quote='"', types={ 'gene_name': TString(), 'description': TString(), 'gene_id': TString(), 'xcase_lof': TInt(), 'xctrl_lof': TInt(), 'pval_lof': TDouble(), 'xcase_mpc': TInt(), 'xctrl_mpc': TInt(), 'pval_mpc': TDouble(), 'xcase_infrIndel': TInt(), 'xctrl_infrIndel': TInt(), 'pval_infrIndel': TDouble(), 'pval_meta': TDouble(), 'analysis_group': TString(), }) es = ElasticsearchClient(args.host, args.port) es.export_kt_to_elasticsearch( kt, index_name="epi_exome_gene_results_181107", index_type_name="result", block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, verbose=True, )
type=int, help= "For use with --num-temp-loading-nodes. Number of temp loading nodes to create.", default=3) p.add_argument("--host", help="Elastisearch host", default=os.environ.get("ELASTICSEARCH_SERVICE_HOSTNAME", "localhost")) p.add_argument("--port", help="Elastisearch port", default="9200") p.add_argument( "--k8s-cluster-name", help="Specifies the kubernetes cluster name that hosts elasticsearch.", required=True) args = p.parse_args() client = ElasticsearchClient(args.host, args.port) wait_for_loading_shards_transfer(client, num_attempts=1) settings = _get_es_node_settings(args.k8s_cluster_name, args.num_temp_loading_nodes) _set_k8s_context(settings) _process_kubernetes_configs( "delete", settings=settings, config_paths=[ "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml", ]) _wait_for_data_nodes_state("delete", settings) run("echo Y | gcloud container node-pools delete --cluster {} loading-cluster".
for group in analysis_groups: group_results = variant_results.filter('analysis_group == "%s"' % group).drop("analysis_group") group_results = group_results.annotate( "%s = { %s }" % (group, ", ".join(["%s: %s" % (col, col) for col in result_columns])) ).select(["v", group]) variants = variants.join(group_results.key_by("v")) variants = variants.annotate("groups = { %s }" % ", ".join(["%s:%s" % (group, group) for group in analysis_groups])) variants = variants.drop(list(analysis_groups)) variants = variants.annotate("v = Variant(v)") variants = variants.annotate("variant_id = %s" % get_expr_for_variant_id()) variants = variants.annotate("chrom = %s" % get_expr_for_contig()) variants = variants.annotate("pos = %s" % get_expr_for_start_pos()) variants = variants.annotate("xpos = %s" % get_expr_for_xpos()) variants = variants.drop(["v"]) pprint.pprint(variants.schema) es = ElasticsearchClient(args.host, args.port) es.export_kt_to_elasticsearch( variants, index_name=args.index, index_type_name="variant", block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, verbose=True, )
def export_to_elasticsearch( vds, args, operation=ELASTICSEARCH_INDEX, delete_index_before_exporting=False, export_genotypes=True, disable_doc_values_for_fields=(), disable_index_for_fields=(), run_after_index_exists=None, force_merge=False, ): """Utility method for exporting the given vds to an elasticsearch index.""" start_with_sample_group = args.start_with_sample_group if args.start_with_step == 0 else 0 if not export_genotypes: genotype_fields_to_export = [] genotype_field_to_elasticsearch_type_map = {} elif args.dataset_type == "VARIANTS": genotype_fields_to_export = VARIANT_GENOTYPE_FIELDS_TO_EXPORT genotype_field_to_elasticsearch_type_map = VARIANT_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP elif args.dataset_type == "SV": genotype_fields_to_export = SV_GENOTYPE_FIELDS_TO_EXPORT genotype_field_to_elasticsearch_type_map = SV_GENOTYPE_FIELD_TO_ELASTICSEARCH_TYPE_MAP else: raise ValueError("Unexpected args.dataset_type: %s" % args.dataset_type) vds = vds.persist() sample_groups = compute_sample_groups(vds, args) client = ElasticsearchClient(args.host, args.port) for i, sample_group in enumerate(sample_groups): if i < start_with_sample_group: continue #if delete_index_before_exporting and i < 4: # continue if len(sample_groups) > 1: vds_sample_subset = vds.filter_samples_list(sample_group, keep=True) current_index_name = "%s_%s" % (args.index, i) else: vds_sample_subset = vds current_index_name = args.index logger.info("==> exporting %s samples into %s" % (len(sample_group), current_index_name)) logger.info( "Samples: %s .. %s" % (", ".join(sample_group[:3]), ", ".join(sample_group[-3:]))) logger.info("==> export to elasticsearch - vds schema:\n" + pformat(vds.variant_schema)) timestamp1 = time.time() client.export_vds_to_elasticsearch( vds_sample_subset, genotype_fields_to_export=genotype_fields_to_export, genotype_field_to_elasticsearch_type_map= genotype_field_to_elasticsearch_type_map, export_genotypes_as_nested_field=bool( args.use_nested_objects_for_genotypes), export_genotypes_as_child_docs=bool( args.use_child_docs_for_genotypes), discard_missing_genotypes=bool(args.discard_missing_genotypes), index_name=current_index_name, index_type_name="variant", block_size=args.es_block_size, num_shards=args.num_shards, delete_index_before_exporting=delete_index_before_exporting, elasticsearch_write_operation=operation, elasticsearch_mapping_id="docId", disable_doc_values_for_fields=disable_doc_values_for_fields, disable_index_for_fields=disable_index_for_fields, is_split_vds=True, run_after_index_exists=run_after_index_exists, verbose=False, force_merge=force_merge, ) timestamp2 = time.time() logger.info("==> finished exporting - time: %s seconds" % (timestamp2 - timestamp1))
import os os.system("pip install elasticsearch") import argparse from hail_scripts.v01.utils.elasticsearch_client import ElasticsearchClient p = argparse.ArgumentParser() p.add_argument( "-H", "--host", help="elasticsearch client host. The default address works if " "`kubectl proxy` is running in the background.", default= "http://localhost:8001/api/v1/namespaces/default/services/elasticsearch:9200/proxy" ) p.add_argument("-p", "--port", help="elasticsearch client port.", default="30001") args = p.parse_args() # to get the ip address, run `kubectl describe pod elasticsearch-1019229749-vhghc` ELASTICSEARCH_HOST = args.host ELASTICSEARCH_PORT = args.port es = ElasticsearchClient(ELASTICSEARCH_HOST, port=ELASTICSEARCH_PORT) es.print_elasticsearch_stats()
type=int) p.add_argument("--vep-block-size", help="Block size to use for VEP", default=200, type=int) p.add_argument("--es-block-size", help="Block size to use when exporting to elasticsearch", default=200, type=int) p.add_argument( "--subset", help="Specify an interval (eg. X:12345-54321 to load a subset of clinvar") args = p.parse_args() client = ElasticsearchClient( host=args.host, port=args.port, ) if args.index_name: index_name = args.index_name.lower() else: index_name = "clinvar_grch{}".format(args.genome_version) hc = create_hail_context() # download vcf vds = download_and_import_latest_clinvar_vcf(hc, args.genome_version, subset=args.subset) # run VEP
COVERAGE_PATHS = EXOME_COVERAGE_CSV_PATHS[-1] kt_coverage = hc.import_table(COVERAGE_PATHS, types=types) kt_coverage = kt_coverage.rename({ '#chrom': 'chrom', '1': 'over1', '5': 'over5', '10': 'over10', '15': 'over15', '20': 'over20', '25': 'over25', '30': 'over30', '50': 'over50', '100': 'over100', }) print(kt_coverage.schema) print("======== Export exome coverage to elasticsearch ======") es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_kt_to_elasticsearch(kt_coverage, index_name=args.index, index_type_name=args.index_type, num_shards=args.num_shards, block_size=args.block_size, delete_index_before_exporting=True, verbose=True)
for field_name in transcript_annotations_to_keep: new_field_name = field_name.split("_")[0] + "".join( map(lambda word: word.capitalize(), field_name.split("_")[1:])) combined_kt = combined_kt.annotate( "%(new_field_name)s = mainTranscript.%(field_name)s" % locals()) combined_kt = combined_kt.drop(["mainTranscript"]) pprint(combined_kt.schema) DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS = ("sortedTranscriptConsequences", ) print("======== Export to elasticsearch ======") es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_kt_to_elasticsearch( combined_kt, index_name=args.index, index_type_name=args.index_type, block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, disable_doc_values_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS, disable_index_for_fields=DISABLE_INDEX_AND_DOC_VALUES_FOR_FIELDS, verbose=True, )
kt = hc.read_table(gene_results_url) kt = kt.rename({ 'ensembl_gene_id': 'gene_id', 'Xcase_lof': 'xcase_lof', 'Xctrl_lof': 'xctrl_lof', 'Pval_lof': 'pval_lof', 'Xcase_mpc': 'xcase_mpc', 'Xctrl_mpc': 'xctrl_mpc', 'Pval_mpc': 'pval_mpc', 'Pval_meta': 'pval_meta', }) kt = kt.annotate("analysis_group = \"all\"") es = ElasticsearchClient( host=args.host, port=args.port, ) es.export_kt_to_elasticsearch( kt, index_name="schizophrenia_gene_results_171213", index_type_name="result", block_size=args.block_size, num_shards=args.num_shards, delete_index_before_exporting=True, verbose=True, )