Exemple #1
0
def update_all_datasets(hc, args):
    client = ElasticsearchClient(args.host, port=args.port)
    indices = client.es.cat.indices(h="index", s="index").strip().split("\n")
    for i, index_name in enumerate(indices):
        _meta = client.get_index_meta(index_name)

        logger.info("==> updating index {} out of {}: {}".format(
            i + 1, len(indices), index_name))
        if _meta and "sourceFilePath" in _meta:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
            try:
                update_dataset(hc, index_name, args)
            except Exception as e:
                logger.error("ERROR while updating %s - %s: %s", index_name,
                             _meta["sourceFilePath"], e)
        else:
            logger.info(
                "==> skipping {} because index _meta['sourceFilePath'] isn't set: {}"
                .format(index_name, _meta))
Exemple #2
0
def update_dataset(hc, index_name, args):

    elasticsearch_client = ElasticsearchClient(args.host, args.port)
    _meta = elasticsearch_client.get_index_meta(index_name)
    if not args.dataset_path and (not _meta or "sourceFilePath" not in _meta):
        logger.error(
            "Couldn't update reference data in {} because it doesn't have a recorded sourceFilePath. Please use "
            "--index-name, --dataset-path, and --genome-version to update this index."
            .format(index_name))
        return

    dataset_path = args.dataset_path or _meta["sourceFilePath"]
    genome_version = args.genome_version or _meta.get("genomeVersion")

    if genome_version is None:
        match = re.search("__grch([0-9]+)__", index_name, re.IGNORECASE)
        if not match:
            logger.info(
                "ERROR: couldn't update clinvar in {} because the genome version wasn't found in _meta ({}) or in the index name."
                .format(index_name, _meta))
            return
        genome_version = match.group(1)

    vds = read_in_dataset(hc, dataset_path)
    vds = vds.drop_samples()
    vds = compute_minimal_schema(vds)
    vds = vds.annotate_global_expr(
        'global.genomeVersion = "{}"'.format(genome_version))

    # add reference data to vds
    filter_expr = []
    if args.update_primate_ai:
        vds = add_primate_ai_to_vds(hc,
                                    vds,
                                    genome_version,
                                    root="va.primate_ai")
        filter_expr.append("isDefined(va.primate_ai.score)")

    if args.update_splice_ai:
        vds = add_splice_ai_to_vds(hc,
                                   vds,
                                   genome_version,
                                   root="va.splice_ai")
        filter_expr.append("isDefined(va.splice_ai.delta_score)")

    if args.update_clinvar:
        #vds = reset_clinvar_fields_in_vds(hc, vds, genome_version, root="va.clinvar", subset=filter_interval)
        vds = add_clinvar_to_vds(hc, vds, genome_version, root="va.clinvar")
        filter_expr.append("isDefined(va.clinvar.allele_id)")

    if args.update_hgmd:
        #vds = reset_hgmd_fields_in_vds(hc, vds, genome_version, root="va.hgmd", subset=filter_interval)
        vds = add_hgmd_to_vds(hc, vds, genome_version, root="va.hgmd")
        filter_expr.append("isDefined(va.hgmd.accession)")

    # filter down to variants that have reference data

    vds = vds.filter_variants_expr(" || ".join(filter_expr), keep=True)

    print("\n\n==> schema: ")
    pprint(vds.variant_schema)

    _, variant_count = vds.count()
    logger.info(
        "\n==> exporting {} variants to elasticsearch:".format(variant_count))
    elasticsearch_client.export_vds_to_elasticsearch(
        vds,
        index_name=index_name,
        index_type_name="variant",
        block_size=args.block_size,
        elasticsearch_write_operation=ELASTICSEARCH_UPDATE,
        elasticsearch_mapping_id="docId",
        is_split_vds=True,
        verbose=False,
        delete_index_before_exporting=False,
        ignore_elasticsearch_write_errors=False,
        export_globals_to_index_meta=True,
    )