def submit_load_dataset_to_es_job_v02(dataproc_cluster_name,
                                      start_with_step=0,
                                      stop_after_step=None,
                                      other_load_dataset_to_es_args=(),
                                      es_host='localhost',
                                      es_port='9200'):
    def abs_path(rel_path):
        return os.path.join(CUR_DIR, rel_path)

    # Must use absolute path because this script changes the directory all over the place :(
    pyfiles = ','.join([abs_path(f) for f in ['lib', '../hail_scripts']])
    files = abs_path('configs/luigi.cfg')
    executable = abs_path('seqr_loading.py')

    if stop_after_step == 1:
        task = 'SeqrVCFToMTTask'
    else:
        task = 'SeqrMTToESTask --es-host %(es_host)s --es-port %(es_port)s' % locals(
        )

    # submit job
    run(" ".join(
        map(str, [
            "hailctl dataproc", "submit", "%(dataproc_cluster_name)s",
            "%(executable)s", "--pyfiles %(pyfiles)s", "--files %(files)s",
            "%(task)s --local-scheduler"
        ])) % locals())
Esempio n. 2
0
def _create_dataproc_cluster(dataproc_cluster_name,
                             genome_version,
                             num_workers=2,
                             num_preemptible_workers=12):
    run("python ./gcloud_dataproc/v01/create_cluster_GRCh%(genome_version)s.py %(dataproc_cluster_name)s %(num_workers)s %(num_preemptible_workers)s"
        % locals(),
        errors_to_ignore=["Already exists"])
Esempio n. 3
0
def _create_dataproc_cluster_v02(dataproc_cluster_name,
                                 genome_version,
                                 num_workers=2,
                                 num_preemptible_workers=12):
    run(f"hailctl dataproc start %(dataproc_cluster_name)s --num-workers %(num_workers)s --pkgs luigi,google-api-python-client "
        "--num-secondary-workers %(num_preemptible_workers)s --max-idle 30m --vep GRCh%(genome_version)s"
        % locals(),
        errors_to_ignore=["Already exists"])
def download_and_import_latest_clinvar_vcf(hail_context,
                                           genome_version,
                                           subset=None):
    """Downloads the latest clinvar VCF from the NCBI FTP server, copies it to HDFS and returns the hdfs file path
    as well the clinvar release date that's specified in the VCF header.

    Args:
        genome_version (str): "37" or "38"
        subset (str): subset by interval (eg. "X:12345-54321") - useful for testing
    Returns:
        2-tuple: (clinvar_vcf_hdfs_path, clinvar_release_date)
    """

    if genome_version not in ["37", "38"]:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    # download vcf
    clinvar_url = CLINVAR_FTP_PATH.format(genome_version=genome_version)
    local_tmp_file_path = "/tmp/clinvar_grch{}.vcf.gz".format(genome_version)
    clinvar_vcf_hdfs_path = "/tmp/" + os.path.basename(local_tmp_file_path)

    print("\n==> downloading {}".format(clinvar_url))

    run("wget {} -O {}".format(clinvar_url, local_tmp_file_path))

    run("hdfs dfs -copyFromLocal -f file://{} {}".format(
        local_tmp_file_path, clinvar_vcf_hdfs_path))

    clinvar_release_date = _parse_clinvar_release_date(local_tmp_file_path)

    # import vcf
    vds = hail_context.import_vcf(
        clinvar_vcf_hdfs_path,
        force_bgz=True,
        min_partitions=10000,
        drop_samples=True)  #.filter_intervals(hail.Interval.parse("1-MT"))

    if subset:
        vds = vds.filter_intervals(hail.Interval.parse(subset))

    vds = vds.repartition(
        10000)  # because the min_partitions arg doesn't work in some cases
    vds = vds.annotate_global_expr(
        'global.sourceFilePath = "{}"'.format(clinvar_url))
    vds = vds.annotate_global_expr(
        'global.version = "{}"'.format(clinvar_release_date))

    # handle multi-allelics
    vds = vds.split_multi()

    # for some reason, this additional filter is necessary to avoid
    #  IllegalArgumentException: requirement failed: called altAllele on a non-biallelic variant
    vds = vds.filter_variants_expr("v.isBiallelic()", keep=True)

    print("\n==> downloaded clinvar vcf: ")
    pprint(vds.globals._attrs)

    return vds
Esempio n. 5
0
def _create_temp_es_loading_nodes(settings):
    # make sure k8s cluster exists
    #run(" ".join([
    #    "gcloud container clusters create %(k8s_cluster_name)s",
    #    "--machine-type %(CLUSTER_MACHINE_TYPE)s",
    #    "--num-nodes 1",   # "--scopes https://www.googleapis.com/auth/devstorage.read_write"
    #]) % locals(), errors_to_ignore=["Already exists"])

    _set_k8s_context(settings)

    # add loading nodes
    run(" ".join([
        "gcloud container node-pools create loading-cluster ",
        "--cluster %(CLUSTER_NAME)s",
        "--machine-type %(CLUSTER_MACHINE_TYPE)s",
        "--num-nodes %(ES_DATA_NUM_PODS)s",
        "--local-ssd-count 1",
    ]) % settings,
        errors_to_ignore=["Already exists"])

    # deploy elasticsearch
    _process_kubernetes_configs(
        "create",
        settings=settings,
        config_paths=[
            "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml",
        ])

    _wait_for_data_nodes_state("create", settings)

    # get ip address of loading nodes
    elasticsearch_ip_address = run(
        "kubectl get endpoints elasticsearch -o jsonpath='{.subsets[0].addresses[0].ip}'"
    )

    logger.info("elasticsearch loading cluster IP address: {}".format(
        elasticsearch_ip_address))
    if not elasticsearch_ip_address or not re.match(
            "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", elasticsearch_ip_address):
        logger.error("Invalid elasticsearch IP address: '{}'".format(
            elasticsearch_ip_address))

    # add firewall rule to allow ingress
    firewall_rule_name = _compute_firewall_rule_name(settings["CLUSTER_NAME"])
    source_range = "%s.%s.0.0/16" % tuple(
        elasticsearch_ip_address.split(".")[0:2])
    for action in ["create", "update"]:
        run((
            "gcloud compute firewall-rules %(action)s %(firewall_rule_name)s "
            "--description='Allow any machine in the project-default network to connect to elasticsearch loading cluster ports 9200, 9300'"
            "--network=default "
            "--allow=tcp:9200,tcp:9300 "
            "--source-ranges=%(source_range)s ") % locals(),
            errors_to_ignore=["already exists"])

    return elasticsearch_ip_address
Esempio n. 6
0
def _process_kubernetes_configs(action, config_paths, settings):
    for config_path in config_paths:
        # configure deployment dir
        output_dir = "/tmp/deployments/%(TIMESTAMP)s_%(CLUSTER_NAME)s" % settings
        process_jinja_template(".", config_path, settings, output_dir)

        config_path = os.path.join(output_dir, config_path)
        if action == "delete":
            run("kubectl delete -f %(config_path)s" % locals(), errors_to_ignore=["not found"])
        elif action == "create":
            run("kubectl apply -f %(config_path)s" % locals(), errors_to_ignore=["already exists", "already allocated"])
Esempio n. 7
0
def _enable_cluster_routing_rebalance(enable, dataproc_cluster_name, host, port):
    logger.info("==> %s cluster.routing.rebalance", "enable" if enable else "disable")

    run(" ".join(map(str, [
        "./gcloud_dataproc/submit.py",
        "--hail-version 0.1",
        "--cluster", dataproc_cluster_name,
        "hail_scripts/elasticsearch_ops/cluster_routing_rebalance.py",
        "--host", host,
        "--port", port,
        "--enable" if enable else "--disable",
   ])))
Esempio n. 8
0
def submit_load_dataset_to_es_job(
        dataproc_cluster_name,
        start_with_step=0,
        stop_after_step=None,
        other_load_dataset_to_es_args=()):

    # submit job
    run(" ".join(map(str, [
        "python3 ./gcloud_dataproc/submit.py",
        "--hail-version 0.1",
        "--cluster %(dataproc_cluster_name)s",
        "hail_scripts/v01/load_dataset_to_es.py",
        "--stop-after-step %(stop_after_step)s " if stop_after_step is not None else "",
        "--start-with-step %(start_with_step)s ",
        ] + list(other_load_dataset_to_es_args))) % locals())
Esempio n. 9
0
def _create_persistent_es_nodes(settings):
    # make sure cluster exists - create cluster with 1 node
    run(
        " ".join([
            "gcloud container clusters create %(CLUSTER_NAME)s",
            "--machine-type %(CLUSTER_MACHINE_TYPE)s",
            "--num-nodes 1",  # "--scopes https://www.googleapis.com/auth/devstorage.read_write"
        ]) % settings,
        errors_to_ignore=["Already exists"])

    _set_k8s_context(settings)

    # create additional nodes
    run(" ".join([
        "gcloud container node-pools create es-persistent-nodes",
        "--cluster %(CLUSTER_NAME)s",
        "--machine-type %(CLUSTER_MACHINE_TYPE)s",
        "--num-nodes " + str(int(settings.get("ES_DATA_NUM_PODS", 1)) - 1),
    ]) % settings,
        errors_to_ignore=["Already exists"])

    # deploy elasticsearch
    _process_kubernetes_configs(
        "create",
        settings=settings,
        config_paths=[
            #"./gcloud_dataproc/utils/elasticsearch_cluster/es-configmap.yaml",
            "./kubernetes/elasticsearch-sharded/es-namespace.yaml",
            "./kubernetes/elasticsearch-sharded/es-discovery-svc.yaml",
            "./kubernetes/elasticsearch-sharded/es-master.yaml",
            "./kubernetes/elasticsearch-sharded/es-svc.yaml",
            "./kubernetes/elasticsearch-sharded/es-kibana.yaml",
        ])

    wait_until_pod_is_running("es-kibana")

    _process_kubernetes_configs(
        "create",
        settings=settings,
        config_paths=[
            "./kubernetes/elasticsearch-sharded/es-client.yaml",
            "./kubernetes/elasticsearch-sharded/es-data-stateful.yaml",
            "./kubernetes/elasticsearch-sharded/es-data-svc.yaml",
        ])

    _wait_for_data_nodes_state("create", settings, data_node_name="es-data")
def get_gcloud_file_stats(gs_path):
    if gs_path.endswith(".vds"):
        gs_path += "/metadata.json.gz"  # set path to a file inside the .vds directory because gsutil stat works only on files.

    gsutil_stat_output = run("gsutil stat %(gs_path)s" % locals(),
                             print_command=False,
                             verbose=False,
                             ignore_all_errors=True)
    """
    Example gsutil stat output:

    Creation time:          Fri, 09 Jun 2017 09:36:23 GMT
    Update time:            Fri, 09 Jun 2017 09:36:23 GMT
    Storage class:          REGIONAL
    Content-Length:         363620675
    Content-Type:           text/x-vcard
    Hash (crc32c):          SWOktA==
    Hash (md5):             fEdIumyOFR7HvULeAwXCwQ==
    ETag:                   CMae+J67sNQCEAE=
    Generation:             1497000983793478
    Metageneration:         1
    """

    if not gsutil_stat_output:
        return None

    EMPTY_MATCH_OBJ = re.match("()", "")
    DATE_FORMAT = '%a, %d %b %Y %H:%M:%S %Z'

    creation_time = (re.search("Creation.time:[\s]+(.+)", gsutil_stat_output,
                               re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1)
    update_time = (re.search("Update.time:[\s]+(.+)", gsutil_stat_output,
                             re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1)
    file_size = (re.search("Content-Length:[\s]+(.+)", gsutil_stat_output,
                           re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1)
    file_md5 = (re.search("Hash (md5):[\s]+(.+)", gsutil_stat_output,
                          re.IGNORECASE) or EMPTY_MATCH_OBJ).group(1)

    ctime = time.mktime(time.strptime(creation_time, DATE_FORMAT))
    mtime = time.mktime(time.strptime(update_time, DATE_FORMAT))
    return FileStats(ctime=ctime, mtime=mtime, size=file_size, md5=file_md5)
Esempio n. 11
0
def _set_k8s_context(settings):
    run("gcloud container clusters get-credentials %(CLUSTER_NAME)s" %
        settings)
    run("kubectl config set-context $(kubectl config current-context) --namespace=%(NAMESPACE)s"
        % settings)
Esempio n. 12
0
                                      "localhost"))
p.add_argument("--port", help="Elastisearch port", default="9200")
p.add_argument(
    "--k8s-cluster-name",
    help="Specifies the kubernetes cluster name that hosts elasticsearch.",
    required=True)
args = p.parse_args()

client = ElasticsearchClient(args.host, args.port)
wait_for_loading_shards_transfer(client, num_attempts=1)

settings = _get_es_node_settings(args.k8s_cluster_name,
                                 args.num_temp_loading_nodes)
_set_k8s_context(settings)

_process_kubernetes_configs(
    "delete",
    settings=settings,
    config_paths=[
        "./kubernetes/elasticsearch-sharded/es-data-stateless-local-ssd.yaml",
    ])
_wait_for_data_nodes_state("delete", settings)

run("echo Y | gcloud container node-pools delete --cluster {} loading-cluster".
    format(args.k8s_cluster_name))

# delete firewall rule
firewall_rule_name = _compute_firewall_rule_name(args.k8s_cluster_name)
run("echo Y | gcloud compute firewall-rules delete {}s".format(
    firewall_rule_name))
               type=int,
               help="Number of es nodes to create.",
               default=3)
p.add_argument(
    "--k8s-cluster-name",
    help="Specifies the kubernetes cluster name that hosts elasticsearch.",
    required=True)
args = p.parse_args()

settings = _get_es_node_settings(args.k8s_cluster_name, args.num_nodes)
load_settings([], settings)

# make sure cluster exists - create cluster with 1 node
run(" ".join([
    "gcloud container clusters create %(CLUSTER_NAME)s",
    "--machine-type %(CLUSTER_MACHINE_TYPE)s",
    "--num-nodes 1",  # "--scopes https://www.googleapis.com/auth/devstorage.read_write"
]) % settings)

_set_k8s_context(settings)

# create additional nodes
run(" ".join([
    "gcloud container node-pools create es-persistent-nodes",
    "--cluster %(CLUSTER_NAME)s",
    "--machine-type %(CLUSTER_MACHINE_TYPE)s",
    "--num-nodes " + str(int(settings.get("ES_DATA_NUM_PODS", 1)) - 1),
]) % settings)

# deploy elasticsearch
_process_kubernetes_configs(
import os
import sys
from kubernetes.shell_utils import simple_run as run

if len(sys.argv) < 2:
    sys.exit("Must provide OMIM download key as command line arg (https://www.omim.org/downloads/)")

omim_download_key = sys.argv[1]

DOWNLOAD_PATH = "https://data.omim.org/downloads/%(omim_download_key)s/genemap2.txt" % locals()
GCLOUD_BUCKET_PATH = "gs://seqr-reference-data/omim"


filename = os.path.basename(DOWNLOAD_PATH)

run("wget -O {filename} {DOWNLOAD_PATH}".format(**locals()))

run("""/bin/bash -c "cat <(grep '^# Chromosome.*Genomic' {filename}) <(grep -v '^#' {filename}) > {filename}.temp" """.format(**locals()))
run("mv {filename}.temp {filename}".format(**locals()))

run("gsutil -m cp {filename} {GCLOUD_BUCKET_PATH}/{filename}".format(**locals()))

run(" ".join([
    "python gcloud_dataproc/v01/run_script.py",
    "--cluster omim",
    "hail_scripts/v01/convert_tsv_to_key_table.py",
    "--key-by 'Ensembl Gene ID'",
    "{GCLOUD_BUCKET_PATH}/{filename}"
]).format(**locals()))
Esempio n. 15
0
#!/usr/bin/env python

from kubernetes.shell_utils import simple_run as run

for vcf_path in [
        "gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz",
        "gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz"
]:
    run(" ".join([
        "python gcloud_dataproc/v01/run_script.py",
        "--cluster topmed",
        "hail_scripts/v01/convert_vcf_to_vds.py",
        "--sites-only",
        "{vcf_path}",
    ]).format(**locals()))
Esempio n. 16
0
def main():
    os.chdir(os.path.join(os.path.dirname(__file__), ".."))

    # get command-line args
    args, unparsed_args = init_command_line_args()

    # forward uparsed and other args to the load_dataset_to_es.py script
    load_dataset_to_es_args = unparsed_args

    load_dataset_to_es_args.extend([
        "--host",
        args.host,
        "--port",
        args.port,
        "--genome-version",
        args.genome_version,
        "--project-guid",
        args.project_guid,
        "--use-temp-loading-nodes" if args.use_temp_loading_nodes else "",
        args.input_dataset,
    ])

    # download .fam file?
    is_fam_file_specified = "--fam-file" in unparsed_args
    is_subset_samples_file_specified = "--subset-samples" in unparsed_args

    if args.download_fam_file and (not is_fam_file_specified
                                   or not is_subset_samples_file_specified):
        input_dataset_directory = os.path.dirname(args.input_dataset) or "."

        # prompt for seqr username and password
        seqr_username = args.seqr_username or input("seqr username: "******"seqr password: "******"gsutil cp %(fam_file_path)s %(fam_file_gcloud_path)s" %
                locals())
            load_dataset_to_es_args.extend(
                ["--fam-file", fam_file_gcloud_path])

        # upload subset-samples to vcf_directory
        if not is_subset_samples_file_specified:
            subset_samples_file_gcloud_path = os.path.join(
                input_dataset_directory,
                os.path.basename(subset_samples_file_path))
            run("gsutil cp %(subset_samples_file_path)s %(subset_samples_file_gcloud_path)s"
                % locals())
            load_dataset_to_es_args.extend(
                ["--subset-samples", subset_samples_file_gcloud_path])

    # run pipeline with or without using a temp elasticsearch cluster for loading
    if args.use_temp_loading_nodes and (args.stop_after_step == None
                                        or args.stop_after_step > 1):
        # make sure kubectl is installed
        run("kubectl version --client")

        # run vep and compute derived annotations before create temp elasticsearch loading nodes
        if args.start_with_step <= 1:
            # make sure cluster exists
            _create_dataproc_cluster_v02(
                args.cluster_name,
                args.genome_version,
                num_workers=args.num_workers,
                num_preemptible_workers=args.num_preemptible_workers)
            submit_load_dataset_to_es_job_v02(
                args.cluster_name,
                start_with_step=args.start_with_step,
                stop_after_step=1,
                other_load_dataset_to_es_args=load_dataset_to_es_args,
                use_seqr_loading_optimized_pipeline=args.
                use_seqr_loading_optimized_pipeline)

        # create temp es nodes
        settings = _get_es_node_settings(args.k8s_cluster_name,
                                         args.num_temp_loading_nodes)

        ip_address = _create_es_nodes(settings)

        # _enable_cluster_routing_rebalance(False, args.cluster_name, ip_address, args.port)

        # make sure cluster exists
        _create_dataproc_cluster_v02(
            args.cluster_name,
            args.genome_version,
            num_workers=args.num_workers,
            num_preemptible_workers=args.num_preemptible_workers)

        # continue pipeline starting with loading steps, stream data to the new elasticsearch instance at ip_address
        submit_load_dataset_to_es_job_v02(
            args.cluster_name,
            start_with_step=max(
                2, args.start_with_step),  # start with step 2 or later
            stop_after_step=args.stop_after_step,
            other_load_dataset_to_es_args=load_dataset_to_es_args +
            ["--host %(ip_address)s" % locals()],
            es_host=ip_address,
            use_seqr_loading_optimized_pipeline=args.
            use_seqr_loading_optimized_pipeline)

        # _enable_cluster_routing_rebalance(True, args.cluster_name, ip_address, args.port)

    else:
        # make sure cluster exists
        _create_dataproc_cluster_v02(
            args.cluster_name,
            args.genome_version,
            num_workers=args.num_workers,
            num_preemptible_workers=args.num_preemptible_workers)

        submit_load_dataset_to_es_job_v02(
            args.cluster_name,
            start_with_step=args.start_with_step,
            stop_after_step=args.stop_after_step,
            other_load_dataset_to_es_args=load_dataset_to_es_args,
            use_seqr_loading_optimized_pipeline=args.
            use_seqr_loading_optimized_pipeline)
#!/usr/bin/env python

from kubernetes.shell_utils import simple_run as run

run(" ".join([
    "python gcloud_dataproc/v01/run_script.py",
    "--cluster gnomad-coverage",
    "download_and_create_reference_datasets/v01/hail_scripts/write_gnomad_coverage_vds.py",
]))
Esempio n. 18
0
#!/usr/bin/env python

import os
from kubernetes.shell_utils import simple_run as run

DOWNLOAD_PATH = "ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/functional_gene_constraint/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt"
GCLOUD_BUCKET_PATH = "gs://seqr-reference-data/gene_constraint"

filename = os.path.basename(DOWNLOAD_PATH)

run("wget -O {filename} {DOWNLOAD_PATH}".format(**locals()))

run("""/bin/bash -c "cat {filename} | sed 's/\(ENST[0-9]*\)\.[0-9]/\\1/' > {filename}.temp" """
    .format(**locals()))
run("mv {filename}.temp {filename}".format(**locals()))
run("gsutil -m cp {filename} {GCLOUD_BUCKET_PATH}/{filename}".format(
    **locals()))

run(" ".join([
    "python gcloud_dataproc/v01/run_script.py",
    "--cluster gene-constraint",
    "hail_scripts/v01/convert_tsv_to_key_table.py",
    "--key-by 'transcript'",
    "{GCLOUD_BUCKET_PATH}/{filename}",
]).format(**locals()))
Esempio n. 19
0
#!/usr/bin/env python

import argparse
from kubernetes.shell_utils import simple_run as run

genome_versions = ['37', '38']

p = argparse.ArgumentParser()
args, unparsed_args = p.parse_known_args()

script_args = " ".join(['"%s"' % arg for arg in unparsed_args])

run(" ".join([
    "python gcloud_dataproc/v01/run_script.py",
    "--cluster gnomad",
    "download_and_create_reference_datasets/v01/hail_scripts/write_gnomad_vds.py",
    "{script_args}",
]).format(**locals()))
#!/usr/bin/env python

from kubernetes.shell_utils import simple_run as run

for dbnsfp_gene_table_path in [
    "gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9_gene",
    "gs://seqr-reference-data/GRCh38/dbNSFP/v3.5/dbNSFP3.5_gene"
]:
    run(" ".join([
        "python gcloud_dataproc/v01/run_script.py",
        "--cluster dbnsfp",
        "hail_scripts/v01/convert_tsv_to_key_table.py",
        "{dbnsfp_gene_table_path}"
    ]).format(**locals()))
#!/usr/bin/env python3

from kubernetes.shell_utils import simple_run as run

for genome_version, vcf_path in [
    ("37", "gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vcf.gz"),
    ("38", "gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vcf.gz"),
]:
    run(("python3 gcloud_dataproc/v02/run_script.py "
        "--cluster create-ht-mpc "
        "hail_scripts/v02/convert_vcf_to_hail.py "
        "--output-sites-only-ht "
        f"--genome-version {genome_version} "
        f"{vcf_path}"))
#!/usr/bin/env python3

from kubernetes.shell_utils import simple_run as run

run(("python3 gcloud_dataproc/v02/run_script.py "
     "--cluster create-ht-cadd "
     "download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py"
     ))
Esempio n. 23
0
#!/usr/bin/env python3

import argparse
from kubernetes.shell_utils import simple_run as run

parser = argparse.ArgumentParser()
parser.add_argument('-b',
                    '--build',
                    help='Reference build, 37 or 38',
                    choices=["37", "38"],
                    required=True)
args = parser.parse_args()

run((
    "python3 gcloud_dataproc/v02/run_script.py "
    "--cluster create-ht-combined-reference-data "
    "download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py "
    f"--build {args.build}"))
#!/usr/bin/env python

from kubernetes.shell_utils import simple_run as run

run(" ".join([
    "python gcloud_dataproc/v01/run_script.py",
    "--cluster cadd",
    "download_and_create_reference_datasets/v01/hail_scripts/write_cadd_vds.py",
]))
import sys

from kubernetes.shell_utils import simple_run as run

unique_id = random.randint(10**5, 10**6 - 1)
random_cluster_name = "without-vep-%s" % unique_id

p = argparse.ArgumentParser()
p.add_argument("-c", "--cluster", default=random_cluster_name)
p.add_argument("script")

args, unparsed_args = p.parse_known_args()

cluster_name = args.cluster
script = args.script
script_args = " ".join(['"%s"' % arg for arg in unparsed_args])

os.chdir(os.path.join(os.path.dirname(__file__), "../.."))

run("./gcloud_dataproc/v02/create_cluster_without_VEP.py %(cluster_name)s 2 12"
    % locals())

if "-h" in sys.argv or "--help" in sys.argv:
    run("python %(script)s -h" % locals())
    sys.exit(0)

run(("time ./gcloud_dataproc/submit.py "
     "--hail-version 0.2 "
     "--cluster %(cluster_name)s "
     "%(script)s %(script_args)s") % locals())
Esempio n. 26
0
#!/usr/bin/env python

"""
This script creates snapshots of disks bound to PersistentVolumeClaims in the "current" kubernetes cluster.
"""

import argparse
import time
from kubernetes.shell_utils import run

p = argparse.ArgumentParser()
p.add_argument("--zone", help="gcloud zone", default="us-central1-b")
args = p.parse_args()

output = run("kubectl get pvc -o jsonpath='{.items[*].spec.volumeName}'")
disk_names = output.split()

timestamp = time.strftime("%Y%m%d-%H%M%S")
snapshot_names = ["snap-%s--%s" % (timestamp, disk_name) for disk_name in disk_names]

disk_names = " ".join(disk_names)
snapshot_names = ",".join(snapshot_names)
zone = args.zone

run("gcloud compute disks snapshot %(disk_names)s --snapshot-names %(snapshot_names)s --zone=%(zone)s" % locals())
import sys

from kubernetes.shell_utils import simple_run as run

unique_id = random.randint(10**5, 10**6 - 1)
random_cluster_name = "vep-grch37-%s" % unique_id

p = argparse.ArgumentParser()
p.add_argument("-c", "--cluster", default=random_cluster_name)
p.add_argument("script")

args, unparsed_args = p.parse_known_args()

cluster_name = args.cluster
script = args.script
script_args = " ".join(['"%s"' % arg for arg in unparsed_args])

os.chdir(os.path.join(os.path.dirname(__file__), "../.."))

run("python gcloud_dataproc/v01/create_cluster_GRCh37.py %(cluster_name)s 2 12"
    % locals())

if "-h" in sys.argv or "--help" in sys.argv:
    run("python %(script)s -h" % locals())
    sys.exit(0)

run(("time ./gcloud_dataproc/submit.py "
     "--hail-version 0.1 "
     "--cluster %(cluster_name)s "
     "%(script)s %(script_args)s") % locals())
Esempio n. 28
0
#!/usr/bin/env python3

from kubernetes.shell_utils import simple_run as run

run((
    "python3 gcloud_dataproc/v02/run_script.py "
    "--cluster create-gnomad-38-hts "
    "download_and_create_reference_datasets/v02/hail_scripts/write_gnomad_38_hts.py"
))
def _make_disks(settings, es_disk_snapshots=None):
    """Create persistent disks from snapshots

    Args:
        es_disk_snapshots (list): optional list of snapshot names
    """

    # create disks from snapshots
    created_disks = []
    if es_disk_snapshots:
        for i, snapshot_name in enumerate(es_disk_snapshots):
            disk_name = "es-data-%s--%d" % (
                settings["CLUSTER_NAME"], i
            )  # time.strftime("%y%m%d-%H%M%S")  - make the timestamp year-month-day so a bunch of disks don't get created accidentally

            run(" ".join([
                "gcloud compute disks create " + disk_name,
                "--type pd-ssd",
                "--source-snapshot " + snapshot_name,
            ]) % settings,
                errors_to_ignore=["lready exists"])

            disk_size = settings[
                "ELASTICSEARCH_DISK_SIZE"]  # TODO GET SNAPSHOT DISK SIZE from gcloud compute disks describe ...

            created_disks.append((disk_name, disk_size))
    else:
        for i in range(settings["ES_NUM_PERSISTENT_NODES"]):
            disk_name = "es-data-%s--%d" % (settings["CLUSTER_NAME"], i)

            run(" ".join([
                "gcloud compute disks create " + disk_name,
                "--type pd-ssd",
                "--size %(ELASTICSEARCH_DISK_SIZE)s",
            ]) % settings,
                errors_to_ignore=["lready exists"])

            created_disks.append(
                (disk_name, settings["ELASTICSEARCH_DISK_SIZE"]))

    # create PersistentVolume objects for disk
    namespace = settings["NAMESPACE"]
    for i, (existing_disk_name,
            elasticsearch_disk_size) in enumerate(created_disks):

        with tempfile.NamedTemporaryFile("w") as f:
            f.write("""apiVersion: v1
kind: PersistentVolume
metadata:
  name: %(existing_disk_name)s
  namespace: %(namespace)s
spec:
  capacity:
    storage: %(elasticsearch_disk_size)s
  accessModes:
    - ReadWriteOnce
  persistentVolumeReclaimPolicy: Retain
  storageClassName: ssd-storage-class
  gcePersistentDisk:
    fsType: ext4
    pdName: %(existing_disk_name)s
""" % locals())

            f.flush()
            file_path = f.name
            run("kubectl create -f %(file_path)s" % locals(),
                print_command=True,
                errors_to_ignore=["already exists"])
Esempio n. 30
0
#!/usr/bin/env python

import argparse
from kubernetes.shell_utils import simple_run as run

genome_versions = ['37', '38']

p = argparse.ArgumentParser()
p.add_argument("-g",
               "--genome-version",
               help="Genome build: 37 or 38",
               choices=genome_versions)
args, unparsed_args = p.parse_known_args()

script_args = " ".join(['"%s"' % arg for arg in unparsed_args])

cluster_name = 'create-all-reference-data-vds'

if args.genome_version:
    cluster_name += "-grch" + args.genome_version
    genome_versions = [args.genome_version]

for genome_version in genome_versions:
    run(" ".join([
        "python gcloud_dataproc/v01/run_script.py",
        "--cluster {cluster_name}",
        "download_and_create_reference_datasets/v01/hail_scripts/combine_all_variant_level_reference_data.py",
        "--genome-version {genome_version} {script_args}",
    ]).format(**locals()))