Exemple #1
0
def _init_dataproc_cluster(settings, assembly="37"):
    """Create a data-proc cluster.

    Args:
        settings (dict): global deployment settings
        assembly (string): "37" or "38"
    """

    # TODO come up with a way to run hail locally

    GCLOUD_ZONE = settings['GCLOUD_ZONE']
    GCLOUD_PROJECT = settings['GCLOUD_PROJECT']

    # gs://hail-common/vep/vep/GRCh%(assembly)s/vep85-GRCh%(assembly)s-init.sh
    _run_shell_command("""
    gcloud dataproc clusters create seqr-backend-export-cluster  \
        --zone %(GCLOUD_ZONE)s \
        --master-machine-type n1-standard-8 \
        --master-boot-disk-size 100 \
        --num-workers 2 \
        --worker-machine-type n1-standard-8 \
        --worker-boot-disk-size 100 \
        --num-preemptible-workers 2 \
        --image-version 1.1 \
        --project %(GCLOUD_PROJECT)s \
        --initialization-actions "gs://hail-common/hail-init.sh"
    """ % locals()).wait()
Exemple #2
0
def _submit_to_hail(settings, script_path, node_name, vds_path):
    """
    """
    _run_shell_command("""
    gcloud --project seqr-project dataproc jobs submit pyspark %(script_path)s \
           --cluster seqr-backend-export-cluster \
           --files=gs://seqr-hail/hail/hail-all-spark.jar \
           --py-files=gs://seqr-hail/hail/hail-python.zip \
           --properties=spark.driver.extraClassPath=./hail-all-spark.jar,spark.executor.extraClassPath=./hail-all-spark.jar \
           -- %(node_name)s %(vds_path)s
    """ % locals()).wait()
Exemple #3
0
def load_project_cassandra(
        deployment_label,
        project_id="1kg",
        assembly="37",
        vds_path="gs://seqr-hail/annotated/Cohen.1kpart.vds"):
    """Export VDS to cassandra

    Args:
        deployment_label (string): "local", "gcloud-dev", or "gcloud-prod"
        project_id (string): project id
        assembly (string): reference genome version - either "37" or "38"
        vds_path (string): path of annotated VDS
    """

    check_kubernetes_context(deployment_label)

    settings = retrieve_settings(deployment_label)

    _init_dataproc_cluster(settings, assembly=assembly)

    pod_name = lookup_json_path("pods",
                                labels={'name': 'cassandra'},
                                json_path=".items[0].metadata.name")

    _run_shell_command("""
    kubectl exec -i %(pod_name)s -- cqlsh <<EOF
        DROP KEYSPACE IF EXISTS seqr;
        CREATE KEYSPACE seqr WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}  AND durable_writes = true;

        CREATE TABLE seqr.seqr (chrom text, start int, ref text, alt text, dataset_5fid text, PRIMARY KEY (chrom, start, ref, alt, dataset_5fid));
    EOF
    """ % locals()).wait()

    script_path = "scripts/loading/export_to_cass.py"
    node_name = lookup_json_path("pods",
                                 labels={'name': 'cassandra'},
                                 json_path=".items[0].spec.nodeName")

    _submit_to_hail(settings, script_path, node_name, vds_path)

    _run_shell_command("""
    kubectl exec -i %(pod_name)s -- cqlsh <<EOF
        select count(*) from seqr.seqr;
    EOF
    """ % locals()).wait()
Exemple #4
0
def load_project_solr(deployment_label,
                      project_id="1kg",
                      assembly="37",
                      vds_path="gs://seqr-hail/annotated/Cohen.1kpart.vds"):
    """Export VDS to solr

    Args:
        deployment_label (string): "local", "gcloud-dev", or "gcloud-prod"
        project_id (string): project id
        assembly (string): reference genome version - either "37" or "38"
        vcf (string): VCF path
    """

    check_kubernetes_context(deployment_label)

    settings = retrieve_settings(deployment_label)

    _init_dataproc_cluster(settings, assembly=assembly)

    pod_name = lookup_json_path("pods",
                                labels={'name': 'solr'},
                                json_path=".items[0].metadata.name")

    _run_shell_command(
        "kubectl exec %(pod_name)s -- su -c '/usr/local/solr-6.4.2/bin/solr delete -c seqr_noref' solr || true"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- su -c '/usr/local/solr-6.4.2/bin/solr create_collection -c seqr_noref' solr || true"
        % locals()).wait()

    script_path = "scripts/loading/export_to_solr.py"
    node_name = lookup_json_path("pods",
                                 labels={'name': 'solr'},
                                 json_path=".items[0].spec.nodeName")

    _submit_to_hail(settings, script_path, node_name, vds_path)

    _run_shell_command(
        "kubectl exec -i %(pod_name)s -- /bin/bash -c \"curl 'http://localhost:30002/solr/seqr_noref/select?indent=on&q=*:*&wt=json'\""
        % locals()).wait()
Exemple #5
0
def load_allele_frequencies(deployment_label, assembly="37"):
    """Load ExAC and 1kg allele frequency datasets. These are larger and take longer to load than other reference data

    Args:
        assembly (string): reference genome version - either "37" or "38"
    """

    check_kubernetes_context(deployment_label)

    pod_name = _get_pod_name('seqr')
    if not pod_name:
        raise ValueError(
            "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?"
            % locals())

    _run_shell_command(
        "kubectl exec %(pod_name)s -- wget -N http://seqr.broadinstitute.org/static/bundle/ExAC.r0.3.sites.vep.popmax.clinvar.vcf.gz -P /data/reference_data/"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- wget -N http://seqr.broadinstitute.org/static/bundle/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.decomposed.with_popmax.vcf.gz -P /data/reference_data/"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_reference" %
        locals()).wait()
Exemple #6
0
def deploy(deployment_label,
           component=None,
           output_dir=None,
           other_settings={}):
    """
    Args:
        deployment_label (string): one of the DEPLOYMENT_LABELS  (eg. "local", or "gcloud")
        component (string): optionally specifies one of the components from the DEPLOYABLE_COMPONENTS lists (eg. "postgres" or "phenotips").
            If this is set to None, all DEPLOYABLE_COMPONENTS will be deployed in sequence.
        output_dir (string): path of directory where to put deployment logs and rendered config files
        other_settings (dict): a dictionary of other key-value pairs for use during deployment
    """

    check_kubernetes_context(deployment_label)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())
    output_dir = output_dir or "deployments/%(timestamp)s_%(deployment_label)s" % locals(
    )

    # configure logging output
    log_dir = os.path.join(output_dir, "logs")
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    log_file_path = os.path.join(log_dir, "deploy.log")
    sh = logging.StreamHandler(open(log_file_path, "w"))
    sh.setLevel(logging.INFO)
    logger.addHandler(sh)
    logger.info("Starting log file: %(log_file_path)s" % locals())

    # parse config files
    settings = retrieve_settings(deployment_label)
    settings.update(other_settings)

    for key, value in settings.items():
        key = key.upper()
        settings[key] = value
        logger.info("%s = %s" % (key, value))

    # copy configs, templates and scripts to output directory
    output_base_dir = os.path.join(output_dir, 'configs')
    for file_path in glob.glob("templates/*/*.*") + glob.glob(
            "templates/*/*/*.*"):
        file_path = file_path.replace('templates/', '')
        input_base_dir = os.path.join(BASE_DIR, 'templates')
        render(template_processor, input_base_dir, file_path, settings,
               output_base_dir)

    for file_path in glob.glob(os.path.join("scripts/*.sh")):
        render(script_processor, BASE_DIR, file_path, settings, output_dir)

    for file_path in glob.glob(os.path.join("scripts/*.py")):
        shutil.copy(file_path, output_base_dir)

    for file_path in glob.glob(os.path.join("config/*.yaml")):
        shutil.copy(file_path, output_base_dir)

    # copy docker directory to output directory
    docker_src_dir = os.path.join(BASE_DIR, "../docker/")
    docker_dest_dir = os.path.join(output_dir, "docker")
    logger.info("Copying %(docker_src_dir)s to %(docker_dest_dir)s" % locals())
    shutil.copytree(docker_src_dir, docker_dest_dir)

    # copy secrets directory
    secrets_src_dir = os.path.join(BASE_DIR,
                                   "secrets/%(deployment_label)s" % locals())
    secrets_dest_dir = os.path.join(output_dir,
                                    "secrets/%(deployment_label)s" % locals())
    logger.info("Copying %(secrets_src_dir)s to %(secrets_dest_dir)s" %
                locals())
    shutil.copytree(secrets_src_dir, secrets_dest_dir)

    # deploy
    if component:
        deployment_scripts = [
            s for s in DEPLOYMENT_SCRIPTS if 'init' in s or component in s
            or component.replace('-', '_') in s
        ]
    else:
        if deployment_label == "gcloud-dev":
            deployment_scripts = DEPLOYMENT_SCRIPTS
        else:
            deployment_scripts = [
                s for s in DEPLOYMENT_SCRIPTS if not any(
                    [k in s for k in ("solr", "cassandra", "database_api")])
            ]

    os.chdir(output_dir)
    logger.info("Switched to %(output_dir)s" % locals())

    for path in deployment_scripts:
        logger.info("=========================")
        _run_shell_command(path, verbose=True).wait()
Exemple #7
0
def load_project(deployment_label,
                 project_id="1kg",
                 assembly="37",
                 vcf=None,
                 ped=None):
    """Load example project

    Args:
        project_id (string): project id
        assembly (string): reference genome version - either "37" or "38"
        vcf (string): VCF path
        ped (string): PED path
    """

    check_kubernetes_context(deployment_label)

    pod_name = _get_pod_name('seqr')
    if not pod_name:
        raise ValueError(
            "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?"
            % locals())

    if not project_id:
        raise ValueError("project_id not specified")
    if not vcf:
        raise ValueError("vcf not specified")
    if not ped:
        raise ValueError("ped not specified")

    vcf_filename = os.path.basename(vcf)
    ped_filename = os.path.basename(ped)

    _run_shell_command("kubectl exec %(pod_name)s -- wget -N %(vcf)s" %
                       locals()).wait()
    _run_shell_command("kubectl exec %(pod_name)s -- wget -N %(ped)s" %
                       locals()).wait()

    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_project '%(project_id)s' '%(project_id)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_individuals_to_project '%(project_id)s' --ped '%(ped_filename)s'"
        % locals()).wait()

    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_vcf_to_project --clear '%(project_id)s' '%(vcf_filename)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_project_to_phenotips '%(project_id)s' '%(project_id)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_individuals_to_phenotips '%(project_id)s' --ped '%(ped_filename)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py generate_pedigree_images -f '%(project_id)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_default_tags '%(project_id)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_project '%(project_id)s'"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_project_datastore '%(project_id)s'"
        % locals()).wait()
Exemple #8
0
def load_reference_data(deployment_label, assembly="37"):
    """Load reference data

    Args:
        assembly (string): reference genome version - either "37" or "38"
    """

    check_kubernetes_context(deployment_label)

    pod_name = _get_pod_name('seqr')
    if not pod_name:
        raise ValueError(
            "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?"
            % locals())

    _run_shell_command(
        "kubectl exec %(pod_name)s -- mkdir -p /data/reference_data/" %
        locals())
    _run_shell_command(
        "kubectl exec %(pod_name)s -- wget -N https://storage.googleapis.com/seqr-public/reference-data/seqr-resource-bundle.tar.gz -P /data/reference_data/"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- tar -xzf /data/reference_data/seqr-resource-bundle.tar.gz --directory /data/reference_data/"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_resources" %
        locals()).wait()

    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_gencode" %
        locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_human_phenotype_ontology"
        % locals()).wait()
    _run_shell_command(
        "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_omim" %
        locals()).wait()

    _run_shell_command(
        "kubectl exec %(pod_name)s -- /usr/local/bin/restart_server.sh" %
        locals()).wait()
Exemple #9
0
def deploy(deployment_label,
           force,
           component=None,
           output_dir=None,
           other_settings={}):
    """
    Args:
        deployment_label (string): one of the DEPLOYMENT_LABELS  (eg. "local", or "gcloud")
        force (bool): whether to redo some parts of the deployment from scratch
        component (string): optionally specifies one of the components from the DEPLOYABLE_COMPONENTS lists (eg. "postgres" or "phenotips").
            If this is set to None, all DEPLOYABLE_COMPONENTS will be deployed in sequence.
        output_dir (string): path of directory where to put deployment logs and rendered config files
        other_settings (dict): a dictionary of other key-value pairs for use during deployment
    """

    # make sure the environment is configured to use a local kube-solo cluster, and not gcloud or something else
    try:
        cmd = 'kubectl config current-context'
        kubectl_current_context = subprocess.check_output(cmd,
                                                          shell=True).strip()
    except subprocess.CalledProcessError as e:
        logger.error(
            'Error while running "kubectl config current-context": %s', e)
        i = raw_input("Continue? [Y/n] ")
        if i != 'Y' and i != 'y':
            sys.exit('Exiting...')
    else:
        if deployment_label == "local":
            if kubectl_current_context != 'kube-solo':
                logger.error(
                    "'%(cmd)s' returned '%(kubectl_current_context)s'. For %(deployment_label)s deployment, this is "
                    "expected to equal 'kube-solo'. Please configure your shell environment "
                    "to point to a local kube-solo cluster by installing "
                    "kube-solo from https://github.com/TheNewNormal/kube-solo-osx, starting the kube-solo VM, "
                    "and then clicking on 'Preset OS Shell' in the kube-solo menu to launch a pre-configured shell."
                    % locals())
                sys.exit(-1)

        elif deployment_label == "gcloud":
            if not kubectl_current_context.startswith('gke_'):
                logger.error(
                    "'%(cmd)s' returned '%(kubectl_current_context)s'. For %(deployment_label)s deployment, this is "
                    "expected to start with 'gke_'. Please configure your shell environment "
                    "to point to a gcloud cluster" % locals())
                sys.exit(-1)
        else:
            raise ValueError("Unexpected value for deployment_label: %s" %
                             deployment_label)

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())
    output_dir = output_dir or "deployments/%(timestamp)s_%(deployment_label)s" % locals(
    )

    # configure logging output
    log_dir = os.path.join(output_dir, "logs")
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    log_file_path = os.path.join(log_dir, "deploy.log")
    sh = logging.StreamHandler(open(log_file_path, "w"))
    sh.setLevel(logging.INFO)
    logger.addHandler(sh)
    logger.info("Starting log file: %(log_file_path)s" % locals())

    # parse config files
    settings = collections.OrderedDict()

    settings['STARTED_VIA_SEQRCTL'] = True
    settings['HOME'] = os.path.expanduser("~")
    settings['SEQR_REPO_PATH'] = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../../..'))

    load_settings([
        os.path.join(BASE_DIR, "config/shared-settings.yaml"),
        os.path.join(BASE_DIR,
                     "config/%(deployment_label)s-settings.yaml" % locals())
    ], settings)

    settings.update(other_settings)

    for key, value in settings.items():
        key = key.upper()
        settings[key] = value
        logger.info("%s = %s" % (key, value))

    # copy configs, templates and scripts to output directory
    output_base_dir = os.path.join(output_dir, 'configs')
    for file_path in glob.glob("templates/*/*.*") + glob.glob(
            "templates/*/*/*.*"):
        file_path = file_path.replace('templates/', '')
        input_base_dir = os.path.join(BASE_DIR, 'templates')
        render(template_processor, input_base_dir, file_path, settings,
               output_base_dir)

    for file_path in glob.glob(os.path.join("scripts/*.sh")):
        render(script_processor, BASE_DIR, file_path, settings, output_dir)

    for file_path in glob.glob(os.path.join("config/*.yaml")):
        shutil.copy(file_path, output_base_dir)

    # copy docker directory to output directory
    docker_src_dir = os.path.join(BASE_DIR, "../docker/")
    docker_dest_dir = os.path.join(output_dir, "docker")
    logger.info("Copying %(docker_src_dir)s to %(docker_dest_dir)s" % locals())
    shutil.copytree(docker_src_dir, docker_dest_dir)

    # copy secrets directory
    secrets_src_dir = os.path.join(BASE_DIR,
                                   "secrets/%(deployment_label)s" % locals())
    secrets_dest_dir = os.path.join(output_dir,
                                    "secrets/%(deployment_label)s" % locals())
    logger.info("Copying %(secrets_src_dir)s to %(secrets_dest_dir)s" %
                locals())
    shutil.copytree(secrets_src_dir, secrets_dest_dir)

    # deploy
    os.environ['FORCE'] = "true" if force else ''

    if component:
        deployment_scripts = [
            s for s in DEPLOYMENT_SCRIPTS if 'init' in s or component in s
        ]
    else:
        deployment_scripts = DEPLOYMENT_SCRIPTS

    os.chdir(output_dir)
    logger.info("Switched to %(output_dir)s" % locals())

    for path in deployment_scripts:
        logger.info("=========================")
        _run_shell_command(path, verbose=True).wait()