def _init_dataproc_cluster(settings, assembly="37"): """Create a data-proc cluster. Args: settings (dict): global deployment settings assembly (string): "37" or "38" """ # TODO come up with a way to run hail locally GCLOUD_ZONE = settings['GCLOUD_ZONE'] GCLOUD_PROJECT = settings['GCLOUD_PROJECT'] # gs://hail-common/vep/vep/GRCh%(assembly)s/vep85-GRCh%(assembly)s-init.sh _run_shell_command(""" gcloud dataproc clusters create seqr-backend-export-cluster \ --zone %(GCLOUD_ZONE)s \ --master-machine-type n1-standard-8 \ --master-boot-disk-size 100 \ --num-workers 2 \ --worker-machine-type n1-standard-8 \ --worker-boot-disk-size 100 \ --num-preemptible-workers 2 \ --image-version 1.1 \ --project %(GCLOUD_PROJECT)s \ --initialization-actions "gs://hail-common/hail-init.sh" """ % locals()).wait()
def _submit_to_hail(settings, script_path, node_name, vds_path): """ """ _run_shell_command(""" gcloud --project seqr-project dataproc jobs submit pyspark %(script_path)s \ --cluster seqr-backend-export-cluster \ --files=gs://seqr-hail/hail/hail-all-spark.jar \ --py-files=gs://seqr-hail/hail/hail-python.zip \ --properties=spark.driver.extraClassPath=./hail-all-spark.jar,spark.executor.extraClassPath=./hail-all-spark.jar \ -- %(node_name)s %(vds_path)s """ % locals()).wait()
def load_project_cassandra( deployment_label, project_id="1kg", assembly="37", vds_path="gs://seqr-hail/annotated/Cohen.1kpart.vds"): """Export VDS to cassandra Args: deployment_label (string): "local", "gcloud-dev", or "gcloud-prod" project_id (string): project id assembly (string): reference genome version - either "37" or "38" vds_path (string): path of annotated VDS """ check_kubernetes_context(deployment_label) settings = retrieve_settings(deployment_label) _init_dataproc_cluster(settings, assembly=assembly) pod_name = lookup_json_path("pods", labels={'name': 'cassandra'}, json_path=".items[0].metadata.name") _run_shell_command(""" kubectl exec -i %(pod_name)s -- cqlsh <<EOF DROP KEYSPACE IF EXISTS seqr; CREATE KEYSPACE seqr WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes = true; CREATE TABLE seqr.seqr (chrom text, start int, ref text, alt text, dataset_5fid text, PRIMARY KEY (chrom, start, ref, alt, dataset_5fid)); EOF """ % locals()).wait() script_path = "scripts/loading/export_to_cass.py" node_name = lookup_json_path("pods", labels={'name': 'cassandra'}, json_path=".items[0].spec.nodeName") _submit_to_hail(settings, script_path, node_name, vds_path) _run_shell_command(""" kubectl exec -i %(pod_name)s -- cqlsh <<EOF select count(*) from seqr.seqr; EOF """ % locals()).wait()
def load_project_solr(deployment_label, project_id="1kg", assembly="37", vds_path="gs://seqr-hail/annotated/Cohen.1kpart.vds"): """Export VDS to solr Args: deployment_label (string): "local", "gcloud-dev", or "gcloud-prod" project_id (string): project id assembly (string): reference genome version - either "37" or "38" vcf (string): VCF path """ check_kubernetes_context(deployment_label) settings = retrieve_settings(deployment_label) _init_dataproc_cluster(settings, assembly=assembly) pod_name = lookup_json_path("pods", labels={'name': 'solr'}, json_path=".items[0].metadata.name") _run_shell_command( "kubectl exec %(pod_name)s -- su -c '/usr/local/solr-6.4.2/bin/solr delete -c seqr_noref' solr || true" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- su -c '/usr/local/solr-6.4.2/bin/solr create_collection -c seqr_noref' solr || true" % locals()).wait() script_path = "scripts/loading/export_to_solr.py" node_name = lookup_json_path("pods", labels={'name': 'solr'}, json_path=".items[0].spec.nodeName") _submit_to_hail(settings, script_path, node_name, vds_path) _run_shell_command( "kubectl exec -i %(pod_name)s -- /bin/bash -c \"curl 'http://localhost:30002/solr/seqr_noref/select?indent=on&q=*:*&wt=json'\"" % locals()).wait()
def load_allele_frequencies(deployment_label, assembly="37"): """Load ExAC and 1kg allele frequency datasets. These are larger and take longer to load than other reference data Args: assembly (string): reference genome version - either "37" or "38" """ check_kubernetes_context(deployment_label) pod_name = _get_pod_name('seqr') if not pod_name: raise ValueError( "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?" % locals()) _run_shell_command( "kubectl exec %(pod_name)s -- wget -N http://seqr.broadinstitute.org/static/bundle/ExAC.r0.3.sites.vep.popmax.clinvar.vcf.gz -P /data/reference_data/" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- wget -N http://seqr.broadinstitute.org/static/bundle/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.decomposed.with_popmax.vcf.gz -P /data/reference_data/" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_reference" % locals()).wait()
def deploy(deployment_label, component=None, output_dir=None, other_settings={}): """ Args: deployment_label (string): one of the DEPLOYMENT_LABELS (eg. "local", or "gcloud") component (string): optionally specifies one of the components from the DEPLOYABLE_COMPONENTS lists (eg. "postgres" or "phenotips"). If this is set to None, all DEPLOYABLE_COMPONENTS will be deployed in sequence. output_dir (string): path of directory where to put deployment logs and rendered config files other_settings (dict): a dictionary of other key-value pairs for use during deployment """ check_kubernetes_context(deployment_label) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) output_dir = output_dir or "deployments/%(timestamp)s_%(deployment_label)s" % locals( ) # configure logging output log_dir = os.path.join(output_dir, "logs") if not os.path.isdir(log_dir): os.makedirs(log_dir) log_file_path = os.path.join(log_dir, "deploy.log") sh = logging.StreamHandler(open(log_file_path, "w")) sh.setLevel(logging.INFO) logger.addHandler(sh) logger.info("Starting log file: %(log_file_path)s" % locals()) # parse config files settings = retrieve_settings(deployment_label) settings.update(other_settings) for key, value in settings.items(): key = key.upper() settings[key] = value logger.info("%s = %s" % (key, value)) # copy configs, templates and scripts to output directory output_base_dir = os.path.join(output_dir, 'configs') for file_path in glob.glob("templates/*/*.*") + glob.glob( "templates/*/*/*.*"): file_path = file_path.replace('templates/', '') input_base_dir = os.path.join(BASE_DIR, 'templates') render(template_processor, input_base_dir, file_path, settings, output_base_dir) for file_path in glob.glob(os.path.join("scripts/*.sh")): render(script_processor, BASE_DIR, file_path, settings, output_dir) for file_path in glob.glob(os.path.join("scripts/*.py")): shutil.copy(file_path, output_base_dir) for file_path in glob.glob(os.path.join("config/*.yaml")): shutil.copy(file_path, output_base_dir) # copy docker directory to output directory docker_src_dir = os.path.join(BASE_DIR, "../docker/") docker_dest_dir = os.path.join(output_dir, "docker") logger.info("Copying %(docker_src_dir)s to %(docker_dest_dir)s" % locals()) shutil.copytree(docker_src_dir, docker_dest_dir) # copy secrets directory secrets_src_dir = os.path.join(BASE_DIR, "secrets/%(deployment_label)s" % locals()) secrets_dest_dir = os.path.join(output_dir, "secrets/%(deployment_label)s" % locals()) logger.info("Copying %(secrets_src_dir)s to %(secrets_dest_dir)s" % locals()) shutil.copytree(secrets_src_dir, secrets_dest_dir) # deploy if component: deployment_scripts = [ s for s in DEPLOYMENT_SCRIPTS if 'init' in s or component in s or component.replace('-', '_') in s ] else: if deployment_label == "gcloud-dev": deployment_scripts = DEPLOYMENT_SCRIPTS else: deployment_scripts = [ s for s in DEPLOYMENT_SCRIPTS if not any( [k in s for k in ("solr", "cassandra", "database_api")]) ] os.chdir(output_dir) logger.info("Switched to %(output_dir)s" % locals()) for path in deployment_scripts: logger.info("=========================") _run_shell_command(path, verbose=True).wait()
def load_project(deployment_label, project_id="1kg", assembly="37", vcf=None, ped=None): """Load example project Args: project_id (string): project id assembly (string): reference genome version - either "37" or "38" vcf (string): VCF path ped (string): PED path """ check_kubernetes_context(deployment_label) pod_name = _get_pod_name('seqr') if not pod_name: raise ValueError( "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?" % locals()) if not project_id: raise ValueError("project_id not specified") if not vcf: raise ValueError("vcf not specified") if not ped: raise ValueError("ped not specified") vcf_filename = os.path.basename(vcf) ped_filename = os.path.basename(ped) _run_shell_command("kubectl exec %(pod_name)s -- wget -N %(vcf)s" % locals()).wait() _run_shell_command("kubectl exec %(pod_name)s -- wget -N %(ped)s" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_project '%(project_id)s' '%(project_id)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_individuals_to_project '%(project_id)s' --ped '%(ped_filename)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_vcf_to_project --clear '%(project_id)s' '%(vcf_filename)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_project_to_phenotips '%(project_id)s' '%(project_id)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_individuals_to_phenotips '%(project_id)s' --ped '%(ped_filename)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py generate_pedigree_images -f '%(project_id)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py add_default_tags '%(project_id)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_project '%(project_id)s'" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_project_datastore '%(project_id)s'" % locals()).wait()
def load_reference_data(deployment_label, assembly="37"): """Load reference data Args: assembly (string): reference genome version - either "37" or "38" """ check_kubernetes_context(deployment_label) pod_name = _get_pod_name('seqr') if not pod_name: raise ValueError( "No 'seqr' pods found. Is the kubectl environment configured in this terminal? and has this type of pod been deployed?" % locals()) _run_shell_command( "kubectl exec %(pod_name)s -- mkdir -p /data/reference_data/" % locals()) _run_shell_command( "kubectl exec %(pod_name)s -- wget -N https://storage.googleapis.com/seqr-public/reference-data/seqr-resource-bundle.tar.gz -P /data/reference_data/" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- tar -xzf /data/reference_data/seqr-resource-bundle.tar.gz --directory /data/reference_data/" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py load_resources" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_gencode" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_human_phenotype_ontology" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- python2.7 -u manage.py update_omim" % locals()).wait() _run_shell_command( "kubectl exec %(pod_name)s -- /usr/local/bin/restart_server.sh" % locals()).wait()
def deploy(deployment_label, force, component=None, output_dir=None, other_settings={}): """ Args: deployment_label (string): one of the DEPLOYMENT_LABELS (eg. "local", or "gcloud") force (bool): whether to redo some parts of the deployment from scratch component (string): optionally specifies one of the components from the DEPLOYABLE_COMPONENTS lists (eg. "postgres" or "phenotips"). If this is set to None, all DEPLOYABLE_COMPONENTS will be deployed in sequence. output_dir (string): path of directory where to put deployment logs and rendered config files other_settings (dict): a dictionary of other key-value pairs for use during deployment """ # make sure the environment is configured to use a local kube-solo cluster, and not gcloud or something else try: cmd = 'kubectl config current-context' kubectl_current_context = subprocess.check_output(cmd, shell=True).strip() except subprocess.CalledProcessError as e: logger.error( 'Error while running "kubectl config current-context": %s', e) i = raw_input("Continue? [Y/n] ") if i != 'Y' and i != 'y': sys.exit('Exiting...') else: if deployment_label == "local": if kubectl_current_context != 'kube-solo': logger.error( "'%(cmd)s' returned '%(kubectl_current_context)s'. For %(deployment_label)s deployment, this is " "expected to equal 'kube-solo'. Please configure your shell environment " "to point to a local kube-solo cluster by installing " "kube-solo from https://github.com/TheNewNormal/kube-solo-osx, starting the kube-solo VM, " "and then clicking on 'Preset OS Shell' in the kube-solo menu to launch a pre-configured shell." % locals()) sys.exit(-1) elif deployment_label == "gcloud": if not kubectl_current_context.startswith('gke_'): logger.error( "'%(cmd)s' returned '%(kubectl_current_context)s'. For %(deployment_label)s deployment, this is " "expected to start with 'gke_'. Please configure your shell environment " "to point to a gcloud cluster" % locals()) sys.exit(-1) else: raise ValueError("Unexpected value for deployment_label: %s" % deployment_label) timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()) output_dir = output_dir or "deployments/%(timestamp)s_%(deployment_label)s" % locals( ) # configure logging output log_dir = os.path.join(output_dir, "logs") if not os.path.isdir(log_dir): os.makedirs(log_dir) log_file_path = os.path.join(log_dir, "deploy.log") sh = logging.StreamHandler(open(log_file_path, "w")) sh.setLevel(logging.INFO) logger.addHandler(sh) logger.info("Starting log file: %(log_file_path)s" % locals()) # parse config files settings = collections.OrderedDict() settings['STARTED_VIA_SEQRCTL'] = True settings['HOME'] = os.path.expanduser("~") settings['SEQR_REPO_PATH'] = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../..')) load_settings([ os.path.join(BASE_DIR, "config/shared-settings.yaml"), os.path.join(BASE_DIR, "config/%(deployment_label)s-settings.yaml" % locals()) ], settings) settings.update(other_settings) for key, value in settings.items(): key = key.upper() settings[key] = value logger.info("%s = %s" % (key, value)) # copy configs, templates and scripts to output directory output_base_dir = os.path.join(output_dir, 'configs') for file_path in glob.glob("templates/*/*.*") + glob.glob( "templates/*/*/*.*"): file_path = file_path.replace('templates/', '') input_base_dir = os.path.join(BASE_DIR, 'templates') render(template_processor, input_base_dir, file_path, settings, output_base_dir) for file_path in glob.glob(os.path.join("scripts/*.sh")): render(script_processor, BASE_DIR, file_path, settings, output_dir) for file_path in glob.glob(os.path.join("config/*.yaml")): shutil.copy(file_path, output_base_dir) # copy docker directory to output directory docker_src_dir = os.path.join(BASE_DIR, "../docker/") docker_dest_dir = os.path.join(output_dir, "docker") logger.info("Copying %(docker_src_dir)s to %(docker_dest_dir)s" % locals()) shutil.copytree(docker_src_dir, docker_dest_dir) # copy secrets directory secrets_src_dir = os.path.join(BASE_DIR, "secrets/%(deployment_label)s" % locals()) secrets_dest_dir = os.path.join(output_dir, "secrets/%(deployment_label)s" % locals()) logger.info("Copying %(secrets_src_dir)s to %(secrets_dest_dir)s" % locals()) shutil.copytree(secrets_src_dir, secrets_dest_dir) # deploy os.environ['FORCE'] = "true" if force else '' if component: deployment_scripts = [ s for s in DEPLOYMENT_SCRIPTS if 'init' in s or component in s ] else: deployment_scripts = DEPLOYMENT_SCRIPTS os.chdir(output_dir) logger.info("Switched to %(output_dir)s" % locals()) for path in deployment_scripts: logger.info("=========================") _run_shell_command(path, verbose=True).wait()