def load_file(file_name, number_of_attributes, timestamp_index, bid_index, ask_index, separator, date_format_regular_expression): conf = SparkConf().setMaster("local").setAppName("My App") SparkContext._ensure_initialized() sc = SparkContext(conf=conf) textRDD = sc.textFile(file_name) quotations = textRDD.flatMap(lambda x: x.split(separator)).zipWithIndex() \ .filter( lambda q: q[1] % number_of_attributes == timestamp_index or q[1] % number_of_attributes == bid_index or q[ 1] % number_of_attributes == ask_index) quotation_timestamps = quotations.filter(lambda q: q[1] % number_of_attributes == timestamp_index) \ .map( lambda timestamp: date_to_timestamp(timestamp[0], date_format_regular_expression)) \ .collect() bid_quotes = quotations.filter(lambda q: q[1] % number_of_attributes == bid_index) \ .map(lambda timestamp: float(timestamp[0])) \ .collect() ask_quotes = quotations.filter(lambda q: q[1] % number_of_attributes == ask_index) \ .map(lambda timestamp: float(timestamp[0])) \ .collect() return [quotation_timestamps, bid_quotes, ask_quotes]
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='snappy', min_block_size=1, branching_factor=50, tmp_dir='/tmp'): if Env._hc: raise FatalError('Hail Context has already been created, restart session ' 'or stop Hail context to change configuration.') SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None self._jhc = scala_object(self._hail, 'HailContext').apply( jsc, app_name, joption(master), local, log, quiet, append, parquet_compression, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, self._jsql_context) # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self
def __init__(self, sc=None, appName="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='snappy', min_block_size=1, branching_factor=50, tmp_dir='/tmp'): if Env._hc: raise FatalError('Hail Context has already been created, restart session ' 'or stop Hail context to change configuration.') from pyspark import SparkContext SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None self._jhc = scala_object(self._hail, 'HailContext').apply( jsc, appName, joption(master), local, log, quiet, append, parquet_compression, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, self._jsql_context) # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def __init__(self, sc=None, appName="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='uncompressed', block_size=1, branching_factor=50, tmp_dir='/tmp'): from pyspark import SparkContext SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm Env._jvm = self._jvm Env._gateway = self._gateway # hail package self._hail = getattr(self._jvm, 'is').hail driver = scala_package_object(self._hail.driver) if not sc: self._jsc = driver.configureAndCreateSparkContext( appName, joption(master), local, parquet_compression, block_size) self.sc = SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) else: self.sc = sc # sc._jsc is a JavaSparkContext self._jsc = sc._jsc.sc() driver.configureHail(branching_factor, tmp_dir) driver.configureLogging(log, quiet, append) self._jsql_context = driver.createSQLContext(self._jsc) self._sql_context = SQLContext(self.sc, self._jsql_context)
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37"): if Env._hc: raise FatalError('Hail Context has already been created, restart session ' 'or stop Hail context to change configuration.') SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. self._jhc = self._hail.HailContext.apply( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, self._jsql_context) self._counter = 1 super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.GenomeReference.setDefaultReference(self._jhc, default_reference) sys.stderr.write('Running on Apache Spark version {}\n'.format(self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format(self._jsc.uiWebUrl().get())) if not quiet: connect_logger('localhost', 12888) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\_,_/_/_/ version {}\n'.format(self.version)) if self.version.startswith('devel'): sys.stderr.write('WARNING: This is an unstable development build.\n')
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def parse_file(input_file_name, output_file_name, number_of_attributes, timestamp_index, price_index, separator, quotation_size, quotation_step, date_format_regular_expression): conf = SparkConf().setMaster("local").setAppName("My App") SparkContext._ensure_initialized() sc = SparkContext(conf=conf) textRDD = sc.textFile(input_file_name) quotations = textRDD.flatMap(lambda x: x.split(separator)).zipWithIndex() \ .filter( lambda q: q[1] % number_of_attributes == timestamp_index or q[1] % number_of_attributes == price_index) quotation_timestamps = quotations.filter(lambda q: q[1] % number_of_attributes == timestamp_index) \ .map( lambda timestamp: date_to_timestamp(timestamp[0], date_format_regular_expression)) \ .collect() quotation_prices = quotations.filter(lambda q: q[1] % number_of_attributes == price_index) \ .map(lambda timestamp: float(timestamp[0])) \ .collect() for i in range(0, len(quotation_timestamps), quotation_step): timestamps_batch = quotation_timestamps[i:(i + quotation_size)] timestamps_batch[:] = [x / 10000000000 for x in timestamps_batch] prices_batch = quotation_prices[i:(i + quotation_size)] fig, ax = plt.subplots(nrows=1, ncols=1) plt.axis('off') ax.plot(timestamps_batch, prices_batch) fig.savefig(output_file_name + 'USDJPY15-' + str(int(i / 10)) + '.png')
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def __init__(self, appName="PyHail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='uncompressed', block_size=1, branching_factor=50, tmp_dir='/tmp'): from pyspark import SparkContext SparkContext._ensure_initialized() self.gateway = SparkContext._gateway self.jvm = SparkContext._jvm self.jsc = scala_package_object(self.jvm.org.broadinstitute.hail.driver).configureAndCreateSparkContext( appName, joption(self.jvm, master), local, log, quiet, append, parquet_compression, block_size, branching_factor, tmp_dir) self.sc = SparkContext(gateway=self.gateway, jsc=self.jvm.JavaSparkContext(self.jsc)) self.jsql_context = scala_package_object(self.jvm.org.broadinstitute.hail.driver).createSQLContext(self.jsc) self.sql_context = SQLContext(self.sc, self.jsql_context)
def init(**kwargs): jars = [] vs_jar_path=vs.find_jar() assert os.path.exists(vs_jar_path), "%s does not exist" % vs_jar_path sys.stderr.write("using variant-spark jar at '%s'\n" % vs_jar_path) jars.append(vs_jar_path) if pkg_resources.resource_exists(hl.__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename(hl.__name__, "hail-all-spark.jar") assert os.path.exists(hail_jar_path), "%s does not exist" % hail_jar_path sys.stderr.write("using hail jar at '%s'\n" % hail_jar_path) jars.append(hail_jar_path) conf = SparkConf() conf.set('spark.jars', ",".join(jars)) conf.set('spark.driver.extraClassPath', hail_jar_path) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') SparkContext._ensure_initialized(conf=conf) hl.init(**kwargs)
def __init__(self, sc=None, appName="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='snappy', min_block_size=1, branching_factor=50, tmp_dir='/tmp'): from pyspark import SparkContext SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm Env._jvm = self._jvm Env._gateway = self._gateway Env._hc = self # hail package self._hail = getattr(self._jvm, 'is').hail jsc = sc._jsc.sc() if sc else None self._jhc = scala_object(self._hail, 'HailContext').apply( jsc, appName, joption(master), local, log, quiet, append, parquet_compression, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, self._jsql_context)
def __init__(self, idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations): if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR'): import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: SparkContext._ensure_initialized(conf=conf) else: import warnings warnings.warn( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm Env._jvm = self._jvm Env._gateway = self._gateway # hail package hail = getattr(self._jvm, 'is').hail jsc = sc._jsc.sc() if sc else None if idempotent: self._jbackend = hail.backend.spark.SparkBackend.getOrCreate( jsc, app_name, master, local, True, min_block_size) self._jhc = hail.HailContext.getOrCreate(self._jbackend, log, True, append, branching_factor, tmp_dir, optimizer_iterations) else: self._jbackend = hail.backend.spark.SparkBackend.apply( jsc, app_name, master, local, True, min_block_size) self._jhc = hail.HailContext.apply(self._jbackend, log, True, append, branching_factor, tmp_dir, optimizer_iterations) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jspark_session = self._jbackend.sparkSession() self._spark_session = SparkSession(self.sc, self._jspark_session) from hail.context import version py_version = version() jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") self._fs = None if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._jbackend.startProgressBar()
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' sys.stderr.write(f'using hail jar at {hail_jar_path}\n') conf = SparkConf() conf.set('spark.driver.extraClassPath', hail_jar_path) conf.set('spark.executor.extraClassPath', hail_jar_path) SparkContext._ensure_initialized(conf=conf) else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') version = read_version_info() hail.__version__ = version if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.ReferenceGenome.setDefaultReference( self._jhc, default_reference) jar_version = self._jhc.version() if jar_version != version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, _backend=None): if Env._hc: if idempotent: return else: raise FatalError('Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename(__name__, "hail-all-spark.jar") assert os.path.exists(hail_jar_path), f'{hail_jar_path} does not exist' sys.stderr.write(f'using hail jar at {hail_jar_path}\n') conf = SparkConf() conf.set('spark.driver.extraClassPath', hail_jar_path) conf.set('spark.executor.extraClassPath', hail_jar_path) SparkContext._ensure_initialized(conf=conf) else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') version = read_version_info() hail.__version__ = version if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) else: self._jhc = self._hail.HailContext.apply( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.ReferenceGenome.setDefaultReference(self._jhc, default_reference) jar_version = self._jhc.version() if jar_version != version: raise RuntimeError(f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format(self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format(self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write('NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.ReferenceGenome.setDefaultReference( self._jhc, default_reference) version = self._jhc.version() hail.__version__ = version if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') install_exception_handler() Env.set_seed(global_seed)
ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 >>> f.map(lambda x : x.split(',')) PythonRDD[11] at RDD at PythonRDD.scala:53 >>> f.map(lambda x : x.split(',')).collect() [['car', 'bike', 'car', 'car', 'bike', 'bus', 'bus', 'truck', 'bus', 'truck', 'truck', 'bus', 'car', 'bike', 'bike', 'bike', 'bike', 'truck', 'bus']] >>> words.filter(lambda x : 'i' in x).collect() ['hi', 'hi', 'hi'] >>> arr = sc.parallelize([2,4,6,8,10]) >>> arr.reduce(add) Traceback (most recent call last): File "<pyshell#38>", line 1, in <module> arr.reduce(add) NameError: name 'add' is not defined >>> from operator import add >>> add <built-in function add> >>> arr.reduce(add) 30 >>> from pyspark import SparkContext >>> sc_1 = SparkContext("local", "App_1") Traceback (most recent call last): File "<pyshell#43>", line 1, in <module> sc_1 = SparkContext("local", "App_1") File "C:\Python38\lib\site-packages\pyspark\context.py", line 144, in __init__ SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) File "C:\Python38\lib\site-packages\pyspark\context.py", line 342, in _ensure_initialized raise ValueError( ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <pyshell#4>:1 >>> words.getNumPartitions() 4 >>>
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, spark_conf=None, optimizer_iterations=None, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR'): import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: SparkContext._ensure_initialized(conf=conf) else: import warnings warnings.warn( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: if os.environ.get('HAIL_APISERVER_URL') is not None: _backend = ServiceBackend() else: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') optimizer_iterations = get_env_or_default(optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) py_version = version() if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{py_version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jspark_session = self._jhc.sparkSession() self._spark_session = SparkSession(self.sc, self._jhc.sparkSession()) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True) ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True) ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True) if default_reference in ReferenceGenome._references: self._default_ref = ReferenceGenome._references[default_reference] else: self._default_ref = ReferenceGenome.read(default_reference) jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(py_version)) if py_version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)