def _write_mapred_site(self): self._jh_port = find_unused_port() self._jh_web_port = find_unused_port() self._mr_shuffle_port = find_unused_port() mapred_configs = { 'mapred.job.tracker': '%s:%s' % ( self._fqdn, self._rm_port, ), 'mapreduce.framework.name': 'yarn', 'mapreduce.jobhistory.address': '%s:%s' % ( self._fqdn, self._jh_port, ), 'mapreduce.jobhistory.webapp.address': '%s:%s' % ( self._fqdn, self._jh_web_port, ), 'mapreduce.task.tmp.dir': self._tmppath('tasks'), 'mapreduce.shuffle.port': self._mr_shuffle_port, } self._mapred_site = self._tmppath('conf/mapred-site.xml') write_config(mapred_configs, self._tmppath('conf/mapred-site.xml'))
def _write_mapred_site(self): self._jh_port = find_unused_port() self._jh_web_port = find_unused_port() self._mr_shuffle_port = find_unused_port() mapred_configs = { 'mapred.job.tracker': '%s:%s' % (self._fqdn, self._rm_port,), 'mapreduce.framework.name': 'yarn', 'mapreduce.jobhistory.address': '%s:%s' % (self._fqdn, self._jh_port,), 'mapreduce.jobhistory.webapp.address': '%s:%s' % (self._fqdn, self._jh_web_port,), 'mapreduce.task.tmp.dir': self._tmppath('tasks'), 'mapreduce.shuffle.port': self._mr_shuffle_port, } self._mapred_site = self._tmppath('conf/mapred-site.xml') write_config(mapred_configs, self._tmppath('conf/mapred-site.xml'))
def _write_mapred_site(self): self._jh_port = find_unused_port() self._jh_web_port = find_unused_port() self._mr_shuffle_port = find_unused_port() mapred_configs = { "mapred.job.tracker": "%s:%s" % (self._fqdn, self._rm_port), "mapreduce.framework.name": "yarn", "mapreduce.jobhistory.address": "%s:%s" % (self._fqdn, self._jh_port), "mapreduce.jobhistory.webapp.address": "%s:%s" % (self._fqdn, self._jh_web_port), "mapreduce.task.tmp.dir": self._tmppath("tasks"), "mapreduce.shuffle.port": self._mr_shuffle_port, } self._mapred_site = self._tmppath("conf/mapred-site.xml") write_config(mapred_configs, self._tmppath("conf/mapred-site.xml"))
def _write_yarn_site(self): self._rm_resource_port = find_unused_port() self._rm_port = find_unused_port() self._rm_scheduler_port = find_unused_port() self._rm_admin_port = find_unused_port() self._rm_webapp_port = find_unused_port() self._nm_port = find_unused_port() self._nm_webapp_port = find_unused_port() yarn_configs = { "yarn.resourcemanager.resource-tracker.address": "%s:%s" % (self._fqdn, self._rm_resource_port), "yarn.resourcemanager.address": "%s:%s" % (self._fqdn, self._rm_port), "yarn.resourcemanager.scheduler.address": "%s:%s" % (self._fqdn, self._rm_scheduler_port), "yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler", "yarn.resourcemanager.admin.address": "%s:%s" % (self._fqdn, self._rm_admin_port), "yarn.resourcemanager.webapp.address": "%s:%s" % (self._fqdn, self._rm_webapp_port), "yarn.log-aggregation-enable": "true", "yarn.dispatcher.exit-on-error": "true", "yarn.nodemanager.local-dirs": self._local_dir, "yarn.nodemanager.log-dirs": self._logpath("yarn-logs"), "yarn.nodemanager.remote-app-log-dir": "/var/log/hadoop-yarn/apps", "yarn.nodemanager.localizer.address": "%s:%s" % (self._fqdn, self._nm_port), "yarn.nodemanager.aux-services": "mapreduce_shuffle", "yarn.nodemanager.aux-services.mapreduce.shuffle.class": "org.apache.hadoop.mapred.ShuffleHandler", "yarn.nodemanager.webapp.address": "%s:%s" % (self._fqdn, self._nm_webapp_port), "yarn.app.mapreduce.am.staging-dir": "/tmp/hadoop-yarn/staging", "yarn.application.classpath": """$HADOOP_CONF_DIR, $HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*, $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*, $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*, $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*""", } self._yarn_site = self._tmppath("conf/yarn-site.xml") write_config(yarn_configs, self._tmppath("conf/yarn-site.xml"))
def _write_core_site(self): self._namenode_port = find_unused_port() self._fs_default_name = 'hdfs://%s:%s' % (self._fqdn, self._namenode_port,) core_configs = { 'fs.default.name': self._fs_default_name, 'hadoop.security.authorization': 'true', 'hadoop.security.authentication': 'simple', 'hadoop.proxyuser.hue.hosts': '*', 'hadoop.proxyuser.hue.groups': '*', 'hadoop.proxyuser.oozie.hosts': '*', 'hadoop.proxyuser.oozie.groups': '*', 'hadoop.proxyuser.%s.hosts' % (getpass.getuser(),): '*', 'hadoop.proxyuser.%s.groups' % (getpass.getuser(),): '*', 'hadoop.tmp.dir': self._tmppath('hadoop_tmp_dir'), 'fs.trash.interval': 10 } self._core_site = self._tmppath('conf/core-site.xml') write_config(core_configs, self._core_site)
def _write_hdfs_site(self): self._dfs_http_port = find_unused_port() self._dfs_http_address = '%s:%s' % (self._fqdn, self._dfs_http_port) hdfs_configs = { 'dfs.webhdfs.enabled': 'true', 'dfs.http.address': self._dfs_http_address, 'dfs.namenode.safemode.extension': 1, 'dfs.namenode.safemode.threshold-pct': 0, 'dfs.datanode.address': '%s:0' % self._fqdn, 'dfs.datanode.http.address': '0.0.0.0:0', # Work around webhdfs redirect bug -- bind to all interfaces 'dfs.datanode.ipc.address': '%s:0' % self._fqdn, 'dfs.replication': 1, 'dfs.safemode.min.datanodes': 1, 'dfs.namenode.fs-limits.min-block-size': '1000', 'dfs.permissions': 'true' } self._hdfs_site = self._tmppath('conf/hdfs-site.xml') write_config(hdfs_configs, self._hdfs_site)
def _write_core_site(self): self._namenode_port = find_unused_port() self._fs_default_name = "hdfs://%s:%s" % (self._fqdn, self._namenode_port) core_configs = { "fs.default.name": self._fs_default_name, "hadoop.security.authorization": "true", "hadoop.security.authentication": "simple", "hadoop.proxyuser.hue.hosts": "*", "hadoop.proxyuser.hue.groups": "*", "hadoop.proxyuser.oozie.hosts": "*", "hadoop.proxyuser.oozie.groups": "*", "hadoop.proxyuser.%s.hosts" % (getpass.getuser(),): "*", "hadoop.proxyuser.%s.groups" % (getpass.getuser(),): "*", "hadoop.tmp.dir": self._tmppath("hadoop_tmp_dir"), "fs.trash.interval": 10, } self._core_site = self._tmppath("conf/core-site.xml") write_config(core_configs, self._core_site)
def _write_hdfs_site(self): self._dfs_http_port = find_unused_port() self._dfs_http_address = "%s:%s" % (self._fqdn, self._dfs_http_port) hdfs_configs = { "dfs.webhdfs.enabled": "true", "dfs.http.address": self._dfs_http_address, "dfs.namenode.safemode.extension": 1, "dfs.namenode.safemode.threshold-pct": 0, "dfs.datanode.address": "%s:0" % self._fqdn, "dfs.datanode.http.address": "0.0.0.0:0", # Work around webhdfs redirect bug -- bind to all interfaces "dfs.datanode.ipc.address": "%s:0" % self._fqdn, "dfs.replication": 1, "dfs.safemode.min.datanodes": 1, "dfs.namenode.fs-limits.min-block-size": "1000", "dfs.permissions": "true", } self._hdfs_site = self._tmppath("conf/hdfs-site.xml") write_config(hdfs_configs, self._hdfs_site)
def _write_yarn_site(self): self._rm_resource_port = find_unused_port() self._rm_port = find_unused_port() self._rm_scheduler_port = find_unused_port() self._rm_admin_port = find_unused_port() self._rm_webapp_port = find_unused_port() self._nm_port = find_unused_port() self._nm_webapp_port = find_unused_port() yarn_configs = { 'yarn.resourcemanager.resource-tracker.address': '%s:%s' % (self._fqdn, self._rm_resource_port,), 'yarn.resourcemanager.address': '%s:%s' % (self._fqdn, self._rm_port,), 'yarn.resourcemanager.scheduler.address': '%s:%s' % (self._fqdn, self._rm_scheduler_port,), 'yarn.resourcemanager.scheduler.class': 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler', 'yarn.resourcemanager.admin.address': '%s:%s' % (self._fqdn, self._rm_admin_port,), 'yarn.resourcemanager.webapp.address': '%s:%s' % (self._fqdn, self._rm_webapp_port,), 'yarn.log-aggregation-enable': 'true', 'yarn.dispatcher.exit-on-error': 'true', 'yarn.nodemanager.local-dirs': self._local_dir, 'yarn.nodemanager.log-dirs': self._logpath('yarn-logs'), 'yarn.nodemanager.remote-app-log-dir': '/var/log/hadoop-yarn/apps', 'yarn.nodemanager.localizer.address' : '%s:%s' % (self._fqdn, self._nm_port,), 'yarn.nodemanager.aux-services': 'mapreduce_shuffle', 'yarn.nodemanager.aux-services.mapreduce.shuffle.class': 'org.apache.hadoop.mapred.ShuffleHandler', 'yarn.nodemanager.webapp.address': '%s:%s' % (self._fqdn, self._nm_webapp_port,), 'yarn.app.mapreduce.am.staging-dir': '/tmp/hadoop-yarn/staging', 'yarn.application.classpath': '''$HADOOP_CONF_DIR, $HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*, $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*, $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*, $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*''', } self._yarn_site = self._tmppath('conf/yarn-site.xml') write_config(yarn_configs, self._tmppath('conf/yarn-site.xml'))
def __init__(self): self.port = python_util.find_unused_port() self.pid = 0
from nose.tools import assert_true, assert_false from django.core.urlresolvers import reverse from django.contrib.auth.models import User from desktop.lib.django_test_util import make_logged_in_client from desktop.lib.paths import get_run_root from desktop.lib.python_util import find_unused_port from desktop.lib.security_util import get_localhost_name from hadoop import pseudo_hdfs4 import beeswax.conf from beeswax.server.dbms import get_query_server_config from beeswax.server import dbms HIVE_SERVER_TEST_PORT = find_unused_port() _INITIALIZED = False _SHARED_HIVE_SERVER_PROCESS = None _SHARED_HIVE_SERVER = None _SHARED_HIVE_SERVER_CLOSER = None LOG = logging.getLogger(__name__) def _start_server(cluster): args = [beeswax.conf.HIVE_SERVER_BIN.get()] env = cluster._mr2_env.copy() hadoop_cp_proc = subprocess.Popen( args=[get_run_root('ext/hadoop/hadoop') + '/bin/hadoop', 'classpath'],
def start(self, extra_configs=None): """ Start a cluster as a subprocess. """ self.tmpdir = tempfile.mkdtemp() if not extra_configs: extra_configs = {} def tmppath(filename): """Creates paths in tmpdir.""" return os.path.join(self.tmpdir, filename) LOGGER.info("Using temporary directory: %s" % self.tmpdir) in_conf_dir = tmppath("in-conf") os.mkdir(in_conf_dir) self.log_dir = tmppath("logs") os.mkdir(self.log_dir) f = file(os.path.join(in_conf_dir, "hadoop-metrics.properties"), "w") try: f.write(""" dfs.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext mapred.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext jvm.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext rpc.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext """) finally: f.close() if self.superuser not in TEST_USER_GROUP_MAPPING: TEST_USER_GROUP_MAPPING[self.superuser] = [self.superuser] _write_static_group_mapping(TEST_USER_GROUP_MAPPING, tmppath('ugm.properties')) core_configs = { 'hadoop.proxyuser.%s.groups' % (self.superuser,): 'users,supergroup', 'hadoop.proxyuser.%s.hosts' % (self.superuser,): 'localhost', 'mapred.jobtracker.plugins': CLUSTER_JT_PLUGINS} extra_configs.update(STARTUP_CONFIGS) write_config(core_configs, tmppath('in-conf/core-site.xml')) write_config({'mapred.jobtracker.taskScheduler': CLUSTER_TASK_SCHEDULER, 'mapred.queue.names': CLUSTER_QUEUE_NAMES}, tmppath('in-conf/mapred-site.xml')) hadoop_policy_keys = ['client', 'client.datanode', 'datanode', 'inter.datanode', 'namenode', 'inter.tracker', 'job.submission', 'task.umbilical', 'refresh.policy', 'admin.operations'] hadoop_policy_config = {} for policy in hadoop_policy_keys: hadoop_policy_config['security.' + policy + '.protocol.acl'] = '*' write_config(hadoop_policy_config, tmppath('in-conf/hadoop-policy.xml')) details_file = file(tmppath("details.json"), "w+") try: args = [ os.path.join(hadoop.conf.HADOOP_MR1_HOME.get(), 'bin', 'hadoop'), "jar", hadoop.conf.HADOOP_TEST_JAR.get(), "minicluster", "-writeConfig", tmppath("config.xml"), "-writeDetails", tmppath("details.json"), "-datanodes", str(self.num_datanodes), "-tasktrackers", str(self.num_tasktrackers), "-useloopbackhosts", "-D", "hadoop.tmp.dir=%s" % self.tmpdir, "-D", "mapred.local.dir=%s/mapred/local" % self.tmpdir, "-D", "mapred.system.dir=/mapred/system", "-D", "mapred.temp.dir=/mapred/temp", "-D", "jobclient.completion.poll.interval=100", "-D", "jobclient.progress.monitor.poll.interval=100", "-D", "fs.checkpoint.period=1", # For a reason I don't fully understand, this must be 0.0.0.0 and not 'localhost' "-D", "dfs.secondary.http.address=0.0.0.0:%d" % python_util.find_unused_port(), # We bind the NN's thrift interface to a port we find here. # This is suboptimal, since there's a race. Alas, if we don't # do this here, the datanodes fail to discover the namenode's thrift # address, and there's a race there "-D", "dfs.thrift.address=localhost:%d" % python_util.find_unused_port(), "-D", "jobtracker.thrift.address=localhost:%d" % python_util.find_unused_port(), # Jobs realize they have finished faster with this timeout. "-D", "jobclient.completion.poll.interval=50", "-D", "hadoop.security.authorization=true", "-D", "hadoop.policy.file=%s/hadoop-policy.xml" % in_conf_dir, ] for key,value in extra_configs.iteritems(): args.append("-D") args.append(key + "=" + value) env = {} env["HADOOP_CONF_DIR"] = in_conf_dir env["HADOOP_OPTS"] = "-Dtest.build.data=%s" % (self.tmpdir, ) env["HADOOP_CLASSPATH"] = ':'.join([ # -- BEGIN JAVA TRIVIA -- # Add the -test- jar to the classpath to work around a subtle issue # involving Java classloaders. In brief, hadoop's RunJar class creates # a child classloader with the test jar on it, but the core classes # are loaded by the system classloader. This is fine except that # some classes in the test jar extend package-protected classes in the # core jar. Even though the classes are in the same package name, they # are thus loaded by different classloaders and therefore an IllegalAccessError # prevents the MiniMRCluster from starting. Adding the test jar to the system # classpath prevents this error since then both the MiniMRCluster and the # core classes are loaded by the system classloader. hadoop.conf.HADOOP_TEST_JAR.get(), # -- END JAVA TRIVIA -- hadoop.conf.HADOOP_PLUGIN_CLASSPATH.get(), # Due to CDH-4537, we need to add test dependencies to run minicluster os.path.join(os.path.dirname(__file__), 'test_jars', '*'), ]) env["HADOOP_HEAPSIZE"] = "128" env["HADOOP_HOME"] = hadoop.conf.HADOOP_MR1_HOME.get() env["HADOOP_LOG_DIR"] = self.log_dir env["USER"] = self.superuser if "JAVA_HOME" in os.environ: env["JAVA_HOME"] = os.environ["JAVA_HOME"] # Wait for the debugger to attach if DEBUG_HADOOP: env["HADOOP_OPTS"] = env.get("HADOOP_OPTS", "") + " -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=9999" if USE_STDERR: stderr=sys.stderr else: stderr=file(tmppath("stderr"), "w") LOGGER.debug("Starting minicluster: %s env: %s" % (repr(args), repr(env))) self.clusterproc = subprocess.Popen( args=args, stdout=file(tmppath("stdout"), "w"), stderr=stderr, env=env) details = {} start = time.time() # We consider the cluster started when the details file parses correct JSON. # MiniHadoopCluster currently writes the details file last, and this depends # on that. while not details: try: details_file.seek(0) details = simplejson.load(details_file) except ValueError: pass if self.clusterproc.poll() is not None or (not DEBUG_HADOOP and (time.time() - start) > MAX_CLUSTER_STARTUP_TIME): LOGGER.debug("stdout:" + file(tmppath("stdout")).read()) if not USE_STDERR: LOGGER.debug("stderr:" + file(tmppath("stderr")).read()) self.stop() raise Exception("Cluster process quit or is taking too long to start. Aborting.") finally: details_file.close() LOGGER.debug("Successfully started minicluster") # Place all the details as attributes on self. for k, v in details.iteritems(): setattr(self, k, v) # Parse the configuration using XPath and place into self.config. config = lxml.etree.parse(tmppath("config.xml")) self.config = dict( (property.find("./name").text, property.find("./value").text) for property in config.xpath("/configuration/property")) # Write out Hadoop-style configuration directory, # which can, in turn, be used for /bin/hadoop. self.config_dir = tmppath("conf") os.mkdir(self.config_dir) hadoop.conf.HADOOP_CONF_DIR.set_for_testing(self.config_dir) write_config(self.config, tmppath("conf/core-site.xml"), ["fs.defaultFS", "jobclient.completion.poll.interval", "dfs.namenode.checkpoint.period", "dfs.namenode.checkpoint.dir", 'hadoop.proxyuser.'+self.superuser+'.groups', 'hadoop.proxyuser.'+self.superuser+'.hosts']) write_config(self.config, tmppath("conf/hdfs-site.xml"), ["fs.defaultFS", "dfs.namenode.http-address", "dfs.namenode.secondary.http-address"]) # mapred.job.tracker isn't written out into self.config, so we fill # that one out more manually. write_config({ 'mapred.job.tracker': 'localhost:%d' % self.jobtracker_port }, tmppath("conf/mapred-site.xml")) write_config(hadoop_policy_config, tmppath('conf/hadoop-policy.xml')) # Once the config is written out, we can start the 2NN. args = [hadoop.conf.HADOOP_BIN.get(), '--config', self.config_dir, 'secondarynamenode'] LOGGER.debug("Starting 2NN at: " + self.config['dfs.secondary.http.address']) LOGGER.debug("2NN command: %s env: %s" % (repr(args), repr(env))) self.secondary_proc = subprocess.Popen( args=args, stdout=file(tmppath("stdout.2nn"), "w"), stderr=file(tmppath("stderr.2nn"), "w"), env=env) while True: try: response = urllib2.urlopen(urllib2.Request('http://' + self.config['dfs.secondary.http.address'])) except urllib2.URLError: # If we should abort startup. if self.secondary_proc.poll() is not None or (not DEBUG_HADOOP and (time.time() - start) > MAX_CLUSTER_STARTUP_TIME): LOGGER.debug("stdout:" + file(tmppath("stdout")).read()) if not USE_STDERR: LOGGER.debug("stderr:" + file(tmppath("stderr")).read()) self.stop() raise Exception("2nn process quit or is taking too long to start. Aborting.") break else: time.sleep(1) continue # We didn't get a URLError. 2NN started successfully. response.close() break LOGGER.debug("Successfully started 2NN")
from django.contrib.auth.models import User from desktop.lib.django_test_util import make_logged_in_client from desktop.lib.paths import get_run_root from desktop.lib.python_util import find_unused_port from desktop.lib.security_util import get_localhost_name from hadoop import pseudo_hdfs4 import beeswax.conf from beeswax.server.dbms import get_query_server_config from beeswax.server import dbms HIVE_SERVER_TEST_PORT = find_unused_port() _INITIALIZED = False _SHARED_HIVE_SERVER_PROCESS = None _SHARED_HIVE_SERVER = None _SHARED_HIVE_SERVER_CLOSER = None LOG = logging.getLogger(__name__) def _start_server(cluster): args = [beeswax.conf.HIVE_SERVER_BIN.get()] env = cluster._mr2_env.copy() hadoop_cp_proc = subprocess.Popen(args=[get_run_root('ext/hadoop/hadoop') + '/bin/hadoop', 'classpath'], env=env, cwd=cluster._tmpdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)