Esempio n. 1
0
    def _write_mapred_site(self):
        self._jh_port = find_unused_port()
        self._jh_web_port = find_unused_port()
        self._mr_shuffle_port = find_unused_port()

        mapred_configs = {
            'mapred.job.tracker':
            '%s:%s' % (
                self._fqdn,
                self._rm_port,
            ),
            'mapreduce.framework.name':
            'yarn',
            'mapreduce.jobhistory.address':
            '%s:%s' % (
                self._fqdn,
                self._jh_port,
            ),
            'mapreduce.jobhistory.webapp.address':
            '%s:%s' % (
                self._fqdn,
                self._jh_web_port,
            ),
            'mapreduce.task.tmp.dir':
            self._tmppath('tasks'),
            'mapreduce.shuffle.port':
            self._mr_shuffle_port,
        }
        self._mapred_site = self._tmppath('conf/mapred-site.xml')
        write_config(mapred_configs, self._tmppath('conf/mapred-site.xml'))
Esempio n. 2
0
  def _write_mapred_site(self):
    self._jh_port = find_unused_port()
    self._jh_web_port = find_unused_port()
    self._mr_shuffle_port = find_unused_port()

    mapred_configs = {
      'mapred.job.tracker': '%s:%s' % (self._fqdn, self._rm_port,),
      'mapreduce.framework.name': 'yarn',
      'mapreduce.jobhistory.address': '%s:%s' % (self._fqdn, self._jh_port,),
      'mapreduce.jobhistory.webapp.address': '%s:%s' % (self._fqdn, self._jh_web_port,),
      'mapreduce.task.tmp.dir': self._tmppath('tasks'),
      'mapreduce.shuffle.port': self._mr_shuffle_port,
    }
    self._mapred_site = self._tmppath('conf/mapred-site.xml')
    write_config(mapred_configs, self._tmppath('conf/mapred-site.xml'))
Esempio n. 3
0
    def _write_mapred_site(self):
        self._jh_port = find_unused_port()
        self._jh_web_port = find_unused_port()
        self._mr_shuffle_port = find_unused_port()

        mapred_configs = {
            "mapred.job.tracker": "%s:%s" % (self._fqdn, self._rm_port),
            "mapreduce.framework.name": "yarn",
            "mapreduce.jobhistory.address": "%s:%s" % (self._fqdn, self._jh_port),
            "mapreduce.jobhistory.webapp.address": "%s:%s" % (self._fqdn, self._jh_web_port),
            "mapreduce.task.tmp.dir": self._tmppath("tasks"),
            "mapreduce.shuffle.port": self._mr_shuffle_port,
        }
        self._mapred_site = self._tmppath("conf/mapred-site.xml")
        write_config(mapred_configs, self._tmppath("conf/mapred-site.xml"))
Esempio n. 4
0
    def _write_yarn_site(self):
        self._rm_resource_port = find_unused_port()
        self._rm_port = find_unused_port()
        self._rm_scheduler_port = find_unused_port()
        self._rm_admin_port = find_unused_port()
        self._rm_webapp_port = find_unused_port()
        self._nm_port = find_unused_port()
        self._nm_webapp_port = find_unused_port()

        yarn_configs = {
            "yarn.resourcemanager.resource-tracker.address": "%s:%s" % (self._fqdn, self._rm_resource_port),
            "yarn.resourcemanager.address": "%s:%s" % (self._fqdn, self._rm_port),
            "yarn.resourcemanager.scheduler.address": "%s:%s" % (self._fqdn, self._rm_scheduler_port),
            "yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler",
            "yarn.resourcemanager.admin.address": "%s:%s" % (self._fqdn, self._rm_admin_port),
            "yarn.resourcemanager.webapp.address": "%s:%s" % (self._fqdn, self._rm_webapp_port),
            "yarn.log-aggregation-enable": "true",
            "yarn.dispatcher.exit-on-error": "true",
            "yarn.nodemanager.local-dirs": self._local_dir,
            "yarn.nodemanager.log-dirs": self._logpath("yarn-logs"),
            "yarn.nodemanager.remote-app-log-dir": "/var/log/hadoop-yarn/apps",
            "yarn.nodemanager.localizer.address": "%s:%s" % (self._fqdn, self._nm_port),
            "yarn.nodemanager.aux-services": "mapreduce_shuffle",
            "yarn.nodemanager.aux-services.mapreduce.shuffle.class": "org.apache.hadoop.mapred.ShuffleHandler",
            "yarn.nodemanager.webapp.address": "%s:%s" % (self._fqdn, self._nm_webapp_port),
            "yarn.app.mapreduce.am.staging-dir": "/tmp/hadoop-yarn/staging",
            "yarn.application.classpath": """$HADOOP_CONF_DIR,
        $HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
        $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
        $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,
        $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*""",
        }
        self._yarn_site = self._tmppath("conf/yarn-site.xml")
        write_config(yarn_configs, self._tmppath("conf/yarn-site.xml"))
Esempio n. 5
0
  def _write_core_site(self):
    self._namenode_port = find_unused_port()
    self._fs_default_name = 'hdfs://%s:%s' % (self._fqdn, self._namenode_port,)

    core_configs = {
      'fs.default.name': self._fs_default_name,
      'hadoop.security.authorization': 'true',
      'hadoop.security.authentication': 'simple',
      'hadoop.proxyuser.hue.hosts': '*',
      'hadoop.proxyuser.hue.groups': '*',
      'hadoop.proxyuser.oozie.hosts': '*',
      'hadoop.proxyuser.oozie.groups': '*',
      'hadoop.proxyuser.%s.hosts' % (getpass.getuser(),): '*',
      'hadoop.proxyuser.%s.groups' % (getpass.getuser(),): '*',
      'hadoop.tmp.dir': self._tmppath('hadoop_tmp_dir'),
      'fs.trash.interval': 10
    }
    self._core_site = self._tmppath('conf/core-site.xml')
    write_config(core_configs, self._core_site)
Esempio n. 6
0
  def _write_hdfs_site(self):
    self._dfs_http_port = find_unused_port()
    self._dfs_http_address = '%s:%s' % (self._fqdn, self._dfs_http_port)

    hdfs_configs = {
      'dfs.webhdfs.enabled': 'true',
      'dfs.http.address': self._dfs_http_address,
      'dfs.namenode.safemode.extension': 1,
      'dfs.namenode.safemode.threshold-pct': 0,
      'dfs.datanode.address': '%s:0' % self._fqdn,
      'dfs.datanode.http.address': '0.0.0.0:0', # Work around webhdfs redirect bug -- bind to all interfaces
      'dfs.datanode.ipc.address': '%s:0' % self._fqdn,
      'dfs.replication': 1,
      'dfs.safemode.min.datanodes': 1,
      'dfs.namenode.fs-limits.min-block-size': '1000',
      'dfs.permissions': 'true'
    }
    self._hdfs_site = self._tmppath('conf/hdfs-site.xml')
    write_config(hdfs_configs, self._hdfs_site)
Esempio n. 7
0
  def _write_core_site(self):
    self._namenode_port = find_unused_port()
    self._fs_default_name = 'hdfs://%s:%s' % (self._fqdn, self._namenode_port,)

    core_configs = {
      'fs.default.name': self._fs_default_name,
      'hadoop.security.authorization': 'true',
      'hadoop.security.authentication': 'simple',
      'hadoop.proxyuser.hue.hosts': '*',
      'hadoop.proxyuser.hue.groups': '*',
      'hadoop.proxyuser.oozie.hosts': '*',
      'hadoop.proxyuser.oozie.groups': '*',
      'hadoop.proxyuser.%s.hosts' % (getpass.getuser(),): '*',
      'hadoop.proxyuser.%s.groups' % (getpass.getuser(),): '*',
      'hadoop.tmp.dir': self._tmppath('hadoop_tmp_dir'),
      'fs.trash.interval': 10
    }
    self._core_site = self._tmppath('conf/core-site.xml')
    write_config(core_configs, self._core_site)
Esempio n. 8
0
  def _write_hdfs_site(self):
    self._dfs_http_port = find_unused_port()
    self._dfs_http_address = '%s:%s' % (self._fqdn, self._dfs_http_port)

    hdfs_configs = {
      'dfs.webhdfs.enabled': 'true',
      'dfs.http.address': self._dfs_http_address,
      'dfs.namenode.safemode.extension': 1,
      'dfs.namenode.safemode.threshold-pct': 0,
      'dfs.datanode.address': '%s:0' % self._fqdn,
      'dfs.datanode.http.address': '0.0.0.0:0', # Work around webhdfs redirect bug -- bind to all interfaces
      'dfs.datanode.ipc.address': '%s:0' % self._fqdn,
      'dfs.replication': 1,
      'dfs.safemode.min.datanodes': 1,
      'dfs.namenode.fs-limits.min-block-size': '1000',
      'dfs.permissions': 'true'
    }
    self._hdfs_site = self._tmppath('conf/hdfs-site.xml')
    write_config(hdfs_configs, self._hdfs_site)
Esempio n. 9
0
    def _write_core_site(self):
        self._namenode_port = find_unused_port()
        self._fs_default_name = "hdfs://%s:%s" % (self._fqdn, self._namenode_port)

        core_configs = {
            "fs.default.name": self._fs_default_name,
            "hadoop.security.authorization": "true",
            "hadoop.security.authentication": "simple",
            "hadoop.proxyuser.hue.hosts": "*",
            "hadoop.proxyuser.hue.groups": "*",
            "hadoop.proxyuser.oozie.hosts": "*",
            "hadoop.proxyuser.oozie.groups": "*",
            "hadoop.proxyuser.%s.hosts" % (getpass.getuser(),): "*",
            "hadoop.proxyuser.%s.groups" % (getpass.getuser(),): "*",
            "hadoop.tmp.dir": self._tmppath("hadoop_tmp_dir"),
            "fs.trash.interval": 10,
        }
        self._core_site = self._tmppath("conf/core-site.xml")
        write_config(core_configs, self._core_site)
Esempio n. 10
0
    def _write_hdfs_site(self):
        self._dfs_http_port = find_unused_port()
        self._dfs_http_address = "%s:%s" % (self._fqdn, self._dfs_http_port)

        hdfs_configs = {
            "dfs.webhdfs.enabled": "true",
            "dfs.http.address": self._dfs_http_address,
            "dfs.namenode.safemode.extension": 1,
            "dfs.namenode.safemode.threshold-pct": 0,
            "dfs.datanode.address": "%s:0" % self._fqdn,
            "dfs.datanode.http.address": "0.0.0.0:0",  # Work around webhdfs redirect bug -- bind to all interfaces
            "dfs.datanode.ipc.address": "%s:0" % self._fqdn,
            "dfs.replication": 1,
            "dfs.safemode.min.datanodes": 1,
            "dfs.namenode.fs-limits.min-block-size": "1000",
            "dfs.permissions": "true",
        }
        self._hdfs_site = self._tmppath("conf/hdfs-site.xml")
        write_config(hdfs_configs, self._hdfs_site)
Esempio n. 11
0
  def _write_yarn_site(self):
    self._rm_resource_port = find_unused_port()
    self._rm_port = find_unused_port()
    self._rm_scheduler_port = find_unused_port()
    self._rm_admin_port = find_unused_port()
    self._rm_webapp_port = find_unused_port()
    self._nm_port = find_unused_port()
    self._nm_webapp_port = find_unused_port()

    yarn_configs = {
      'yarn.resourcemanager.resource-tracker.address': '%s:%s' % (self._fqdn, self._rm_resource_port,),
      'yarn.resourcemanager.address': '%s:%s' % (self._fqdn, self._rm_port,),
      'yarn.resourcemanager.scheduler.address': '%s:%s' % (self._fqdn, self._rm_scheduler_port,),
      'yarn.resourcemanager.scheduler.class': 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler',
      'yarn.resourcemanager.admin.address': '%s:%s' % (self._fqdn, self._rm_admin_port,),
      'yarn.resourcemanager.webapp.address': '%s:%s' % (self._fqdn, self._rm_webapp_port,),

      'yarn.log-aggregation-enable': 'true',
      'yarn.dispatcher.exit-on-error': 'true',

      'yarn.nodemanager.local-dirs': self._local_dir,
      'yarn.nodemanager.log-dirs': self._logpath('yarn-logs'),
      'yarn.nodemanager.remote-app-log-dir': '/var/log/hadoop-yarn/apps',
      'yarn.nodemanager.localizer.address' : '%s:%s' % (self._fqdn, self._nm_port,),
      'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
      'yarn.nodemanager.aux-services.mapreduce.shuffle.class': 'org.apache.hadoop.mapred.ShuffleHandler',
      'yarn.nodemanager.webapp.address': '%s:%s' % (self._fqdn, self._nm_webapp_port,),

      'yarn.app.mapreduce.am.staging-dir': '/tmp/hadoop-yarn/staging',

      'yarn.application.classpath':
      '''$HADOOP_CONF_DIR,
        $HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
        $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
        $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,
        $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*''',
    }
    self._yarn_site = self._tmppath('conf/yarn-site.xml')
    write_config(yarn_configs, self._tmppath('conf/yarn-site.xml'))
Esempio n. 12
0
  def _write_yarn_site(self):
    self._rm_resource_port = find_unused_port()
    self._rm_port = find_unused_port()
    self._rm_scheduler_port = find_unused_port()
    self._rm_admin_port = find_unused_port()
    self._rm_webapp_port = find_unused_port()
    self._nm_port = find_unused_port()
    self._nm_webapp_port = find_unused_port()

    yarn_configs = {
      'yarn.resourcemanager.resource-tracker.address': '%s:%s' % (self._fqdn, self._rm_resource_port,),
      'yarn.resourcemanager.address': '%s:%s' % (self._fqdn, self._rm_port,),
      'yarn.resourcemanager.scheduler.address': '%s:%s' % (self._fqdn, self._rm_scheduler_port,),
      'yarn.resourcemanager.scheduler.class': 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler',
      'yarn.resourcemanager.admin.address': '%s:%s' % (self._fqdn, self._rm_admin_port,),
      'yarn.resourcemanager.webapp.address': '%s:%s' % (self._fqdn, self._rm_webapp_port,),

      'yarn.log-aggregation-enable': 'true',
      'yarn.dispatcher.exit-on-error': 'true',

      'yarn.nodemanager.local-dirs': self._local_dir,
      'yarn.nodemanager.log-dirs': self._logpath('yarn-logs'),
      'yarn.nodemanager.remote-app-log-dir': '/var/log/hadoop-yarn/apps',
      'yarn.nodemanager.localizer.address' : '%s:%s' % (self._fqdn, self._nm_port,),
      'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
      'yarn.nodemanager.aux-services.mapreduce.shuffle.class': 'org.apache.hadoop.mapred.ShuffleHandler',
      'yarn.nodemanager.webapp.address': '%s:%s' % (self._fqdn, self._nm_webapp_port,),

      'yarn.app.mapreduce.am.staging-dir': '/tmp/hadoop-yarn/staging',

      'yarn.application.classpath':
      '''$HADOOP_CONF_DIR,
        $HADOOP_COMMON_HOME/share/hadoop/common/*,$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
        $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
        $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,
        $HADOOP_YARN_HOME/share/hadoop/yarn/*,$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*''',
    }
    self._yarn_site = self._tmppath('conf/yarn-site.xml')
    write_config(yarn_configs, self._tmppath('conf/yarn-site.xml'))
Esempio n. 13
0
 def __init__(self):
     self.port = python_util.find_unused_port()
     self.pid = 0
Esempio n. 14
0
from nose.tools import assert_true, assert_false
from django.core.urlresolvers import reverse
from django.contrib.auth.models import User

from desktop.lib.django_test_util import make_logged_in_client
from desktop.lib.paths import get_run_root
from desktop.lib.python_util import find_unused_port
from desktop.lib.security_util import get_localhost_name
from hadoop import pseudo_hdfs4

import beeswax.conf

from beeswax.server.dbms import get_query_server_config
from beeswax.server import dbms

HIVE_SERVER_TEST_PORT = find_unused_port()
_INITIALIZED = False
_SHARED_HIVE_SERVER_PROCESS = None
_SHARED_HIVE_SERVER = None
_SHARED_HIVE_SERVER_CLOSER = None

LOG = logging.getLogger(__name__)


def _start_server(cluster):
    args = [beeswax.conf.HIVE_SERVER_BIN.get()]

    env = cluster._mr2_env.copy()

    hadoop_cp_proc = subprocess.Popen(
        args=[get_run_root('ext/hadoop/hadoop') + '/bin/hadoop', 'classpath'],
Esempio n. 15
0
  def start(self, extra_configs=None):
    """
    Start a cluster as a subprocess.
    """
    self.tmpdir = tempfile.mkdtemp()

    if not extra_configs:
      extra_configs = {}

    def tmppath(filename):
      """Creates paths in tmpdir."""
      return os.path.join(self.tmpdir, filename)

    LOGGER.info("Using temporary directory: %s" % self.tmpdir)

    in_conf_dir = tmppath("in-conf")
    os.mkdir(in_conf_dir)
    self.log_dir = tmppath("logs")
    os.mkdir(self.log_dir)
    f = file(os.path.join(in_conf_dir, "hadoop-metrics.properties"), "w")
    try:
      f.write("""
dfs.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
mapred.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
jvm.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
rpc.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
""")
    finally:
      f.close()

    if self.superuser not in TEST_USER_GROUP_MAPPING:
      TEST_USER_GROUP_MAPPING[self.superuser] = [self.superuser]

    _write_static_group_mapping(TEST_USER_GROUP_MAPPING,
      tmppath('ugm.properties'))

    core_configs = {
      'hadoop.proxyuser.%s.groups' % (self.superuser,): 'users,supergroup',
      'hadoop.proxyuser.%s.hosts' % (self.superuser,): 'localhost',
      'mapred.jobtracker.plugins': CLUSTER_JT_PLUGINS}

    extra_configs.update(STARTUP_CONFIGS)
    write_config(core_configs, tmppath('in-conf/core-site.xml'))

    write_config({'mapred.jobtracker.taskScheduler': CLUSTER_TASK_SCHEDULER,
                  'mapred.queue.names': CLUSTER_QUEUE_NAMES},
                 tmppath('in-conf/mapred-site.xml'))

    hadoop_policy_keys = ['client', 'client.datanode', 'datanode', 'inter.datanode', 'namenode', 'inter.tracker', 'job.submission', 'task.umbilical', 'refresh.policy', 'admin.operations']
    hadoop_policy_config = {}
    for policy in hadoop_policy_keys:
      hadoop_policy_config['security.' + policy + '.protocol.acl'] = '*'
    write_config(hadoop_policy_config, tmppath('in-conf/hadoop-policy.xml'))

    details_file = file(tmppath("details.json"), "w+")
    try:
      args = [ os.path.join(hadoop.conf.HADOOP_MR1_HOME.get(), 'bin', 'hadoop'),
        "jar",
        hadoop.conf.HADOOP_TEST_JAR.get(),
        "minicluster",
        "-writeConfig", tmppath("config.xml"), 
        "-writeDetails", tmppath("details.json"),
        "-datanodes", str(self.num_datanodes),
        "-tasktrackers", str(self.num_tasktrackers),
        "-useloopbackhosts",
        "-D", "hadoop.tmp.dir=%s" % self.tmpdir,
        "-D", "mapred.local.dir=%s/mapred/local" % self.tmpdir,
        "-D", "mapred.system.dir=/mapred/system",
        "-D", "mapred.temp.dir=/mapred/temp",
        "-D", "jobclient.completion.poll.interval=100",
        "-D", "jobclient.progress.monitor.poll.interval=100",
        "-D", "fs.checkpoint.period=1",
        # For a reason I don't fully understand, this must be 0.0.0.0 and not 'localhost'
        "-D", "dfs.secondary.http.address=0.0.0.0:%d" % python_util.find_unused_port(),
        # We bind the NN's thrift interface to a port we find here.
        # This is suboptimal, since there's a race.  Alas, if we don't
        # do this here, the datanodes fail to discover the namenode's thrift
        # address, and there's a race there
        "-D", "dfs.thrift.address=localhost:%d" % python_util.find_unused_port(),
        "-D", "jobtracker.thrift.address=localhost:%d" % python_util.find_unused_port(),
        # Jobs realize they have finished faster with this timeout.
        "-D", "jobclient.completion.poll.interval=50",
        "-D", "hadoop.security.authorization=true",
        "-D", "hadoop.policy.file=%s/hadoop-policy.xml" % in_conf_dir,
      ]

      for key,value in extra_configs.iteritems():
        args.append("-D")
        args.append(key + "=" + value)

      env = {}
      env["HADOOP_CONF_DIR"] = in_conf_dir
      env["HADOOP_OPTS"] = "-Dtest.build.data=%s" % (self.tmpdir, )
      env["HADOOP_CLASSPATH"] = ':'.join([
        # -- BEGIN JAVA TRIVIA --
        # Add the -test- jar to the classpath to work around a subtle issue
        # involving Java classloaders. In brief, hadoop's RunJar class creates
        # a child classloader with the test jar on it, but the core classes
        # are loaded by the system classloader. This is fine except that
        # some classes in the test jar extend package-protected classes in the
        # core jar. Even though the classes are in the same package name, they
        # are thus loaded by different classloaders and therefore an IllegalAccessError
        # prevents the MiniMRCluster from starting. Adding the test jar to the system
        # classpath prevents this error since then both the MiniMRCluster and the
        # core classes are loaded by the system classloader.
        hadoop.conf.HADOOP_TEST_JAR.get(),
        # -- END JAVA TRIVIA --
        hadoop.conf.HADOOP_PLUGIN_CLASSPATH.get(),
        # Due to CDH-4537, we need to add test dependencies to run minicluster
        os.path.join(os.path.dirname(__file__), 'test_jars', '*'),
      ])
      env["HADOOP_HEAPSIZE"] = "128"
      env["HADOOP_HOME"] = hadoop.conf.HADOOP_MR1_HOME.get()
      env["HADOOP_LOG_DIR"] = self.log_dir
      env["USER"] = self.superuser
      if "JAVA_HOME" in os.environ:
        env["JAVA_HOME"] = os.environ["JAVA_HOME"]
      # Wait for the debugger to attach
      if DEBUG_HADOOP:
        env["HADOOP_OPTS"] = env.get("HADOOP_OPTS", "") + " -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=9999"

      if USE_STDERR:
        stderr=sys.stderr
      else:
        stderr=file(tmppath("stderr"), "w")
      LOGGER.debug("Starting minicluster: %s env: %s" % (repr(args), repr(env)))
      self.clusterproc = subprocess.Popen(
        args=args,
        stdout=file(tmppath("stdout"), "w"),
        stderr=stderr,
        env=env)

      details = {}
      start = time.time()
      # We consider the cluster started when the details file parses correct JSON.
      # MiniHadoopCluster currently writes the details file last, and this depends
      # on that.
      while not details:
        try:
          details_file.seek(0)
          details = simplejson.load(details_file)
        except ValueError:
          pass
        if self.clusterproc.poll() is not None or (not DEBUG_HADOOP and (time.time() - start) > MAX_CLUSTER_STARTUP_TIME):
          LOGGER.debug("stdout:" + file(tmppath("stdout")).read())
          if not USE_STDERR:
            LOGGER.debug("stderr:" + file(tmppath("stderr")).read())
          self.stop()
          raise Exception("Cluster process quit or is taking too long to start.  Aborting.")
    finally:
      details_file.close()

    LOGGER.debug("Successfully started minicluster")

    # Place all the details as attributes on self.
    for k, v in details.iteritems():
      setattr(self, k, v)

    # Parse the configuration using XPath and place into self.config.
    config = lxml.etree.parse(tmppath("config.xml"))
    self.config = dict( (property.find("./name").text, property.find("./value").text) 
      for property in config.xpath("/configuration/property"))

    # Write out Hadoop-style configuration directory, 
    # which can, in turn, be used for /bin/hadoop.
    self.config_dir = tmppath("conf")
    os.mkdir(self.config_dir)

    hadoop.conf.HADOOP_CONF_DIR.set_for_testing(self.config_dir)

    write_config(self.config, tmppath("conf/core-site.xml"), 
      ["fs.defaultFS", "jobclient.completion.poll.interval",
       "dfs.namenode.checkpoint.period", "dfs.namenode.checkpoint.dir",
       'hadoop.proxyuser.'+self.superuser+'.groups', 'hadoop.proxyuser.'+self.superuser+'.hosts'])
    write_config(self.config, tmppath("conf/hdfs-site.xml"), ["fs.defaultFS", "dfs.namenode.http-address", "dfs.namenode.secondary.http-address"])
    # mapred.job.tracker isn't written out into self.config, so we fill
    # that one out more manually.
    write_config({ 'mapred.job.tracker': 'localhost:%d' % self.jobtracker_port },
                 tmppath("conf/mapred-site.xml"))
    write_config(hadoop_policy_config, tmppath('conf/hadoop-policy.xml'))

    # Once the config is written out, we can start the 2NN.
    args = [hadoop.conf.HADOOP_BIN.get(),
      '--config', self.config_dir,
      'secondarynamenode']

    LOGGER.debug("Starting 2NN at: " +
      self.config['dfs.secondary.http.address'])
    LOGGER.debug("2NN command: %s env: %s" % (repr(args), repr(env)))

    self.secondary_proc = subprocess.Popen(
      args=args,
      stdout=file(tmppath("stdout.2nn"), "w"),
      stderr=file(tmppath("stderr.2nn"), "w"),
      env=env)

    while True:
      try:
        response = urllib2.urlopen(urllib2.Request('http://' +
          self.config['dfs.secondary.http.address']))
      except urllib2.URLError:
        # If we should abort startup.
        if self.secondary_proc.poll() is not None or (not DEBUG_HADOOP and (time.time() - start) > MAX_CLUSTER_STARTUP_TIME):
          LOGGER.debug("stdout:" + file(tmppath("stdout")).read())
          if not USE_STDERR:
            LOGGER.debug("stderr:" + file(tmppath("stderr")).read())
          self.stop()
          raise Exception("2nn process quit or is taking too long to start. Aborting.")
          break
        else:
          time.sleep(1)
          continue

      # We didn't get a URLError. 2NN started successfully.
      response.close()
      break

    LOGGER.debug("Successfully started 2NN")
Esempio n. 16
0
from django.contrib.auth.models import User


from desktop.lib.django_test_util import make_logged_in_client
from desktop.lib.paths import get_run_root
from desktop.lib.python_util import find_unused_port
from desktop.lib.security_util import get_localhost_name
from hadoop import pseudo_hdfs4

import beeswax.conf

from beeswax.server.dbms import get_query_server_config
from beeswax.server import dbms


HIVE_SERVER_TEST_PORT = find_unused_port()
_INITIALIZED = False
_SHARED_HIVE_SERVER_PROCESS = None
_SHARED_HIVE_SERVER = None
_SHARED_HIVE_SERVER_CLOSER = None


LOG = logging.getLogger(__name__)


def _start_server(cluster):
  args = [beeswax.conf.HIVE_SERVER_BIN.get()]

  env = cluster._mr2_env.copy()

  hadoop_cp_proc = subprocess.Popen(args=[get_run_root('ext/hadoop/hadoop') + '/bin/hadoop', 'classpath'], env=env, cwd=cluster._tmpdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)