Ejemplo n.º 1
0
def yarn_site_xml_defaults(workdir, node_info):
    '''
    Default entries for the yarn-site.xml config file.
    '''
    mem_dflts = memory_defaults(node_info)
    ncores = node_info['cores']
    max_alloc = round_mb(mem_dflts.ram_per_container *
                         mem_dflts.num_containers)
    min_alloc = round_mb(mem_dflts.ram_per_container)
    dflts = {
        'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
        'yarn.scheduler.maximum-allocation-mb': max_alloc,
        'yarn.scheduler.minimum-allocation-mb': min_alloc,
        'yarn.nodemanager.resource.memory-mb': max_alloc,
        'yarn.nodemanager.vmem-check-enabled': 'false',
        'yarn.nodemanager.vmem-pmem-ratio': 2.1,
        'yarn.nodemanager.hostname': '$dataname',
        'yarn.nodemanager.webapp.address': '$hostaddress:8042',
        'yarn.resourcemanager.hostname': '$masterdataname',
        'yarn.resourcemanager.webapp.address': '$masterhostaddress:8088',
        'yarn.resourcemanager.webapp.https.address': '$masterhostaddress:8090',
        'yarn.resourcemanager.scheduler.class':
        'org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler',
        'yarn.scheduler.capacity.allocation.file': 'capacity-scheduler.xml',
        'yarn.scheduler.maximum-allocation-vcores': str(ncores),
        'yarn.scheduler.minimum-allocation-vcores': '1',
        'yarn.nodemanager.resource.cpu-vcores': str(ncores),
    }
    return dflts
Ejemplo n.º 2
0
def yarn_site_xml_defaults(workdir, node_info):
    '''
    Default entries for the yarn-site.xml config file.
    '''
    mem_dflts = memory_defaults(node_info)
    ncores = node_info['cores']
    max_alloc = round_mb(mem_dflts.ram_per_container * mem_dflts.num_containers)
    min_alloc = round_mb(mem_dflts.ram_per_container)
    dflts = {
        'yarn.nodemanager.aux-services': 'mapreduce_shuffle',
        'yarn.scheduler.maximum-allocation-mb': max_alloc,
        'yarn.scheduler.minimum-allocation-mb': min_alloc,
        'yarn.nodemanager.resource.memory-mb': max_alloc,
        'yarn.nodemanager.vmem-check-enabled':'false',
        'yarn.nodemanager.vmem-pmem-ratio': 2.1,
        'yarn.nodemanager.hostname': '$dataname',
        'yarn.nodemanager.webapp.address': '$hostaddress:8042',
        'yarn.resourcemanager.hostname': '$masterdataname',
        'yarn.resourcemanager.webapp.address': '$masterhostaddress:8088',
        'yarn.resourcemanager.webapp.https.address': '$masterhostaddress:8090',
        'yarn.resourcemanager.scheduler.class': 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler',
        'yarn.scheduler.capacity.allocation.file': 'capacity-scheduler.xml',
        'yarn.scheduler.maximum-allocation-vcores': str(ncores),
        'yarn.scheduler.minimum-allocation-vcores': '1',
        'yarn.nodemanager.resource.cpu-vcores': str(ncores),
    }
    return dflts
Ejemplo n.º 3
0
def mapred_site_xml_defaults(workdir, node_info):
    '''
    Default entries for the mapred-site.xml config file.
    '''
    mem_dflts = memory_defaults(node_info)

    java_map_mem = format_memory(0.8 * mem_dflts.ram_per_container,
                                 round_val=True)
    java_reduce_mem = format_memory(0.8 * 2 * mem_dflts.ram_per_container,
                                    round_val=True)
    # In my tests, Yarn gets shirty if I try to run a job and these values are set to
    # more then 8g:
    map_memory = round_mb(mem_dflts.ram_per_container)
    reduce_memory = round_mb(2 * mem_dflts.ram_per_container)
    dflts = {
        'mapreduce.framework.name':
        'yarn',
        'mapreduce.map.java.opts':
        '-Xmx%s' % java_map_mem,
        'mapreduce.map.memory.mb':
        map_memory,
        'mapreduce.reduce.java.opts':
        '-Xmx%s' % java_reduce_mem,
        'mapreduce.reduce.memory.mb':
        reduce_memory,
        # io.sort.mb can't be > 2047mb
        'mapreduce.task.io.sort.mb':
        min(int(0.4 * map_memory), 2047),
        'yarn.app.mapreduce.am.staging-dir':
        '$localworkdir/tmp/hadoop-yarn/staging',
    }
    return dflts
Ejemplo n.º 4
0
def yarn_site_xml_defaults(workdir, node_info):
    """
    Default entries for the yarn-site.xml config file.
    """
    mem_dflts = memory_defaults(node_info)
    ncores = node_info["cores"]
    max_alloc = round_mb(mem_dflts.ram_per_container * mem_dflts.num_containers)
    min_alloc = round_mb(mem_dflts.ram_per_container)
    dflts = {
        "yarn.nodemanager.aux-services": "mapreduce_shuffle",
        "yarn.scheduler.maximum-allocation-mb": max_alloc,
        "yarn.scheduler.minimum-allocation-mb": min_alloc,
        "yarn.nodemanager.resource.memory-mb": max_alloc,
        "yarn.nodemanager.vmem-check-enabled": "false",
        "yarn.nodemanager.vmem-pmem-ratio": 2.1,
        "yarn.nodemanager.hostname": "$dataname",
        "yarn.nodemanager.webapp.address": "$hostaddress:8042",
        "yarn.resourcemanager.hostname": "$masterdataname",
        "yarn.resourcemanager.webapp.address": "$masterhostaddress:8088",
        "yarn.resourcemanager.webapp.https.address": "$masterhostaddress:8090",
        "yarn.resourcemanager.scheduler.class": "org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler",
        "yarn.scheduler.capacity.allocation.file": "capacity-scheduler.xml",
        "yarn.scheduler.maximum-allocation-vcores": str(ncores),
        "yarn.scheduler.minimum-allocation-vcores": "1",
        "yarn.nodemanager.resource.cpu-vcores": str(ncores),
    }
    return dflts
Ejemplo n.º 5
0
 def test_mapred_site_xml_defaults(self):
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=24, totalcores=24, usablecores=range(24), num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.mapred_site_xml_defaults('/', node)
     self.assertEqual(len(d), 7)
     # Capped at 8g
     self.assertEqual(d['mapreduce.map.memory.mb'], hcc.round_mb(hcc.parse_memory('2G')))
     self.assertEqual(d['mapreduce.reduce.memory.mb'], hcc.round_mb(hcc.parse_memory('4G')))
 def test_mapred_site_xml_defaults(self):
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=24, totalcores=24, usablecores=range(24), num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.mapred_site_xml_defaults('/', node)
     self.assertEqual(len(d), 7)
     # Capped at 8g
     self.assertEqual(d['mapreduce.map.memory.mb'], hcc.round_mb(hcc.parse_memory('2G')))
     self.assertEqual(d['mapreduce.reduce.memory.mb'], hcc.round_mb(hcc.parse_memory('4G')))
 def test_mapred_site_xml_defaults(self):
     '''Test mapred defaults; note: only using 4 from 24 cores.'''
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=4, totalcores=24, usablecores=[0, 1, 2, 3], num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.mapred_site_xml_defaults('/', node)
     self.assertEqual(len(d), 9)
     self.assertEqual(d['hadoop.ln.cmd'], '/bin/ln')
     self.assertEqual(d['lustre.dir'], '$workdir')
     self.assertEqual(d['mapreduce.map.memory.mb'], hcc.round_mb(hcc.parse_memory('1G')))
     self.assertEqual(d['mapreduce.reduce.memory.mb'], hcc.round_mb(hcc.parse_memory('2G')))
 def test_mapred_site_xml_defaults(self):
     '''Test mapred defaults; note: only using 4 from 24 cores.'''
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=4, totalcores=24, usablecores=[0, 1, 2, 3], num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.mapred_site_xml_defaults('/', node)
     self.assertEqual(len(d), 9)
     self.assertEqual(d['hadoop.ln.cmd'], '/bin/ln')
     self.assertEqual(d['lustre.dir'], '$workdir')
     self.assertEqual(d['mapreduce.map.memory.mb'], hcc.round_mb(hcc.parse_memory('1G')))
     self.assertEqual(d['mapreduce.reduce.memory.mb'], hcc.round_mb(hcc.parse_memory('2G')))
def spark_defaults(_, node_info):
    '''
    Generate spark defaults so spark uses all the resources that yarn is able to
    provide.

    Defaults here are based on Cloudera's recommendations here: 
    http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/

    We use 2 cores per executor based on discussion found here:
    http://stackoverflow.com/questions/24622108/apache-spark-the-number-of-cores-vs-the-number-of-executors
    '''
    memory_defaults = hcah.memory_defaults(node_info)
    num_nodes = node_info['num_nodes']
    cores_per_executor = min(2, node_info['cores'])
    instances_per_node = node_info['cores'] / cores_per_executor
    # -1 because we want one less executor instance on the application master
    # If we have only one node then we don't expect the driver to be very busy, so
    # we can give the executors more memory.
    instances = max((num_nodes * instances_per_node) - 1, 1)
    memory = hcac.round_mb(memory_defaults.available_memory / instances_per_node)
    dflts = {
        'spark.executor.cores': cores_per_executor,
        'spark.executor.instances': instances,
        'spark.executor.memory':  str(memory) + 'M',
        'spark.local.dir': tempfile.gettempdir(),
    }
    return dflts
 def test_mapred_site_xml_defaults(self):
     node = dict(
         fqdn="hosty.domain.be",
         network="ib0",
         pid=1234,
         cores=24,
         totalcores=24,
         usablecores=range(24),
         num_nodes=1,
         memory=dict(meminfo=dict(memtotal=68719476736), ulimit="unlimited"),
     )
     d = hca.mapred_site_xml_defaults("/", node)
     self.assertEqual(len(d), 7)
     # Capped at 8g
     self.assertEqual(d["mapreduce.map.memory.mb"], hcc.round_mb(hcc.parse_memory("2G")))
     self.assertEqual(d["mapreduce.reduce.memory.mb"], hcc.round_mb(hcc.parse_memory("4G")))
Ejemplo n.º 11
0
 def test_yarn_site_xml_defaults(self):
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=24, totalcores=24, usablecores=range(24), num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.yarn_site_xml_defaults('/', node)
     self.assertEqual(len(d), 16)
     self.assertEqual(d['yarn.nodemanager.resource.memory-mb'], hcc.round_mb(hcc.parse_memory('56G')))
     self.assertEqual(d['yarn.resourcemanager.webapp.address'], '$masterhostaddress:8088')
     self.assertEqual(d['yarn.resourcemanager.webapp.https.address'], '$masterhostaddress:8090')
     self.assertEqual(d['yarn.nodemanager.hostname'], '$dataname')
     self.assertEqual(d['yarn.nodemanager.webapp.address'], '$hostaddress:8042')
     self.assertEqual(d['yarn.scheduler.minimum-allocation-mb'], hcc.round_mb(hcc.parse_memory('2G')))
     self.assertEqual(d['yarn.scheduler.maximum-allocation-mb'], hcc.round_mb(hcc.parse_memory('56G')))
     self.assertEqual(d['yarn.scheduler.maximum-allocation-vcores'], '24')
     self.assertEqual(d['yarn.scheduler.minimum-allocation-vcores'], '1')
     self.assertEqual(d['yarn.nodemanager.resource.cpu-vcores'], '24')
Ejemplo n.º 12
0
 def test_yarn_site_xml_defaults(self):
     node = dict(fqdn='hosty.domain.be', network='ib0', pid=1234,
             cores=24, totalcores=24, usablecores=range(24), num_nodes=1,
             memory=dict(meminfo=dict(memtotal=68719476736), ulimit='unlimited'))
     d = hca.yarn_site_xml_defaults('/', node)
     self.assertEqual(len(d), 16)
     self.assertEqual(d['yarn.nodemanager.resource.memory-mb'], hcc.round_mb(hcc.parse_memory('56G')))
     self.assertEqual(d['yarn.resourcemanager.webapp.address'], '$masterhostaddress:8088')
     self.assertEqual(d['yarn.resourcemanager.webapp.https.address'], '$masterhostaddress:8090')
     self.assertEqual(d['yarn.nodemanager.hostname'], '$dataname')
     self.assertEqual(d['yarn.nodemanager.webapp.address'], '$hostaddress:8042')
     self.assertEqual(d['yarn.scheduler.minimum-allocation-mb'], hcc.round_mb(hcc.parse_memory('2G')))
     self.assertEqual(d['yarn.scheduler.maximum-allocation-mb'], hcc.round_mb(hcc.parse_memory('56G')))
     self.assertEqual(d['yarn.scheduler.maximum-allocation-vcores'], '24')
     self.assertEqual(d['yarn.scheduler.minimum-allocation-vcores'], '1')
     self.assertEqual(d['yarn.nodemanager.resource.cpu-vcores'], '24')
Ejemplo n.º 13
0
def spark_defaults(_, node_info):
    '''
    Generate spark defaults so spark uses all the resources that yarn is able to
    provide.

    Defaults here are based on Cloudera's recommendations here: 
    http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/

    We use 2 cores per executor based on discussion found here:
    http://stackoverflow.com/questions/24622108/apache-spark-the-number-of-cores-vs-the-number-of-executors
    '''
    memory_defaults = hcah.memory_defaults(node_info)
    num_nodes = node_info['num_nodes']
    cores_per_executor = min(2, node_info['cores'])
    instances_per_node = node_info['cores'] / cores_per_executor
    # -1 because we want one less executor instance on the application master
    # If we have only one node then we don't expect the driver to be very busy, so
    # we can give the executors more memory.
    instances = max((num_nodes * instances_per_node) - 1, 1)
    memory = hcac.round_mb(memory_defaults.available_memory / instances_per_node)
    dflts = {
        'spark.executor.cores': cores_per_executor,
        'spark.executor.instances': instances,
        'spark.executor.memory':  str(memory) + 'M',
        'spark.local.dir': tempfile.gettempdir(),
    }
    return dflts
Ejemplo n.º 14
0
def mapred_site_xml_defaults(workdir, node_info):
    '''
    Default entries for the mapred-site.xml config file.
    '''
    mem_dflts = memory_defaults(node_info)

    java_map_mem = format_memory(0.8 * mem_dflts.ram_per_container, round_val=True)
    java_reduce_mem = format_memory(0.8 * 2 * mem_dflts.ram_per_container, round_val=True)
    # In my tests, Yarn gets shirty if I try to run a job and these values are set to
    # more then 8g:
    map_memory = round_mb(mem_dflts.ram_per_container)
    reduce_memory = round_mb(2 * mem_dflts.ram_per_container)
    dflts = {
        'mapreduce.framework.name': 'yarn',
        'mapreduce.map.java.opts': '-Xmx%s' % java_map_mem,
        'mapreduce.map.memory.mb': map_memory,
        'mapreduce.reduce.java.opts': '-Xmx%s' % java_reduce_mem,
        'mapreduce.reduce.memory.mb': reduce_memory,
        # io.sort.mb can't be > 2047mb
        'mapreduce.task.io.sort.mb': min(int(0.4 * map_memory), 2047),
        'yarn.app.mapreduce.am.staging-dir': '$localworkdir/tmp/hadoop-yarn/staging',
    }
    return dflts
 def test_yarn_site_xml_defaults(self):
     node = dict(
         fqdn="hosty.domain.be",
         network="ib0",
         pid=1234,
         cores=24,
         totalcores=24,
         usablecores=range(24),
         num_nodes=1,
         memory=dict(meminfo=dict(memtotal=68719476736), ulimit="unlimited"),
     )
     d = hca.yarn_site_xml_defaults("/", node)
     self.assertEqual(len(d), 16)
     self.assertEqual(d["yarn.nodemanager.resource.memory-mb"], hcc.round_mb(hcc.parse_memory("56G")))
     self.assertEqual(d["yarn.resourcemanager.webapp.address"], "$masterhostaddress:8088")
     self.assertEqual(d["yarn.resourcemanager.webapp.https.address"], "$masterhostaddress:8090")
     self.assertEqual(d["yarn.nodemanager.hostname"], "$dataname")
     self.assertEqual(d["yarn.nodemanager.webapp.address"], "$hostaddress:8042")
     self.assertEqual(d["yarn.scheduler.minimum-allocation-mb"], hcc.round_mb(hcc.parse_memory("2G")))
     self.assertEqual(d["yarn.scheduler.maximum-allocation-mb"], hcc.round_mb(hcc.parse_memory("56G")))
     self.assertEqual(d["yarn.scheduler.maximum-allocation-vcores"], "24")
     self.assertEqual(d["yarn.scheduler.minimum-allocation-vcores"], "1")
     self.assertEqual(d["yarn.nodemanager.resource.cpu-vcores"], "24")
Ejemplo n.º 16
0
 def test_round_mb(self):
     mb = 1024
     self.assertEqual(hcc.round_mb(512 * (1024**3)), 512 * mb)
     self.assertEqual(hcc.round_mb(64 * (1024**3)), 64 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3)), 32 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3) + 100*(1024**2)), 32 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3) - 100*(1024**2)), 32 * mb - 1024)
     self.assertEqual(hcc.round_mb(16 * (1024**3)), 16 * mb)
     self.assertEqual(hcc.round_mb(8 * (1024**3)), 8 * mb)
     self.assertEqual(hcc.round_mb(4 * (1024**3)), 4 * mb)
     self.assertEqual(hcc.round_mb(3 * (1024**3)), 3 * mb)
     self.assertEqual(hcc.round_mb(2 * (1024**3)), 2 * mb)
     self.assertEqual(hcc.round_mb(1 * (1024**3)), 1 * mb)
Ejemplo n.º 17
0
 def test_round_mb(self):
     mb = 1024
     self.assertEqual(hcc.round_mb(512 * (1024**3)), 512 * mb)
     self.assertEqual(hcc.round_mb(64 * (1024**3)), 64 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3)), 32 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3) + 100 * (1024**2)),
                      32 * mb)
     self.assertEqual(hcc.round_mb(32 * (1024**3) - 100 * (1024**2)),
                      32 * mb - 1024)
     self.assertEqual(hcc.round_mb(16 * (1024**3)), 16 * mb)
     self.assertEqual(hcc.round_mb(8 * (1024**3)), 8 * mb)
     self.assertEqual(hcc.round_mb(4 * (1024**3)), 4 * mb)
     self.assertEqual(hcc.round_mb(3 * (1024**3)), 3 * mb)
     self.assertEqual(hcc.round_mb(2 * (1024**3)), 2 * mb)
     self.assertEqual(hcc.round_mb(1 * (1024**3)), 1 * mb)