Exemple #1
0
def test_host_terminates_when_stopped():
  flags.non_flag_components()
  job = mock()
  connection = mock()
  spot_instance_request = mock()
  h = Host(job, connection, spot_instance_request)
  h._instance = mock()
  h._instance.state = 'running'
  h.run = lambda cmd: 0
  h.start()
  h.stop()
  h.join()
  verify(h._instance).terminate()
Exemple #2
0
def run(mapper, reducer, input_map=None, output_map=None, compression=None, filter=None):
  flags.non_flag_components()

  if not mapper:
    mapper = IdentityMapper
  if not reducer:
    reducer = IdentityReducer
  if not input_map:
    input_map = flags.input_map
  if not output_map:
    output_map = flags.output_map

  input_source = input_tasks(input_map, flags.parallelism, filter)

  skip_reducer = (reducer == SkipReducer)

  if output_map.startswith(store.MongoStore.TAG):
    # TODO(Douwe): dont special case Mongo but let the Writer class decide
    output_names = [output_map] * flags.parallelism
  else:
    output_names = shard_names(output_map)
  out_shard_num = len(output_names)
  
  if is_master():
    # No specific task to run, but parallel. We are the master
    master = Master(flags.parallelism, input_source, out_shard_num, skip_reducer)
    master.run()
  else:
    if flags.perform_task:
      perform_task = True
      task_phase, task_ids = flags.perform_task.split(':', 1)
      task_ids = [int(id) for id in task_ids.split(':')]
    else:
      perform_task = False
      task_phase = None
      task_ids = range(input_source.num_tasks())
  
    if not perform_task:
      print 'Started map phase'
    
    if not perform_task or task_phase == 'map':
      mapper_instance = instantiate_mapper(mapper, task_ids, input_source.num_tasks())
      for task_id in task_ids:
        markerfn = tmpfile('MAP_%d_DONE' % task_id)
        if not os.path.isfile(markerfn):
          if skip_reducer:
            if output_map.startswith(store.MongoStore.TAG):
              shard_writer = get_writer_from_spec(output_names[0])
              mapper_instance.attach_writer(shard_writer)
              out_shards = [shard_writer]
            else:
              # We skip the reducer. To make sure the mapper behaves, i.e. only writes to the corresponding
              # shard, supply Nones
              if out_shard_num != input_source.num_tasks():
                raise StandardError('Need the same in and output sharding when skipping reducer.')
              out_shards = [None for out_idx in range(out_shard_num)]
              shard_writer = get_writer_from_spec(output_names[task_id])
              mapper_instance.attach_writer(shard_writer)
              out_shards[task_id] = shard_writer
          else:
            base_out = tmpfile('inter-%04d-%%04d' % task_id)
            out_shards = [store.MultiStore(base_out % out_idx, mode='w', compression=compression)
                          for out_idx in range(out_shard_num)]
          run_maptask(mapper_instance, input_source.iterate_key_values(task_id, flags.input_debug_key),
                      out_shards, markerfn, task_id)

    if (not perform_task or task_phase == 'reduce') and not skip_reducer:
      for task_id, ds_out_name in enumerate(output_names):
        if perform_task and not task_id in task_ids:
          continue
        markerfn = tmpfile('REDUCE_%d_DONE' % task_id)
        if not os.path.isfile(markerfn):
          ds_out = get_writer_from_spec(ds_out_name)
          base_in = tmpfile('inter-%%04d-%04d' % task_id)
          in_shards = [store.MultiStore(base_in % in_task_id) for in_task_id in range(input_source.num_tasks())]
          keys = set()
          for in_shard in in_shards:
            keys.update(set(in_shard.keys()))
          keys = list(keys)
          keys.sort()
          run_reducetask(keys, in_shards, ds_out, reducer, markerfn, task_id, len(output_names))

  if flags.remove_state:
    for fn in os.listdir(flags.work_dir):
      if fn.startswith(flags.StateFp()):
        os.remove(os.path.join(flags.work_dir, fn))

  print count_line()
Exemple #3
0
def run_pipeline(mapper, input_map=None, output_map=None):
  """Use this when you don't need the reducer and input and output have the same keys."""
  flags.non_flag_components()
  if not input_map:
    input_map = flags.input_map
  if not output_map:
    output_map = flags.output_map
  input_names = input_shard_names(input_map)
  in_shard_num = len(input_names)
  output_names = shard_names(output_map)
  out_shard_num = len(output_names)

  if flags.use_previous:
    print 'using previous'
    if '@' in output_map:
      prev_map = output_map.replace('@', '.old@')
    else:
      prev_map = output_map + '.old'
    prev_shard_names = shard_names(prev_map)
    exists = True
    for prev, out in zip(prev_shard_names, output_names):
      for ext in ('dst', 'idx'):
        fn_old = '%s.%s' % (prev, ext)
        fn_new = '%s.%s' % (out, ext)
        if os.path.isfile(fn_old):
          os.remove(fn_old)
        if os.path.isfile(fn_new):
          os.rename(fn_new, fn_old)
        else:
          exists = False
    if exists:
      previous = store.Store(prev_map)
    else:
      previous = {}
      prev_shard_names = [None] * out_shard_num
  else:
    prev_shard_names = [None] * out_shard_num
    previous = {}
  
  if flags.run_parallel and out_shard_num == in_shard_num:
    import multiprocessing
    pool = multiprocessing.Pool(2)
    work = zip(range(in_shard_num),
               [mapper] * in_shard_num,
               input_names,
               output_names,
               prev_shard_names)
    pool.map(RunPipelineTask, work)
    return
    
  input_map = store.Store(input_map)
  output_map = store.Store(output_map, mode='w')
  count = 0
  cached = 0
  l = len(input_map.keys())
  for key, value in input_map.items():
    perc = int(100 * float(count) / l)
    if count % 100 == 0:
      print perc, '%', key, cached
    count += 1
    if key in previous:
      if previous_mapper:
        output_map[key] = previous_mapper(key, value, previous[key])
      else:
        output_map[key] = previous[key]
      cached += 1
    else:
      for k, v in mapper(key, value):
        output_map[k] = v
        break
  input_map.close()
  output_map.close()
  if flags.use_previous and previous:
    previous.close()
    for shard in shard_names(prev_map):
      for ext in ('dst', 'idx'):
        fn_prev = '%s.%s' % (prev, ext)
        if os.path.isfile(fn_prev):
          os.remove(fn_prev)
  print 'alldone'