def test_host_terminates_when_stopped(): flags.non_flag_components() job = mock() connection = mock() spot_instance_request = mock() h = Host(job, connection, spot_instance_request) h._instance = mock() h._instance.state = 'running' h.run = lambda cmd: 0 h.start() h.stop() h.join() verify(h._instance).terminate()
def run(mapper, reducer, input_map=None, output_map=None, compression=None, filter=None): flags.non_flag_components() if not mapper: mapper = IdentityMapper if not reducer: reducer = IdentityReducer if not input_map: input_map = flags.input_map if not output_map: output_map = flags.output_map input_source = input_tasks(input_map, flags.parallelism, filter) skip_reducer = (reducer == SkipReducer) if output_map.startswith(store.MongoStore.TAG): # TODO(Douwe): dont special case Mongo but let the Writer class decide output_names = [output_map] * flags.parallelism else: output_names = shard_names(output_map) out_shard_num = len(output_names) if is_master(): # No specific task to run, but parallel. We are the master master = Master(flags.parallelism, input_source, out_shard_num, skip_reducer) master.run() else: if flags.perform_task: perform_task = True task_phase, task_ids = flags.perform_task.split(':', 1) task_ids = [int(id) for id in task_ids.split(':')] else: perform_task = False task_phase = None task_ids = range(input_source.num_tasks()) if not perform_task: print 'Started map phase' if not perform_task or task_phase == 'map': mapper_instance = instantiate_mapper(mapper, task_ids, input_source.num_tasks()) for task_id in task_ids: markerfn = tmpfile('MAP_%d_DONE' % task_id) if not os.path.isfile(markerfn): if skip_reducer: if output_map.startswith(store.MongoStore.TAG): shard_writer = get_writer_from_spec(output_names[0]) mapper_instance.attach_writer(shard_writer) out_shards = [shard_writer] else: # We skip the reducer. To make sure the mapper behaves, i.e. only writes to the corresponding # shard, supply Nones if out_shard_num != input_source.num_tasks(): raise StandardError('Need the same in and output sharding when skipping reducer.') out_shards = [None for out_idx in range(out_shard_num)] shard_writer = get_writer_from_spec(output_names[task_id]) mapper_instance.attach_writer(shard_writer) out_shards[task_id] = shard_writer else: base_out = tmpfile('inter-%04d-%%04d' % task_id) out_shards = [store.MultiStore(base_out % out_idx, mode='w', compression=compression) for out_idx in range(out_shard_num)] run_maptask(mapper_instance, input_source.iterate_key_values(task_id, flags.input_debug_key), out_shards, markerfn, task_id) if (not perform_task or task_phase == 'reduce') and not skip_reducer: for task_id, ds_out_name in enumerate(output_names): if perform_task and not task_id in task_ids: continue markerfn = tmpfile('REDUCE_%d_DONE' % task_id) if not os.path.isfile(markerfn): ds_out = get_writer_from_spec(ds_out_name) base_in = tmpfile('inter-%%04d-%04d' % task_id) in_shards = [store.MultiStore(base_in % in_task_id) for in_task_id in range(input_source.num_tasks())] keys = set() for in_shard in in_shards: keys.update(set(in_shard.keys())) keys = list(keys) keys.sort() run_reducetask(keys, in_shards, ds_out, reducer, markerfn, task_id, len(output_names)) if flags.remove_state: for fn in os.listdir(flags.work_dir): if fn.startswith(flags.StateFp()): os.remove(os.path.join(flags.work_dir, fn)) print count_line()
def run_pipeline(mapper, input_map=None, output_map=None): """Use this when you don't need the reducer and input and output have the same keys.""" flags.non_flag_components() if not input_map: input_map = flags.input_map if not output_map: output_map = flags.output_map input_names = input_shard_names(input_map) in_shard_num = len(input_names) output_names = shard_names(output_map) out_shard_num = len(output_names) if flags.use_previous: print 'using previous' if '@' in output_map: prev_map = output_map.replace('@', '.old@') else: prev_map = output_map + '.old' prev_shard_names = shard_names(prev_map) exists = True for prev, out in zip(prev_shard_names, output_names): for ext in ('dst', 'idx'): fn_old = '%s.%s' % (prev, ext) fn_new = '%s.%s' % (out, ext) if os.path.isfile(fn_old): os.remove(fn_old) if os.path.isfile(fn_new): os.rename(fn_new, fn_old) else: exists = False if exists: previous = store.Store(prev_map) else: previous = {} prev_shard_names = [None] * out_shard_num else: prev_shard_names = [None] * out_shard_num previous = {} if flags.run_parallel and out_shard_num == in_shard_num: import multiprocessing pool = multiprocessing.Pool(2) work = zip(range(in_shard_num), [mapper] * in_shard_num, input_names, output_names, prev_shard_names) pool.map(RunPipelineTask, work) return input_map = store.Store(input_map) output_map = store.Store(output_map, mode='w') count = 0 cached = 0 l = len(input_map.keys()) for key, value in input_map.items(): perc = int(100 * float(count) / l) if count % 100 == 0: print perc, '%', key, cached count += 1 if key in previous: if previous_mapper: output_map[key] = previous_mapper(key, value, previous[key]) else: output_map[key] = previous[key] cached += 1 else: for k, v in mapper(key, value): output_map[k] = v break input_map.close() output_map.close() if flags.use_previous and previous: previous.close() for shard in shard_names(prev_map): for ext in ('dst', 'idx'): fn_prev = '%s.%s' % (prev, ext) if os.path.isfile(fn_prev): os.remove(fn_prev) print 'alldone'