Example #1
0
        def start_distributed_task(job_type,
                                   task_index,
                                   evaluation_worker=False):

            task_parameters = DistributedTaskParameters(
                framework_type=
                "tensorflow",  # TODO: tensorflow should'nt be hardcoded
                parameters_server_hosts=ps_hosts,
                worker_hosts=worker_hosts,
                job_type=job_type,
                task_index=task_index,
                evaluate_only=evaluation_worker,
                use_cpu=args.use_cpu,
                num_tasks=total_tasks,  # training tasks + 1 evaluation task
                num_training_tasks=args.num_workers,
                experiment_path=args.experiment_path,
                shared_memory_scratchpad=None,
                seed=args.seed + task_index if args.seed is not None else
                None)  # each worker gets a different seed
            task_parameters.__dict__ = add_items_to_dict(
                task_parameters.__dict__, args.__dict__)
            # we assume that only the evaluation workers are rendering

            graph_manager.visualization_parameters.render = args.render and evaluation_worker
            start_graph(graph_manager, task_parameters)
            #p = Process(target=start_graph, args=(graph_manager, task_parameters))
            #p.start()
            return
Example #2
0
 def start_distributed_task(
         job_type,
         task_index,
         evaluation_worker=False,
         shared_memory_scratchpad=shared_memory_scratchpad):
     task_parameters = DistributedTaskParameters(
         framework_type=args.framework,
         parameters_server_hosts=ps_hosts,
         worker_hosts=worker_hosts,
         job_type=job_type,
         task_index=task_index,
         evaluate_only=0 if evaluation_worker else
         None,  # 0 value for evaluation worker as it should run infinitely
         use_cpu=args.use_cpu,
         num_tasks=total_tasks,  # training tasks + 1 evaluation task
         num_training_tasks=args.num_workers,
         experiment_path=args.experiment_path,
         shared_memory_scratchpad=shared_memory_scratchpad,
         seed=args.seed + task_index if args.seed is not None else
         None,  # each worker gets a different seed
         checkpoint_save_secs=args.checkpoint_save_secs,
         checkpoint_restore_path=args.
         checkpoint_restore_dir,  # MonitoredTrainingSession only supports a dir
         checkpoint_save_dir=args.checkpoint_save_dir,
         export_onnx_graph=args.export_onnx_graph,
         apply_stop_condition=args.apply_stop_condition)
     # we assume that only the evaluation workers are rendering
     graph_manager.visualization_parameters.render = args.render and evaluation_worker
     p = Process(target=start_graph,
                 args=(graph_manager, task_parameters))
     # p.daemon = True
     p.start()
     return p