def _serialize(self, file_path): """ Serializes the Job to the provided file_path. Special-case sequential job definitions (jobs where all the tasks are to be sequentially executed) and there are no parallel executions. In those cases, instead of serializing the tasks into the appropriate folder, just pickle the entire job and write it that instead. This way when executing if a Job is found run_job will be called, vs run_task. """ aggregate_file = _zipfile.ZipFile(file_path, 'w') base_path = _tempfile.mkdtemp() # TODO: serialize job attributes (like .name) or else won't be in deserialized object # identify if only one task per step, and special-case that if all([len(step) == 1 for step in self._sequence]): # one task per step, so handle with serial execution __LOGGER__.debug("Special casing sequential execution") step_idx = 0 job_idx = 0 step_dir = _os.path.join(base_path, str(step_idx)) _os.mkdir(step_dir) job_file_path = _os.path.join(step_dir, str(job_idx)) with open(job_file_path, 'w') as job_file: _cloudpickle.dump(self, job_file) relative_path = _os.path.join('steps', str(step_idx), str(job_idx)) aggregate_file.write(job_file_path, relative_path) else: for step_idx, cur_step in enumerate(self._sequence): step_dir = _os.path.join(base_path, str(step_idx)) _os.mkdir(step_dir) for task_idx, cur_task in enumerate(cur_step): task_file_path = _os.path.join(step_dir, str(task_idx)) with open(task_file_path, 'w') as task_file: _cloudpickle.dump(cur_task, task_file) relative_path = _os.path.join('steps', str(step_idx), str(task_idx)) aggregate_file.write(task_file_path, relative_path) aggregate_file.close() # delete the tempdir created _shutil.rmtree(base_path)
def save(self, obj, typename=None): """ Save the item to this session. Parameters ---------- obj : object Object to save to this session typename : str, optional Specify the type of this object (Task, Environment, Job) """ if isinstance(obj, str): if obj in self._objects: obj = self._objects[obj] else: raise Exception("Unable to find artifact to save.") if not Session._is_known_type(obj): raise Exception("Trying to save an unknown type") savedir = self.location if typename is None: if isinstance(obj, _job.Job): typename = 'Job' elif isinstance(obj, _environment.Environment): typename = 'Environment' elif isinstance(obj, _artifact.Task): typename = 'Task' elif isinstance(obj, _predictive_service.PredictiveService): typename = 'PredictiveService' else: __LOGGER__.error("Trying to save an unrecognized item of type: %s, saving failed." % type(obj)) return # overwrite the obj with PredictiveServiceEndpoint to be pickled and saved if isinstance(obj, _predictive_service.PredictiveService) and typename == 'PredictiveService': obj = _predictive_service_endpoint.PredictiveServiceEndpoint(obj.name, obj._s3_state_path, obj.aws_credentials) filename = self._get_filename_from_name(obj.name) + "." + typename try: with __builtin__.open(str(_os.path.join(savedir, filename)), "w") as f: _cloudpickle.dump(obj, f) if hasattr(obj, '_modified_since_last_saved'): obj._modified_since_last_saved = False except Exception as e: __LOGGER__.warning("Error saving %s: '%s'" % (typename, e))
def _save_imp(self, po_path, dependency_path, aws_credentials): '''Save the predictive object to a directory The files for a predictive object are laid out the following way: po_path/definition/meta -- serialized json file about the predictive object, including: description, dependencies, etc. po_path/definition/definition -- cloudpickle-serialized PredictiveObject dependency_path -- all dependent GraphLab objects, each in its own directory: dependency_path/uri1/ -- serialized GraphLab object with uri1 dependency_path/uri2/ -- serialized GraphLab object with uri2 ''' fu.create_directory(po_path) describe = { 'description': self.description, 'dependencies': {}, 'schema_version' : self.schema_version } for (uri, gl_obj) in self.dependencies.iteritems(): # If it isn't already saved, save it. temp_path = None try: if not fu.is_path(gl_obj): obj_type = self._get_graphlab_object_type(gl_obj) temp_path = tempfile.mkdtemp() __logger__.info("Saving dependent GraphLab %s (%s) locally to '%s' " % (obj_type, uri, temp_path)) gl_obj.save(temp_path) gl_obj = temp_path else: obj_type = get_graphlab_object_type(gl_obj) # Copy the saved object without loading it. save_path = os.path.join(dependency_path, uri) __logger__.info("Copying dependent GraphLab %s(%s) from '%s' to '%s' " % (obj_type, uri, gl_obj, save_path)) if fu.is_s3_path(gl_obj) and fu.is_s3_path(save_path): fu.intra_s3_copy_model(gl_obj, save_path, aws_credentials) elif fu.is_local_path(gl_obj) and fu.is_s3_path(save_path): fu.s3_copy_model(gl_obj, save_path, aws_credentials) elif fu.is_local_path(gl_obj) and fu.is_local_path(save_path): # Useful for unit tests shutil.copytree(gl_obj, save_path) else: raise RuntimeError("Copy GraphLab object from S3 to local path is not supported. GraphLab object path: %s, save path: %s" % (gl_obj, save_path)) finally: if temp_path: shutil.rmtree(temp_path) # add to the global describe dictionary describe['dependencies'][uri] = { 'path': save_path, 'type': obj_type } # persist the global description describe_path = self._get_describe_path(po_path) self._save_object(describe_path, describe) # persist the definition of myself definition_path = self._get_definition_path(po_path) try: with open(definition_path, 'wb') as f: _cloudpickle.dump(self, f) except Exception as e: __logger__.error('Unable to save object: %s' % (e.message)) raise e