def test_pickle_vocab_vectors(en_vocab): vectors_length = en_vocab.vectors_length assert vectors_length != 0 apples = en_vocab['apples'] oranges = en_vocab['oranges'] hippos = en_vocab['hippos'] assert apples.similarity(oranges) > apples.similarity(hippos) apples.vector = hippos.vector assert apples.similarity(oranges) < apples.similarity(hippos) file_ = io.BytesIO() cloudpickle.dump(en_vocab, file_) file_.seek(0) loaded = pickle.load(file_) apples = loaded['apples'] oranges = loaded['oranges'] hippos = loaded['hippos'] assert apples.similarity(oranges) < apples.similarity(hippos)
def _calc_T_inv(self, name, x, lambdify=True): """ Return the inverse transform matrix, which converts from world coordinates into the robot's end-effector reference frame name string: name of the joint or link, or end-effector x list: the [x,y,z] position of interest in "name"'s reference frame lambdify boolean: if True returns a function to calculate the transform. If False returns the Sympy matrix """ # check to see if we have our transformation saved in file if (os.path.isfile('%s/%s.T_inv' % (self.config_folder, name))): T_inv = cloudpickle.load(open('%s/%s.T_inv' % (self.config_folder, name), 'rb')) else: T = self._calc_T(name=name) rotation_inv = T[:3, :3].T translation_inv = -rotation_inv * T[:3, 3] T_inv = rotation_inv.row_join(translation_inv).col_join( sp.Matrix([[0, 0, 0, 1]])) # save to file cloudpickle.dump(T_inv, open('%s/%s.T_inv' % (self.config_folder, name), 'wb')) if lambdify is False: return T_inv return sp.lambdify(self.q + self.x, T_inv)
def test_module_locals_behavior(self): # Makes sure that a local function defined in another module is # correctly serialized. This notably checks that the globals are # accessible and that there is no issue with the builtins (see #211) pickled_func_path = os.path.join(self.tmpdir, 'local_func_g.pkl') child_process_script = ''' import pickle import gc with open("{pickled_func_path}", 'rb') as f: func = pickle.load(f) assert func(range(10)) == 45 ''' child_process_script = child_process_script.format( pickled_func_path=_escape(pickled_func_path)) try: from .testutils import make_local_function g = make_local_function() with open(pickled_func_path, 'wb') as f: cloudpickle.dump(g, f, protocol=self.protocol) assert_run_python_script(textwrap.dedent(child_process_script)) finally: os.unlink(pickled_func_path)
def test_pickle_english(EN): file_ = io.BytesIO() cloudpickle.dump(EN, file_) file_.seek(0) loaded = pickle.load(file_)
def test_pickle_vocab(en_vocab): file_ = io.BytesIO() cloudpickle.dump(en_vocab, file_) file_.seek(0) loaded = pickle.load(file_)
def test_load_namespace(self): obj = 1, 2, 3, 4 bio = BytesIO() cloudpickle.dump(obj, bio) bio.seek(0) returned_obj = cloudpickle.load(bio) self.assertEqual(obj, returned_obj)
def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, 'dan.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'dan.pkl'), 'wb') as f: cloudpickle.dump({ 'page_field': self.page_field, 'combined_text_field': self.text_field, 'unigram_text_field': self.unigram_field, 'bigram_text_field': self.bigram_field, 'trigram_text_field': self.trigram_field, 'combined_ngrams': self.combined_ngrams, 'unigrams': self.unigrams, 'bigrams': self.bigrams, 'trigrams': self.trigrams, 'combined_max_vocab_size': self.combined_max_vocab_size, 'unigram_max_vocab_size': self.unigram_max_vocab_size, 'bigram_max_vocab_size': self.bigram_max_vocab_size, 'trigram_max_vocab_size': self.trigram_max_vocab_size, 'qanta_id_field': self.qanta_id_field, 'n_classes': self.n_classes, 'gradient_clip': self.gradient_clip, 'n_hidden_units': self.n_hidden_units, 'n_hidden_layers': self.n_hidden_layers, 'nn_dropout': self.nn_dropout, 'batch_size': self.batch_size, 'use_wiki': self.use_wiki, 'n_wiki_sentences': self.n_wiki_sentences, 'wiki_title_replace_token': self.wiki_title_replace_token, 'lowercase': self.lowercase, 'pooling': self.pooling, 'random_seed': self.random_seed, 'config_num': self.config_num }, f)
def _calc_Tx(self, name, x, lambdify=True): """ Uses Sympy to transform x from the reference frame of a joint or link to the origin (world) coordinates. name string: name of the joint or link, or end-effector x list: the [x,y,z] position of interest in "name"'s reference frame lambdify boolean: if True returns a function to calculate the transform. If False returns the Sympy matrix """ # check to see if we have our transformation saved in file if (os.path.isfile('%s/%s.T' % (self.config_folder, name))): Tx = cloudpickle.load(open('%s/%s.T' % (self.config_folder, name), 'rb')) else: T = self._calc_T(name=name) # transform x into world coordinates Tx = T * sp.Matrix(self.x + [1]) # save to file cloudpickle.dump(Tx, open('%s/%s.T' % (self.config_folder, name), 'wb')) if lambdify is False: return Tx return sp.lambdify(self.q + self.x, Tx)
def _calc_Mq_g(self, lambdify=True): """ Uses Sympy to generate the force of gravity in joint space for the ur5 lambdify boolean: if True returns a function to calculate the Jacobian. If False returns the Sympy matrix """ # check to see if we have our gravity term saved in file if os.path.isfile('%s/Mq_g' % self.config_folder): Mq_g = cloudpickle.load(open('%s/Mq_g' % self.config_folder, 'rb')) else: # get the Jacobians for each link's COM J = [self._calc_J('link%s' % ii, x=[0, 0, 0], lambdify=False) for ii in range(self.num_links)] # transform each inertia matrix into joint space and # sum together the effects of arm segments' inertia on each motor Mq_g = sp.zeros(self.num_joints, 1) for ii in range(self.num_joints): Mq_g += J[ii].T * self._M[ii] * self.gravity Mq_g = sp.Matrix(Mq_g) # save to file cloudpickle.dump(Mq_g, open('%s/Mq_g' % self.config_folder, 'wb')) if lambdify is False: return Mq_g return sp.lambdify(self.q + self.x, Mq_g)
def test_pickle(EN): file_ = io.BytesIO() cloudpickle.dump(EN.parser, file_) file_.seek(0) loaded = pickle.load(file_)
def pycloud_pickle(file_name, obj): # type: (Text, Any) -> None """Pickle an object to a file using cloudpickle.""" import cloudpickle with io.open(file_name, 'wb') as f: cloudpickle.dump(obj, f)
def _save_cache_to_file(self, cache, file_name): self._logger.debug( 'Saving cache with %d entries to %s' % (len(cache), file_name)) deterministic_cache = DeterministicCache( cache, self._cache_valid_for_turns) with open(file_name, 'wb') as io: pickle.dump(deterministic_cache, io) return True
def send_result (match, result): """ Send a match result to the webserver """ filename = config.temp_dir + match.uuid.hex + "-match-result.txt" f = open(filename, 'wb') cloudpickle.dump(result, f) f.close() send_file_webserver_ready(filename, config.webserver_results_path) subprocess.call(["rm", filename])
def get_dataset(path_to_data='data/tokenized/'): # our input data is going to be .txt files in a folder that are formatted as follows: # each line is a new token (word) separated from a class label with a tab character. # our preprocessing includes converting to lowercase, splitting into characters, and repeating # the label for each character. Because punctuation counts as a word, we are doing special # rules with adding spaces around punctuation tokens to build a more accurate language model class StringProcessor: """ This is a helper class (normally we would just do functions for preprocessing) to preprocess our text files (line by line) into the appropriate input and target data. The class is used because we needed to keep track of state when processing line by line. """ def __init__(self): self.previous_label = '' self.space_before_punct = ['(', '``', '[', '{', '$', '#', '&'] self.space_after_punct = ['&'] self.previous_char = '' def process_line(self, line): chars, label = line.split('\t', 1) chars = chars.lower() label = label.rstrip() labels = [label] * len(chars) if (not chars[0] in string.punctuation or chars[0] in self.space_before_punct) and \ (not self.previous_char in self.space_before_punct or self.previous_char in self.space_after_punct): chars = ' ' + chars if label == self.previous_label: labels = [label] + labels else: labels = ['O'] + labels self.previous_label = label self.previous_char = chars[-1] return chars, labels def get_inputs(self, line): return self.process_line(line)[0] def get_labels(self, line): return self.process_line(line)[1] # now that we defined our preprocessor, create a new TextDataset (works over files) # a TextDataset is an OpenDeep class that creates one-hot encodings of inputs and outputs automatically # and keeps them in vocab and entity_vocab dictionaries. processor = StringProcessor() dataset = TextDataset(path=path_to_data, inputs_preprocess=lambda line: processor.get_inputs(line), targets_preprocess=lambda line: processor.get_labels(line), level="char", sequence_length=120) # save the computed dictionaries to use for converting inputs and outputs from running the model. with open('vocab.pkl', 'wb') as f: pickle.dump(dataset.vocab, f, protocol=pickle.HIGHEST_PROTOCOL) with open('entity_vocab.pkl', 'wb') as f: pickle.dump(dataset.label_vocab, f, protocol=pickle.HIGHEST_PROTOCOL) return dataset
def _write_partial_result(self, results, counter): filename = "pyqit-{}-{}-{}".format( int(time.mktime(self.time.timetuple())), self.id, counter) with open(filename, "w") as f: cloudpickle.dump(results, f) logging.info("Qit: Writing file {} ({} results)".format( filename, len(results)))
def save(self, directory: str) -> None: shutil.copyfile(self.model_file, os.path.join(directory, 'elmo.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'elmo.pkl'), 'wb') as f: cloudpickle.dump({ 'class_to_i': self.class_to_i, 'i_to_class': self.i_to_class, 'config_num': self.config_num, 'random_seed': self.random_seed, 'dropout': self.dropout }, f)
def train(self, X, y, outpath=None, verbose=True): """ Train intent classifier for given training data :param X: :param y: :param outpath: :param verbose: :return: """ def build(X, y=None): """ Inner build function that builds a single model. :param X: :param y: :return: """ model = Pipeline([ ('vectorizer', TfidfVectorizer( tokenizer=self.spacy_tokenizer, preprocessor=None, lowercase=False)), ('clf', SVC(C=1,kernel="linear", probability=True, class_weight='balanced'))]) from sklearn.model_selection import GridSearchCV items,counts= np.unique(y, return_counts=True) cv_splits = max(2, min(5, np.min(counts) // 5)) Cs = [0.01,0.25,1, 2, 5, 10, 20, 100] param_grid = {'clf__C': Cs, 'clf__kernel': ["linear"]} grid_search = GridSearchCV(model, param_grid=param_grid, scoring='f1_weighted', cv=cv_splits, verbose=2, n_jobs=-1 ) grid_search.fit(X, y) return grid_search model = build(X, y) if outpath: with open(outpath, 'wb') as f: cloudpickle.dump(model, f) if verbose: print("Model written out to {}".format(outpath)) return model
def persist(self, model_dir): # type: (Text) -> Dict[Text, Any] """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" import cloudpickle classifier_file = os.path.join(model_dir, "ngram_featurizer.pkl") with io.open(classifier_file, 'wb') as f: cloudpickle.dump(self, f) return { "ngram_featurizer": "ngram_featurizer.pkl" }
def hpolib_wrapper(objfun, search, budget, result_on_terminate=0.0): # construct the typemap (dict mapping argument names to their types) # the typemap is used to reverse type erasure by serialization # we use the search space definition to acquire types hp2type = {'float': float, 'switch': str } typemap = {k: hp2type[v.name] for k, v in search.items()} with open('/tmp/data.pkl', 'w') as f: pickle.dump({"objfun": objfun, "typemap": typemap, "search": search}, f)
def wrapper(args): kwargs = {k: v for k, v in zip(search.keys(), args)} result = fun(**kwargs) try: with open('/tmp/results.pkl', 'r') as f: data = pickle.load(f) except (IOError, EOFError): data = {"kwargs": [], "results": []} with open('/tmp/results.pkl', 'w') as f: data["kwargs"].append(kwargs) data["results"].append(result) pickle.dump(data, f) return result
def _save_local(self, path): '''Save current query object to local path ''' try: os.makedirs(path) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(path): pass else: raise with open(os.path.join(path, "pickle_archive"), "wb") as f: _cloudpickle.dump(self, f)
def send_compile_result(self, path, subm_id, game_id, submission): """ send results of compilation back to webserver """ runfile = path + "run_command" if os.path.exists(runfile): # make sure they don't put malicious commands in here, by deleting the file if it exists subprocess.call(["chmod", "u+rw", runfile]) subprocess.call("rm", "-f", runfile) with open(runfile, 'wb') as fo: cloudpickle.dump(submission, fo) subprocess.call(["chmod", "u+r", path + "*"]) zipfile = path + subm_id + "-compiled.zip" subprocess.call(["zip", "-r", zipfile, path, "-i", path + "*"]) comms.send_file_datastore_ready(zipfile, config.datastore_submission_path) reportfile = path + subm_id + "-report.txt" self.save_report(submission, reportfile) comms.send_file_webserver_ready(reportfile, config.webserver_results_path) if (submission.is_ready()): self.send_matchmaker_compile_info(path, submission.username, game_id, subm_id)
def get_data(name, force=False, read=True): """ remember that the stuff is here d._rightmove_data__request_object = _GetDataFromUrl() d._rightmove_data__url And that weblinks go into df.url. That is what we need to recurse into. """ url = urls[name] filename = get_hash_pickle_name(name, url) if os.path.exists(filename) and not force: print("found {}".format(filename)) else: rightmove_object = rightmove_data(url) pickle.dump(rightmove_object, open(filename, 'wb')) if read: print("reading {}".format(filename)) return pickle.load(open(filename, 'rb'))
def save(self, path=None): """Save model to a pickle located at `path`""" if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)
def persist(cls, model_dir, **args): """ save pickle model Args: model_dir: model dir model_name: model name Returns: saved json """ # type: (Text) -> Dict[Text, Any] """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" import cloudpickle model_name = str(args["model_version"]) + "_" + cls.name + ".pickle" classifier_file = os.path.join(model_dir, model_name) with io.open(classifier_file, 'wb') as f: cloudpickle.dump(cls, f) return { "model_name"+"_"+cls.name: model_name }
def _calc_J(self, name, x, lambdify=True): """ Uses Sympy to generate the Jacobian for a joint or link name string: name of the joint or link, or end-effector lambdify boolean: if True returns a function to calculate the Jacobian. If False returns the Sympy matrix """ # check to see if we have our Jacobian saved in file if os.path.isfile('%s/%s.J' % (self.config_folder, name)): J = cloudpickle.load(open('%s/%s.J' % (self.config_folder, name), 'rb')) else: Tx = self._calc_Tx(name, x=x, lambdify=False) J = [] # calculate derivative of (x,y,z) wrt to each joint for ii in range(self.num_joints): J.append([]) J[ii].append(Tx[0].diff(self.q[ii])) # dx/dq[ii] J[ii].append(Tx[1].diff(self.q[ii])) # dy/dq[ii] J[ii].append(Tx[2].diff(self.q[ii])) # dz/dq[ii] end_point = name.strip('link').strip('joint') if end_point != 'EE': end_point = min(int(end_point) + 1, self.num_joints) # add on the orientation information up to the last joint for ii in range(end_point): J[ii] = J[ii] + self.J_orientation[ii] # fill in the rest of the joints orientation info with 0 for ii in range(end_point, self.num_joints): J[ii] = J[ii] + [0, 0, 0] # save to file cloudpickle.dump(J, open('%s/%s.J' % (self.config_folder, name), 'wb')) J = sp.Matrix(J).T # correct the orientation of J if lambdify is False: return J return sp.lambdify(self.q + self.x, J)
def save(self, path): """Save this workflow to disk Parameters ---------- path: str The path to save the workflow to """ # avoid a circular import getting the version from nvtabular import __version__ as nvt_version fs = fsspec.get_fs_token_paths(path)[0] fs.makedirs(path, exist_ok=True) # point all stat ops to store intermediate output (parquet etc) at the path # this lets us easily bundle for stat in _get_stat_ops([self.output_node]): stat.op.set_storage_path(path, copy=True) # generate a file of all versions used to generate this bundle lib = cudf if cudf else pd with fs.open(fs.sep.join([path, "metadata.json"]), "w") as o: json.dump( { "versions": { "nvtabular": nvt_version, lib.__name__: lib.__version__, "python": sys.version, }, "generated_timestamp": int(time.time()), }, o, ) # dump out the full workflow (graph/stats/operators etc) using cloudpickle with fs.open(fs.sep.join([path, "workflow.pkl"]), "wb") as o: cloudpickle.dump(self, o)
def save( self, path: tp.Union[str, pathlib.Path], ) -> None: """ Saves the model to disk. It creates a directory that includes: - `{path}/model.pkl`: The `Model` object instance serialized with `pickle`, this allows you to re-instantiate the model later. This allows you to save the entirety of the states of a model in a directory structure which can be fully restored via `Model.load` if the model is already instiated or `elegy.model.load` to load the model instance from its pickled version. ```python import elegy model.save('my_model') # creates folder at 'my_model' del model # deletes the existing model # returns a model identical to the previous one model = elegy.model.load('my_model') ``` Arguments: path: path where model structure will be saved. """ model = self.local() if isinstance(path, str): path = pathlib.Path(path) path.mkdir(parents=True, exist_ok=True) with open(path / "model.pkl", "wb") as f: cloudpickle.dump(model, f)
def save_itr_params(self, itr, params): """Save the parameters if at the right iteration. Args: itr (int): Number of iterations. Used as the index of snapshot. params (obj): Content of snapshot to be saved. Raises: ValueError: If snapshot_mode is not one of "all", "last" or "gap". """ file_name = None if self._snapshot_mode == 'all': file_name = os.path.join(self._snapshot_dir, 'itr_%d.pkl' % itr) elif self._snapshot_mode == 'last': # override previous params file_name = os.path.join(self._snapshot_dir, 'params.pkl') elif self._snapshot_mode == 'gap': if itr % self._snapshot_gap == 0: file_name = os.path.join(self._snapshot_dir, 'itr_%d.pkl' % itr) elif self._snapshot_mode == 'gap_and_last': if itr % self._snapshot_gap == 0: file_name = os.path.join(self._snapshot_dir, 'itr_%d.pkl' % itr) file_name_last = os.path.join(self._snapshot_dir, 'params.pkl') with open(file_name_last, 'wb') as file: cloudpickle.dump(params, file) elif self._snapshot_mode == 'none': pass else: raise ValueError('Invalid snapshot mode {}'.format( self._snapshot_mode)) if file_name: with open(file_name, 'wb') as file: cloudpickle.dump(params, file)
def save(task_path: Path, result=None, task=None): """ Save a :class:`~pydra.engine.core.TaskBase` object and/or results. Parameters ---------- task_path : :obj:`Path` Write directory result : :obj:`Result` Result to pickle and write task : :class:`~pydra.engine.core.TaskBase` Task to pickle and write """ if task is None and result is None: raise ValueError("Nothing to be saved") task_path.mkdir(parents=True, exist_ok=True) if result: with (task_path / "_result.pklz").open("wb") as fp: cp.dump(result, fp) if task: with (task_path / "_task.pklz").open("wb") as fp: cp.dump(task, fp)
def save(cls, obj: Any, filename: str, **kwargs: Dict[str, Any]) -> None: """ @param filename: str @param obj: Pickled @param kwargs: Dict[str, Any] @return: None """ print(f'PWD: {Path.cwd()}, Database root: {cls.root}' ) if cls.debug else None cls.root.mkdir(parents=True, exist_ok=True) filename = Path(f'{filename}.pickle') try: pobj = Pickled(obj=obj) pobj._save_data_hook(**kwargs) with open(cls.root / filename, 'wb') as f: pickle.dump(pobj, f) except Exception as e: print('Тип данных не соответвует базе данных') print('isinstance: ', isinstance(obj, SEq)) print('type: ', type(obj)) print(e)
def save(self, fpath: str = None) -> str: """ Saves the Flow to a file by serializing it with cloudpickle. This method is recommended if you wish to separate out the building of your Flow from its deployment. Args: - fpath (str, optional): the filepath where your Flow will be saved; defaults to `~/.prefect/flows/FLOW-NAME.prefect` Returns: - str: the full location the Flow was saved to """ if fpath is None: path = "{home}/flows".format(home=prefect.context.config.home_dir) fpath = Path(os.path.expanduser( path)) / "{}.prefect".format( # type: ignore slugify(self.name)) assert fpath is not None # mypy assert fpath.parent.mkdir(exist_ok=True, parents=True) with open(str(fpath), "wb") as f: cloudpickle.dump(self, f) return str(fpath)
def save_game_state_on_exit(orchestrator_object: IGameOrchestrator) -> None: """ This method it's called whenever the game crashes or it's stopped, and basically it saves all the information of a played game, saving all the current instantiated objects in a file, using pickle library. :param IGameOrchestrator orchestrator_object: The orchestrator, that controls all the game. :rtype: None. """ now = datetime.now() current_time = now.strftime("%m-%d-%Y-%H:%M:%S") main_player = orchestrator_object.game.players[0] save_file_name = '{date}-saved-game-{name}-{job}.pckl'.format( date=current_time, name=main_player.name, job=main_player.job.get_name(), ) save_file_path = '{working_directory}/saved_games/{file}'.format( working_directory=str(get_project_root()), file=save_file_name) f = open(save_file_path, 'wb') cloudpickle.dump(orchestrator_object, f) f.close()
def _save(self, data: pd.DataFrame) -> None: save_path = self._get_save_path() print(save_path) # mlflow.sklearn.save_model(sk_model=data, # path=save_path, # conda_env = self.conda_env, # **self._save_args) if not os.path.exists(".tmp"): os.makedirs(".tmp") model_path = os.path.join(".tmp", "model.pkl") with open(model_path, "wb+") as f: cloudpickle.dump(data, f) mlflow.pyfunc.save_model( save_path, python_model=ScikitWrapper(), artifacts={"model_path": model_path}, code_path=["src/stitch_classify"], conda_env="src/environment.yml", )
def save_model(model: SerializableModel, path: str, no_zip=False): """ Save model as a ZIP or a directory at path. Path may or may not contain the .zip extension Args: model (SerializableModel): Model to save path (str): Path to save the model at. May or may not have the .zip extension no_zip (bool): Do not create a zip """ if not (hasattr(model, "serialize") and hasattr(model, "deserialize")): raise NotImplementedError(f"Serialize/Deserialize not implemented for model of class {model.__class__}") work_dir = None try: # First create a temporary directory to put all contents in work_dir = tempfile.mkdtemp() with open(f"{work_dir}/serialized.pkl", "wb") as f: cloudpickle.dump(model, f) # Now let the model save whatever it wants model.serialize(work_dir) if no_zip: shutil.rmtree(path, ignore_errors=True) shutil.copytree(work_dir, path) return # Package it up into a zip and clean up the directory if path.endswith(".zip"): path = path[:-len(".zip")] shutil.make_archive(path, "zip", work_dir) finally: if work_dir is not None: shutil.rmtree(work_dir)
def find_out_feasible_states(env, log_dir, distance_threshold=0.1, brownian_variance=1, animate=False): no_new_states = 0 with env.set_kill_outside(): load_dir = 'data_upload/state_collections/' old_all_feasible_starts = pickle.load( open(osp.join(load_dir, 'all_feasible_states.pkl'), 'rb')) out_feasible_starts = StateCollection( distance_threshold=distance_threshold) print('number of feasible starts: ', old_all_feasible_starts.size) for start in old_all_feasible_starts.state_list: obs = env.reset(init_state=start) if obs[16] > -0.5: # print("got one more up to ", out_feasible_starts.size) out_feasible_starts.append([start]) print("number of out feasible starts:", out_feasible_starts.size) while no_new_states < 5: total_num_starts = out_feasible_starts.size starts = out_feasible_starts.sample(100) new_starts = generate_starts(env, starts=starts, horizon=1000, size=100000, variance=brownian_variance, animated=animate, speedup=10) out_feasible_starts.append(new_starts) num_new_starts = out_feasible_starts.size - total_num_starts logger.log("number of new states: " + str(num_new_starts)) if num_new_starts < 10: no_new_states += 1 with open(osp.join(log_dir, 'all_out_feasible_states.pkl'), 'wb') as f: cloudpickle.dump(out_feasible_starts, f, protocol=3)
def persist(self, path, persistor=None, create_unique_subfolder=True): entity_extractor_file, entity_extractor_config_file = None, None timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') if create_unique_subfolder: dir_name = os.path.join(path, "model_" + timestamp) os.mkdir(dir_name) else: dir_name = path data_file = os.path.join(dir_name, "training_data.json") classifier_file, ner_dir = None, None if self.intent_classifier: classifier_file = os.path.join(dir_name, "intent_classifier.pkl") if self.entity_extractor: ner_dir = os.path.join(dir_name, 'ner') if not os.path.exists(ner_dir): os.mkdir(ner_dir) entity_extractor_config_file = os.path.join(ner_dir, "config.json") entity_extractor_file = os.path.join(ner_dir, "model") write_training_metadata(dir_name, timestamp, data_file, self.name, self.language_name, classifier_file, ner_dir) with open(data_file, 'w') as f: f.write(self.training_data.as_json(indent=2)) if self.intent_classifier: with open(classifier_file, 'wb') as f: cloudpickle.dump(self.intent_classifier, f) if self.entity_extractor: with open(entity_extractor_config_file, 'w') as f: json.dump(self.entity_extractor.ner.cfg, f) self.entity_extractor.ner.model.dump(entity_extractor_file) if persistor is not None: persistor.send_tar_to_s3(dir_name)
def create_and_save_data_preparation(data_preparation_function: Callable, path: str) -> None: """Create, serialize and save a DataPreparation instance. Parameters ---------- data_preparation_function : Callable A function to use as data preparation. You can use your own custom code for data preparation, but it must be wrapped in a single function. NOTE: If the data preparation includes any kind of fitting on the training dataset (e.g. Scikit Learn transformers), it must be performed outside the final data preparation function to save. Fit the transformer(s) outside the function and put only the transform method inside it. Furthermore, if the entire data preparation is performed with a single Scikit-Learn transformer, you can directly pass it (fitted) to this method. path : str Local path to save the data preparation to. Raises ------ TypeError If data_preparation_function is not a function (Callable type) ClearboxWrapperException If data preparation path already exists. """ if not isinstance(data_preparation_function, Callable): raise TypeError( "data_preparation_function should be a Callable, got '{}'".format( type(data_preparation_function))) if os.path.exists(path): raise ClearboxWrapperException( "Data preparation path '{}' already exists".format(path)) data_preparation = DataPreparation(data_preparation_function) with open(path, "wb") as data_preparation_serialized_file: cloudpickle.dump(data_preparation, data_preparation_serialized_file)
def save(cfg, filename: str): """ Args: cfg: an omegaconf config object filename: yaml file name to save the config file """ logger = logging.getLogger(__name__) try: cfg = deepcopy(cfg) except Exception: pass else: # if it's deep-copyable, then... def _replace_type_by_name(x): if "_target_" in x and callable(x._target_): try: x._target_ = _convert_target_to_string(x._target_) except AttributeError: pass # not necessary, but makes yaml looks nicer _visit_dict_config(cfg, _replace_type_by_name) try: with PathManager.open(filename, "w") as f: OmegaConf.save(cfg, f) except Exception: logger.exception("Unable to serialize the config to yaml. Error:") new_filename = filename + ".pkl" try: # retry by pickle with PathManager.open(new_filename, "wb") as f: cloudpickle.dump(cfg, f) logger.warning( f"Config saved using cloudpickle at {new_filename} ...") except Exception: pass
def _calc_R(self, name, lambdify=True): """Uses Sympy to generate the rotation matrix for a joint or link Parameters ---------- name : string name of the joint, link, or end-effector lambdify : boolean, optional (Default: True) if True returns a function to calculate the matrix. If False returns the Sympy matrix """ R = None R_func = None filename = name + "_R" # check to see if we have the rotation matrix saved in file R, R_func = self._load_from_file(filename, lambdify=True) if R is None and R_func is None: # if no saved file was loaded, generate function print("Generating rotation matrix function.") R = self._calc_T(name=name)[:3, :3] # save to file abr_control.utils.os_utils.makedirs( "%s/%s" % (self.config_folder, filename) ) cloudpickle.dump( sp.Matrix(R), open("%s/%s/%s" % (self.config_folder, filename, filename), "wb"), ) if R_func is None: R_func = self._generate_and_save_function( filename=filename, expression=R, parameters=self.q ) return R_func
def persist(self, path, persistor=None, create_unique_subfolder=True): timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') if create_unique_subfolder: dir_name = os.path.join(path, "model_" + timestamp) os.mkdir(dir_name) else: dir_name = path data_file = os.path.join(dir_name, "training_data.json") classifier_file = os.path.join(dir_name, "intent_classifier.dat") entity_extractor_file = os.path.join(dir_name, "entity_extractor.dat") entity_synonyms_file = os.path.join( dir_name, "index.json") if self.training_data.entity_synonyms else None write_training_metadata(dir_name, timestamp, data_file, MITIE_SKLEARN_BACKEND_NAME, 'en', classifier_file, entity_extractor_file, entity_synonyms_file, self.fe_file) with open(data_file, 'w') as f: f.write(self.training_data.as_json(indent=2)) if self.training_data.entity_synonyms: with open(entity_synonyms_file, 'w') as f: json.dump(self.training_data.entity_synonyms, f) if self.intent_classifier: with open(classifier_file, 'wb') as f: cloudpickle.dump(self.intent_classifier, f) self.entity_extractor.save_to_disk(entity_extractor_file, pure_model=True) if persistor is not None: persistor.send_tar_to_s3(dir_name)
def pickle_model(fold_output_path, trained_workflow, model_name='model.pkl'): """Pickle and reload trained workflow. If workflow can't be pickled, print warning and return original workflow. Parameters ---------- fold_output_path : str the path into which the model will be pickled trained_workflow : a rampwf.workflow the workflow to be pickled model_name : str (default='model.pkl') the file name of the pickled workflow Returns ------- trained_workflow : a rampwf.workflow either the input workflow or the pickled and reloaded workflow """ msg = "Warning: model can't be pickled." model_file = os.path.join(fold_output_path, model_name) try: with open(model_file, 'wb') as pickle_file: cloudpickle.dump(trained_workflow, pickle_file) except pickle.PicklingError as e: print_warning(msg) print_warning(e) return trained_workflow else: # check if dumped trained_workflow can be loaded try: with open(model_file, 'rb') as pickle_file: trained_workflow = cloudpickle.load(pickle_file) except Exception as e: print_warning(msg) print_warning(e) return trained_workflow
def test_run_flow(monkeypatch): file_path = os.path.dirname(prefect.environments.execution.dask.k8s.__file__) environment = KubernetesJobEnvironment(path.join(file_path, "job.yaml")) flow_runner = MagicMock() monkeypatch.setattr( "prefect.engine.get_default_flow_runner_class", MagicMock(return_value=flow_runner), ) with tempfile.TemporaryDirectory() as directory: with open(os.path.join(directory, "flow_env.prefect"), "w+") as env: flow = prefect.Flow("test") flow_path = os.path.join(directory, "flow_env.prefect") with open(flow_path, "wb") as f: cloudpickle.dump(flow, f) with set_temporary_config({"cloud.auth_token": "test"}): with prefect.context( flow_file_path=os.path.join(directory, "flow_env.prefect") ): environment.run_flow() assert flow_runner.call_args[1]["flow"].name == "test"
def _finalize(self, link_as=None, name_hint=None): log.debug(f'finalize hint={name_hint} link_as={link_as} {self._state}') if link_as and self._state == 'spun_down': self.hooks.save.in_reverse() temp_path = self.path self.path = temp.unique_dir(self._parent_path, hint=name_hint) log.debug(f'saving to temp {temp_path}') self._state = 'saving' self.expiration.depend_on_loaded_python_modules() self.log.finalize() with open(os.path.join(temp_path, 'machine.clpickle'), 'wb') as f: cloudpickle.dump(self, f) log.debug(f'moving {temp_path} to {self.path}') os.rename(temp_path, self.path) self._state == 'saved' link_this = self.path else: self.log.finalize() assert self._state in ('spun_down', 'loaded', 'dropped') log.info(f'discarding {self.path}') # TODO: track whether the step will be transient-last and reflink? with open(os.path.join(self.path, 'log.txt')) as f: self.log_contents = f.read() temp.remove(self.path) link_this = self._parent_path self._state = 'dropped' if (link_this and link_as and os.path.realpath(link_as) != os.path.realpath(link_this)): log.debug(f'linking {link_this} to {link_as}') if os.path.lexists(link_as): if os.path.exists(link_as) and not needs_a_rebuild(link_as): log.critical(f'Refusing to overwrite fresh {link_as}') raise RuntimeError(f'Not overriding fresh {link_as}') os.unlink(link_as) os.symlink(link_this, link_as) return link_as
def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, "dan.pt")) shell(f"rm -f {self.model_file}") with open(os.path.join(directory, "dan.pkl"), "wb") as f: cloudpickle.dump( { "page_field": self.page_field, "combined_text_field": self.text_field, "unigram_text_field": self.unigram_field, "bigram_text_field": self.bigram_field, "trigram_text_field": self.trigram_field, "combined_ngrams": self.combined_ngrams, "unigrams": self.unigrams, "bigrams": self.bigrams, "trigrams": self.trigrams, "combined_max_vocab_size": self.combined_max_vocab_size, "unigram_max_vocab_size": self.unigram_max_vocab_size, "bigram_max_vocab_size": self.bigram_max_vocab_size, "trigram_max_vocab_size": self.trigram_max_vocab_size, "qanta_id_field": self.qanta_id_field, "n_classes": self.n_classes, "gradient_clip": self.gradient_clip, "n_hidden_units": self.n_hidden_units, "n_hidden_layers": self.n_hidden_layers, "nn_dropout": self.nn_dropout, "batch_size": self.batch_size, "use_wiki": self.use_wiki, "n_wiki_sentences": self.n_wiki_sentences, "wiki_title_replace_token": self.wiki_title_replace_token, "lowercase": self.lowercase, "pooling": self.pooling, "random_seed": self.random_seed, "config_num": self.config_num, }, f, )
def map(self, parallel_task, args): uid = 'slurmc.' + str(uuid4()) outd = os.path.join(self.tmp_dir, uid) os.makedirs(outd) batches = list(split_evenly(args, self.max_tasks)) sfile = os.path.join(outd, 'exdata.cloudpickle') with open(sfile, 'wb') as f: cloudpickle.dump( { 'f': parallel_task, 'args': batches, 'outd': outd }, f) self.template = parse_template(self.template, sfile=sfile) n_tasks = len(batches) ar = self.poll_loop(n_tasks, outd) for i in tqdm(ar, desc="(SLURM)", total=n_tasks): pass shutil.rmtree(outd)
def plot_pitch_slider(event_start_frame=LIV_GOALS[MATCH]['START'], event_end_frame=LIV_GOALS[MATCH]['END'], field_dimen=(106.0, 68.0), data=None, frame=None): tracking_frames = range(event_start_frame, event_end_frame+1) # make figure fig_dict = { "data": [], "layout": {}, "frames": [] } fig_dict['layout'] = generate_pitch_layout( tracking_frames) fig_dict['data'] = generate_data_for_frame(frame_num=event_start_frame) fig_dict['frames'] = generate_plotly_frames_for_event(tracking_frames) # fig = go.Figure(fig_dict) # fig.show() # https://anvil.works/forum/t/serialization-of-graph-objects/4134/2 with open('../datahub/lastrow/{}_fig_dict_white.pickle'.format(LIV_GOALS[MATCH]['PLAY']), 'wb') as handle: cloudpickle.dump(fig_dict, handle, protocol=cloudpickle.DEFAULT_PROTOCOL) handle.close()
def subrun(self): with unique_tempdir() as tmp_f: with open(os.path.join(tmp_f, "input"), "w") as f: cloudpickle.dump((self.func, self.args), f) server = ''' import sys import cloudpickle import StringIO input_ = StringIO.StringIO(sys.stdin.read()) stdout = sys.stdout sys.stdout = StringIO.StringIO() input_.seek(0) (func, args) = cloudpickle.load(input_); res = func(args) for line in res: stdout.write(line) ''' cmd = '''cd {cwd} && cat {input} | OMP_NUM_THREADS=1 nice -n +19 python -c "{server}" > {output} '''.format( server=server, input=os.path.join(tmp_f, "input"), output=os.path.join(tmp_f, "output"), cwd=os.path.dirname(os.path.abspath(__file__))) process = subprocess.Popen("/bin/bash -c '{}'".format( cmd.replace("'", "'\\''")), shell=True) while process.poll() is None: if self.terminate: process.kill() time.sleep(1) nothing, err = process.communicate() retcode = process.poll() if retcode and not self.terminate: raise ProcessError(retcode, None, err) if self.result_file is not None: shutil.copy(os.path.join(tmp_f, "output"), self.result_file)
def save_itr_params(itr, params, use_cloudpickle=True, pkl_prefix=''): if _snapshot_dir: if _snapshot_mode == 'all': file_name = osp.join(get_snapshot_dir(), pkl_prefix + 'itr_%d.pkl' % itr) elif _snapshot_mode == 'last': # override previous params file_name = osp.join(get_snapshot_dir(), pkl_prefix + 'params.pkl') elif _snapshot_mode == "gap": if itr == 0 or (itr + 1) % _snapshot_gap == 0: file_name = osp.join(get_snapshot_dir(), pkl_prefix + 'itr_%d.pkl' % itr) else: return elif _snapshot_mode == 'none': return else: raise NotImplementedError if use_cloudpickle: import cloudpickle with open(file_name, 'wb') as f: cloudpickle.dump(params, f, protocol=3) else: joblib.dump(params, file_name, compress=3)
def dump_trajectories(self, force=False): """Dumps trajectories in a new shard. Should be called at most once per epoch. Args: force: (bool) Whether to complete unfinished trajectories and create a new shard even if we have not reached the minimum size. """ if self.trajectory_dump_dir is None: return gfile.makedirs(self.trajectory_dump_dir) trajectories = self.train_env.trajectories if force: trajectories.complete_all_trajectories() # complete_all_trajectories() also adds trajectories that were just reset. # We don't want them since they have just the initial observation and no # actions, so we filter them out. def has_any_action(trajectory): return (trajectory.time_steps and trajectory.time_steps[0].action is not None) self._trajectory_buffer.extend( filter(has_any_action, trajectories.completed_trajectories)) trajectories.clear_completed_trajectories() ready = (len(self._trajectory_buffer) >= self._trajectory_dump_min_count_per_shard) if ready or force: shard_path = os.path.join(self.trajectory_dump_dir, "{}.pkl".format(self.epoch)) with gfile.GFile(shard_path, "wb") as f: pickle.dump(self._trajectory_buffer, f) self._trajectory_buffer = []
def main(): args = parse_args() setup_logging(args.logfile) log = get_logger() assert (0 <= args.hidden_fraction <= 1) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) log.info('*' * 100) log.info('[Starting MC experiment]') log_dict(log.info, vars(args)) log.info('[Loading target GIs]') with open(args.target_gis, 'rb') as f: tgt_gis = cpkl.load(f) log.info('[Loading source GIs]') with open(args.source_gis, 'rb') as f: src_gis = cpkl.load(f) log.info('[Loading sim scores]') with open(args.sim_scores, 'rb') as f: sim_scores_data = cpkl.load(f) sim_scores = sim_scores_data['values'] sim_scores = sim_scores / np.max(sim_scores) # Normalize # log.info('\t- %d scores', len(sim_scores)) hp_param_space = xsmf_param_space(args) results, models, training_curves, trials = \ run_xsmf_experiment(tgt_gis=tgt_gis, src_gis=src_gis, space=hp_param_space, sim_scores=sim_scores, val_hf=args.val_hidden_fraction, test_hf=args.hidden_fraction, n_repeats=args.n_repeats, hp_iters=args.n_hyperopt_iters, hp_seed=args.random_seed) # Save results and other information log_results(results['summary']) with open(args.results_output, 'w') as f: json.dump(results, f, indent=2) with open(args.training_curve_output, 'wb') as f: cpkl.dump(training_curves, f) # TODO: save models the models cannot be pickled at the moment # We will need to implement a from dict and a to dict method with open(args.models_output, 'wb') as f: cpkl.dump(trials, f) with open(args.trials_output, 'wb') as f: cpkl.dump(trials, f)
def _main(): parser = argparse.ArgumentParser(description='Encodes images in a directory.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument(dest='images_dir', type=str, help='The images directory.') parser.add_argument(dest='model_path', type=str, help='The path to Keras model,') parser.add_argument(dest='output_file', type=str, help='The output .pickle file.') parser.add_argument('--image-size', type=int, default=224, help='The image size.') parser.add_argument('--only-hor', action='store_true', default=False, help='Only do horizontal flips.') parser.add_argument('--fill', action='store_true', default=False, help='Zoom to fill letterbox if the image is small.') args = parser.parse_args() images = os.listdir(args.images_dir) full_paths = [os.path.join(args.images_dir, img) for img in images] embeddings = encode_images( model_path=args.model_path, images=full_paths, letterbox_size=args.image_size, verbose=True, onlyhor=args.only_hor, fill=args.fill ) with open(args.output_file, 'wb') as of: cloudpickle.dump(embeddings, of)
def _save_model(sk_model, output_path, serialization_format, protocol=None): """ :param sk_model: The scikit-learn model to serialize. :param output_path: The file path to which to write the serialized model. :param serialization_format: The format in which to serialize the model. This should be one of the following: ``mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE`` or ``mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE``. :param protocol: The pickle protocol version. If ``None``, the default protocol version from cloudpickle will be used. """ with open(output_path, "wb") as out: if serialization_format == SERIALIZATION_FORMAT_PICKLE: pickle.dump(sk_model, out) elif serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE: import cloudpickle cloudpickle.dump(sk_model, out, protocol) else: raise MlflowException( message= "Unrecognized serialization format: {serialization_format}". format(serialization_format=serialization_format), error_code=INTERNAL_ERROR, )
def subrun(self): with unique_tempdir() as tmp_f: with open(os.path.join(tmp_f, "input"), "w") as f: cloudpickle.dump((self.func, self.args), f) server = ''' import sys import cloudpickle import StringIO input_ = StringIO.StringIO(sys.stdin.read()) stdout = sys.stdout sys.stdout = StringIO.StringIO() input_.seek(0) (func, args) = cloudpickle.load(input_); res = func(args) for line in res: stdout.write(line) ''' cmd = '''cd {cwd} && cat {input} | OMP_NUM_THREADS=1 nice -n +19 python -c "{server}" > {output} '''.format( server=server, input=os.path.join(tmp_f, "input"), output=os.path.join(tmp_f, "output"), cwd=os.path.dirname(os.path.abspath(__file__)) ) process = subprocess.Popen("/bin/bash -c '{}'".format(cmd.replace("'", "'\\''")), shell=True) while process.poll() is None: if self.terminate: process.kill() time.sleep(1) nothing, err = process.communicate() retcode = process.poll() if retcode and not self.terminate: raise ProcessError(retcode, None, err) if self.result_file is not None: shutil.copy(os.path.join(tmp_f, "output"), self.result_file)
def get_env(version='v1.0-mini', env_path=None, save_pkl_path=None, render_bev=True, config={}): if env_path is not None: t = time.time() env = cloudpickle.load(open(env_path, 'rb')) print(f"env load time: {time.time()-t}") else: env_config = config env_config['config']['NuScenesAgent_config']['version'] = version env = class_from_path(env_config['type'])(env_config['config']) if not render_bev: env.config['render_type'] = [] if 'pedestrian' in env.graphics.plot_list : env.graphics.plot_list.remove('pedestrian') if 'map_info' in env.graphics.plot_list: env.graphics.plot_list.remove('map_info') if 'cam' in env.graphics.plot_list: env.graphics.plot_list.remove('cam') if save_pkl_path is not None: cloudpickle.dump(env, open(save_pkl_path, 'wb')) return env
def add_flow(self, flow: "Flow") -> str: """ Method for storing a new flow as bytes in the local filesytem. Args: - flow (Flow): a Prefect Flow to add Returns: - str: the location of the newly added flow in this Storage object Raises: - ValueError: if a flow with the same name is already contained in this storage """ if flow.name in self: raise ValueError( 'Name conflict: Flow with the name "{}" is already present in this storage.' .format(flow.name)) flow_location = os.path.join(self.directory, "{}.prefect".format(slugify(flow.name))) with open(flow_location, "wb") as f: cloudpickle.dump(flow, f) self.flows[flow.name] = flow_location return flow_location
def save(self, directory: str): shutil.copyfile(self.model_file, os.path.join(directory, 'rnn.pt')) shell(f'rm -f {self.model_file}') with open(os.path.join(directory, 'rnn.pkl'), 'wb') as f: cloudpickle.dump( { 'page_field': self.page_field, 'text_field': self.text_field, 'qanta_id_field': self.qanta_id_field, 'n_classes': self.n_classes, 'gradient_clip': self.gradient_clip, 'n_hidden_units': self.n_hidden_units, 'n_hidden_layers': self.n_hidden_layers, 'lr': self.lr, 'nn_dropout': self.nn_dropout, 'sm_dropout': self.sm_dropout, 'batch_size': self.batch_size, 'use_wiki': self.use_wiki, 'n_wiki_sentences': self.n_wiki_sentences, 'wiki_title_replace_token': self.wiki_title_replace_token, 'lowercase': self.lowercase, 'random_seed': self.random_seed, 'config_num': self.config_num }, f)
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'): """ Saves a DataFrame to disk along with its metadata in a pickle format. This function saves a DataFrame to disk along with its metadata from the catalog. Specifically, this function saves the DataFrame in the given file path, and saves the metadata in the same directory (as the file path) but with a different extension. This extension can be optionally given by the user (defaults to '.pklmetadata'). Args: data_frame (DataFrame): The DataFrame that should be saved. file_path (string): The file path where the DataFrame must be stored. metadata_ext (string): The metadata extension that should be used while storing the metadata information. The default value is '.pklmetadata'. Returns: A Boolean value of True is returned if the DataFrame is successfully saved. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `file_path` is not of type string. AssertionError: If `metadata_ext` is not of type string. AssertionError: If a file cannot written in the given `file_path`. Examples: >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta See Also: :meth:`~py_entitymatching.load_table` Note: This function is a bit different from to_csv_metadata, where the DataFrame is stored in a CSV file format. The CSV file format can be viewed using a text editor. But a DataFrame stored using 'save_table' is stored in a special format, which cannot be viewed with a text editor. The reason we have save_table is, for larger DataFrames it is efficient to pickle the DataFrame to disk than writing the DataFrame in CSV format. """ # Validate the input parameters validate_object_type(data_frame, pd.DataFrame) validate_object_type(file_path, six.string_types, error_prefix='Input file path') validate_object_type(metadata_ext, six.string_types, error_prefix='Input Metadata ext') # Get the file_name (with out extension) and the extension from the given # file path. For example if the file_path was /Users/foo/file.csv then # the file_name will be /Users/foo/file and the extension will be '.csv' file_name, _ = os.path.splitext(file_path) # The metadata file name is the same file name but with the extension # given by the user metadata_filename = file_name + metadata_ext # Check if the file exists in the file_path and whether we have # sufficient access privileges to write in that path can_write, file_exists = ps._check_file_path(file_path) if can_write: # If the file already exists then issue a warning and overwrite the # file if file_exists: logger.warning( 'File already exists at %s; Overwriting it', file_path) # we open the file_path in binary mode, as we are writing in # binary format' with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # Looks like we cannot write the file in the given path. Raise an # error in this case. logger.error('Cannot write in the file path %s; Exiting', file_path) raise AssertionError('Cannot write in the file path %s', file_path) # Once we are done with writing the DataFrame, we will write the metadata # now # Initialize a metadata dictionary to hold the metadata of DataFrame from # the catalog metadata_dict = collections.OrderedDict() # get all the properties for the input data frame # # Check if the DataFrame information is present in the catalog properties = {} if cm.is_dfinfo_present(data_frame) is True: properties = cm.get_all_properties(data_frame) # If the properties are present in the catalog, then write properties to # disk if len(properties) > 0: for property_name, property_value in six.iteritems(properties): if isinstance(property_value, six.string_types) is True: metadata_dict[property_name] = property_value # try to save metadata can_write, file_exists = ps._check_file_path(metadata_filename) if can_write: # If the file already exists, then issue a warning and overwrite the # file if file_exists: logger.warning( 'Metadata file already exists at %s. Overwriting it', metadata_filename) # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: logger.warning( 'Cannot write metadata at the file path %s. Skip writing metadata ' 'file', metadata_filename) return True
def save_object(object_to_save, file_path): """ Saves a Python object to disk. This function is intended to be used to save py_entitymatching objects such as rule-based blocker, feature vectors, etc. A user would like to store py_entitymatching objects to disk, when he/she wants to save the workflow and resume it later. This function provides a way to save the required objects to disk. This function takes in the object to save the file path. It pickles the object and stores it in the file path specified. Args: object_to_save (Python object): The Python object to save. This can be a rule-based blocker, feature vectors, etc. file_path (string): The file path where the object must be saved. Returns: A Boolean value of True is returned, if the saving was successful. Raises: AssertionError: If `file_path` is not of type string. AssertionError: If a file cannot be written in the given `file_path`. Examples: >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> rb = em.RuleBasebBlocker() >>> block_f = em.get_features_for_blocking(A, B) >>> rule1 = ['colA_colA_lev_dist(ltuple, rtuple) > 3'] >>> rb.add_rule(rule1) >>> em.save_object(rb, './rule_blocker.pkl') See Also: :meth:`~load_object` """ # Validate input parameters validate_object_type(file_path, six.string_types, 'Input file path') # Check whether the file path is valid and if a file is already present # at that path. # noinspection PyProtectedMember can_write, file_exists = ps._check_file_path(file_path) # Check whether we can write if can_write: # If a file already exists in that location, issue a warning and # overwrite the file. if file_exists: logger.warning( 'File already exists at %s; Overwriting it', file_path) # we open the file in 'wb' mode as we are writing a binary file. with open(file_path, 'wb') as file_handler: cloudpickle.dump(object_to_save, file_handler) else: with open(file_path, 'wb') as file_handler: cloudpickle.dump(object_to_save, file_handler) # If we cannot write, then raise an error. else: logger.error('Cannot write in the file path %s; Exiting', file_path) raise AssertionError('Cannot write in the file path %s', file_path) # Return True if everything was successful. return True
def main(): args = argparser.parse_args() n_pts = int(args.points) print 'Loading data' with open(os.path.abspath(args.in_file), 'rb') as f: data = cloudpickle.load(f) print 'subtracting mean' time_arr = data['time_arr'] - np.mean(data['time_arr']) gyr_arr = data['gyr_arr'] - np.tile( np.mean(data['gyr_arr'], axis=1).reshape((3,1)), (1,data['gyr_arr'].shape[1]) ) acc_arr = data['acc_arr'] - np.tile( np.mean(data['acc_arr'], axis=1).reshape((3,1)), (1,data['acc_arr'].shape[1]) ) # M: number of axes # N: number of epochs if acc_arr.shape != gyr_arr.shape: raise Exception('different sizes') M, N = gyr_arr.shape # automate this? print 'Computing mean dt' t0 = np.mean(np.diff(time_arr)) fs = np.float64(1.0)/t0 n = np.power(2, np.arange(np.floor(np.log2(N/2.)))) end_log_inc = np.log10(n[-1]) m = shared_from_array( np.unique(np.ceil(np.logspace(0, end_log_inc, n_pts))).astype(np.int64) ) T = m*t0 if (T < 0).any(): print 'T < 0' set_trace() # setup input/output shared memory arrays theta_gyr = shared_from_array( np.cumsum(gyr_arr, axis=1) ) theta_acc = shared_from_array( np.cumsum(acc_arr, axis=1) ) sigma2_gyr = shared_from_array( np.zeros((M, len(m))) ) sigma2_acc = shared_from_array( np.zeros((M, len(m))) ) # shared memory/serialization workaround: define calculation functions here so # that the shared memory arrays are in scope def adev_at_tau(i): """worker function for parallelization. first part of the Allan deviation equation. There is potentially a way to do the Allan deviation calculation without any for loop whatsoever, but I haven't figured it out yet. It would require 2D array indexing in NumPy. """ k = range(N - 2*m[i]) sigma2_gyr[:,i] = np.sum( np.power( theta_gyr[:,k+2*m[i]] - 2*theta_gyr[:,k+m[i]] + theta_gyr[:,k] , 2 ), axis=1) sigma2_acc[:,i] = np.sum( np.power( theta_acc[:,k+2*m[i]] - 2*theta_acc[:,k+m[i]] + theta_acc[:,k] , 2 ), axis=1) def adev_at_tau_wrapper(idxs): if idxs[0] == 0: for i in trange(len(idxs)): adev_at_tau(idxs[i]) else: for i in idxs: adev_at_tau(i) print 'creating procs' idx_chunks = chunk(range(len(m)), int(args.cores)) procs = [multiprocessing.Process(target=adev_at_tau_wrapper, args=(ichnk,)) for ichnk in idx_chunks] print '# chunks: ', len(procs) for proc in procs: proc.start() for proc in procs: proc.join() div = np.tile(2*np.multiply(np.power(T,2), N-2*m), (M,1)) sigma2_gyr = np.divide(sigma2_gyr, div) sigma2_acc = np.divide(sigma2_acc, div) sigma_gyr = np.sqrt(sigma2_gyr) sigma_acc = np.sqrt(sigma2_acc) data_dir, in_name = os.path.split(os.path.abspath(args.in_file)) set_name, ext = in_name.split(os.extsep) out_file_name = os.path.join(data_dir, set_name+'_adev'+os.extsep+ext) print 'saving to: ', out_file_name with open(out_file_name, 'wb') as f: cloudpickle.dump( { 'T': T, 'sigma2_gyr': sigma2_gyr, 'sigma2_acc': sigma2_acc, 'sigma_gyr': sigma_gyr, 'sigma_acc': sigma_acc, }, f, -1 )