def save_model(model, model_filepath): """ Saves the model passed given the name of the file""" cp.dump(model, model_filepath, compression='lzma', set_default_extension=False)
def _get_snp_classifications(self, genus): if genus == 'Pocillopora': # First check to see if the cached version exists snp_cache_dir = os.path.join(self.input_dir_18s, 'snp_classifications', f'poc_snp_class_df.p.bz') elif genus == 'Porites': snp_cache_dir = os.path.join(self.input_dir_18s, 'snp_classifications', f'por_snp_class_df.p.bz') if os.path.exists(snp_cache_dir): return compress_pickle.load(snp_cache_dir) else: # Need to create it from scratch if genus == 'Pocillopora': raw_snp_class_path = os.path.join( self.input_dir_18s, 'snp_classifications', f'POC_SNP_classifications.csv') elif genus == 'Porites': raw_snp_class_path = os.path.join( self.input_dir_18s, 'snp_classifications', f'POR_SNP_classifications.csv') snp_class_df = pd.read_csv(raw_snp_class_path, index_col=0) snp_class_df.index = self._convert_index_to_sample_ids( snp_class_df.index) snp_class_df.dropna(inplace=True) snp_class_df.columns = ['label'] compress_pickle.dump(snp_class_df, snp_cache_dir) return snp_class_df
def parse_information_files(list_of_enzyme_commission_numbers, list_of_databases, cpus): output_path = "data/parsed_raw/parsed_dictionary_of_raw_data.gzip" if not os.path.exists(output_path): print("Creating information compressed file. Saved at {}".format( output_path)) work_list = [] for enzyme in list_of_enzyme_commission_numbers: for database in list_of_databases: file_path = "data/raw/{}_{}_raw.txt".format(enzyme, database) work_list.append((file_path, )) result_list = run_multiprocessing.run_mp(work_list, cpus, process_file) processed_results = split_results(result_list) dump(processed_results, output_path, compression="gzip", set_default_extension=False) else: print("Information file have been found at {}. Loading it.".format( output_path)) processed_results = load(output_path, compression="gzip", set_default_extension=False) print("File loaded") return processed_results
def __init__(self, kernel, machine, cores=1): """Initialize cache simulation based predictor from kernel and machine object.""" CachePredictor.__init__(self, kernel, machine, cores) if isinstance(kernel, KernelCode): # Make use of caching for symbolic LC representation: file_name = 'CSIM_analysis.pickle.lzma' file_path = kernel.get_intermediate_location( file_name, machine_and_compiler_dependent=False, other_dependencies=[str(cores)] + [str(t) for t in self.kernel.constants.items()]) lock_mode, lock_fp = kernel.lock_intermediate(file_path) if lock_mode == fcntl.LOCK_SH: # use cache cache = compress_pickle.load(file_path) lock_fp.close() # release lock self.first_dim_factor = cache['first_dim_factor'] self.stats = cache['stats'] else: # lock_mode == fcntl.LOCK_EX # needs update self.simulate() compress_pickle.dump( { 'first_dim_factor': self.first_dim_factor, 'stats': self.stats }, file_path) lock_fp.close() # release lock else: # No caching support without filename for kernel code self.simulate()
def test_dump_load(dump_load): ( message, path, compression, set_default_extension, optimize, expected_file, expected_fail, ) = dump_load with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) if expected_fail is None: dump( message, path, compression, set_default_extension=set_default_extension, optimize=optimize, ) loaded_message = load( path, compression, set_default_extension=set_default_extension ) assert loaded_message == message else: with pytest.raises(expected_fail): dump( message, path, compression, set_default_extension=set_default_extension, optimize=optimize, ) with pytest.raises(expected_fail): load(path, compression, set_default_extension=set_default_extension)
def __init__(self, kernel, machine, cores=1, symbolic=False): """Initialize layer condition based predictor from kernel and machine object.""" CachePredictor.__init__(self, kernel, machine, cores=cores) if isinstance(kernel, KernelCode): # Make use of caching for symbolic LC representation: file_name = 'LC_analysis.pickle.lzma' file_path = kernel.get_intermediate_location( file_name, machine_and_compiler_dependent=False, other_dependencies=[str(cores)]) lock_mode, lock_fp = kernel.lock_intermediate(file_path) if lock_mode == fcntl.LOCK_SH: # use cache self.results = compress_pickle.load(file_path) lock_fp.close() # release lock else: # lock_mode == fcntl.LOCK_EX # needs update self.build_symbolic_LCs() compress_pickle.dump(self.results, file_path) lock_fp.close() # release lock else: # No caching support without filename for kernel code self.build_symbolic_LCs() if not symbolic: self.desymbolize()
def remove_training_data(): from compress_pickle import load training_data_path = "/home/pepamengual/UEP/trained_model/UEP_trained_model_4" skempi_data_path = "/home/pepamengual/UEP/trained_model/substracted_4" substracted_model = {} training_data = load(training_data_path, compression="lzma", set_default_extension=False) skempi_data = load(skempi_data_path, compression="lzma", set_default_extension=False) for environment, amino_acid_dict in training_data.items(): for amino_acid, counts in amino_acid_dict.items(): if environment in skempi_data and amino_acid in skempi_data[ environment]: substract = counts - skempi_data[environment][amino_acid] substracted_model.setdefault(environment, {}).setdefault( amino_acid, substract) else: substracted_model.setdefault(environment, {}).setdefault( amino_acid, counts) from compress_pickle import dump dump(substracted_model, "substracted_def_4", compression="lzma")
def test_dump_load_on_filestreams(simple_dump_and_remove): path, compression, message, optimize = simple_dump_and_remove read_mode = "rb" # get_compression_read_mode(compression) write_mode = "wb" # get_compression_write_mode(compression) with open(path, write_mode) as f: dump(message, f, compression=compression, optimize=optimize) with open(path, read_mode) as f: raw_content = f.read() f.seek(0) loaded_message = load(f, compression=compression) assert loaded_message == message os.remove(path) dump( message, path, compression=compression, set_default_extension=False, optimize=optimize, ) with open(path, read_mode) as f: benchmark = f.read() # zipfile compression stores the data in a zip archive. The archive then # contains a file with the data. Said file's mtime will always be # different between the two dump calls, so we skip the follwing assertion if compression != "zipfile": assert raw_content == benchmark
def save_pickle_impl(self): ''' save pvalues into pickle file overrides class StorePrime::save_pickle_impl() ''' #compress_pickle.dump(self.pvalues, self.pfile, compression="lzma") compress_pickle.dump(self.pvalues, self.pfile)
def dump(obj, filename, default_compression=DEFAULT_PICKLE_COMPRESSION, directory=RESULT_DIRECTORY, use_compression=PICKLE_COMPRESSION, save_full_object=SAVE_FULL_OBJECT): filename = str(filename) if not os.path.isabs(filename): filename = os.path.join(directory, filename) try: os.makedirs(os.path.dirname(filename)) except OSError: pass if dill is not None and save_full_object: string_buff = dill.dumps(obj, recurse=True) obj_to_save = {'dill': True, 'obj': string_buff} else: obj_to_save = obj if compress_pickle is not None and use_compression: if os.path.splitext(filename)[0] == filename: filename = '.'.join([filename, default_compression]) compress_pickle.dump(obj_to_save, filename) else: if os.path.splitext(filename)[0] == filename: filename = '.'.join([filename, use_compression]) with open(filename, 'wb') as f: pickle.dump(obj_to_save, f) return filename
def _get_hard_and_rel_sub_dicts(self, sample_names): if os.path.isfile( os.path.join(self.cache_dir, 'hard_sub_sample_dict.p.bz')): if os.path.isfile( os.path.join(self.cache_dir, 'rel_sub_sample_dict.p.bz')): return compress_pickle.load( os.path.join( self.cache_dir, 'hard_sub_sample_dict.p.bz')), compress_pickle.load( os.path.join(self.cache_dir, 'rel_sub_sample_dict.p.bz')) hard_sub_sample_dict = {} rel_sub_sample_dict = {} count = 0 tot_samples = len(sample_names) for sample_name in sample_names: count += 1 sys.stdout.write(f'\r{sample_name}: {count}/{tot_samples}') abund_list = self.absolute_consolidated_abundance_dict[sample_name] if sum(abund_list) < 10000: continue # Make a redundant list of the seqs non_z = [] for i, abund in enumerate(abund_list): if abund > 0: non_z.append(i) redundant_list = [] # prob_list = [] tot = sum(abund_list) for i in non_z: seq = self.ordered_seq_names[i] abund = abund_list[i] # prob = abund/tot redundant_list.extend([seq for _ in range(abund)]) # prob_list.extend([prob for _ in range(abund)]) hard_sub_sample_list = np.random.choice(redundant_list, 10000, replace=False) hard_abunds_dict = dict(Counter(hard_sub_sample_list)) hard_sub_sample_dict[sample_name] = hard_abunds_dict # For soft norm_abund_dict = { self.ordered_seq_names[i]: int((abund_list[i] / tot) * 100) for i in non_z if int((abund_list[i] / tot) * 10000) > 0 } rel_sub_sample_dict[sample_name] = norm_abund_dict compress_pickle.dump( hard_sub_sample_dict, os.path.join(self.cache_dir, 'hard_sub_sample_dict.p.bz')) compress_pickle.dump( rel_sub_sample_dict, os.path.join(self.cache_dir, 'rel_sub_sample_dict.p.bz')) return hard_sub_sample_dict, rel_sub_sample_dict
def pickle_to_str(pkl_path: str = None, obj=None): if obj is None: s = base64.b64encode(open(pkl_path, "rb").read()).decode("ascii") else: dump(obj, "temp.pkl") s = base64.b64encode(open("temp.pkl", "rb").read()).decode("ascii") return s
def test_dump_fails_on_unhandled_compression(wrong_compressions): with pytest.raises(ValueError): dump( 1, "test_path.pkl", compression=wrong_compressions, set_default_extension=False, )
def save_fixed_frames_dict(frames_dict: Dict[str, Dict[int, List[numpy.ndarray]]], save_path_without_extension: str): folder = os.path.dirname(save_path_without_extension) if not os.path.exists(folder): os.makedirs(folder) with open(save_path_without_extension + "_frames_dict.xz", "wb") as file: compress_pickle.dump(frames_dict, file, compression="lzma")
def create_training_dataset(english_tokens, german_tokens, max_samples=None, sample_length=100, validation_split=None, save_dataset=True, save_interval=1000, save_dir=None, save_dir_validation=None, save_compression='zipfile', tqdm=None): max_samples = min(max_samples or len(english_tokens), len(english_tokens), len(german_tokens)) # save_interval = min(save_interval, len(english_tokens), len(german_tokens)) if tqdm is None: tqdm = get_tqdm() assert len(english_tokens) == len(german_tokens), \ f'unexpected data mismatch for english={len(english_tokens)}, german={len(german_tokens)}' assert isinstance(english_tokens, list), \ f'unexpected format received: received={type(english_tokens)}, expected=<list>' if save_dataset: os.makedirs(save_dir, exist_ok=True) if save_dir_validation is not None: os.makedirs(save_dir_validation, exist_ok=True) assert validation_split is not None, \ f'provide a validation split (fraction of whole dataset created) when using `save_dir_validation=True`' validation_index = int((1 - validation_split)*max_samples) else: validation_index = max_samples for i in tqdm(range(0, max_samples, save_interval), desc='create_training_data'): logger.debug(f'creating dataset: i={i}, max_samples={max_samples}, step={save_interval}') en = english_tokens[i: i+save_interval] de = german_tokens[i: i+save_interval] for j in range(min(len(en), len(de))): row_sample_length = min(len(en[j]), len(de[j]), sample_length) en[j] = en[j][:row_sample_length] # Truncating to same length as a first solution de[j] = de[j][:row_sample_length] # Truncating to same length as a first solution assert len(en) == len(de), \ f'unexpected data mismatch for english={len(en)}, german={len(de)}' dataset = (en, de) compress_pickle.dump(dataset, path=os.path.join(save_dir if i < validation_index else save_dir_validation, f'train_{i}'), compression=save_compression) if not save_dataset: raise NotImplementedError('Down-prioritized due to too typically high memory requirements.') assert len(en) == len(de), \ f'unexpected data mismatch for english={len(en)}, german={len(de)}' return en, de
def compile( self, compile_params: Dict[str, Any], progress_cb: Optional[Callable[[int], None]] = None, ) -> None: if self.compiled: return client_id = compile_params["client_id"] client_secret = compile_params["client_secret"] reddit = praw.Reddit( client_id=client_id, client_secret=client_secret, user_agent=constants.reddit_user_agent, check_for_updates=False, comment_kind="t1", message_kind="t4", redditor_kind="t2", submission_kind="t3", subreddit_kind="t5", trophy_kind="t6", oauth_url="https://oauth.reddit.com", reddit_url="https://www.reddit.com", short_url="https://redd.it", ratelimit_seconds=5, timeout=16, ) api = PushshiftAPI(reddit) comments = [] start_epoch = int(self.start_time.timestamp()) end_epoch = int(self.end_time.timestamp()) progress = 0 for subreddit in self.subreddits: # Collect all comments matching the search parameters in a list for comment in api.search_comments(after=start_epoch, before=end_epoch, subreddit=subreddit): comments.append(comment) progress += 1 if progress_cb is not None: progress_cb(progress) if not comments: raise ValueError( "No comments found. Double-check your search parameters and API credentials." ) self.document_count = len(comments) with open(self.comments_pickle_path, "wb") as pickle_file: compress_pickle.dump(comments, pickle_file, compression="gzip") self.compiled = True self.write()
def wrapped_method(self, *args, **kwargs): path = path_format.format(**{**vars(self), **kwargs}) if os.path.exists(path): value = load(path) return value result = f(self, *args, **kwargs) os.makedirs(os.path.split(path)[0], exist_ok=True) dump(result, path) return result
def saveState(vars, file, name='data'): import compress_pickle if not (os.path.exists(f'save_data/{file}/')): os.makedirs(f'save_data/{file}/') pklfile = f'save_data/{file}/{name}.pkl' # with open(file+name+'.pkl', 'wb') as f: # pickle.dump(vars, f) compress_pickle.dump(vars, pklfile + '.lz4')
def stop(self): self.is_running = False file_path = 'data/' + self.file_name + '_checked.gz' DataManager().judgeFileExistance(file_path) compress_pickle.dump(self.frames, file_path) for frame in self.frames: frame.force_array = None pickle.dump(self.frames, open('data/' + self.file_name + '.simple', 'wb'))
def preProcessSubLandscape(pop, landReps, fName, drive, nodesAggLst, nodeAggIx, MF=(True, True), cmpr='bz2', SUM=True, AGG=True, SPA=True, REP=True, SRP=True): """ Preprocesses a subset of the landscape Args: pop (list): Files list element aggregated by landscape subset landReps (dict): Landscape repetitions (spatial from monet.loadAndAggregateLandscapeDataRepetitions) fName (str): Filename (including path) drive (dict): Gene-drive dictionary nodesAggLst (lst): List of lists containing the indices of the nodes to be aggregated together nodeAggIx (int): Current list to process (from the nodeAggLst) MF (bool tuple): Male and Female boolean selectors cmpr (str): Compression algorithm to be used by compress-python SUM (bool): Population summed and gene-aggregated into one node AGG (bool): Population gene-aggregated in their own nodes SPA (bool): Genetic landscape (gene-aggregated) REP (bool): Garbage gene-aggregated data SRP (bool): Summed into one garbage gene-aggregated data Returns: None """ if SUM: sumData = monet.sumLandscapePopulationsFromFiles(pop, MF[0], MF[1]) sumAgg = monet.aggregateGenotypesInNode(sumData, drive) pkl.dump(sumAgg, fName + '_sum', compression=cmpr) if AGG: aggData = monet.loadAndAggregateLandscapeData(pop, drive, MF[0], MF[1]) pkl.dump(aggData, fName + '_agg', compression=cmpr) if SPA: geneSpaTemp = monet.getGenotypeArraysFromLandscape(aggData) pkl.dump(geneSpaTemp, fName + '_spa', compression=cmpr) if REP or SRP: fLandReps = monet.filterAggregateGarbageByIndex( landReps, nodesAggLst[nodeAggIx]) pkl.dump(fLandReps, fName + '_rep', compression=cmpr) if SRP: fRepsSum = [sum(i) for i in fLandReps['landscapes']] fRepsDict = { 'genotypes': fLandReps['genotypes'], 'landscapes': fRepsSum } pkl.dump(fRepsDict, fName + '_srp', compression=cmpr) return None
def main(): path_training_folders = "/home/pepamengual/UEPPi/ueppi_script/training/all_complexes/interactome_*" radius = 4 number_of_processors = 27 training_data = training_with_multiprocessing(radius, number_of_processors, path_training_folders) dump(training_data, "single_contact_matrix", compression="lzma", set_default_extension=False)
def dump_predicted_set(self, x_set, y_set): x_set_enc = self.deep_autoencoder.predict_encoded(x_set) dump = { "categories": self.classes_vector, "x": x_set, "x_encoded": x_set_enc, "y": y_set } now_string = self._get_now_string() dump_filename = "%s_encoded.gz" % now_string compress_pickle.dump(dump, dump_filename)
def test_dump_compresses(simple_dump_and_remove): path, compression, message, optimize = simple_dump_and_remove kwargs = dict() if compression == "zipfile": kwargs = dict(zipfile_compression=zipfile.ZIP_DEFLATED) dump(message, path, compression=compression, set_default_extension=False, **kwargs) with open(path, "rb") as f: compressed_message = f.read() if compression in (None, "pickle"): assert len(compressed_message) > len(message) else: assert len(compressed_message) < len(message)
def record_board(is_end, file_name): board = Board() while is_end.qsize() == 0: frame = board.getFrame() frame.output() #print(board.getFrameTime()) cv2.destroyAllWindows() board.stop() print('Board compressing.', time.perf_counter()) compress_pickle.dump(board.frames, 'data/' + file_name + '.gz') print('Board released.', time.perf_counter())
def save_pickle(self, filename): ''' Saves current net model to a compressed file that can be reloaded or shared. ''' pickle_tuple = (self.net_thickness, self.net_width, self.net_length, self.slots_in, self.slots_out, self.angle_in, self.angle_out, self.spi_in, self.tol, self.knots, self.probe_pts, self.probe_samples) outfile = open(filename, 'wb') dump(pickle_tuple, outfile, compression='gzip') outfile.close()
def save_to_file(dataset, filepath, force_overwrite=False, compression='lzma'): if not isinstance(dataset, NISTDB19Dataset): raise RuntimeError(f"Object {type(dataset)} is not inherit from NISTDB19Dataset") if not os.path.exists(os.path.dirname(filepath)): raise RuntimeError(f"Folder {os.path.dirname(filepath)} does not exist") if not force_overwrite and os.path.exists(filepath): print( f"\n[WARNING]: Can't save to '{filepath}'.\nFile already exist. Add 'force_overwrite=False' for overwrite") return with open(filepath, 'wb') as dataset_file: compress_pickle.dump(dataset, dataset_file, compression=compression)
def main(): path_static_data = sys.argv[1] # Path of atlas is the first arg type_moving_data = sys.argv[2] # The type of img1 is the second arg path_moving_data = sys.argv[3] # Path of img1 (moving data) is the third arg # Extract the basename of the moving image to use later for saving other images moving_data_basename = split("\.", basename(path_moving_data))[0] # Perform an initial linear registration using FLIRT and update the path # to the new transformed image path_moving_data, path_aff_mat = flirt(path_static_data, path_moving_data, type_moving_data, moving_data_basename) # Load the moving data and atlas moving_data, moving_affine = load_nifti(path_moving_data) static_data, static_affine = load_nifti(path_static_data) # Perform non-linear registration warped_moving, mapping = syn_registration(static_data, static_affine, moving_data, moving_affine) # Saving the registration results path_warped_moving = pjoin(Path(path_moving_data).parent, moving_data_basename + "_nlinreg.nii.gz") nib.save(nib.Nifti1Image(warped_moving.astype(np.float32), static_affine), path_warped_moving) # Save the optimized mapping object for future use dump(mapping, pjoin(Path(path_moving_data).parent, moving_data_basename + "_map.gz"), compression="gzip", set_default_extension=True) # Apply the affine transformation and warp to all the other images for i in range (4, len(sys.argv)): img_basename = split("\.", basename(sys.argv[i]))[0] path_img_flrt = apply_mat(path_static_data, sys.argv[i], path_aff_mat, img_basename) img_data, img_affine = load_nifti(path_img_flrt) warped_img = apply_syn_registration(img_data, mapping) path_warped_img = pjoin(Path(sys.argv[i]).parent, img_basename + "_nlinreg.nii.gz") nib.save(nib.Nifti1Image(warped_img.astype(np.float32), static_affine), path_warped_img)
def _run_job(args): # Reset random seed np.random.seed() synthesizer, metadata, metrics, iteration, cache_dir, timeout, run_id = args name = synthesizer['name'] dataset_name = metadata._metadata['name'] LOGGER.info( 'Evaluating %s on %s dataset %s with timeout %ss; iteration %s; %s', name, metadata.modality, dataset_name, timeout, iteration, used_memory()) if timeout: output = _score_with_timeout(timeout, synthesizer, metadata, metrics, iteration) else: output = _score(synthesizer, metadata, metrics, iteration) scores = output.get('scores') if not scores: scores = pd.DataFrame({'score': [None]}) else: scores = pd.DataFrame(scores) scores.insert(0, 'synthesizer', name) scores.insert(1, 'dataset', metadata._metadata['name']) scores.insert(2, 'modality', metadata.modality) scores.insert(3, 'iteration', iteration) scores['model_time'] = output.get('model_time') scores['run_id'] = run_id if 'error' in output: scores['error'] = output['error'] if cache_dir: base_path = str(cache_dir / f'{name}_{dataset_name}_{iteration}_{run_id}') if scores is not None: scores.to_csv(base_path + '_scores.csv', index=False) if 'synthetic_data' in output: compress_pickle.dump(output['synthetic_data'], base_path + '.data.gz') if 'exception' in output: with open(base_path + '_error.txt', 'w') as error_file: error_file.write(output['exception']) return scores
def tune_custom_model_a_hyperparameters( episodes_folder: str, save_folder: str, potential_training_file_nbs: List[int], potential_validation_file_nbs: List[int], cpickled_trials_path: str = None): """cpickled_trials_path can be used to resume the tuning. By default it will be in the save_folder and have the file name Trials.xz (as we use lzma-compression with compress-pickle).""" if cpickled_trials_path is None: cpickled_trials_path = os.path.join(save_folder, "Trials.xz") if not os.path.exists(cpickled_trials_path): trials = Trials() current_nb_runs = 0 else: with open(cpickled_trials_path, 'rb') as file: trials = compress_pickle.load(file, compression="lzma") current_nb_runs = len(trials.trials) best_hyperparameters = None while current_nb_runs < TUNING_NB_RUNS: best_hyperparameters = fmin( tune_model_a, space=( hp.loguniform('learning_rate', math.log(10**-5), math.log(10**-3)), hp.loguniform('regularization_strength', math.log(10**-4), math.log(10**-2)), hp.uniformint('nb_frames_to_stack', 2, 25), hp.choice('episodes_folder', [episodes_folder]), # not really a choice hp.choice('save_folder', [save_folder]), # just a way to pass more parameters hp.choice('potential_training_file_nbs', [potential_training_file_nbs]), hp.choice('potential_validation_file_nbs', [potential_validation_file_nbs])), algo=tpe.suggest, max_evals=current_nb_runs + 1, # just keep going (Note: messes with the progress bar) trials=trials) current_nb_runs += 1 # (after the += 1: == len(trials.trials)) # Save after every tuning run with open(cpickled_trials_path, "wb") as file: compress_pickle.dump(trials, file, compression="lzma") print(best_hyperparameters) print(trials.best_trial["result"]["loss"])
def cache_write(object, file_name, only_on_professors_computer=False): if only_on_professors_computer and not is_this_my_computer(): """ Probably for your own good :-). """ return # file_name = cn_(file_name) if cache_prefix else file_name dn = os.path.dirname(file_name) if not os.path.exists(dn): os.mkdir(dn) print("Writing cache...", file_name) with open( file_name, 'wb', ) as f: compress_pickle.dump(object, f, compression="lzma") print("Done!")