def test_hashable(): with pytest.raises(NotImplementedError): Hashable().consistent_hash() a = MyHashable(2) b = MyHashable(2) c = MyHashable(3) assert validate_consistent_hash(a, b) assert validate_consistent_hash(b, a) assert not validate_consistent_hash(a, c) assert not validate_consistent_hash(b, c) assert sha256({"my_hashable": a}) == sha256({"my_hashable": b})
def holdout_cache_path(cache_directory: str, holdouts_parameters: Dict) -> str: """Return path where to store the cache file, recording the created holdout. cache_directory: str, directory where to store the holdouts cache. """ return "{cache_directory}/cache/{holdouts_parameters}.json".format( cache_directory=cache_directory, holdouts_parameters=sha256(holdouts_parameters))
def holdout_pickle_path(cache_directory: str, holdouts_parameters: Dict) -> str: """Return path where to pickle an holdout created with given parameters. cache_directory: str, directory where to store the holdouts cache. holdouts_parameters: Dict, hyper parameters used to create the holdouts. """ return "{results_directory}/holdouts/{hash}.pickle.gz".format( results_directory=cache_directory, hash=sha256(holdouts_parameters))
def test_compress_json(): D = random_string_dict(10, 10) key = sha256(D) extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys() for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.dump(D, path) assert key == sha256(compress_json.load(path)) shutil.rmtree("random_dirs") for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.local_dump(D, path) assert key == sha256(compress_json.local_load(path)) shutil.rmtree("tests/random_dirs")
def consistent_hash(self) -> str: """Returns consistent hash describing the model.""" return sha256( dict( **self.parameters(), model_name=self.model_name(), library_name=self.library_name(), task_name=self.task_name(), ))
def getHashCode(data_pair): """ Produce un hash code unico para cada combinacion de pares (hyperparams, net_layers). """ hyperparams = data_pair[0] net_layers = data_pair[1] return sha256({"hyperparams": hyperparams, "net_layers": net_layers})
def cached_experiment(**kwargs: Dict): path = "tests/cached_experiments" os.makedirs(path, exist_ok=True) path = "{path}/{sha}.json".format(path=path, sha=sha256(kwargs)) if os.path.exists(path): with open(path, "r") as f: return json.load(f) response = experiment(**kwargs) with open(path, "w") as f: json.dump(response, f) return response
def init_test(self, tcode, test): thash = sha256(test) new_test = True if tcode in self.tests: if self.tests[tcode]["hash"] == thash: new_test = False elif self.verbose: print( "Failed to update results for test {} (fingerprint mismatch).\n" "These results will be overwritten\n".format(test["code"])) if new_test: testr = deepcopy(test) testr.update({ "hash": thash, "time_proto": np.full((test["nexp"], len(test["nsample"])), np.nan), "time_est": np.full((test["nexp"], len(test["nsample"])), np.nan), "nmeas": np.full((test["nexp"], len(test["nsample"])), np.nan), "fidelity": np.full((test["nexp"], len(test["nsample"])), np.nan), "sm_flag": True }) self.tests.update({tcode: testr}) if self.is_par_mode() and tcode not in self.par_result.tests: par_test = deepcopy(test) par_range = self.par_get_range(par_test["nexp"]) par_test.update({ "parent": test, "start": par_range[0], "nexp": len(par_range), "time_proto": self.tests[tcode]["fidelity"][par_range, :], "time_est": self.tests[tcode]["time_est"][par_range, :], "nmeas": self.tests[tcode]["nmeas"][par_range, :], "fidelity": self.tests[tcode]["fidelity"][par_range, :], "sm_flag": self.tests[tcode]["sm_flag"] }) self.par_result.tests.update({tcode: par_test}) return self
def results_path(results_directory: str, holdout_key: str, hyper_parameters: Dict) -> str: """Return default path for storing the main results csv. results_directory: str, directory where to store the prediction_labels. holdout_key:str, key that identifies the holdout used for training. hyper_parameters: Dict, hyperparameters used to train the model. """ return "{results_directory}/results/{key}.json".format( results_directory=results_directory, key=sha256({ "holdout_key": holdout_key, "hyper_parameters": hyper_parameters }))
def work_in_progress_path(results_directory: str, holdout_key: str, hyper_parameters: str) -> str: """Return default path for storing the main work in progress csv. results_directory: str, directory where to store the prediction_labels. holdout_key:str, key that identifies the holdout used for training. hyper_parameters: Dict, hyperparameters used to train the model. """ return "{wip}/{key}".format( wip=work_in_progress_directory(results_directory), key=sha256({ "hyper_parameters": hyper_parameters, "holdout_key": holdout_key }))
def history_path(results_directory: str, holdout_key: str, hyper_parameters: Dict) -> str: """Return path where to store metrics tracked during history. results_directory: str, directory where to store the prediction_labels. holdout_key:str, key that identifies the holdout used for training. hyper_parameters: Dict, hyperparameters used to train the model. """ return "{results_directory}/histories/{key}.csv".format( results_directory=results_directory, key=sha256({ "hyper_parameters": hyper_parameters, "holdout_key": holdout_key }))
def trained_model_path(results_directory: str, holdout_key: str, hyper_parameters: Dict) -> str: """Return default path for storing the model trained with given holdout key and given parameters. results_directory: str, directory where to store the prediction_labels. holdout_key:str, key that identifies the holdout used for training. hyper_parameters: Dict, hyperparameters used to train the model. """ return "{results_directory}/trained_models/{key}.h5".format( results_directory=results_directory, key=sha256({ "hyper_parameters": hyper_parameters, "holdout_key": holdout_key }))
def compare_metrics(candidate: Callable, reference: Callable, tests: int = 10) -> bool: """Test if candidate metric is identical within float error to reference metric. Parameters ------------------------------------------- candidate: Callable, The metric to be tested. reference: Callable, The reference metric considered as ground truth. tests: int = 10, Number of random dictionaries (both float and integer) to test. Returns -------------------------------------------- Boolean value with the test result. """ for dict_generator in (random_float_dict, random_int_dict): random.seed(46) for _ in range(tests): a = normalize_dict(deflate(dict_generator(2, 10))) b = normalize_dict(deflate(dict_generator(2, 10))) assert candidate(b, a) == pytest.approx(candidate(a, b)) assert 0 == pytest.approx(candidate(a, a)) assert 0 == pytest.approx(candidate(b, b)) path = "tests/references/{metric}/{sha}.json".format( metric=candidate.__name__, sha=sha256({ "a": a, "b": b })) distance = candidate(a, b) if not os.path.exists(path): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w") as f: json.dump({"distance": distance}, f) with open(path, "r") as f: assert pytest.approx(distance) == json.load(f)["distance"] if reference is not None: try: assert pytest.approx(distance) == reference( *dict_to_array(a, b)) except AssertionError as e: print( "Candidate {candidate} does not match {reference}: {candidate_value} != {reference_value}." .format( candidate=candidate.__name__, reference=reference.__name__, candidate_value=candidate(a, b), reference_value=reference(*dict_to_array(a, b)))) raise e
def true_labels_path(results_directory: str, holdout_key: str, labels_type: str, hyper_parameters: str) -> str: """Return default path for true labels. results_directory: str, directory where to store the true_labels. holdout_key:str, key that identifies the holdout used for training. labels_type:str, the labels_type of the data. Can either be "train", "test". hyper_parameters: Dict, hyperparameters used to train the model. """ return "{results_directory}/true_labels/{labels_type}/{key}.csv".format( results_directory=results_directory, labels_type=labels_type, key=sha256({ "holdout_key": holdout_key, "hyper_parameters": hyper_parameters }))
def test_dict_hash(): path = sha256(create_dict()) assert os.path.exists(path) os.remove(path)
data = [] try: data = load_data(file) except Exception as message: log(message, 3, "red") exit(1) # run plugin plugin_sources = import_module(f"plugins.{name}").main(data) log(f"Got {len(plugin_sources)} sources", 2, "green") for source in plugin_sources: # make unique key for cache matching source["_cache"] = sha256({ **source, "plugin": name, "input": file }) # add source sources.append(source) log("Generating citations for sources") # load existing citations citations = [] try: citations = load_data(config["output"]) except Exception as message: log(message, 2, "yellow") # list of new citations to overwrite existing citations new_citations = []
def train_and_score(self, network, network_id=None): """ Compiles the network and trains it on the training data with the enabled callbacks. The network is compiled by self.compile_network() which in turn uses the supplied build_model function. In the case no val_data is supplied, a val_split section of train_data is used for this purpose instead. When no test_data is specified, the test score will be identical to the last validation score. Arguments: network (dictionary): dictionary containing the network parameters. network_id: (int) Id of network in generation; if None, no logpath is created, blocking tensorboard, checkpoint and csvlogging Returns: keras.callbacks.History: History returned from training. float: test score obtained as described above. """ clear_session() if network_id is not None: model_save_path = os.path.join( self.log_dir, "gen_" + "{:03d}".format(self.current_generation), "id_" + "{:03d}".format(network_id)) os.makedirs(model_save_path) if self.cache: network_hash = sha256(network) if network_hash in self.trained_networks.keys(): if random.uniform( 0, 1 ) > self.train_chance: # skip training based on train_chance # grab score from cache test_score = self.trained_networks[network_hash]['score'] total_score = test_score * ( 1 + self.penalty * network['io_config']['input_shape']) old_save_path = self.trained_networks[network_hash][ 'save_path'] with open( os.path.join(model_save_path, 'cached_score.txt'), 'w') as fp: print(f'Test_score: {test_score}', file=fp) print(f'Total_score: {total_score}', file=fp) print(old_save_path, file=fp) print(network, file=fp) print(network_hash, file=fp) return total_score callbacks = [] if self.cb_early_stop: early_stop = EarlyStopping(patience=self.early_stop_patience) callbacks.append(early_stop) if network_id is not None: if self.cb_tensorboard: tensorboard = TensorBoard(os.path.join(model_save_path, "tensorboard"), write_graph=True, histogram_freq=5) callbacks.append(tensorboard) if self.cb_model_checkpoint: model_checkpoint = ModelCheckpoint(os.path.join( model_save_path, "checkpoints", "model.{epoch:02d}-{val_loss:.5f}.hdf5"), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) callbacks.append(model_checkpoint) if self.cb_csv_logger: csv_logger = CSVLogger(os.path.join(model_save_path, "train_log.csv"), separator=',', append=False) callbacks.append(csv_logger) callbacks = callbacks + self.user_callbacks model = self.compile_network(network) # create data based on in/outputs x_train = self.train_data[network['io_config']['inputs']] if self.y_train is not None: y_train = self.y_train else: y_train = self.train_data[network['io_config']['outputs']] if self.val_data is not None: x_val = self.val_data[network['io_config']['inputs']] if self.y_train is not None: y_val = self.y_val else: y_val = self.val_data[network['io_config']['outputs']] total_val_data = [x_val, y_val] else: total_val_data = None start_time = time.time() history = model.fit(x_train, y_train, verbose=self.train_verbose, epochs=network['network_config']['epochs'], batch_size=network['network_config']['batch_size'], callbacks=callbacks, validation_data=total_val_data, validation_split=self.val_split) print('Training time: {}, training val_loss: {}'.format( time.time() - start_time, history.history['val_loss'][-1]), file=self.head_log) # if test data is specified, use it. Otherwise use last val_loss from training if self.test_data is not None: x_test = self.test_data[network['io_config']['inputs']] if self.y_train is not None: y_test = self.y_test else: y_test = self.test_data[network['io_config']['outputs']] start_time = time.time() test_score = model.evaluate(x_test, y_test, verbose=0)[0] print('Test time: {}, testing loss: {}'.format( time.time() - start_time, test_score), file=self.head_log) print(network, file=self.head_log) else: test_score = history.history['val_loss'][-1] # serialize model to JSON model_json = model.to_json() json_save_path = os.path.join(model_save_path, "json_model") os.makedirs(json_save_path) with open(os.path.join(json_save_path, "model.json"), "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights(os.path.join(json_save_path, "model_weigths.h5")) self.head_log.flush() # return best score instead of newest if self.cache: if network_hash in self.trained_networks.keys(): old_score = self.trained_networks[network_hash]['score'] if old_score > test_score: # update cached score if new score is better (lower) self.trained_networks[network_hash]['score'] = test_score self.trained_networks[network_hash][ 'save_path'] = model_save_path else: # get hashed score if better test_score = old_score else: # if the networks was not cached before self.trained_networks[network_hash] = {} self.trained_networks[network_hash]['score'] = test_score self.trained_networks[network_hash][ 'save_path'] = model_save_path # modify test_score to include penalty for number of inputs total_score = test_score * ( 1 + self.penalty * network['io_config']['input_shape']) return total_score
def __init__( self, name: str, version: str, directed: bool = False, load_nodes: bool = True, load_node_types: bool = True, keep_nodes_without_descriptions: bool = True, keep_nodes_without_categories: bool = True, keep_interwikipedia_nodes: bool = True, keep_external_nodes: bool = True, compute_node_description: bool = False, auto_enable_tradeoffs: bool = True, sort_tmp_dir: Optional[str] = None, verbose: int = 2, cache: bool = True, cache_path: Optional[str] = None, cache_sys_var: str = "GRAPH_CACHE_DIR", graph_kwargs: Dict = None ): """Create new automatically retrieved graph. Parameters ------------------- name: str The name of the graph to be retrieved and loaded. version: str The version of the graph to be retrieved. directed: bool = False Whether to load the graph as directed or undirected. By default false. load_nodes: bool = True Whether to load the nodes vocabulary or treat the nodes simply as a numeric range. This feature is only available when the preprocessing is enabled. load_node_types: bool = True Whether to load the node types if available or skip them entirely. This feature is only available when the preprocessing is enabled. keep_nodes_without_descriptions: bool = True Whether to keep the nodes laking a description keep_nodes_without_categories: bool = True Whether to keep the nodes laking a category keep_interwikipedia_nodes: bool = True Whether to keep nodes from external wikipedia websites keep_external_nodes: bool = True Whether to keep nodes from external websites (non wikipedia ones). compute_node_description: bool = False Whether to compute the node descriptions. Note that this will significantly increase the side of the node lists! auto_enable_tradeoffs: bool = True Whether to enable the Ensmallen time-memory tradeoffs in small graphs automatically. By default True, that is, if a graph has less than 50 million edges. In such use cases the memory expenditure is minimal. sort_tmp_dir: Optional[str] = None Which folder to use to store the temporary files needed to sort in parallel the edge list when building the optimal preprocessed file. This defaults to the same folder of the edge list when no value is provided. verbose: int = 2 Whether to show loading bars. cache: bool = True Whether to use cache, i.e. download files only once and preprocess them only once. cache_path: Optional[str] = None Where to store the downloaded graphs. If no path is provided, first we check the system variable provided below is set, otherwise we use the directory `graphs`. cache_sys_var: str = "GRAPH_CACHE_DIR" The system variable with the default graph cache directory. graph_kwargs: Dict = None Eventual additional kwargs for loading the graph. Raises ------------------- ValueError, If the given graph name is not available. ValueError, If the OS is Windows, we cannot process the file. """ if is_windows(): raise ValueError( "On Windows we do not support the processing of " "Wikipedia graphs." ) self._keep_nodes_without_descriptions = keep_nodes_without_descriptions self._keep_nodes_without_categories = keep_nodes_without_categories self._keep_interwikipedia_nodes = keep_interwikipedia_nodes self._keep_external_nodes = keep_external_nodes self._compute_node_description = compute_node_description super().__init__( name=name, version=version, repository="wikipedia", directed=directed, preprocess=True, load_nodes=load_nodes, load_node_types=load_node_types, load_edge_weights=False, auto_enable_tradeoffs=auto_enable_tradeoffs, sort_tmp_dir=sort_tmp_dir, verbose=verbose, cache=cache, cache_path=cache_path, cache_sys_var=cache_sys_var, graph_kwargs=graph_kwargs, hash_seed=sha256(dict( keep_nodes_without_descriptions=keep_nodes_without_descriptions, keep_nodes_without_categories=keep_nodes_without_categories, keep_interwikipedia_nodes=keep_interwikipedia_nodes, keep_external_nodes=keep_external_nodes, compute_node_description=compute_node_description )) )
def consistent_hash(self) -> str: return sha256({"a": self._a})
def import_property(self, api_service: Resource, gsc_property: str, request_date: date, dimensions: list, search_types: list, previous_data: list, aggregation_type: str, database: str, table_name: str, dataset_name: str = None): table_reference = self.bigquery.table_reference( table_name, dataset_name) previous_dates = {} cache_hash = sha256({ 'property': gsc_property, 'dimensions': dimensions, 'date': datetime.now().isoformat() }) print(' - Property: "{:s}"'.format(gsc_property), end='') for previous_data_item in previous_data: if 'year' == previous_data_item: previous_date = request_date - relativedelta(years=1) previous_dates['PreviousYear'] = { 'startDate': previous_date, 'endDate': previous_date, } if 'month' == previous_data_item: previous_date = request_date - relativedelta(months=1) previous_dates['PreviousMonth'] = { 'startDate': previous_date.replace(day=1), 'endDate': previous_date.replace(day=monthrange( previous_date.year, previous_date.month)[1]), } if 'week' == previous_data_item: previous_date = request_date - relativedelta(weeks=1) previous_dates['PreviousWeek'] = { 'startDate': previous_date, 'endDate': previous_date, } if 'day' == previous_data_item: previous_date = request_date - relativedelta(days=1) previous_dates['PreviousDay'] = { 'startDate': previous_date, 'endDate': previous_date, } if 'bigquery' == database and self._bigquery_check_has_existing_data( gsc_property, table_reference, request_date): raise _DataAlreadyExistError() elif 'mongodb' == database and self._mongodb_check_has_existing_data( gsc_property, request_date): raise _DataAlreadyExistError() print('\n + {:%Y-%m-%d} -> {:%Y-%m-%d}'.format( request_date, request_date), end='') timer_base = time() for search_type in search_types: iteration_count = 0 while True: request = { 'startDate': request_date.strftime('%Y-%m-%d'), 'endDate': request_date.strftime('%Y-%m-%d'), 'searchType': search_type, 'dimensions': dimensions, 'rowLimit': GoogleSearchConsole.ROW_LIMIT, 'startRow': GoogleSearchConsole.ROW_LIMIT * iteration_count } if 0 < len(aggregation_type): request['aggregationType'] = aggregation_type response = api_service.searchanalytics().query( siteUrl=gsc_property, body=request).execute() if 'rows' not in response: if 0 == iteration_count and ( len(search_types) - 1) == search_types.index(search_type): cache_entry = self.mongodb.find_one( GoogleSearchConsole.COLLECTION_NAME_CACHE, {'hash': cache_hash}, True) if cache_entry is None: raise _DataNotAvailableYet() break self._cache_rows(cache_hash, gsc_property, response['rows'], previous_dates, request_date, dimensions, search_type) if len(response['rows']) < GoogleSearchConsole.ROW_LIMIT: break iteration_count = iteration_count + 1 print(' - OK - {:s}'.format( str(timedelta(seconds=int(time() - timer_base))))) for previous_data_column, previous_date in previous_dates.items(): print(' + {:%Y-%m-%d} -> {:%Y-%m-%d}'.format( previous_date['startDate'], previous_date['endDate']), end='') timer_previous = time() for search_type in search_types: iteration_count = 0 while True: request = { 'startDate': previous_date['startDate'].strftime('%Y-%m-%d'), 'endDate': previous_date['endDate'].strftime('%Y-%m-%d'), 'searchType': search_type, 'dimensions': dimensions, 'rowLimit': GoogleSearchConsole.ROW_LIMIT, 'startRow': GoogleSearchConsole.ROW_LIMIT * iteration_count } if 0 < len(aggregation_type): request['aggregationType'] = aggregation_type response = api_service.searchanalytics().query( siteUrl=gsc_property, body=request).execute() if 'rows' not in response: break self._add_previous_data( cache_hash, search_type, previous_data_column, response['rows'], dimensions, ) if len(response['rows']) < GoogleSearchConsole.ROW_LIMIT: break iteration_count = iteration_count + 1 print(' - OK - {:s}'.format( str(timedelta(seconds=int(time() - timer_previous))))) offset = 0 while True: rows = self.mongodb.find(GoogleSearchConsole.COLLECTION_NAME_CACHE, {'hash': cache_hash}, True, GoogleSearchConsole.ROW_LIMIT, offset) if 0 == len(rows): break offset += GoogleSearchConsole.ROW_LIMIT self._import_rows(database, rows, table_reference) self._clear_cache(cache_hash)
def _get_formatted_path(self, args, kwargs, formatter=None, function_info=None, extra_kwargs=None, inner_self=None) -> str: """Compute the path adding and computing the needed arguments.""" formatter = formatter or self.cache_path if isinstance(formatter, list): return [ self._get_formatted_path(args, kwargs, f) for f in formatter ] if isinstance(formatter, tuple): return tuple([ self._get_formatted_path(args, kwargs, f) for f in formatter ]) elif isinstance(formatter, dict): return { key:self._get_formatted_path(args, kwargs, v) for key, v in formatter.items() } extra_kwargs = extra_kwargs or {} function_info = function_info or self.function_info params = get_params(function_info, args, kwargs) groups = get_format_groups(formatter) groups_set = {match.str_match for match in groups} if "_hash" in groups_set: data = {"params": params, "function_info": function_info} if inner_self is not None: data["self"] = inner_self params["_hash"] = sha256(data, use_approximation=self.use_approximated_hash) self.logger.debug("Got parameters %s", params) format_args = { **params, **function_info, **extra_kwargs, "cache_dir":self.cache_dir, } new_formatter = "" old_formatter = formatter # Handle the composite paths while len(old_formatter) != 0: new_match, formatter_remainder = get_next_format_group(old_formatter) # there are no more matches just append the remainder if new_match is None: new_formatter += formatter_remainder break # check if we should call the value or not if new_match.str_match.endswith("()"): new_match.str_match = new_match.str_match[:-2] # Get the name of the base element and the attributes chain root, *attrs = new_match.str_match.split(".") # Get the params to use for the attributes chain root = format_args[root] # Follow the attributes chain for attr in attrs: root = getattr(root, attr) # Check if we have to call the function or not if inspect.isfunction(root) or inspect.ismethod(root) or inspect.isbuiltin(root): root = root() sub = str(root) else: sub = "{" + new_match.str_match + "}" new_formatter += old_formatter[:new_match.start] new_formatter += sub old_formatter = formatter_remainder path = new_formatter.format( **format_args, ) self.logger.debug("Calculated path %s", path) return path
def __init__( self, name: str, version: str, repository: str, directed: bool = False, preprocess: Union[bool, str] = "auto", load_nodes: bool = True, load_node_types: bool = True, load_edge_weights: bool = True, auto_enable_tradeoffs: bool = True, sort_tmp_dir: Optional[str] = None, verbose: int = 2, cache: bool = True, cache_path: Optional[str] = None, cache_sys_var: str = "GRAPH_CACHE_DIR", graph_kwargs: Dict = None, hash_seed: str = None, callbacks: List[Callable] = (), callbacks_arguments: List[Dict] = (), ): """Create new automatically retrieved graph. Parameters ------------------- name: str The name of the graph to be retrieved and loaded. version: str The version of the graph to be retrieved. repository: str Name of the repository to load data from. directed: bool = False Whether to load the graph as directed or undirected. By default false. preprocess: Union[bool, str] = "auto" Whether to preprocess the node list and edge list to be loaded optimally in both time and memory. Will automatically preprocess in Linux and macOS and avoid doing this on Windows. load_nodes: bool = True Whether to load the nodes vocabulary or treat the nodes simply as a numeric range. This feature is only available when the preprocessing is enabled. load_node_types: bool = True Whether to load the node types if available or skip them entirely. This feature is only available when the preprocessing is enabled. load_edge_weights: bool = True Whether to load the edge weights if available or skip them entirely. This feature is only available when the preprocessing is enabled. auto_enable_tradeoffs: bool = True Whether to enable the Ensmallen time-memory tradeoffs in small graphs automatically. By default True, that is, if a graph has less than 50 million edges. In such use cases the memory expenditure is minimal. sort_tmp_dir: Optional[str] = None Which folder to use to store the temporary files needed to sort in parallel the edge list when building the optimal preprocessed file. This defaults to the same folder of the edge list when no value is provided. verbose: int = 2 Whether to show loading bars. cache: bool = True Whether to use cache, i.e. download files only once and preprocess them only once. cache_path: Optional[str] = None Where to store the downloaded graphs. If no path is provided, first we check the system variable provided below is set, otherwise we use the directory `graphs`. cache_sys_var: str = "GRAPH_CACHE_DIR" The system variable with the default graph cache directory. graph_kwargs: Dict = None Eventual additional kwargs for loading the graph. hash_seed: str = None Seed to use for the hash. callbacks: List[Callable] = () Eventual callbacks to call after download files. callbacks_arguments: List[Dict] = () Eventual arguments for callbacks. Raises ------------------- ValueError, If the given graph name is not available. ValueError, If the preprocess flag is provided but the system is Windows, which does not provide the sort command. """ try: validate_graph_version(name, repository, version) all_versions = compress_json.local_load( "{}.json.gz".format(repository) )[name] self._graph = all_versions[version] except KeyError: raise ValueError( ( "Requested graph `{}` is not currently available.\n" "Open an issue on the Graph repository to ask " "for this graph to be added." ).format(name) ) if preprocess == "auto": preprocess = is_macos() or is_linux() if preprocess and is_windows(): raise ValueError( "Currently preprocessing to optimal edge list is not supported " "on Windows because the sorting step is based upon the `sort` " "command, which is only available to our knowledge on Linux and " "macOS systems." ) # If the cache path was not provided # we either check the system variable # and if it is not set we use `graphs` if cache_path is None: cache_path = os.getenv(cache_sys_var, "graphs") cache_path = os.path.join(cache_path, repository) self._directed = directed self._preprocess = preprocess self._load_nodes = load_nodes self._load_node_types = load_node_types self._load_edge_weights = load_edge_weights self._name = name self._repository = repository self._version = version self._auto_enable_tradeoffs = auto_enable_tradeoffs self._sort_tmp_dir = sort_tmp_dir self._cache = cache self._verbose = verbose self._callbacks = callbacks if graph_kwargs is None: graph_kwargs = {} self._graph_kwargs = graph_kwargs self._callbacks_arguments = callbacks_arguments self._instance_hash = sha256({ "hash_seed": hash_seed, **self._graph, **self._graph_kwargs, }) self._cache_path = os.path.join( cache_path, name, version ) self._downloader = BaseDownloader( auto_extract=True, cache=cache, target_directory=self._cache_path, verbose=self._verbose, process_number=1 )
def test_dict_hash(): d = create_dict() assert dict_hash(d) == dict_hash(d) Path(sha256(d)).touch()
def __init__(self, assembly, window_size, batch_size, buffer_size=None, max_gap_size=100, train_chromosomes=None, val_chromosomes=None, cache_dir=None, lazy_load=True, clear_cache=False, compile_on_start=True, n_type="uniform"): self.assembly, self.window_size = assembly, window_size self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes # Buffersize default None == cpu count for optimal performance: if not buffer_size: buffer_size = cpu_count() self.buffer_size = buffer_size # Validate the type of N if n_type not in self.n_types: raise ValueError("n_type must be one of %s" % n_type) self.n_type = n_type # Get the cache dir cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp" self._cache_directory = "/".join( [cache_dir, assembly, str(window_size)]) if clear_cache: self.clean_cache() # Generate a pool of processes to save the overhead self.workers = max(2, cpu_count()) self.pool = Pool(self.workers) # Preprocess all the possible data self.genome = Genome( assembly=assembly, lazy_load=lazy_load, cache_directory=cache_dir, ) if not val_chromosomes: self.val_chromosomes = [] # If no chromosomes passed then use all the genome if not train_chromosomes: self.chromosomes = sorted(list(self.genome)) else: self.chromosomes = train_chromosomes + self.val_chromosomes self.instance_hash = sha256({ "assembly": self.assembly, "chromosomes": self.chromosomes, "window_size": self.window_size, "max_gap_size": self.max_gap_size, "n_type": n_type, }) if compile_on_start: self.compile()
def test_with_different_keys_order(): d1 = {'tune_best_model': True, 'target': 'def'} d2 = {'target': 'def', 'tune_best_model': True} assert sha256(d1) == sha256(d2)