Ejemplo n.º 1
0
def test_hashable():
    with pytest.raises(NotImplementedError):
        Hashable().consistent_hash()

    a = MyHashable(2)
    b = MyHashable(2)
    c = MyHashable(3)
    assert validate_consistent_hash(a, b)
    assert validate_consistent_hash(b, a)
    assert not validate_consistent_hash(a, c)
    assert not validate_consistent_hash(b, c)

    assert sha256({"my_hashable": a}) == sha256({"my_hashable": b})
Ejemplo n.º 2
0
def holdout_cache_path(cache_directory: str, holdouts_parameters: Dict) -> str:
    """Return path where to store the cache file, recording the created holdout.
        cache_directory: str, directory where to store the holdouts cache.
    """
    return "{cache_directory}/cache/{holdouts_parameters}.json".format(
        cache_directory=cache_directory,
        holdouts_parameters=sha256(holdouts_parameters))
Ejemplo n.º 3
0
def holdout_pickle_path(cache_directory: str,
                        holdouts_parameters: Dict) -> str:
    """Return path where to pickle an holdout created with given parameters.
        cache_directory: str, directory where to store the holdouts cache.
        holdouts_parameters: Dict, hyper parameters used to create the holdouts.
    """
    return "{results_directory}/holdouts/{hash}.pickle.gz".format(
        results_directory=cache_directory, hash=sha256(holdouts_parameters))
Ejemplo n.º 4
0
def test_compress_json():
    D = random_string_dict(10, 10)
    key = sha256(D)
    extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys()
    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.dump(D, path)
        assert key == sha256(compress_json.load(path))

    shutil.rmtree("random_dirs")

    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.local_dump(D, path)
        assert key == sha256(compress_json.local_load(path))

    shutil.rmtree("tests/random_dirs")
Ejemplo n.º 5
0
 def consistent_hash(self) -> str:
     """Returns consistent hash describing the model."""
     return sha256(
         dict(
             **self.parameters(),
             model_name=self.model_name(),
             library_name=self.library_name(),
             task_name=self.task_name(),
         ))
Ejemplo n.º 6
0
def getHashCode(data_pair):
    """
    Produce un hash code unico para cada combinacion de
    pares (hyperparams, net_layers).
    """
    hyperparams = data_pair[0]
    net_layers = data_pair[1]

    return sha256({"hyperparams": hyperparams, "net_layers": net_layers})
Ejemplo n.º 7
0
def cached_experiment(**kwargs: Dict):
    path = "tests/cached_experiments"
    os.makedirs(path, exist_ok=True)
    path = "{path}/{sha}.json".format(path=path, sha=sha256(kwargs))
    if os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    response = experiment(**kwargs)
    with open(path, "w") as f:
        json.dump(response, f)
    return response
Ejemplo n.º 8
0
    def init_test(self, tcode, test):
        thash = sha256(test)
        new_test = True
        if tcode in self.tests:
            if self.tests[tcode]["hash"] == thash:
                new_test = False
            elif self.verbose:
                print(
                    "Failed to update results for test {} (fingerprint mismatch).\n"
                    "These results will be overwritten\n".format(test["code"]))
        if new_test:
            testr = deepcopy(test)
            testr.update({
                "hash":
                thash,
                "time_proto":
                np.full((test["nexp"], len(test["nsample"])), np.nan),
                "time_est":
                np.full((test["nexp"], len(test["nsample"])), np.nan),
                "nmeas":
                np.full((test["nexp"], len(test["nsample"])), np.nan),
                "fidelity":
                np.full((test["nexp"], len(test["nsample"])), np.nan),
                "sm_flag":
                True
            })
            self.tests.update({tcode: testr})

        if self.is_par_mode() and tcode not in self.par_result.tests:
            par_test = deepcopy(test)
            par_range = self.par_get_range(par_test["nexp"])
            par_test.update({
                "parent":
                test,
                "start":
                par_range[0],
                "nexp":
                len(par_range),
                "time_proto":
                self.tests[tcode]["fidelity"][par_range, :],
                "time_est":
                self.tests[tcode]["time_est"][par_range, :],
                "nmeas":
                self.tests[tcode]["nmeas"][par_range, :],
                "fidelity":
                self.tests[tcode]["fidelity"][par_range, :],
                "sm_flag":
                self.tests[tcode]["sm_flag"]
            })
            self.par_result.tests.update({tcode: par_test})

        return self
Ejemplo n.º 9
0
def results_path(results_directory: str, holdout_key: str,
                 hyper_parameters: Dict) -> str:
    """Return default path for storing the main results csv.
        results_directory: str, directory where to store the prediction_labels.
        holdout_key:str, key that identifies the holdout used for training.
        hyper_parameters: Dict, hyperparameters used to train the model.
    """
    return "{results_directory}/results/{key}.json".format(
        results_directory=results_directory,
        key=sha256({
            "holdout_key": holdout_key,
            "hyper_parameters": hyper_parameters
        }))
Ejemplo n.º 10
0
def work_in_progress_path(results_directory: str, holdout_key: str,
                          hyper_parameters: str) -> str:
    """Return default path for storing the main work in progress csv.
        results_directory: str, directory where to store the prediction_labels.
        holdout_key:str, key that identifies the holdout used for training.
        hyper_parameters: Dict, hyperparameters used to train the model.
    """
    return "{wip}/{key}".format(
        wip=work_in_progress_directory(results_directory),
        key=sha256({
            "hyper_parameters": hyper_parameters,
            "holdout_key": holdout_key
        }))
Ejemplo n.º 11
0
def history_path(results_directory: str, holdout_key: str,
                 hyper_parameters: Dict) -> str:
    """Return path where to store metrics tracked during history.
        results_directory: str, directory where to store the prediction_labels.
        holdout_key:str, key that identifies the holdout used for training.
        hyper_parameters: Dict, hyperparameters used to train the model.
    """
    return "{results_directory}/histories/{key}.csv".format(
        results_directory=results_directory,
        key=sha256({
            "hyper_parameters": hyper_parameters,
            "holdout_key": holdout_key
        }))
Ejemplo n.º 12
0
def trained_model_path(results_directory: str, holdout_key: str,
                       hyper_parameters: Dict) -> str:
    """Return default path for storing the model trained with given holdout key and given parameters.
        results_directory: str, directory where to store the prediction_labels.
        holdout_key:str, key that identifies the holdout used for training.
        hyper_parameters: Dict, hyperparameters used to train the model.
    """
    return "{results_directory}/trained_models/{key}.h5".format(
        results_directory=results_directory,
        key=sha256({
            "hyper_parameters": hyper_parameters,
            "holdout_key": holdout_key
        }))
Ejemplo n.º 13
0
def compare_metrics(candidate: Callable,
                    reference: Callable,
                    tests: int = 10) -> bool:
    """Test if candidate metric is identical within float error to reference metric.

    Parameters
    -------------------------------------------
    candidate: Callable,
        The metric to be tested.
    reference: Callable,
        The reference metric considered as ground truth.
    tests: int = 10,
        Number of random dictionaries (both float and integer) to test.

    Returns
    --------------------------------------------
    Boolean value with the test result.
    """
    for dict_generator in (random_float_dict, random_int_dict):
        random.seed(46)
        for _ in range(tests):
            a = normalize_dict(deflate(dict_generator(2, 10)))
            b = normalize_dict(deflate(dict_generator(2, 10)))
            assert candidate(b, a) == pytest.approx(candidate(a, b))
            assert 0 == pytest.approx(candidate(a, a))
            assert 0 == pytest.approx(candidate(b, b))
            path = "tests/references/{metric}/{sha}.json".format(
                metric=candidate.__name__, sha=sha256({
                    "a": a,
                    "b": b
                }))
            distance = candidate(a, b)
            if not os.path.exists(path):
                os.makedirs(os.path.dirname(path), exist_ok=True)
                with open(path, "w") as f:
                    json.dump({"distance": distance}, f)
            with open(path, "r") as f:
                assert pytest.approx(distance) == json.load(f)["distance"]
            if reference is not None:
                try:
                    assert pytest.approx(distance) == reference(
                        *dict_to_array(a, b))
                except AssertionError as e:
                    print(
                        "Candidate {candidate} does not match {reference}: {candidate_value} != {reference_value}."
                        .format(
                            candidate=candidate.__name__,
                            reference=reference.__name__,
                            candidate_value=candidate(a, b),
                            reference_value=reference(*dict_to_array(a, b))))
                    raise e
Ejemplo n.º 14
0
def true_labels_path(results_directory: str, holdout_key: str,
                     labels_type: str, hyper_parameters: str) -> str:
    """Return default path for true labels.
        results_directory: str, directory where to store the true_labels.
        holdout_key:str, key that identifies the holdout used for training.
        labels_type:str, the labels_type of the data. Can either be "train", "test".
        hyper_parameters: Dict, hyperparameters used to train the model.
    """
    return "{results_directory}/true_labels/{labels_type}/{key}.csv".format(
        results_directory=results_directory,
        labels_type=labels_type,
        key=sha256({
            "holdout_key": holdout_key,
            "hyper_parameters": hyper_parameters
        }))
Ejemplo n.º 15
0
def test_dict_hash():
    path = sha256(create_dict())
    assert os.path.exists(path)
    os.remove(path)
Ejemplo n.º 16
0
        data = []
        try:
            data = load_data(file)
        except Exception as message:
            log(message, 3, "red")
            exit(1)

        # run plugin
        plugin_sources = import_module(f"plugins.{name}").main(data)

        log(f"Got {len(plugin_sources)} sources", 2, "green")

        for source in plugin_sources:
            # make unique key for cache matching
            source["_cache"] = sha256({
                **source, "plugin": name,
                "input": file
            })
            # add source
            sources.append(source)

log("Generating citations for sources")

# load existing citations
citations = []
try:
    citations = load_data(config["output"])
except Exception as message:
    log(message, 2, "yellow")

# list of new citations to overwrite existing citations
new_citations = []
Ejemplo n.º 17
0
    def train_and_score(self, network, network_id=None):
        """
        Compiles the network and trains it on the training data with the enabled callbacks.
        The network is compiled by self.compile_network() which in turn uses the supplied build_model function.
        In the case no val_data is supplied, a val_split section of train_data is used for this purpose instead. 
        When no test_data is specified, the test score will be identical to the last validation score.
        
        Arguments:
            network (dictionary): dictionary containing the network parameters. 
            network_id: (int) Id of network in generation; if None, no logpath is created, blocking tensorboard, checkpoint and csvlogging
        
        Returns:
            keras.callbacks.History: History returned from training.
            float: test score obtained as described above.            
        """

        clear_session()

        if network_id is not None:
            model_save_path = os.path.join(
                self.log_dir,
                "gen_" + "{:03d}".format(self.current_generation),
                "id_" + "{:03d}".format(network_id))

            os.makedirs(model_save_path)

        if self.cache:
            network_hash = sha256(network)
            if network_hash in self.trained_networks.keys():
                if random.uniform(
                        0, 1
                ) > self.train_chance:  # skip training based on train_chance
                    # grab score from cache
                    test_score = self.trained_networks[network_hash]['score']
                    total_score = test_score * (
                        1 + self.penalty * network['io_config']['input_shape'])
                    old_save_path = self.trained_networks[network_hash][
                        'save_path']

                    with open(
                            os.path.join(model_save_path, 'cached_score.txt'),
                            'w') as fp:
                        print(f'Test_score: {test_score}', file=fp)
                        print(f'Total_score: {total_score}', file=fp)
                        print(old_save_path, file=fp)
                        print(network, file=fp)
                        print(network_hash, file=fp)

                    return total_score

        callbacks = []
        if self.cb_early_stop:
            early_stop = EarlyStopping(patience=self.early_stop_patience)
            callbacks.append(early_stop)

        if network_id is not None:
            if self.cb_tensorboard:
                tensorboard = TensorBoard(os.path.join(model_save_path,
                                                       "tensorboard"),
                                          write_graph=True,
                                          histogram_freq=5)
                callbacks.append(tensorboard)
            if self.cb_model_checkpoint:
                model_checkpoint = ModelCheckpoint(os.path.join(
                    model_save_path, "checkpoints",
                    "model.{epoch:02d}-{val_loss:.5f}.hdf5"),
                                                   monitor='val_loss',
                                                   verbose=0,
                                                   save_best_only=True,
                                                   save_weights_only=False,
                                                   mode='auto',
                                                   period=1)
                callbacks.append(model_checkpoint)
            if self.cb_csv_logger:
                csv_logger = CSVLogger(os.path.join(model_save_path,
                                                    "train_log.csv"),
                                       separator=',',
                                       append=False)
                callbacks.append(csv_logger)

        callbacks = callbacks + self.user_callbacks

        model = self.compile_network(network)

        # create data based on in/outputs
        x_train = self.train_data[network['io_config']['inputs']]
        if self.y_train is not None:
            y_train = self.y_train
        else:
            y_train = self.train_data[network['io_config']['outputs']]

        if self.val_data is not None:
            x_val = self.val_data[network['io_config']['inputs']]
            if self.y_train is not None:
                y_val = self.y_val
            else:
                y_val = self.val_data[network['io_config']['outputs']]
                total_val_data = [x_val, y_val]
        else:
            total_val_data = None

        start_time = time.time()

        history = model.fit(x_train,
                            y_train,
                            verbose=self.train_verbose,
                            epochs=network['network_config']['epochs'],
                            batch_size=network['network_config']['batch_size'],
                            callbacks=callbacks,
                            validation_data=total_val_data,
                            validation_split=self.val_split)

        print('Training time: {}, training val_loss: {}'.format(
            time.time() - start_time, history.history['val_loss'][-1]),
              file=self.head_log)

        # if test data is specified, use it. Otherwise use last val_loss from training
        if self.test_data is not None:
            x_test = self.test_data[network['io_config']['inputs']]
            if self.y_train is not None:
                y_test = self.y_test
            else:
                y_test = self.test_data[network['io_config']['outputs']]

            start_time = time.time()
            test_score = model.evaluate(x_test, y_test, verbose=0)[0]
            print('Test time: {}, testing loss: {}'.format(
                time.time() - start_time, test_score),
                  file=self.head_log)
            print(network, file=self.head_log)
        else:
            test_score = history.history['val_loss'][-1]

        # serialize model to JSON
        model_json = model.to_json()
        json_save_path = os.path.join(model_save_path, "json_model")
        os.makedirs(json_save_path)
        with open(os.path.join(json_save_path, "model.json"),
                  "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        model.save_weights(os.path.join(json_save_path, "model_weigths.h5"))

        self.head_log.flush()

        # return best score instead of newest
        if self.cache:
            if network_hash in self.trained_networks.keys():
                old_score = self.trained_networks[network_hash]['score']
                if old_score > test_score:
                    # update cached score if new score is better (lower)
                    self.trained_networks[network_hash]['score'] = test_score
                    self.trained_networks[network_hash][
                        'save_path'] = model_save_path
                else:
                    # get hashed score if better
                    test_score = old_score
            else:
                # if the networks was not cached before
                self.trained_networks[network_hash] = {}
                self.trained_networks[network_hash]['score'] = test_score
                self.trained_networks[network_hash][
                    'save_path'] = model_save_path

        # modify test_score to include penalty for number of inputs
        total_score = test_score * (
            1 + self.penalty * network['io_config']['input_shape'])
        return total_score
Ejemplo n.º 18
0
    def __init__(
        self,
        name: str,
        version: str,
        directed: bool = False,
        load_nodes: bool = True,
        load_node_types: bool = True,
        keep_nodes_without_descriptions: bool = True,
        keep_nodes_without_categories: bool = True,
        keep_interwikipedia_nodes: bool = True,
        keep_external_nodes: bool = True,
        compute_node_description: bool = False,
        auto_enable_tradeoffs: bool = True,
        sort_tmp_dir: Optional[str] = None,
        verbose: int = 2,
        cache: bool = True,
        cache_path: Optional[str] = None,
        cache_sys_var: str = "GRAPH_CACHE_DIR",
        graph_kwargs: Dict = None
    ):
        """Create new automatically retrieved graph.

        Parameters
        -------------------
        name: str
            The name of the graph to be retrieved and loaded.
        version: str
            The version of the graph to be retrieved.
        directed: bool = False
            Whether to load the graph as directed or undirected.
            By default false.
        load_nodes: bool = True
            Whether to load the nodes vocabulary or treat the nodes
            simply as a numeric range.
            This feature is only available when the preprocessing is enabled.
        load_node_types: bool = True
            Whether to load the node types if available or skip them entirely.
            This feature is only available when the preprocessing is enabled.
        keep_nodes_without_descriptions: bool = True
            Whether to keep the nodes laking a description
        keep_nodes_without_categories: bool = True
            Whether to keep the nodes laking a category
        keep_interwikipedia_nodes: bool = True
            Whether to keep nodes from external wikipedia websites
        keep_external_nodes: bool = True
            Whether to keep nodes from external websites (non wikipedia ones).
        compute_node_description: bool = False
            Whether to compute the node descriptions.
            Note that this will significantly increase the side of the node lists!
        auto_enable_tradeoffs: bool = True
            Whether to enable the Ensmallen time-memory tradeoffs in small graphs
            automatically. By default True, that is, if a graph has less than
            50 million edges. In such use cases the memory expenditure is minimal.
        sort_tmp_dir: Optional[str] = None
            Which folder to use to store the temporary files needed to sort in 
            parallel the edge list when building the optimal preprocessed file.
            This defaults to the same folder of the edge list when no value is 
            provided.
        verbose: int = 2
            Whether to show loading bars.
        cache: bool = True
            Whether to use cache, i.e. download files only once
            and preprocess them only once.
        cache_path: Optional[str] = None
            Where to store the downloaded graphs.
            If no path is provided, first we check the system variable
            provided below is set, otherwise we use the directory `graphs`.
        cache_sys_var: str = "GRAPH_CACHE_DIR"
            The system variable with the default graph cache directory.
        graph_kwargs: Dict = None
            Eventual additional kwargs for loading the graph.

        Raises
        -------------------
        ValueError,
            If the given graph name is not available.
        ValueError,
            If the OS is Windows, we cannot process the file.
        """
        if is_windows():
            raise ValueError(
                "On Windows we do not support the processing of "
                "Wikipedia graphs."
            )
        self._keep_nodes_without_descriptions = keep_nodes_without_descriptions
        self._keep_nodes_without_categories = keep_nodes_without_categories
        self._keep_interwikipedia_nodes = keep_interwikipedia_nodes
        self._keep_external_nodes = keep_external_nodes
        self._compute_node_description = compute_node_description

        super().__init__(
            name=name,
            version=version,
            repository="wikipedia",
            directed=directed,
            preprocess=True,
            load_nodes=load_nodes,
            load_node_types=load_node_types,
            load_edge_weights=False,
            auto_enable_tradeoffs=auto_enable_tradeoffs,
            sort_tmp_dir=sort_tmp_dir,
            verbose=verbose,
            cache=cache,
            cache_path=cache_path,
            cache_sys_var=cache_sys_var,
            graph_kwargs=graph_kwargs,
            hash_seed=sha256(dict(
                keep_nodes_without_descriptions=keep_nodes_without_descriptions,
                keep_nodes_without_categories=keep_nodes_without_categories,
                keep_interwikipedia_nodes=keep_interwikipedia_nodes,
                keep_external_nodes=keep_external_nodes,
                compute_node_description=compute_node_description
            ))
        )
Ejemplo n.º 19
0
 def consistent_hash(self) -> str:
     return sha256({"a": self._a})
Ejemplo n.º 20
0
    def import_property(self,
                        api_service: Resource,
                        gsc_property: str,
                        request_date: date,
                        dimensions: list,
                        search_types: list,
                        previous_data: list,
                        aggregation_type: str,
                        database: str,
                        table_name: str,
                        dataset_name: str = None):
        table_reference = self.bigquery.table_reference(
            table_name, dataset_name)
        previous_dates = {}
        cache_hash = sha256({
            'property': gsc_property,
            'dimensions': dimensions,
            'date': datetime.now().isoformat()
        })

        print(' - Property: "{:s}"'.format(gsc_property), end='')

        for previous_data_item in previous_data:
            if 'year' == previous_data_item:
                previous_date = request_date - relativedelta(years=1)
                previous_dates['PreviousYear'] = {
                    'startDate': previous_date,
                    'endDate': previous_date,
                }
            if 'month' == previous_data_item:
                previous_date = request_date - relativedelta(months=1)
                previous_dates['PreviousMonth'] = {
                    'startDate':
                    previous_date.replace(day=1),
                    'endDate':
                    previous_date.replace(day=monthrange(
                        previous_date.year, previous_date.month)[1]),
                }
            if 'week' == previous_data_item:
                previous_date = request_date - relativedelta(weeks=1)
                previous_dates['PreviousWeek'] = {
                    'startDate': previous_date,
                    'endDate': previous_date,
                }
            if 'day' == previous_data_item:
                previous_date = request_date - relativedelta(days=1)
                previous_dates['PreviousDay'] = {
                    'startDate': previous_date,
                    'endDate': previous_date,
                }

        if 'bigquery' == database and self._bigquery_check_has_existing_data(
                gsc_property, table_reference, request_date):
            raise _DataAlreadyExistError()
        elif 'mongodb' == database and self._mongodb_check_has_existing_data(
                gsc_property, request_date):
            raise _DataAlreadyExistError()

        print('\n   + {:%Y-%m-%d} -> {:%Y-%m-%d}'.format(
            request_date, request_date),
              end='')

        timer_base = time()

        for search_type in search_types:
            iteration_count = 0

            while True:
                request = {
                    'startDate': request_date.strftime('%Y-%m-%d'),
                    'endDate': request_date.strftime('%Y-%m-%d'),
                    'searchType': search_type,
                    'dimensions': dimensions,
                    'rowLimit': GoogleSearchConsole.ROW_LIMIT,
                    'startRow': GoogleSearchConsole.ROW_LIMIT * iteration_count
                }

                if 0 < len(aggregation_type):
                    request['aggregationType'] = aggregation_type

                response = api_service.searchanalytics().query(
                    siteUrl=gsc_property, body=request).execute()

                if 'rows' not in response:
                    if 0 == iteration_count and (
                            len(search_types) -
                            1) == search_types.index(search_type):
                        cache_entry = self.mongodb.find_one(
                            GoogleSearchConsole.COLLECTION_NAME_CACHE,
                            {'hash': cache_hash}, True)

                        if cache_entry is None:
                            raise _DataNotAvailableYet()

                    break

                self._cache_rows(cache_hash, gsc_property, response['rows'],
                                 previous_dates, request_date, dimensions,
                                 search_type)

                if len(response['rows']) < GoogleSearchConsole.ROW_LIMIT:
                    break

                iteration_count = iteration_count + 1

        print(' - OK - {:s}'.format(
            str(timedelta(seconds=int(time() - timer_base)))))

        for previous_data_column, previous_date in previous_dates.items():
            print('   + {:%Y-%m-%d} -> {:%Y-%m-%d}'.format(
                previous_date['startDate'], previous_date['endDate']),
                  end='')

            timer_previous = time()

            for search_type in search_types:
                iteration_count = 0

                while True:
                    request = {
                        'startDate':
                        previous_date['startDate'].strftime('%Y-%m-%d'),
                        'endDate':
                        previous_date['endDate'].strftime('%Y-%m-%d'),
                        'searchType':
                        search_type,
                        'dimensions':
                        dimensions,
                        'rowLimit':
                        GoogleSearchConsole.ROW_LIMIT,
                        'startRow':
                        GoogleSearchConsole.ROW_LIMIT * iteration_count
                    }

                    if 0 < len(aggregation_type):
                        request['aggregationType'] = aggregation_type

                    response = api_service.searchanalytics().query(
                        siteUrl=gsc_property, body=request).execute()

                    if 'rows' not in response:
                        break

                    self._add_previous_data(
                        cache_hash,
                        search_type,
                        previous_data_column,
                        response['rows'],
                        dimensions,
                    )

                    if len(response['rows']) < GoogleSearchConsole.ROW_LIMIT:
                        break

                    iteration_count = iteration_count + 1

            print(' - OK - {:s}'.format(
                str(timedelta(seconds=int(time() - timer_previous)))))

        offset = 0

        while True:
            rows = self.mongodb.find(GoogleSearchConsole.COLLECTION_NAME_CACHE,
                                     {'hash': cache_hash}, True,
                                     GoogleSearchConsole.ROW_LIMIT, offset)

            if 0 == len(rows):
                break

            offset += GoogleSearchConsole.ROW_LIMIT

            self._import_rows(database, rows, table_reference)

        self._clear_cache(cache_hash)
Ejemplo n.º 21
0
    def _get_formatted_path(self, args, kwargs, formatter=None, function_info=None, extra_kwargs=None, inner_self=None) -> str:
        """Compute the path adding and computing the needed arguments."""        
        formatter = formatter or self.cache_path

        if isinstance(formatter, list):
            return [
                self._get_formatted_path(args, kwargs, f)
                for f in formatter
            ]

        if isinstance(formatter, tuple):
            return tuple([
                self._get_formatted_path(args, kwargs, f)
                for f in formatter
            ])

        elif isinstance(formatter, dict):
            return {
                key:self._get_formatted_path(args, kwargs, v)
                for key, v in formatter.items()
            }

        extra_kwargs = extra_kwargs or {}

        function_info = function_info or self.function_info
        params = get_params(function_info, args, kwargs)
        groups = get_format_groups(formatter)
        groups_set = {match.str_match for match in groups}

        if "_hash" in groups_set:
            data = {"params": params, "function_info": function_info}

            if inner_self is not None: 
                data["self"] = inner_self

            params["_hash"] = sha256(data, use_approximation=self.use_approximated_hash)

        self.logger.debug("Got parameters %s", params)

        format_args = {
            **params,
            **function_info,
            **extra_kwargs,
            "cache_dir":self.cache_dir,
        }

        new_formatter = ""
        old_formatter = formatter
        # Handle the composite paths
        while len(old_formatter) != 0:
            new_match, formatter_remainder = get_next_format_group(old_formatter)

            # there are no more matches just append the remainder
            if new_match is None:
                new_formatter += formatter_remainder
                break
            
            # check if we should call the value or not
            if new_match.str_match.endswith("()"):
                new_match.str_match  = new_match.str_match[:-2]
                # Get the name of the base element and the attributes chain
                root, *attrs = new_match.str_match.split(".")
                # Get the params to use for the attributes chain
                root = format_args[root]

                # Follow the attributes chain
                for attr in attrs:                    
                    root = getattr(root, attr)

                # Check if we have to call the function or not
                if inspect.isfunction(root) or inspect.ismethod(root) or inspect.isbuiltin(root):
                    root = root()

                sub = str(root)
            else:
                sub = "{" + new_match.str_match + "}"

            new_formatter += old_formatter[:new_match.start]
            new_formatter += sub
            old_formatter = formatter_remainder

        path = new_formatter.format(
            **format_args,
        )
        self.logger.debug("Calculated path %s", path)
        return path
Ejemplo n.º 22
0
    def __init__(
        self,
        name: str,
        version: str,
        repository: str,
        directed: bool = False,
        preprocess: Union[bool, str] = "auto",
        load_nodes: bool = True,
        load_node_types: bool = True,
        load_edge_weights: bool = True,
        auto_enable_tradeoffs: bool = True,
        sort_tmp_dir: Optional[str] = None,
        verbose: int = 2,
        cache: bool = True,
        cache_path: Optional[str] = None,
        cache_sys_var: str = "GRAPH_CACHE_DIR",
        graph_kwargs: Dict = None,
        hash_seed: str = None,
        callbacks: List[Callable] = (),
        callbacks_arguments: List[Dict] = (),
    ):
        """Create new automatically retrieved graph.

        Parameters
        -------------------
        name: str
            The name of the graph to be retrieved and loaded.
        version: str
            The version of the graph to be retrieved.
        repository: str
            Name of the repository to load data from.
        directed: bool = False
            Whether to load the graph as directed or undirected.
            By default false.
        preprocess: Union[bool, str] = "auto"
            Whether to preprocess the node list and edge list
            to be loaded optimally in both time and memory.
            Will automatically preprocess in Linux and macOS
            and avoid doing this on Windows.
        load_nodes: bool = True
            Whether to load the nodes vocabulary or treat the nodes
            simply as a numeric range.
            This feature is only available when the preprocessing is enabled.
        load_node_types: bool = True
            Whether to load the node types if available or skip them entirely.
            This feature is only available when the preprocessing is enabled.
        load_edge_weights: bool = True
            Whether to load the edge weights if available or skip them entirely.
            This feature is only available when the preprocessing is enabled.
        auto_enable_tradeoffs: bool = True
            Whether to enable the Ensmallen time-memory tradeoffs in small graphs
            automatically. By default True, that is, if a graph has less than
            50 million edges. In such use cases the memory expenditure is minimal.
        sort_tmp_dir: Optional[str] = None
            Which folder to use to store the temporary files needed to sort in 
            parallel the edge list when building the optimal preprocessed file.
            This defaults to the same folder of the edge list when no value is 
            provided.
        verbose: int = 2
            Whether to show loading bars.
        cache: bool = True
            Whether to use cache, i.e. download files only once
            and preprocess them only once.
        cache_path: Optional[str] = None
            Where to store the downloaded graphs.
            If no path is provided, first we check the system variable
            provided below is set, otherwise we use the directory `graphs`.
        cache_sys_var: str = "GRAPH_CACHE_DIR"
            The system variable with the default graph cache directory.
        graph_kwargs: Dict = None
            Eventual additional kwargs for loading the graph.
        hash_seed: str = None
            Seed to use for the hash.
        callbacks: List[Callable] = ()
            Eventual callbacks to call after download files.
        callbacks_arguments: List[Dict] = ()
            Eventual arguments for callbacks.

        Raises
        -------------------
        ValueError,
            If the given graph name is not available.
        ValueError,
            If the preprocess flag is provided but the system
            is Windows, which does not provide the sort command.
        """
        try:
            validate_graph_version(name, repository, version)

            all_versions = compress_json.local_load(
                "{}.json.gz".format(repository)
            )[name]

            self._graph = all_versions[version]
        except KeyError:
            raise ValueError(
                (
                    "Requested graph `{}` is not currently available.\n"
                    "Open an issue on the Graph repository to ask "
                    "for this graph to be added."
                ).format(name)
            )

        if preprocess == "auto":
            preprocess = is_macos() or is_linux()

        if preprocess and is_windows():
            raise ValueError(
                "Currently preprocessing to optimal edge list is not supported "
                "on Windows because the sorting step is based upon the `sort` "
                "command, which is only available to our knowledge on Linux and "
                "macOS systems."
            )

        # If the cache path was not provided
        # we either check the system variable
        # and if it is not set we use `graphs`
        if cache_path is None:
            cache_path = os.getenv(cache_sys_var, "graphs")

        cache_path = os.path.join(cache_path, repository)

        self._directed = directed
        self._preprocess = preprocess
        self._load_nodes = load_nodes
        self._load_node_types = load_node_types
        self._load_edge_weights = load_edge_weights
        self._name = name
        self._repository = repository
        self._version = version
        self._auto_enable_tradeoffs = auto_enable_tradeoffs
        self._sort_tmp_dir = sort_tmp_dir
        self._cache = cache
        self._verbose = verbose
        self._callbacks = callbacks
        if graph_kwargs is None:
            graph_kwargs = {}
        self._graph_kwargs = graph_kwargs
        self._callbacks_arguments = callbacks_arguments
        self._instance_hash = sha256({
            "hash_seed": hash_seed,
            **self._graph,
            **self._graph_kwargs,
        })
        self._cache_path = os.path.join(
            cache_path,
            name,
            version
        )
        self._downloader = BaseDownloader(
            auto_extract=True,
            cache=cache,
            target_directory=self._cache_path,
            verbose=self._verbose,
            process_number=1
        )
Ejemplo n.º 23
0
def test_dict_hash():
    d = create_dict()
    assert dict_hash(d) == dict_hash(d)
    Path(sha256(d)).touch()
Ejemplo n.º 24
0
    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()
Ejemplo n.º 25
0
def test_with_different_keys_order():
    d1 = {'tune_best_model': True, 'target': 'def'}

    d2 = {'target': 'def', 'tune_best_model': True}

    assert sha256(d1) == sha256(d2)