Python Trie.values Exemples, pygtrie.Trie.values Python Exemples

Exemple #1

0

Afficher le fichier

    def cal_left_right_entropy(self):
        left_right_entropy = {}
        for n in range(self.min_n, self.max_n + 1):
            ngrams_entropy = {}
            target_ngrams = self.ngrams_words[n]
            parent_words = self.ngrams_words[n + 1]
            left_neighbors = Trie()
            right_neighbors = Trie()

            for parent_word in tqdm(parent_words, desc='build neighbors'):
                right_neighbors[parent_word] = self.ngrams_freq[parent_word]
                left_neighbors[parent_word[1:] +
                               parent_word[0]] = self.ngrams_freq[parent_word]

            for target_ngram in tqdm(target_ngrams, desc='target ngram'):
                try:
                    right_neighbors_counts = (
                        right_neighbors.values(target_ngram))
                    right_entropy = self.cal_ngram_entropy(
                        right_neighbors_counts)
                except KeyError:
                    right_entropy = 0
                try:
                    left_neighbors_counts = (
                        left_neighbors.values(target_ngram))
                    left_entropy = self.cal_ngram_entropy(
                        left_neighbors_counts)
                except KeyError:
                    left_entropy = 0
                ngrams_entropy[target_ngram] = (left_entropy, right_entropy)

            left_right_entropy.update(ngrams_entropy)

        return left_right_entropy

Exemple #2

0

Afficher le fichier

def _calc_ngram_entropy(ngram_freq, ngram_keys, n):
    """
    基于ngram频率信息计算熵信息
    :param ngram_freq:
    :param ngram_keys:
    :param n:
    :return:
    """
    if isinstance(n, Iterable):  ## 一次性计算 len(N)>1 的 ngram
        entropy = {}
        for ni in n:
            entropy = {
                **entropy,
                **_calc_ngram_entropy(ngram_freq, ngram_keys, ni)
            }
        return entropy

    ngram_entropy = {}
    target_ngrams = ngram_keys[n]
    parent_candidates = ngram_keys[n + 1]

    if CPU_COUNT == 1:
        ## 对 n+1 gram 进行建Trie处理
        left_neighbors = Trie()
        right_neighbors = Trie()

        for parent_candidate in parent_candidates:
            right_neighbors[parent_candidate] = ngram_freq[parent_candidate]
            left_neighbors[parent_candidate[1:] +
                           parent_candidate[0]] = ngram_freq[parent_candidate]

        ## 计算
        for target_ngram in target_ngrams:
            try:  ## 一定情况下, 一个candidate ngram 没有左右neighbor
                right_neighbor_counts = (right_neighbors.values(target_ngram))
                right_entropy = _ngram_entropy_scorer(right_neighbor_counts)
            except KeyError:
                right_entropy = 0
            try:
                left_neighbor_counts = (left_neighbors.values(target_ngram))
                left_entropy = _ngram_entropy_scorer(left_neighbor_counts)
            except KeyError:
                left_entropy = 0
            ngram_entropy[target_ngram] = (left_entropy, right_entropy)
        return ngram_entropy
    else:
        ## TODO 多进程计算
        pass

Exemple #3

0

Afficher le fichier

Fichier : trie.py Projet : sandeepmistry/dvc

def build_outs_trie(stages):
    outs = Trie()

    for stage in filter(bool, stages):  # bug? not using it later
        for out in stage.outs:
            out_key = out.path_info.parts

            # Check for dup outs
            if out_key in outs:
                dup_stages = [stage, outs[out_key].stage]
                raise OutputDuplicationError(str(out), dup_stages)

            # Check for overlapping outs
            if outs.has_subtrie(out_key):
                parent = out
                overlapping = first(outs.values(prefix=out_key))
            else:
                parent = outs.shortest_prefix(out_key).value
                overlapping = out
            if parent and overlapping:
                msg = (
                    "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                    "overlap. To avoid unpredictable behaviour, "
                    "rerun command with non overlapping outs paths."
                ).format(
                    str(parent),
                    parent.stage.addressing,
                    str(overlapping),
                    overlapping.stage.addressing,
                )
                raise OverlappingOutputPathsError(parent, overlapping, msg)

            outs[out_key] = out

    return outs

Exemple #4

0

Afficher le fichier

def build_outs_trie(stages):
    outs = Trie()

    for stage in stages:
        for out in stage.outs:
            out_key = out.fs.path.parts(out.fs_path)

            # Check for dup outs
            if out_key in outs:
                dup_stages = [stage, outs[out_key].stage]
                raise OutputDuplicationError(str(out), dup_stages)

            # Check for overlapping outs
            if outs.has_subtrie(out_key):
                parent = out
                overlapping = first(outs.values(prefix=out_key))
            else:
                parent = outs.shortest_prefix(out_key).value
                overlapping = out
            if parent and overlapping:
                msg = (
                    "The output paths:\n'{}'('{}')\n'{}'('{}')\n"
                    "overlap and are thus in the same tracked directory.\n"
                    "To keep reproducibility, outputs should be in separate "
                    "tracked directories or tracked individually.").format(
                        str(parent),
                        parent.stage.addressing,
                        str(overlapping),
                        overlapping.stage.addressing,
                    )
                raise OverlappingOutputPathsError(parent, overlapping, msg)

            outs[out_key] = out

    return outs

Exemple #5

0

Afficher le fichier

def calcul_ngram_entropy(ngram_freq,
                        ngram_keys,
                        n):
    """
    Calcul entropy by ngram frequences
    """
    # Calcul ngram entropy
    if isinstance(n,collections.abc.Iterable): 
        entropy = {}
        for ni in n:
            entropy = {**entropy,**calcul_ngram_entropy(ngram_freq,ngram_keys,ni)}
        return entropy
      
    ngram_entropy = {}
    parent_candidates = ngram_keys[n+1]
    if n!=1:
        target_ngrams = ngram_keys[n]
    else:
        target_ngrams = [l for l in ngram_keys[n] if ToolWord().is_english_word(l[0])]       

    if hp.CPU_COUNT == 1:
        # Build trie for n+1 gram 
        left_neighbors = Trie()
        right_neighbors = Trie()

        for parent_candidate in parent_candidates:
            right_neighbors[parent_candidate] = ngram_freq[parent_candidate]
            left_neighbors[parent_candidate[1:]+(parent_candidate[0],)] = ngram_freq[parent_candidate]

        # Calcul entropy
        for target_ngram in target_ngrams:
            try:  
                right_neighbor_counts = (right_neighbors.values(target_ngram))
                right_entropy = entropy_of_list(right_neighbor_counts)
            except KeyError:
                right_entropy = 0
            try:
                left_neighbor_counts = (left_neighbors.values(target_ngram))
                left_entropy = entropy_of_list(left_neighbor_counts)
            except KeyError:
                left_entropy = 0
            ngram_entropy[target_ngram] = (left_entropy,right_entropy)
        return ngram_entropy
    else:
        # Multi process
        pass

Exemple #6

0

Afficher le fichier

    def _collect_graph(self, stages):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie

        from dvc.exceptions import (
            OutputDuplicationError,
            OverlappingOutputPathsError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in filter(bool, stages):  # bug? not using it later
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                           "overlap. To avoid unpredictable behaviour, "
                           "rerun command with non overlapping outs paths."
                           ).format(
                               str(parent),
                               parent.stage.addressing,
                               str(overlapping),
                               overlapping.stage.addressing,
                           )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = [n.value for n in outs.prefixes(dep_key)]
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)
        check_acyclic(G)

        return G