def cal_left_right_entropy(self): left_right_entropy = {} for n in range(self.min_n, self.max_n + 1): ngrams_entropy = {} target_ngrams = self.ngrams_words[n] parent_words = self.ngrams_words[n + 1] left_neighbors = Trie() right_neighbors = Trie() for parent_word in tqdm(parent_words, desc='build neighbors'): right_neighbors[parent_word] = self.ngrams_freq[parent_word] left_neighbors[parent_word[1:] + parent_word[0]] = self.ngrams_freq[parent_word] for target_ngram in tqdm(target_ngrams, desc='target ngram'): try: right_neighbors_counts = ( right_neighbors.values(target_ngram)) right_entropy = self.cal_ngram_entropy( right_neighbors_counts) except KeyError: right_entropy = 0 try: left_neighbors_counts = ( left_neighbors.values(target_ngram)) left_entropy = self.cal_ngram_entropy( left_neighbors_counts) except KeyError: left_entropy = 0 ngrams_entropy[target_ngram] = (left_entropy, right_entropy) left_right_entropy.update(ngrams_entropy) return left_right_entropy
def _calc_ngram_entropy(ngram_freq, ngram_keys, n): """ 基于ngram频率信息计算熵信息 :param ngram_freq: :param ngram_keys: :param n: :return: """ if isinstance(n, Iterable): ## 一次性计算 len(N)>1 的 ngram entropy = {} for ni in n: entropy = { **entropy, **_calc_ngram_entropy(ngram_freq, ngram_keys, ni) } return entropy ngram_entropy = {} target_ngrams = ngram_keys[n] parent_candidates = ngram_keys[n + 1] if CPU_COUNT == 1: ## 对 n+1 gram 进行建Trie处理 left_neighbors = Trie() right_neighbors = Trie() for parent_candidate in parent_candidates: right_neighbors[parent_candidate] = ngram_freq[parent_candidate] left_neighbors[parent_candidate[1:] + parent_candidate[0]] = ngram_freq[parent_candidate] ## 计算 for target_ngram in target_ngrams: try: ## 一定情况下, 一个candidate ngram 没有左右neighbor right_neighbor_counts = (right_neighbors.values(target_ngram)) right_entropy = _ngram_entropy_scorer(right_neighbor_counts) except KeyError: right_entropy = 0 try: left_neighbor_counts = (left_neighbors.values(target_ngram)) left_entropy = _ngram_entropy_scorer(left_neighbor_counts) except KeyError: left_entropy = 0 ngram_entropy[target_ngram] = (left_entropy, right_entropy) return ngram_entropy else: ## TODO 多进程计算 pass
def build_outs_trie(stages): outs = Trie() for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def build_outs_trie(stages): outs = Trie() for stage in stages: for out in stage.outs: out_key = out.fs.path.parts(out.fs_path) # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "The output paths:\n'{}'('{}')\n'{}'('{}')\n" "overlap and are thus in the same tracked directory.\n" "To keep reproducibility, outputs should be in separate " "tracked directories or tracked individually.").format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def calcul_ngram_entropy(ngram_freq, ngram_keys, n): """ Calcul entropy by ngram frequences """ # Calcul ngram entropy if isinstance(n,collections.abc.Iterable): entropy = {} for ni in n: entropy = {**entropy,**calcul_ngram_entropy(ngram_freq,ngram_keys,ni)} return entropy ngram_entropy = {} parent_candidates = ngram_keys[n+1] if n!=1: target_ngrams = ngram_keys[n] else: target_ngrams = [l for l in ngram_keys[n] if ToolWord().is_english_word(l[0])] if hp.CPU_COUNT == 1: # Build trie for n+1 gram left_neighbors = Trie() right_neighbors = Trie() for parent_candidate in parent_candidates: right_neighbors[parent_candidate] = ngram_freq[parent_candidate] left_neighbors[parent_candidate[1:]+(parent_candidate[0],)] = ngram_freq[parent_candidate] # Calcul entropy for target_ngram in target_ngrams: try: right_neighbor_counts = (right_neighbors.values(target_ngram)) right_entropy = entropy_of_list(right_neighbor_counts) except KeyError: right_entropy = 0 try: left_neighbor_counts = (left_neighbors.values(target_ngram)) left_entropy = entropy_of_list(left_neighbor_counts) except KeyError: left_entropy = 0 ngram_entropy[target_ngram] = (left_entropy,right_entropy) return ngram_entropy else: # Multi process pass
def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G