def __init__(self, fs, root_dir): from dvc.repo import Repo default_ignore_patterns = [ ".hg/", ".git/", ".git", f"{Repo.DVC_DIR}/", ] self.fs = fs self.root_dir = root_dir self.ignores_trie_fs = Trie() self._ignores_trie_subrepos = Trie() key = self._get_key(root_dir) self.ignores_trie_fs[key] = DvcIgnorePatterns( default_ignore_patterns, root_dir, fs.sep, ) self._ignores_trie_subrepos[key] = self.ignores_trie_fs[key] self._update( self.root_dir, self._ignores_trie_subrepos, dnames=None, ignore_subrepos=False, ) self._update( self.root_dir, self.ignores_trie_fs, dnames=None, ignore_subrepos=True, )
def cal_left_right_entropy(self): left_right_entropy = {} for n in range(self.min_n, self.max_n + 1): ngrams_entropy = {} target_ngrams = self.ngrams_words[n] parent_words = self.ngrams_words[n + 1] left_neighbors = Trie() right_neighbors = Trie() for parent_word in tqdm(parent_words, desc='build neighbors'): right_neighbors[parent_word] = self.ngrams_freq[parent_word] left_neighbors[parent_word[1:] + parent_word[0]] = self.ngrams_freq[parent_word] for target_ngram in tqdm(target_ngrams, desc='target ngram'): try: right_neighbors_counts = ( right_neighbors.values(target_ngram)) right_entropy = self.cal_ngram_entropy( right_neighbors_counts) except KeyError: right_entropy = 0 try: left_neighbors_counts = ( left_neighbors.values(target_ngram)) left_entropy = self.cal_ngram_entropy( left_neighbors_counts) except KeyError: left_entropy = 0 ngrams_entropy[target_ngram] = (left_entropy, right_entropy) left_right_entropy.update(ngrams_entropy) return left_right_entropy
def _calc_ngram_entropy(ngram_freq, ngram_keys, n): """ 基于ngram频率信息计算熵信息 :param ngram_freq: :param ngram_keys: :param n: :return: """ if isinstance(n, Iterable): ## 一次性计算 len(N)>1 的 ngram entropy = {} for ni in n: entropy = { **entropy, **_calc_ngram_entropy(ngram_freq, ngram_keys, ni) } return entropy ngram_entropy = {} target_ngrams = ngram_keys[n] parent_candidates = ngram_keys[n + 1] if CPU_COUNT == 1: ## 对 n+1 gram 进行建Trie处理 left_neighbors = Trie() right_neighbors = Trie() for parent_candidate in parent_candidates: right_neighbors[parent_candidate] = ngram_freq[parent_candidate] left_neighbors[parent_candidate[1:] + parent_candidate[0]] = ngram_freq[parent_candidate] ## 计算 for target_ngram in target_ngrams: try: ## 一定情况下, 一个candidate ngram 没有左右neighbor right_neighbor_counts = (right_neighbors.values(target_ngram)) right_entropy = _ngram_entropy_scorer(right_neighbor_counts) except KeyError: right_entropy = 0 try: left_neighbor_counts = (left_neighbors.values(target_ngram)) left_entropy = _ngram_entropy_scorer(left_neighbor_counts) except KeyError: left_entropy = 0 ngram_entropy[target_ngram] = (left_entropy, right_entropy) return ngram_entropy else: ## TODO 多进程计算 pass
def __init__(self, tree: GitObject, rev: str): self.tree = tree self.rev = rev self.trie = Trie() self.trie[()] = tree self._build(tree, ())
def build_outs_trie(stages): outs = Trie() for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def walk(self, top, topdown=True, onerror=None, **kwargs): from pygtrie import Trie assert topdown root = PathInfo(os.path.abspath(top)) try: meta = self.metadata(root) except OutputNotFoundError: if onerror is not None: onerror(FileNotFoundError(top)) return if not meta.isdir: if onerror is not None: onerror(NotADirectoryError(top)) return trie = Trie() for out in meta.outs: trie[out.path_info.parts] = out if out.is_dir_checksum and root.isin_or_eq(out.path_info): self._add_dir(top, trie, out, **kwargs) yield from self._walk(root, trie, topdown=topdown, **kwargs)
def walk(self, top, topdown=True, **kwargs): from pygtrie import Trie assert topdown if not self.exists(top): raise FileNotFoundError if not self.isdir(top): raise NotADirectoryError root = PathInfo(os.path.abspath(top)) outs = self._find_outs(top, recursive=True, strict=False) trie = Trie() for out in outs: trie[out.path_info.parts] = out if out.is_dir_checksum and (self.fetch or self.stream): # pull dir cache if needed dir_cache = out.get_dir_cache(**kwargs) # pull dir contents if needed if self.fetch: if out.changed_cache(filter_info=top): used_cache = out.get_used_cache(filter_info=top) self.repo.cloud.pull(used_cache, **kwargs) for entry in dir_cache: entry_relpath = entry[out.remote.PARAM_RELPATH] path_info = out.path_info / entry_relpath trie[path_info.parts] = None yield from self._walk(root, trie, topdown=topdown)
def walk(self, top, topdown=True, onerror=None, **kwargs): from pygtrie import Trie assert topdown root = os.path.abspath(top) try: info = self.info(root) except FileNotFoundError: if onerror is not None: onerror(FileNotFoundError(top)) return if info["type"] != "directory": if onerror is not None: onerror(NotADirectoryError(top)) return trie = Trie() for out in info["outs"]: trie[out.fs.path.parts(out.fs_path)] = out if out.is_dir_checksum and self.path.isin_or_eq(root, out.fs_path): self._add_dir(trie, out, **kwargs) yield from self._walk(root, trie, topdown=topdown, **kwargs)
def walk(self, top, topdown=True, onerror=None, **kwargs): from pygtrie import Trie assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return root = PathInfo(os.path.abspath(top)) outs = self._find_outs(top, recursive=True, strict=False) trie = Trie() for out in outs: trie[out.path_info.parts] = out if out.is_dir_checksum and root.isin_or_eq(out.path_info): self._add_dir(top, trie, out, **kwargs) yield from self._walk(root, trie, topdown=topdown, **kwargs)
def build_outs_trie(stages): outs = Trie() for stage in stages: for out in stage.outs: out_key = out.fs.path.parts(out.fs_path) # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ( "The output paths:\n'{}'('{}')\n'{}'('{}')\n" "overlap and are thus in the same tracked directory.\n" "To keep reproducibility, outputs should be in separate " "tracked directories or tracked individually.").format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out return outs
def walk(self, top, topdown=True): from pygtrie import Trie assert topdown if not self.exists(top): raise FileNotFoundError if not self.isdir(top): raise NotADirectoryError root = PathInfo(os.path.abspath(top)) outs = self._find_outs(top, recursive=True, strict=False) trie = Trie() for out in outs: trie[out.path_info.parts] = out if out.is_dir_checksum and (self.fetch or self.stream): # will pull dir cache if needed with self.repo.state: cache = out.collect_used_dir_cache() for _, names in cache.scheme_names(out.scheme): for name in names: path_info = out.path_info.parent / name trie[path_info.parts] = None yield from self._walk(root, trie, topdown=topdown)
def calcul_ngram_entropy(ngram_freq, ngram_keys, n): """ Calcul entropy by ngram frequences """ # Calcul ngram entropy if isinstance(n,collections.abc.Iterable): entropy = {} for ni in n: entropy = {**entropy,**calcul_ngram_entropy(ngram_freq,ngram_keys,ni)} return entropy ngram_entropy = {} parent_candidates = ngram_keys[n+1] if n!=1: target_ngrams = ngram_keys[n] else: target_ngrams = [l for l in ngram_keys[n] if ToolWord().is_english_word(l[0])] if hp.CPU_COUNT == 1: # Build trie for n+1 gram left_neighbors = Trie() right_neighbors = Trie() for parent_candidate in parent_candidates: right_neighbors[parent_candidate] = ngram_freq[parent_candidate] left_neighbors[parent_candidate[1:]+(parent_candidate[0],)] = ngram_freq[parent_candidate] # Calcul entropy for target_ngram in target_ngrams: try: right_neighbor_counts = (right_neighbors.values(target_ngram)) right_entropy = entropy_of_list(right_neighbor_counts) except KeyError: right_entropy = 0 try: left_neighbor_counts = (left_neighbors.values(target_ngram)) left_entropy = entropy_of_list(left_neighbor_counts) except KeyError: left_entropy = 0 ngram_entropy[target_ngram] = (left_entropy,right_entropy) return ngram_entropy else: # Multi process pass
def build_trie(alphabet, vocab): from pygtrie import CharTrie as Trie trie = Trie() start_time = datetime.now() info('start building trie at {}'.format( start_time.strftime("%H:%M:%S"))) for v in vocab: trie[v] = 1 end_time = datetime.now() info('finish building trie at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return trie
def build_trie(alphabet, vocab): from pygtrie import CharTrie as Trie start_time = datetime.now() info('start building trie at {}'.format( start_time.strftime("%H:%M:%S"))) trie = Trie() for i, v in enumerate(vocab, start=1): trie[v] = 1 if i % 10000 == 0: info('inserted {} ...'.format(i)) end_time = datetime.now() info('finish building trie at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return trie
def __init__( self, repo: Optional["Repo"] = None, subrepos=False, repo_factory: RepoFactory = None, **kwargs, ): super().__init__() from pygtrie import Trie if repo is None: repo, repo_factory = self._repo_from_fs_config(subrepos=subrepos, **kwargs) if not repo_factory: from dvc.repo import Repo self.repo_factory: RepoFactory = Repo else: self.repo_factory = repo_factory def _getcwd(): relparts = () if repo.fs.path.isin(repo.fs.path.getcwd(), repo.root_dir): relparts = repo.fs.path.relparts(repo.fs.path.getcwd(), repo.root_dir) return self.root_marker + self.sep.join(relparts) self.path = Path(self.sep, getcwd=_getcwd) self.repo = repo self.hash_jobs = repo.fs.hash_jobs self._traverse_subrepos = subrepos self._subrepos_trie = Trie() """Keeps track of each and every path with the corresponding repo.""" key = self._get_key(self.repo.root_dir) self._subrepos_trie[key] = repo self._datafss = {} """Keep a datafs instance of each repo.""" if hasattr(repo, "dvc_dir"): self._datafss[key] = DataFileSystem(repo=repo)
def walk( self, top, topdown=True, onerror=None, download_callback=None, **kwargs ): from pygtrie import Trie assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return root = PathInfo(os.path.abspath(top)) outs = self._find_outs(top, recursive=True, strict=False) trie = Trie() for out in outs: trie[out.path_info.parts] = out if out.is_dir_checksum and (self.fetch or self.stream): # pull dir cache if needed dir_cache = out.get_dir_cache(**kwargs) # pull dir contents if needed if self.fetch: if out.changed_cache(filter_info=top): used_cache = out.get_used_cache(filter_info=top) downloaded = self.repo.cloud.pull(used_cache, **kwargs) if download_callback: download_callback(downloaded) for entry in dir_cache: entry_relpath = entry[out.remote.tree.PARAM_RELPATH] if os.name == "nt": entry_relpath = entry_relpath.replace("/", os.sep) path_info = out.path_info / entry_relpath trie[path_info.parts] = None yield from self._walk(root, trie, topdown=topdown)
def walk(self, top, topdown=True): from pygtrie import Trie assert topdown if not self.exists(top): raise FileNotFoundError if not self.isdir(top): raise NotADirectoryError root = PathInfo(os.path.abspath(top)) outs = self._find_outs(top, recursive=True, strict=False) trie = Trie() for out in outs: trie[out.path_info.parts] = out yield from self._walk(root, trie, topdown=topdown)
def _collect_graph(self, stages): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, collect stages in the repository. Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from pygtrie import Trie from dvc.exceptions import ( OutputDuplicationError, OverlappingOutputPathsError, StagePathAsOutputError, ) G = nx.DiGraph() stages = stages or self.stages outs = Trie() # Use trie to efficiently find overlapping outs and deps for stage in filter(bool, stages): # bug? not using it later for out in stage.outs: out_key = out.path_info.parts # Check for dup outs if out_key in outs: dup_stages = [stage, outs[out_key].stage] raise OutputDuplicationError(str(out), dup_stages) # Check for overlapping outs if outs.has_subtrie(out_key): parent = out overlapping = first(outs.values(prefix=out_key)) else: parent = outs.shortest_prefix(out_key).value overlapping = out if parent and overlapping: msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n" "overlap. To avoid unpredictable behaviour, " "rerun command with non overlapping outs paths." ).format( str(parent), parent.stage.addressing, str(overlapping), overlapping.stage.addressing, ) raise OverlappingOutputPathsError(parent, overlapping, msg) outs[out_key] = out for stage in stages: out = outs.shortest_prefix(PathInfo(stage.path).parts).value if out: raise StagePathAsOutputError(stage, str(out)) # Building graph G.add_nodes_from(stages) for stage in stages: for dep in stage.deps: if dep.path_info is None: continue dep_key = dep.path_info.parts overlapping = [n.value for n in outs.prefixes(dep_key)] if outs.has_subtrie(dep_key): overlapping.extend(outs.values(prefix=dep_key)) G.add_edges_from((stage, out.stage) for out in overlapping) check_acyclic(G) return G
def test_size(trie_dict, size): dir_info = DirInfo() dir_info.trie = Trie(trie_dict) assert dir_info.size == size
def trie(self): from pygtrie import Trie return Trie(self._dict)
def test_nfiles(trie_dict, nfiles): dir_info = DirInfo() dir_info.trie = Trie(trie_dict) assert dir_info.nfiles == nfiles
def __init__(self, data, tokenizer=None, upper=True, lower=True): self.t = tokenizer if tokenizer else tokenizers.get('jieba') self.upper = upper self.lower = lower self.fw_trie = Trie() self.bw_trie = Trie()
def trie(self): return Trie(self._dict)
def __init__(self): self.trie = Trie()
def test_items_with_path(path_info, trie_dict, items): dir_info = DirInfo() dir_info.trie = Trie(trie_dict) assert list(dir_info.items(path_info)) == items
def __init__(self, price_tracker): self.price_tracker = price_tracker self.ngram_counter = Trie() self.templates = [] self.template_id = 0
def test_merge(ancestor_dict, our_dict, their_dict, merged_dict): actual = _merge(Trie(ancestor_dict), Trie(our_dict), Trie(their_dict)) expected = Trie(merged_dict) assert actual == expected
def __init__(self, user_dict: Union[str, Iterable] = None): self.t = posseg.POSTokenizer() self.t.initialize() self.trie = Trie() if user_dict: self.load_user_dict(user_dict)
def test_items(trie_dict, items): dir_info = DirInfo() dir_info.trie = Trie(trie_dict) assert list(dir_info.items()) == items
def test_list(lst, trie_dict): dir_info = DirInfo.from_list(lst) assert dir_info.trie == Trie(trie_dict) assert dir_info.to_list() == sorted(lst, key=itemgetter("relpath"))