def detect_path_overlap(paths): """ Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap). Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c). NOTE: The logic is copied from pulpcore.app.files.validate_file_paths(). This function returns the first dupe or overlap it detects. We use a trie (or prefix tree) to keep track of which paths we've already seen. Args: paths (iterable of str): An iterable of strings each representing a relative path Returns: str: a path which overlaps or duplicates another """ path_trie = StringTrie(separator="/") for path in paths: if path in path_trie: # path duplicates a path already in the trie return path if path_trie.has_subtrie(path): # overlap where path is 'a/b' and trie has 'a/b/c' return path prefixes = list(path_trie.prefixes(path)) if prefixes: # overlap where path is 'a/b/c' and trie has 'a/b' return path # if there are no overlaps, add it to our trie and continue path_trie[path] = True
def find_entites(text: str, trie: StringTrie): tokens = text.split() start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() for i in range(len(tokens)): key = "/".join(tokens[start : i + 1]).lower() if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = (get_entity(trie, key), start, i + 1) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = (trie[key], start, i + 1) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() entities[count] = (get_entity(trie, old_key), start, i) count += 1 if trie.has_node( tokens[i].lower() ): # Need to verify that the current token isn't in the Trie start = i else: start = i + 1 else: # No match start = i + 1 return reduce_entities(entities)
def __init__(self, repo, subrepos=False, repo_factory: Callable[[str], "Repo"] = None, **kwargs): super().__init__(repo, {"url": repo.root_dir}) if not repo_factory: from dvc.repo import Repo self.repo_factory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.root_dir = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = StringTrie(separator=os.sep) """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvctrees = {} """Keep a dvctree instance of each repo.""" self._dvctree_configs = kwargs if hasattr(repo, "dvc_dir"): self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs)
def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir) self._update(self.root_dir)
def validate_file_paths(paths): """ Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap). Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c). This function will raise an exception at the first dupe or overlap it detects. We use a trie (or prefix tree) to keep track of which paths we've already seen. Args: paths (iterable of str): An iterable of strings each representing a relative path Raises: ValueError: If any path overlaps another """ overlap_error = _("The path for file '{path}' overlaps: {conflicts}") path_trie = StringTrie(separator="/") dups = [] overlaps = [] for path in paths: if path in path_trie: # path duplicates a path already in the trie dups.append(path) elif path_trie.has_subtrie(path): # overlap where path is 'a/b' and trie has 'a/b/c' conflicts = [item[0] for item in path_trie.items(prefix=path)] overlaps.append( overlap_error.format(path=path, conflicts=", ".join(conflicts))) else: prefixes = list(path_trie.prefixes(path)) if prefixes: # overlap where path is 'a/b/c' and trie has 'a/b' conflicts = [prefix.key for prefix in prefixes] overlaps.append( overlap_error.format(path=path, conflicts=", ".join(conflicts))) # if there are no overlaps, add it to our trie and continue path_trie[path] = True if dups or overlaps: dups_msg = "" overlaps_msg = "" if dups: dups_msg = _("Paths are duplicated: {paths}").format( paths=",".join(dups)) if overlaps: overlaps_msg = "\n".join(overlaps) raise ValueError( _("Path errors found. {dups}\n{overlaps}").format( dups=dups_msg, overlaps=overlaps_msg))
def __init__(self, sul: RERSConnectorV4 = None, separator=" ", storagepath=None, saveinterval=15): super().__init__(sul, storagepath, saveinterval) self.separator = separator self.cache = StringTrie(separator=separator) self.error_cache = StringTrie(separator=separator) self.invalid_cache = PrefixSet() # hookup rers cache self.sul.hookup_cache(self.cache, self.error_cache, self.invalid_cache) self.passthrough = False
def load_folder(self, folder): dic_csv = self.__map_dict.get(folder) if dic_csv is None: return logging.info(f"Read {dic_csv}") brands, models = self.read_brand(dic_csv) _brands: StringTrie = StringTrie() _brands_models: StringTrie = StringTrie() _models_brands: StringTrie = StringTrie() # Parse brands for S in brands: low = normalize_vn(normalize(S.lower())) ww = low.split() key = "/".join(ww) _brands["/".join(ww)] = S # Tất cả dòng, model ten_model_san_pham = [ M for B, M in models if not (M is None or M is np.nan) ] m_ct = Counter(ten_model_san_pham) for B, M in models: if M is None or M is np.nan: continue b_low = normalize_vn(normalize(B.lower())) m_low = normalize_vn(normalize(M.lower())) ww = " ".join([b_low, m_low]).split() key = "/".join(ww) _brands_models[key] = M # Chỉ lấy tên Model là duy nhất để gán dòng,model với hãng, 2 ky tự tro len, ko phai la number # Galaxy S20 -> Samsung # # if M == "Galaxy": # print(M) if len(M) < 2: continue if M.isdigit(): continue if m_ct.get(M) > 1: # print("m_ct.get(K) :", M, m_ct.get(M)) continue key = "/".join(m_low.split()) _models_brands[key] = f"{B}:{M}" # self.__brands[folder] = _brands self.__brands_models[folder] = _brands_models self.__models_brands[folder] = _models_brands
def _get_trie(*list_replacement_items): replacement_items_ordereddict = OrderedDict() for replacement_items in list_replacement_items: replacement = replacement_items[0] items = replacement_items[1] replacement_items_ordereddict[replacement] = items trie = StringTrie(separator=' ') for key in replacement_items_ordereddict.keys(): trie.update( StringTrie.fromkeys(replacement_items_ordereddict[key], value=key, separator=' ')) return trie
def time_test(length): array = np.random.random_integers(-2**31 - 1, 2**31, length) trie = StringTrie() for number in array: if not trie.has_key(bin(number)): trie[bin(number)] = 0 trie[bin(number)] += 1 time1 = time.time() for number in np.random.choice(array, 10000): trie[bin(number)] time2 = time.time() print("Access to array of length {} took {} ms".format( length, (time2 - time1) * 1000))
def grab_graftm_taxa(tax_ids_file): taxonomic_tree = StringTrie(separator='; ') with open(tax_ids_file) as tax_ids: header = tax_ids.readline().strip() if header != "tax_id,parent_id,rank,tax_name,root,kingdom,phylum,class,order,family,genus,species": logging.error("Unable to handle format of " + tax_ids_file + "!") sys.exit(21) line = tax_ids.readline().strip() while line: try: _, _, _, _, _, k_, p_, c_, o_, f_, g_, s_ = line.split(',') except IndexError: logging.error("Unexpected format of line in " + tax_ids_file + ":\n" + line) sys.exit(21) ranks = ["Root", k_, p_, c_, o_, f_, g_, s_] lineage_list = [] # In case there are missing ranks... which is likely for rank in ranks: if rank: # GraftM seems to append an 'e1' to taxa that are duplicated in the taxonomic lineage. # For example: Bacteria; Aquificae; Aquificaee1; Aquificales lineage_list.append(re.sub(r'e\d+$', '', rank)) # lineage_list.append(rank) lineage = re.sub('_', ' ', clean_lineage_string('; '.join(lineage_list))) i = 0 ranks = len(lineage) while i < len(lineage): taxonomic_tree["; ".join(lineage.split("; ")[:ranks - i])] = True i += 1 line = tax_ids.readline().strip() return taxonomic_tree
class DvcIgnorePatternsTrie(DvcIgnore): trie = None def __init__(self): if self.trie is None: self.trie = StringTrie(separator=os.sep) def __call__(self, root, dirs, files): ignore_pattern = self[root] if ignore_pattern: return ignore_pattern(root, dirs, files) return dirs, files def __setitem__(self, root, ignore_pattern): base_pattern = self[root] common_dirname, merged_pattern = merge_patterns( base_pattern.dirname, base_pattern.pattern_list, ignore_pattern.dirname, ignore_pattern.pattern_list, ) self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname) def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) if ignore_pattern: return ignore_pattern.value return DvcIgnorePatterns([], root)
def __init__(self, sul: object = None, separator: object = " ", storagepath: object = None, saveinterval: object = 15) -> object: super().__init__(sul, storagepath, saveinterval) self.cache = StringTrie(separator=separator) self.separator = separator
def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) for root, dirs, _ in self.tree.walk( self.root_dir, use_dvcignore=False ): self._update(root) self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, [])
def create_trie(names): trie = StringTrie(delimiter="/") for (name, qid) in names: q_name = eval(qid[0])[1] name = [x for x in name if x.lower() not in BLACK_LIST] for i in range(len(name)): trie["/".join(name[i:]).lower()] = (q_name, qid) return trie
def create_trie(names): trie = StringTrie(delimiter="/") for name in names: processed_name = [ x for x in name["name"].split() if x.lower() not in BLACK_LIST ] for i in range(len(processed_name)): trie["/".join(processed_name[i:]).lower()] = (name["name"], name["ids"]) return trie
def load_taxonomic_trie(lineages: list) -> StringTrie: taxonomic_tree = StringTrie(separator='; ') for lineage in lineages: i = 0 ranks = len(lineage) while i < len(lineage): taxonomic_tree["; ".join(lineage.split("; ")[:ranks - i])] = True i += 1 return taxonomic_tree
class ProjectRegistry(type): """Registry for benchbuild projects.""" projects = StringTrie() def __init__(cls, name, bases, attrs): """Register a project in the registry.""" super(ProjectRegistry, cls).__init__(name, bases, attrs) if None not in {cls.NAME, cls.DOMAIN, cls.GROUP}: key = "{name}/{group}".format(name=cls.NAME, group=cls.GROUP) ProjectRegistry.projects[key] = cls
def create_entity_dic_v2(df): def check_pros(R): it = R.to_dict() if it.get("location") == 1: return True if it.get("geo") == 1: return True for C in it: if C.startswith("is_") and it.get(C) == 1: return True return False df = df.loc[df["status"] == 1] filter: pd.DataFrame = df.loc[df.apply(lambda i: check_pros(i), axis=1)] pros_list = [C for C in list(filter.columns) if C.startswith("is_")] print("pros_list:", pros_list) pros_map = {V: K for K, V in enumerate(pros_list)} print("pros_map:", pros_map) def get_pros(R): r = [False for C in pros_map] for P in pros_map: r[pros_map.get( P)] = True if R.get(P) is not None and R.get(P) == 1 else False if R.get("location") == 1: r[pros_map.get("is_loc")] = True if R.get("geo") == 1: r[pros_map.get("is_geo")] = True return r dct_words, root = defaultdict(int), StringTrie() for R in filter.to_dict(orient="records"): ww = R.get("word").split() first_word = ww[0] if first_word not in dct_words: dct_words[first_word] = len(ww) else: max_len = dct_words.get(first_word) if max_len < len(ww): dct_words[first_word] = len(ww) root[R.get("word")] = get_pros(R) dictionary = { "root": root, "length": filter.id.count(), "start_sylls": dct_words, "pros_map": pros_map, } logging.info("Total defined names : %s" % filter.id.count()) named_vocal_bin = path.join(conf.vocobulary_path, "entity_named.dic.bin") dump_to_file(dictionary, named_vocal_bin) logging.info("Save as : %s" % named_vocal_bin)
def __init__(self, file_path): self.file_path = Path(file_path) if not self.file_path.exists(): raise FileNotFoundError(self.file_path) with self.open_zip() as archive: file_names = [ file_.filename for file_ in archive.filelist if not file_.is_dir() ] self.file_trie = StringTrie.fromkeys(file_names)
def create_entity_dict(word_list): cur_dir = conf.vocobulary_path named_vocal_bin = path.join(cur_dir, "entity_named.dic.bin") result = [ x for x in word_list if x.get("named") is not None and len(x.get("named")) > 0 ] # result = result[-2:] logging.info("Total result exist named: %s" % len(result)) # dct = {i: set([]) for i in ENTITY.values()} # letters = set([r.get("word")[0] for r in result if len(r.get("word")) > 1]) letters = [i.lower() for i in letters if i.isalpha()] # last_modified = max([r.get("published_at") for r in result if "published_at" in r]) for r in result: if r.get("named") is None or len(r.get("named")) < 1: continue wwl = normalize(r.get("word").lower()) # if wwl == "anthony joshua": # print(wwl) for v in r.get("named"): dct[v].add(wwl) counter, dct_words, root = 0, defaultdict(int), StringTrie() for feature in dct: for x in dct[feature]: try: if not x[0].isalpha(): continue ww = x.split() first_word = ww[0] if first_word not in dct_words: dct_words[first_word] = len(ww) else: max_len = dct_words.get(first_word) if max_len < len(ww): dct_words[first_word] = len(ww) bits = get_bits(dct, x) root[x] = bits counter += 1 except Exception as e: logging.error(str(e), exc_info=True) exit() dictionary = {"root": root, "length": counter, "start_sylls": dct_words} logging.info("Total defined names : %s" % counter) dump_to_file(dictionary, named_vocal_bin) logging.info("Save as : %s" % named_vocal_bin) return True
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN): tokens = text.split() tokens = fix_punct_tokens(tokens) start = 0 count = 1 # start at 1, 0 is for the "NO_MATCH" entities = dict() out = [] for i in range(len(tokens)): key = "/".join(tokens[start:i + 1]).lower() # name = " ".join(tokens[start: i + 1]) if trie.has_subtrie(key): # Not done yet if i == len(tokens) - 1: # Reached the end of the string entities[count] = get_partial_match(trie, key) out.append(add_bold(get_entity(entities[count]))) elif trie.has_key(key): # noqa: W601 # Find a perfect match entities[count] = trie[key] out.append(add_bold(get_entity(entities[count]))) count += 1 start = i + 1 elif start < i: # Found partial prefix match before this token old_key = "/".join(tokens[start:i]).lower() # name = " ".join(tokens[start:i]) entities[count] = get_partial_match(trie, old_key) out.append(add_bold(get_entity(entities[count]))) count += 1 if trie.has_node(tokens[i].lower( )): # Need to verify that the current token isn't in the Trie start = i else: out.append(tokens[i]) start = i + 1 else: # No match out.append(tokens[i]) start = i + 1 retokenized = "".join([ " " + i if not i.startswith("'") and i not in PUNCT else i for i in out ]).strip() return retokenized, reduce_entities(entities)
class DvcIgnoreFilter: @staticmethod def _is_dvc_repo(root, directory): from dvc.repo import Repo return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) for root, dirs, _ in self.tree.walk( self.root_dir, use_dvcignore=False ): self._update(root) self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, []) def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path, use_dvcignore=False): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) old_pattern = self._get_trie_pattern(dirname) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, ) ) else: self.ignores_trie_tree[dirname] = new_pattern def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): old_pattern = self._get_trie_pattern(root) if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, ["/{}/".format(d)], root, ) ) else: self.ignores_trie_tree[root] = DvcIgnorePatterns( ["/{}/".format(d)], root ) def __call__(self, root, dirs, files): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: return ignore_pattern(root, dirs, files) else: return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value return ignore_pattern def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return True dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) else: return False def is_ignored_dir(self, path): path = os.path.abspath(path) if path == self.root_dir: return False return self._is_ignored(path, True) def is_ignored_file(self, path): return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( os.name == "nt" and not os.path.commonprefix( [os.path.abspath(path), self.root_dir] ) ): return True return False
class DvcIgnoreFilter: @staticmethod def _is_dvc_repo(root, directory): from dvc.repo import Repo return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) def __init__(self, tree, root_dir): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] self.tree = tree self.root_dir = root_dir self.ignores_trie_tree = StringTrie(separator=os.sep) self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir) self._update(self.root_dir) def _update(self, dirname): old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if not matches and self.tree.exists(ignore_file_path, use_dvcignore=False): new_pattern = DvcIgnorePatterns.from_files(ignore_file_path, self.tree) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, new_pattern.dirname, )) else: self.ignores_trie_tree[dirname] = new_pattern elif old_pattern: self.ignores_trie_tree[dirname] = old_pattern # NOTE: using `walk` + `break` because tree doesn't have `listdir()` for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False): self._update_sub_repo(root, dirs) break def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( old_pattern.pattern_list, old_pattern.dirname, ["/{}/".format(d)], root, )) else: self.ignores_trie_tree[root] = DvcIgnorePatterns( ["/{}/".format(d)], root) def __call__(self, root, dirs, files): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: return ignore_pattern(root, dirs, files) else: return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) if ignore_pattern: return ignore_pattern prefix = self.ignores_trie_tree.longest_prefix(dirname).key if not prefix: # outside of the repo return None dirs = list( takewhile( lambda path: path != prefix, (parent.fspath for parent in PathInfo(dirname).parents), )) dirs.reverse() dirs.append(dirname) for parent in dirs: self._update(parent) return self.ignores_trie_tree.get(dirname) def _is_ignored(self, path, is_dir=False): if self._outside_repo(path): return True dirname, basename = os.path.split(os.path.normpath(path)) ignore_pattern = self._get_trie_pattern(dirname) if ignore_pattern: return ignore_pattern.matches(dirname, basename, is_dir) else: return False def is_ignored_dir(self, path): path = os.path.abspath(path) if path == self.root_dir: return False return self._is_ignored(path, True) def is_ignored_file(self, path): return self._is_ignored(path, False) def _outside_repo(self, path): path = PathInfo(path) # paths outside of the repo should be ignored path = relpath(path, self.root_dir) if path.startswith("..") or ( os.name == "nt" and not os.path.commonprefix( [os.path.abspath(path), self.root_dir])): return True return False
from pygtrie import StringTrie # type: ignore dictionary = StringTrie() with open("words.txt") as words: for word in words.read().splitlines(): if word: dictionary[word] = word
def url_from_urn(upstream_urls: pygtrie.StringTrie, urn: str = None) -> str: _, url = upstream_urls.longest_prefix(urn) return url
def __init__(self): if self.trie is None: self.trie = StringTrie(separator=os.sep)
def deck_trie(self): return StringTrie(**self.decks_by_name(), separator=AnkiDeck.deck_name_separator)
class RepoTree(BaseTree): # pylint:disable=abstract-method """DVC + git-tracked files tree. Args: repo: DVC or git repo. subrepos: traverse to subrepos (by default, it ignores subrepos) repo_factory: A function to initialize subrepo with, default is Repo. kwargs: Additional keyword arguments passed to the `DvcTree()`. """ scheme = "local" PARAM_CHECKSUM = "md5" def __init__(self, repo, subrepos=False, repo_factory: Callable[[str], "Repo"] = None, **kwargs): super().__init__(repo, {"url": repo.root_dir}) if not repo_factory: from dvc.repo import Repo self.repo_factory = Repo else: self.repo_factory = repo_factory self._main_repo = repo self.root_dir = repo.root_dir self._traverse_subrepos = subrepos self._subrepos_trie = StringTrie(separator=os.sep) """Keeps track of each and every path with the corresponding repo.""" self._subrepos_trie[self.root_dir] = repo self._dvctrees = {} """Keep a dvctree instance of each repo.""" self._dvctree_configs = kwargs if hasattr(repo, "dvc_dir"): self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs) def _get_repo(self, path) -> Optional["Repo"]: """Returns repo that the path falls in, using prefix. If the path is already tracked/collected, it just returns the repo. Otherwise, it collects the repos that might be in the path's parents and then returns the appropriate one. """ repo = self._subrepos_trie.get(path) if repo: return repo prefix, repo = self._subrepos_trie.longest_prefix(path) if not prefix: return None parents = (parent.fspath for parent in PathInfo(path).parents) dirs = [path] + list(takewhile(lambda p: p != prefix, parents)) dirs.reverse() self._update(dirs, starting_repo=repo) return self._subrepos_trie.get(path) @wrap_with(threading.Lock()) def _update(self, dirs, starting_repo): """Checks for subrepo in directories and updates them.""" repo = starting_repo for d in dirs: if self._is_dvc_repo(d): repo = self.repo_factory(d) self._dvctrees[repo.root_dir] = DvcTree( repo, **self._dvctree_configs) self._subrepos_trie[d] = repo def _is_dvc_repo(self, dir_path): """Check if the directory is a dvc repo.""" if not self._traverse_subrepos: return False from dvc.repo import Repo repo_path = os.path.join(dir_path, Repo.DVC_DIR) # dvcignore will ignore subrepos, therefore using `use_dvcignore=False` return self._main_repo.tree.isdir(repo_path, use_dvcignore=False) def _get_tree_pair(self, path) -> Tuple[Union["GitTree", "LocalTree"], DvcTree]: """ Returns a pair of trees based on repo the path falls in, using prefix. """ path = os.path.abspath(path) # fallback to the top-level repo if repo was not found # this can happen if the path is outside of the repo repo = self._get_repo(path) or self._main_repo dvc_tree = self._dvctrees.get(repo.root_dir) return repo.tree, dvc_tree @property def fetch(self): return "fetch" in self._dvctree_configs @property def stream(self): return "stream" in self._dvctree_configs def open(self, path, mode="r", encoding="utf-8", **kwargs): # pylint: disable=arguments-differ if "b" in mode: encoding = None tree, dvc_tree = self._get_tree_pair(path) if dvc_tree and dvc_tree.exists(path): return dvc_tree.open(path, mode=mode, encoding=encoding, **kwargs) return tree.open(path, mode=mode, encoding=encoding) def exists(self, path, use_dvcignore=True): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.exists(path) or (dvc_tree and dvc_tree.exists(path)) def isdir(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.isdir(path) or (dvc_tree and dvc_tree.isdir(path)) def isdvc(self, path, **kwargs): _, dvc_tree = self._get_tree_pair(path) return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ tree, dvc_tree = self._get_tree_pair(path) return tree.isfile(path) or (dvc_tree and dvc_tree.isfile(path)) def isexec(self, path): tree, dvc_tree = self._get_tree_pair(path) if dvc_tree and dvc_tree.exists(path): return dvc_tree.isexec(path) return tree.isexec(path) def stat(self, path): tree, _ = self._get_tree_pair(path) return tree.stat(path) def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: yield from self._dvc_walk(walk) def _subrepo_walk(self, dir_path, **kwargs): """Walk into a new repo. NOTE: subrepo will only be discovered when walking if ignore_subrepos is set to False. """ tree, dvc_tree = self._get_tree_pair(dir_path) tree_walk = tree.walk(dir_path, topdown=True, ignore_subrepos=not self._traverse_subrepos) if dvc_tree: dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs) else: dvc_walk = None yield from self._walk(tree_walk, dvc_walk, **kwargs) def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): assert repo_walk try: _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else (None, [], [])) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs dvc_set = set(dvc_dirs) repo_set = set(repo_dirs) dvc_only = list(dvc_set - repo_set) repo_only = list(repo_set - dvc_set) shared = list(dvc_set & repo_set) dirs = shared + dvc_only + repo_only # merge file lists files = { fname for fname in dvc_fnames + repo_fnames if dvcfiles or not is_valid_filename(fname) } yield repo_root, dirs, list(files) # set dir order for next recursion level - shared dirs first so that # next() for both generators recurses into the same shared directory dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set] repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set] for dirname in dirs: dir_path = os.path.join(repo_root, dirname) if self._is_dvc_repo(dir_path): yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles) elif dirname in shared: yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk(self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs): # pylint: disable=arguments-differ """Walk and merge both DVC and repo trees. Args: top: path to walk from topdown: if True, tree will be walked from top down. onerror: if set, onerror function will be called if an error occurs (by default errors are ignored). dvcfiles: if True, dvcfiles will be included in the files list for walked directories. Any kwargs will be passed into methods used for fetching and/or streaming DVC outs from remotes. """ assert topdown if not self.exists(top): if onerror is not None: onerror(FileNotFoundError(top)) return if not self.isdir(top): if onerror is not None: onerror(NotADirectoryError(top)) return tree, dvc_tree = self._get_tree_pair(top) dvc_exists = dvc_tree and dvc_tree.exists(top) repo_exists = tree.exists(top) if dvc_exists: dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs) if repo_exists: repo_walk = tree.walk( top, topdown=topdown, ignore_subrepos=not self._traverse_subrepos, ) yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) else: yield from dvc_walk else: repo_walk = tree.walk( top, topdown=topdown, onerror=onerror, ignore_subrepos=not self._traverse_subrepos, ) yield from self._walk(repo_walk, None, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): for fname in files: yield PathInfo(root) / fname def get_file_hash(self, path_info): """Return file checksum for specified path. If path_info is a DVC out, the pre-computed checksum for the file will be used. If path_info is a git file, MD5 will be computed for the git object. """ if not self.exists(path_info): raise FileNotFoundError _, dvc_tree = self._get_tree_pair(path_info) if dvc_tree and dvc_tree.exists(path_info): try: return dvc_tree.get_file_hash(path_info) except OutputNotFoundError: pass return self.PARAM_CHECKSUM, file_md5(path_info, self)[0] def copytree(self, top, dest): top = PathInfo(top) dest = PathInfo(dest) if not self.exists(top): raise FileNotFoundError if self.isfile(top): makedirs(dest.parent, exist_ok=True) with self.open(top, mode="rb") as fobj: copy_fobj_to_file(fobj, dest) return for root, _, files in self.walk(top): root = PathInfo(root) dest_dir = dest / root.relative_to(top) makedirs(dest_dir, exist_ok=True) for fname in files: src = root / fname with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) @property def hash_jobs(self): # pylint: disable=invalid-overridden-method return self._main_repo.tree.hash_jobs def metadata(self, path): path_info = PathInfo(os.path.abspath(path)) tree, dvc_tree = self._get_tree_pair(path_info) dvc_meta = None if dvc_tree: with suppress(OutputNotFoundError): dvc_meta = dvc_tree.metadata(path_info) stat_result = None with suppress(FileNotFoundError): stat_result = tree.stat(path_info) if not stat_result and not dvc_meta: raise FileNotFoundError meta = dvc_meta or Metadata(path_info=path_info) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir if not dvc_meta: meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode) return meta
def init_trie_functions_and_info( separator: str) -> Tuple[StringTrie, StringTrie, dict]: if len(separator) != 1: raise ValueError( "SeparatorError: the separator must be char and not string!") commands_trie, commands_info_trie = StringTrie( separator=separator), StringTrie(separator=separator) # Defining commands according to the amount of words in the command # 2 words command commands_trie["show log"] = __show_log commands_trie["set timer"] = __add_new_entry commands_trie["show db"] = __show_db_values # 3 words command commands_trie["show logs all"] = __show_logs_all commands_trie["show logs last"] = __show_logs commands_trie["show logs top"] = __show_logs commands_trie["show logs range"] = __show_logs commands_trie["show node status"] = __show_node_status # commands_trie["show system status"] = show_system_status commands_trie["set multicast ip"] = __add_new_entry commands_trie["log value add"] = __add_new_entry commands_trie["log value delete"] = __add_new_entry commands_trie["log value edit"] = __add_new_entry # Defines the explanations at each node and what values it receives # 1 word command commands_info_trie["show"] = "Prints data on CLI" commands_info_trie["set"] = "Sets a value in the database" commands_info_trie["log"] = "Displays actions on LOGS in RAFT algorithm" # 2 words command commands_info_trie["show node"] = "action on node parameters" # commands_info_trie["show system"] = "action on system parameters (all nodes in RAFT)" commands_info_trie["show log"] = "shows last log in node" commands_info_trie["show logs"] = "shows all logs in node" commands_info_trie["show db"] = "shows all logs in specific DB" commands_info_trie["set multicast"] = "action on multicast IP" commands_info_trie["set timer"] = "A number between 0.150 to 0.300" commands_info_trie[ "log value"] = "A set of actions functions like add, delete, edit ..." # 3 words command commands_info_trie["show node all"] = "Shows all logs in cluster" commands_info_trie["show logs last"] = "A positive number greater than 0" commands_info_trie["show logs top"] = "A positive number greater than 0" commands_info_trie[ "show logs range"] = "A range between two positive numbers" commands_info_trie[ "show node status"] = "Shows all node parameters in cluster" # commands_info_trie["show system status"] = "Shows all nodes parameters in system" commands_info_trie[ "set multicast ip"] = "IP A.B.C.D according to the protocol" commands_info_trie[ "log value add"] = "Adds a new log <key, value> to the cluster" commands_info_trie[ "log value delete"] = "Deletes log <key> from the cluster" commands_info_trie["log value edit"] = "Edits log <key> in the cluster" special_words_dict = { "range": (2, [int, int]), "add": (2, [str, int]), "ip": (1, [str]), "timer": (1, [float]), "delete": (1, [str]), "edit": (2, [str, int]), "top": (1, [int]), "last": (1, [int]), "db": (1, [str]) } return commands_trie, commands_info_trie, special_words_dict
class LifecycleMetaEvent(MetaEvent): """生命周期元事件""" __event__ = "meta_event.lifecycle" meta_event_type: Literal["lifecycle"] sub_type: str class HeartbeatMetaEvent(MetaEvent): """心跳元事件""" __event__ = "meta_event.heartbeat" meta_event_type: Literal["heartbeat"] status: Status interval: int _t = StringTrie(separator=".") # define `model` first to avoid globals changing while `for` model = None for model in globals().values(): if not inspect.isclass(model) or not issubclass(model, Event): continue _t["." + model.__event__] = model def get_event_model(event_name) -> List[Type[Event]]: """ :说明: 根据事件名获取对应 ``Event Model`` 及 ``FallBack Event Model`` 列表