Exemple #1
0
    def detect_path_overlap(paths):
        """
        Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap).

        Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c).

        NOTE: The logic is copied from pulpcore.app.files.validate_file_paths().

        This function returns the first dupe or overlap it detects. We use a trie (or
        prefix tree) to keep track of which paths we've already seen.

        Args:
            paths (iterable of str): An iterable of strings each representing a relative path

        Returns:
            str: a path which overlaps or duplicates another

        """
        path_trie = StringTrie(separator="/")
        for path in paths:
            if path in path_trie:
                # path duplicates a path already in the trie
                return path

            if path_trie.has_subtrie(path):
                # overlap where path is 'a/b' and trie has 'a/b/c'
                return path

            prefixes = list(path_trie.prefixes(path))
            if prefixes:
                # overlap where path is 'a/b/c' and trie has 'a/b'
                return path

            # if there are no overlaps, add it to our trie and continue
            path_trie[path] = True
Exemple #2
0
def find_entites(text: str, trie: StringTrie):
    tokens = text.split()
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    for i in range(len(tokens)):
        key = "/".join(tokens[start : i + 1]).lower()
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = (get_entity(trie, key), start, i + 1)
        elif trie.has_key(key):  # noqa: W601 # Find a perfect match
            entities[count] = (trie[key], start, i + 1)
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            entities[count] = (get_entity(trie, old_key), start, i)
            count += 1
            if trie.has_node(
                tokens[i].lower()
            ):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                start = i + 1
        else:  # No match
            start = i + 1
    return reduce_entities(entities)
Exemple #3
0
    def __init__(self,
                 repo,
                 subrepos=False,
                 repo_factory: Callable[[str], "Repo"] = None,
                 **kwargs):
        super().__init__(repo, {"url": repo.root_dir})

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.root_dir = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = StringTrie(separator=os.sep)
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvctrees = {}
        """Keep a dvctree instance of each repo."""

        self._dvctree_configs = kwargs

        if hasattr(repo, "dvc_dir"):
            self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs)
Exemple #4
0
    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = StringTrie(separator=os.sep)
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir)
        self._update(self.root_dir)
Exemple #5
0
def validate_file_paths(paths):
    """
    Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap).

    Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c).

    This function will raise an exception at the first dupe or overlap it detects. We use a trie (or
    prefix tree) to keep track of which paths we've already seen.

    Args:
        paths (iterable of str): An iterable of strings each representing a relative path

    Raises:
        ValueError: If any path overlaps another
    """
    overlap_error = _("The path for file '{path}' overlaps: {conflicts}")

    path_trie = StringTrie(separator="/")
    dups = []
    overlaps = []
    for path in paths:
        if path in path_trie:
            # path duplicates a path already in the trie
            dups.append(path)
        elif path_trie.has_subtrie(path):
            # overlap where path is 'a/b' and trie has 'a/b/c'
            conflicts = [item[0] for item in path_trie.items(prefix=path)]
            overlaps.append(
                overlap_error.format(path=path,
                                     conflicts=", ".join(conflicts)))
        else:
            prefixes = list(path_trie.prefixes(path))
            if prefixes:
                # overlap where path is 'a/b/c' and trie has 'a/b'
                conflicts = [prefix.key for prefix in prefixes]
                overlaps.append(
                    overlap_error.format(path=path,
                                         conflicts=", ".join(conflicts)))

        # if there are no overlaps, add it to our trie and continue
        path_trie[path] = True

    if dups or overlaps:
        dups_msg = ""
        overlaps_msg = ""
        if dups:
            dups_msg = _("Paths are duplicated: {paths}").format(
                paths=",".join(dups))
        if overlaps:
            overlaps_msg = "\n".join(overlaps)

        raise ValueError(
            _("Path errors found. {dups}\n{overlaps}").format(
                dups=dups_msg, overlaps=overlaps_msg))
Exemple #6
0
    def __init__(self, sul: RERSConnectorV4 = None, separator=" ", storagepath=None, saveinterval=15):
        super().__init__(sul, storagepath, saveinterval)
        self.separator = separator
        self.cache = StringTrie(separator=separator)
        self.error_cache = StringTrie(separator=separator)
        self.invalid_cache = PrefixSet()

        # hookup rers cache
        self.sul.hookup_cache(self.cache,
                              self.error_cache,
                              self.invalid_cache)

        self.passthrough = False
Exemple #7
0
    def load_folder(self, folder):
        dic_csv = self.__map_dict.get(folder)
        if dic_csv is None:
            return
        logging.info(f"Read {dic_csv}")
        brands, models = self.read_brand(dic_csv)
        _brands: StringTrie = StringTrie()
        _brands_models: StringTrie = StringTrie()
        _models_brands: StringTrie = StringTrie()

        # Parse brands
        for S in brands:
            low = normalize_vn(normalize(S.lower()))
            ww = low.split()
            key = "/".join(ww)
            _brands["/".join(ww)] = S

        # Tất cả dòng, model
        ten_model_san_pham = [
            M for B, M in models if not (M is None or M is np.nan)
        ]
        m_ct = Counter(ten_model_san_pham)
        for B, M in models:
            if M is None or M is np.nan:
                continue
            b_low = normalize_vn(normalize(B.lower()))
            m_low = normalize_vn(normalize(M.lower()))
            ww = " ".join([b_low, m_low]).split()
            key = "/".join(ww)
            _brands_models[key] = M

            # Chỉ lấy tên Model là duy nhất để gán dòng,model với hãng, 2 ky tự tro len, ko phai la number
            # Galaxy S20 -> Samsung
            #
            # if M == "Galaxy":
            #     print(M)
            if len(M) < 2:
                continue
            if M.isdigit():
                continue
            if m_ct.get(M) > 1:
                # print("m_ct.get(K) :", M, m_ct.get(M))
                continue
            key = "/".join(m_low.split())
            _models_brands[key] = f"{B}:{M}"
        #
        self.__brands[folder] = _brands
        self.__brands_models[folder] = _brands_models
        self.__models_brands[folder] = _models_brands
def _get_trie(*list_replacement_items):
    replacement_items_ordereddict = OrderedDict()

    for replacement_items in list_replacement_items:
        replacement = replacement_items[0]
        items = replacement_items[1]
        replacement_items_ordereddict[replacement] = items

    trie = StringTrie(separator=' ')

    for key in replacement_items_ordereddict.keys():
        trie.update(
            StringTrie.fromkeys(replacement_items_ordereddict[key],
                                value=key,
                                separator=' '))
    return trie
Exemple #9
0
def time_test(length):
    array = np.random.random_integers(-2**31 - 1, 2**31, length)

    trie = StringTrie()

    for number in array:
        if not trie.has_key(bin(number)):
            trie[bin(number)] = 0
        trie[bin(number)] += 1

    time1 = time.time()
    for number in np.random.choice(array, 10000):
        trie[bin(number)]
    time2 = time.time()
    print("Access to array of length {} took {} ms".format(
        length, (time2 - time1) * 1000))
Exemple #10
0
def grab_graftm_taxa(tax_ids_file):
    taxonomic_tree = StringTrie(separator='; ')
    with open(tax_ids_file) as tax_ids:
        header = tax_ids.readline().strip()
        if header != "tax_id,parent_id,rank,tax_name,root,kingdom,phylum,class,order,family,genus,species":
            logging.error("Unable to handle format of " + tax_ids_file + "!")
            sys.exit(21)
        line = tax_ids.readline().strip()
        while line:
            try:
                _, _, _, _, _, k_, p_, c_, o_, f_, g_, s_ = line.split(',')
            except IndexError:
                logging.error("Unexpected format of line in " + tax_ids_file +
                              ":\n" + line)
                sys.exit(21)
            ranks = ["Root", k_, p_, c_, o_, f_, g_, s_]
            lineage_list = []
            # In case there are missing ranks... which is likely
            for rank in ranks:
                if rank:
                    # GraftM seems to append an 'e1' to taxa that are duplicated in the taxonomic lineage.
                    # For example: Bacteria; Aquificae; Aquificaee1; Aquificales
                    lineage_list.append(re.sub(r'e\d+$', '', rank))
                    # lineage_list.append(rank)
            lineage = re.sub('_', ' ',
                             clean_lineage_string('; '.join(lineage_list)))
            i = 0
            ranks = len(lineage)
            while i < len(lineage):
                taxonomic_tree["; ".join(lineage.split("; ")[:ranks -
                                                             i])] = True
                i += 1

            line = tax_ids.readline().strip()
    return taxonomic_tree
Exemple #11
0
class DvcIgnorePatternsTrie(DvcIgnore):
    trie = None

    def __init__(self):
        if self.trie is None:
            self.trie = StringTrie(separator=os.sep)

    def __call__(self, root, dirs, files):
        ignore_pattern = self[root]
        if ignore_pattern:
            return ignore_pattern(root, dirs, files)
        return dirs, files

    def __setitem__(self, root, ignore_pattern):
        base_pattern = self[root]
        common_dirname, merged_pattern = merge_patterns(
            base_pattern.dirname,
            base_pattern.pattern_list,
            ignore_pattern.dirname,
            ignore_pattern.pattern_list,
        )
        self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)

    def __getitem__(self, root):
        ignore_pattern = self.trie.longest_prefix(root)
        if ignore_pattern:
            return ignore_pattern.value
        return DvcIgnorePatterns([], root)
Exemple #12
0
 def __init__(self,
              sul: object = None,
              separator: object = " ",
              storagepath: object = None,
              saveinterval: object = 15) -> object:
     super().__init__(sul, storagepath, saveinterval)
     self.cache = StringTrie(separator=separator)
     self.separator = separator
Exemple #13
0
    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = StringTrie(separator=os.sep)
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir
        )
        for root, dirs, _ in self.tree.walk(
            self.root_dir, use_dvcignore=False
        ):
            self._update(root)
            self._update_sub_repo(root, dirs)
            dirs[:], _ = self(root, dirs, [])
Exemple #14
0
def create_trie(names):
    trie = StringTrie(delimiter="/")
    for (name, qid) in names:
        q_name = eval(qid[0])[1]
        name = [x for x in name if x.lower() not in BLACK_LIST]
        for i in range(len(name)):
            trie["/".join(name[i:]).lower()] = (q_name, qid)
    return trie
Exemple #15
0
def create_trie(names):
    trie = StringTrie(delimiter="/")
    for name in names:
        processed_name = [
            x for x in name["name"].split() if x.lower() not in BLACK_LIST
        ]
        for i in range(len(processed_name)):
            trie["/".join(processed_name[i:]).lower()] = (name["name"], name["ids"])
    return trie
def load_taxonomic_trie(lineages: list) -> StringTrie:
    taxonomic_tree = StringTrie(separator='; ')

    for lineage in lineages:
        i = 0
        ranks = len(lineage)
        while i < len(lineage):
            taxonomic_tree["; ".join(lineage.split("; ")[:ranks - i])] = True
            i += 1

    return taxonomic_tree
Exemple #17
0
class ProjectRegistry(type):
    """Registry for benchbuild projects."""

    projects = StringTrie()

    def __init__(cls, name, bases, attrs):
        """Register a project in the registry."""
        super(ProjectRegistry, cls).__init__(name, bases, attrs)

        if None not in {cls.NAME, cls.DOMAIN, cls.GROUP}:
            key = "{name}/{group}".format(name=cls.NAME, group=cls.GROUP)
            ProjectRegistry.projects[key] = cls
Exemple #18
0
def create_entity_dic_v2(df):
    def check_pros(R):
        it = R.to_dict()
        if it.get("location") == 1:
            return True
        if it.get("geo") == 1:
            return True
        for C in it:
            if C.startswith("is_") and it.get(C) == 1:
                return True
        return False

    df = df.loc[df["status"] == 1]
    filter: pd.DataFrame = df.loc[df.apply(lambda i: check_pros(i), axis=1)]
    pros_list = [C for C in list(filter.columns) if C.startswith("is_")]
    print("pros_list:", pros_list)
    pros_map = {V: K for K, V in enumerate(pros_list)}
    print("pros_map:", pros_map)

    def get_pros(R):
        r = [False for C in pros_map]
        for P in pros_map:
            r[pros_map.get(
                P)] = True if R.get(P) is not None and R.get(P) == 1 else False
        if R.get("location") == 1:
            r[pros_map.get("is_loc")] = True
        if R.get("geo") == 1:
            r[pros_map.get("is_geo")] = True
        return r

    dct_words, root = defaultdict(int), StringTrie()
    for R in filter.to_dict(orient="records"):
        ww = R.get("word").split()
        first_word = ww[0]
        if first_word not in dct_words:
            dct_words[first_word] = len(ww)
        else:
            max_len = dct_words.get(first_word)
            if max_len < len(ww):
                dct_words[first_word] = len(ww)
        root[R.get("word")] = get_pros(R)

    dictionary = {
        "root": root,
        "length": filter.id.count(),
        "start_sylls": dct_words,
        "pros_map": pros_map,
    }
    logging.info("Total defined names : %s" % filter.id.count())
    named_vocal_bin = path.join(conf.vocobulary_path, "entity_named.dic.bin")
    dump_to_file(dictionary, named_vocal_bin)
    logging.info("Save as : %s" % named_vocal_bin)
Exemple #19
0
    def __init__(self, file_path):

        self.file_path = Path(file_path)
        if not self.file_path.exists():
            raise FileNotFoundError(self.file_path)

        with self.open_zip() as archive:
            file_names = [
                file_.filename for file_ in archive.filelist
                if not file_.is_dir()
            ]

        self.file_trie = StringTrie.fromkeys(file_names)
Exemple #20
0
def create_entity_dict(word_list):
    cur_dir = conf.vocobulary_path

    named_vocal_bin = path.join(cur_dir, "entity_named.dic.bin")
    result = [
        x for x in word_list
        if x.get("named") is not None and len(x.get("named")) > 0
    ]
    # result = result[-2:]
    logging.info("Total result exist named: %s" % len(result))
    #
    dct = {i: set([]) for i in ENTITY.values()}
    #
    letters = set([r.get("word")[0] for r in result if len(r.get("word")) > 1])
    letters = [i.lower() for i in letters if i.isalpha()]
    # last_modified = max([r.get("published_at") for r in result if "published_at" in r])
    for r in result:
        if r.get("named") is None or len(r.get("named")) < 1:
            continue
        wwl = normalize(r.get("word").lower())
        # if wwl == "anthony joshua":
        #     print(wwl)
        for v in r.get("named"):
            dct[v].add(wwl)

    counter, dct_words, root = 0, defaultdict(int), StringTrie()
    for feature in dct:
        for x in dct[feature]:
            try:
                if not x[0].isalpha():
                    continue
                ww = x.split()
                first_word = ww[0]
                if first_word not in dct_words:
                    dct_words[first_word] = len(ww)
                else:
                    max_len = dct_words.get(first_word)
                    if max_len < len(ww):
                        dct_words[first_word] = len(ww)
                bits = get_bits(dct, x)
                root[x] = bits
                counter += 1
            except Exception as e:
                logging.error(str(e), exc_info=True)
                exit()

    dictionary = {"root": root, "length": counter, "start_sylls": dct_words}
    logging.info("Total defined names : %s" % counter)
    dump_to_file(dictionary, named_vocal_bin)
    logging.info("Save as : %s" % named_vocal_bin)
    return True
Exemple #21
0
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN):
    tokens = text.split()
    tokens = fix_punct_tokens(tokens)
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    out = []
    for i in range(len(tokens)):
        key = "/".join(tokens[start:i + 1]).lower()
        # name = " ".join(tokens[start: i + 1])
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = get_partial_match(trie, key)
                out.append(add_bold(get_entity(entities[count])))
        elif trie.has_key(key):  # noqa: W601  # Find a perfect match
            entities[count] = trie[key]
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            #  name = " ".join(tokens[start:i])
            entities[count] = get_partial_match(trie, old_key)
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            if trie.has_node(tokens[i].lower(
            )):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                out.append(tokens[i])
                start = i + 1
        else:  # No match
            out.append(tokens[i])
            start = i + 1
    retokenized = "".join([
        " " + i if not i.startswith("'") and i not in PUNCT else i for i in out
    ]).strip()
    return retokenized, reduce_entities(entities)
Exemple #22
0
class DvcIgnoreFilter:
    @staticmethod
    def _is_dvc_repo(root, directory):
        from dvc.repo import Repo

        return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = StringTrie(separator=os.sep)
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir
        )
        for root, dirs, _ in self.tree.walk(
            self.root_dir, use_dvcignore=False
        ):
            self._update(root)
            self._update_sub_repo(root, dirs)
            dirs[:], _ = self(root, dirs, [])

    def _update(self, dirname):
        ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if self.tree.exists(ignore_file_path, use_dvcignore=False):
            new_pattern = DvcIgnorePatterns.from_files(
                ignore_file_path, self.tree
            )
            old_pattern = self._get_trie_pattern(dirname)
            if old_pattern:
                self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
                    *merge_patterns(
                        old_pattern.pattern_list,
                        old_pattern.dirname,
                        new_pattern.pattern_list,
                        new_pattern.dirname,
                    )
                )
            else:
                self.ignores_trie_tree[dirname] = new_pattern

    def _update_sub_repo(self, root, dirs):
        for d in dirs:
            if self._is_dvc_repo(root, d):
                old_pattern = self._get_trie_pattern(root)
                if old_pattern:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        *merge_patterns(
                            old_pattern.pattern_list,
                            old_pattern.dirname,
                            ["/{}/".format(d)],
                            root,
                        )
                    )
                else:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        ["/{}/".format(d)], root
                    )

    def __call__(self, root, dirs, files):
        ignore_pattern = self._get_trie_pattern(root)
        if ignore_pattern:
            return ignore_pattern(root, dirs, files)
        else:
            return dirs, files

    def _get_trie_pattern(self, dirname):
        ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
        return ignore_pattern

    def _is_ignored(self, path, is_dir=False):
        if self._outside_repo(path):
            return True
        dirname, basename = os.path.split(os.path.normpath(path))
        ignore_pattern = self._get_trie_pattern(dirname)
        if ignore_pattern:
            return ignore_pattern.matches(dirname, basename, is_dir)
        else:
            return False

    def is_ignored_dir(self, path):
        path = os.path.abspath(path)
        if path == self.root_dir:
            return False

        return self._is_ignored(path, True)

    def is_ignored_file(self, path):
        return self._is_ignored(path, False)

    def _outside_repo(self, path):
        path = PathInfo(path)

        # paths outside of the repo should be ignored
        path = relpath(path, self.root_dir)
        if path.startswith("..") or (
            os.name == "nt"
            and not os.path.commonprefix(
                [os.path.abspath(path), self.root_dir]
            )
        ):
            return True
        return False
Exemple #23
0
class DvcIgnoreFilter:
    @staticmethod
    def _is_dvc_repo(root, directory):
        from dvc.repo import Repo

        return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

    def __init__(self, tree, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

        self.tree = tree
        self.root_dir = root_dir
        self.ignores_trie_tree = StringTrie(separator=os.sep)
        self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
            default_ignore_patterns, root_dir)
        self._update(self.root_dir)

    def _update(self, dirname):
        old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
        matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False)

        ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
        if not matches and self.tree.exists(ignore_file_path,
                                            use_dvcignore=False):
            new_pattern = DvcIgnorePatterns.from_files(ignore_file_path,
                                                       self.tree)
            if old_pattern:
                self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
                    *merge_patterns(
                        old_pattern.pattern_list,
                        old_pattern.dirname,
                        new_pattern.pattern_list,
                        new_pattern.dirname,
                    ))
            else:
                self.ignores_trie_tree[dirname] = new_pattern
        elif old_pattern:
            self.ignores_trie_tree[dirname] = old_pattern

        # NOTE: using `walk` + `break` because tree doesn't have `listdir()`
        for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False):
            self._update_sub_repo(root, dirs)
            break

    def _update_sub_repo(self, root, dirs):
        for d in dirs:
            if self._is_dvc_repo(root, d):
                old_pattern = self.ignores_trie_tree.longest_prefix(root).value
                if old_pattern:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        *merge_patterns(
                            old_pattern.pattern_list,
                            old_pattern.dirname,
                            ["/{}/".format(d)],
                            root,
                        ))
                else:
                    self.ignores_trie_tree[root] = DvcIgnorePatterns(
                        ["/{}/".format(d)], root)

    def __call__(self, root, dirs, files):
        ignore_pattern = self._get_trie_pattern(root)
        if ignore_pattern:
            return ignore_pattern(root, dirs, files)
        else:
            return dirs, files

    def _get_trie_pattern(self, dirname):
        ignore_pattern = self.ignores_trie_tree.get(dirname)
        if ignore_pattern:
            return ignore_pattern

        prefix = self.ignores_trie_tree.longest_prefix(dirname).key
        if not prefix:
            # outside of the repo
            return None

        dirs = list(
            takewhile(
                lambda path: path != prefix,
                (parent.fspath for parent in PathInfo(dirname).parents),
            ))
        dirs.reverse()
        dirs.append(dirname)

        for parent in dirs:
            self._update(parent)

        return self.ignores_trie_tree.get(dirname)

    def _is_ignored(self, path, is_dir=False):
        if self._outside_repo(path):
            return True
        dirname, basename = os.path.split(os.path.normpath(path))
        ignore_pattern = self._get_trie_pattern(dirname)
        if ignore_pattern:
            return ignore_pattern.matches(dirname, basename, is_dir)
        else:
            return False

    def is_ignored_dir(self, path):
        path = os.path.abspath(path)
        if path == self.root_dir:
            return False

        return self._is_ignored(path, True)

    def is_ignored_file(self, path):
        return self._is_ignored(path, False)

    def _outside_repo(self, path):
        path = PathInfo(path)

        # paths outside of the repo should be ignored
        path = relpath(path, self.root_dir)
        if path.startswith("..") or (
                os.name == "nt" and not os.path.commonprefix(
                    [os.path.abspath(path), self.root_dir])):
            return True
        return False
Exemple #24
0
from pygtrie import StringTrie  # type: ignore

dictionary = StringTrie()

with open("words.txt") as words:
    for word in words.read().splitlines():
        if word:
            dictionary[word] = word
Exemple #25
0
def url_from_urn(upstream_urls: pygtrie.StringTrie, urn: str = None) -> str:
    _, url = upstream_urls.longest_prefix(urn)
    return url
Exemple #26
0
 def __init__(self):
     if self.trie is None:
         self.trie = StringTrie(separator=os.sep)
Exemple #27
0
 def deck_trie(self):
     return StringTrie(**self.decks_by_name(),
                       separator=AnkiDeck.deck_name_separator)
Exemple #28
0
class RepoTree(BaseTree):  # pylint:disable=abstract-method
    """DVC + git-tracked files tree.

    Args:
        repo: DVC or git repo.
        subrepos: traverse to subrepos (by default, it ignores subrepos)
        repo_factory: A function to initialize subrepo with, default is Repo.
        kwargs: Additional keyword arguments passed to the `DvcTree()`.
    """

    scheme = "local"
    PARAM_CHECKSUM = "md5"

    def __init__(self,
                 repo,
                 subrepos=False,
                 repo_factory: Callable[[str], "Repo"] = None,
                 **kwargs):
        super().__init__(repo, {"url": repo.root_dir})

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory = Repo
        else:
            self.repo_factory = repo_factory

        self._main_repo = repo
        self.root_dir = repo.root_dir
        self._traverse_subrepos = subrepos

        self._subrepos_trie = StringTrie(separator=os.sep)
        """Keeps track of each and every path with the corresponding repo."""

        self._subrepos_trie[self.root_dir] = repo

        self._dvctrees = {}
        """Keep a dvctree instance of each repo."""

        self._dvctree_configs = kwargs

        if hasattr(repo, "dvc_dir"):
            self._dvctrees[repo.root_dir] = DvcTree(repo, **kwargs)

    def _get_repo(self, path) -> Optional["Repo"]:
        """Returns repo that the path falls in, using prefix.

        If the path is already tracked/collected, it just returns the repo.

        Otherwise, it collects the repos that might be in the path's parents
        and then returns the appropriate one.
        """
        repo = self._subrepos_trie.get(path)
        if repo:
            return repo

        prefix, repo = self._subrepos_trie.longest_prefix(path)
        if not prefix:
            return None

        parents = (parent.fspath for parent in PathInfo(path).parents)
        dirs = [path] + list(takewhile(lambda p: p != prefix, parents))
        dirs.reverse()
        self._update(dirs, starting_repo=repo)
        return self._subrepos_trie.get(path)

    @wrap_with(threading.Lock())
    def _update(self, dirs, starting_repo):
        """Checks for subrepo in directories and updates them."""
        repo = starting_repo
        for d in dirs:
            if self._is_dvc_repo(d):
                repo = self.repo_factory(d)
                self._dvctrees[repo.root_dir] = DvcTree(
                    repo, **self._dvctree_configs)
            self._subrepos_trie[d] = repo

    def _is_dvc_repo(self, dir_path):
        """Check if the directory is a dvc repo."""
        if not self._traverse_subrepos:
            return False

        from dvc.repo import Repo

        repo_path = os.path.join(dir_path, Repo.DVC_DIR)
        # dvcignore will ignore subrepos, therefore using `use_dvcignore=False`
        return self._main_repo.tree.isdir(repo_path, use_dvcignore=False)

    def _get_tree_pair(self,
                       path) -> Tuple[Union["GitTree", "LocalTree"], DvcTree]:
        """
        Returns a pair of trees based on repo the path falls in, using prefix.
        """
        path = os.path.abspath(path)

        # fallback to the top-level repo if repo was not found
        # this can happen if the path is outside of the repo
        repo = self._get_repo(path) or self._main_repo

        dvc_tree = self._dvctrees.get(repo.root_dir)
        return repo.tree, dvc_tree

    @property
    def fetch(self):
        return "fetch" in self._dvctree_configs

    @property
    def stream(self):
        return "stream" in self._dvctree_configs

    def open(self, path, mode="r", encoding="utf-8", **kwargs):  # pylint: disable=arguments-differ
        if "b" in mode:
            encoding = None

        tree, dvc_tree = self._get_tree_pair(path)
        if dvc_tree and dvc_tree.exists(path):
            return dvc_tree.open(path, mode=mode, encoding=encoding, **kwargs)
        return tree.open(path, mode=mode, encoding=encoding)

    def exists(self, path, use_dvcignore=True):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.exists(path) or (dvc_tree and dvc_tree.exists(path))

    def isdir(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.isdir(path) or (dvc_tree and dvc_tree.isdir(path))

    def isdvc(self, path, **kwargs):
        _, dvc_tree = self._get_tree_pair(path)
        return dvc_tree is not None and dvc_tree.isdvc(path, **kwargs)

    def isfile(self, path):  # pylint: disable=arguments-differ
        tree, dvc_tree = self._get_tree_pair(path)
        return tree.isfile(path) or (dvc_tree and dvc_tree.isfile(path))

    def isexec(self, path):
        tree, dvc_tree = self._get_tree_pair(path)
        if dvc_tree and dvc_tree.exists(path):
            return dvc_tree.isexec(path)
        return tree.isexec(path)

    def stat(self, path):
        tree, _ = self._get_tree_pair(path)
        return tree.stat(path)

    def _dvc_walk(self, walk):
        try:
            root, dirs, files = next(walk)
        except StopIteration:
            return
        yield root, dirs, files
        for _ in dirs:
            yield from self._dvc_walk(walk)

    def _subrepo_walk(self, dir_path, **kwargs):
        """Walk into a new repo.

         NOTE: subrepo will only be discovered when walking if
         ignore_subrepos is set to False.
        """
        tree, dvc_tree = self._get_tree_pair(dir_path)
        tree_walk = tree.walk(dir_path,
                              topdown=True,
                              ignore_subrepos=not self._traverse_subrepos)
        if dvc_tree:
            dvc_walk = dvc_tree.walk(dir_path, topdown=True, **kwargs)
        else:
            dvc_walk = None
        yield from self._walk(tree_walk, dvc_walk, **kwargs)

    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set]

        for dirname in dirs:
            dir_path = os.path.join(repo_root, dirname)
            if self._is_dvc_repo(dir_path):
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk(self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs):  # pylint: disable=arguments-differ
        """Walk and merge both DVC and repo trees.

        Args:
            top: path to walk from
            topdown: if True, tree will be walked from top down.
            onerror: if set, onerror function will be called if an error
                occurs (by default errors are ignored).
            dvcfiles: if True, dvcfiles will be included in the files list
                for walked directories.

        Any kwargs will be passed into methods used for fetching and/or
        streaming DVC outs from remotes.
        """
        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        tree, dvc_tree = self._get_tree_pair(top)
        dvc_exists = dvc_tree and dvc_tree.exists(top)
        repo_exists = tree.exists(top)
        if dvc_exists:
            dvc_walk = dvc_tree.walk(top, topdown=topdown, **kwargs)
            if repo_exists:
                repo_walk = tree.walk(
                    top,
                    topdown=topdown,
                    ignore_subrepos=not self._traverse_subrepos,
                )
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            else:
                yield from dvc_walk
        else:
            repo_walk = tree.walk(
                top,
                topdown=topdown,
                onerror=onerror,
                ignore_subrepos=not self._traverse_subrepos,
            )
            yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)

    def walk_files(self, top, **kwargs):  # pylint: disable=arguments-differ
        for root, _, files in self.walk(top, **kwargs):
            for fname in files:
                yield PathInfo(root) / fname

    def get_file_hash(self, path_info):
        """Return file checksum for specified path.

        If path_info is a DVC out, the pre-computed checksum for the file
        will be used. If path_info is a git file, MD5 will be computed for
        the git object.
        """
        if not self.exists(path_info):
            raise FileNotFoundError
        _, dvc_tree = self._get_tree_pair(path_info)
        if dvc_tree and dvc_tree.exists(path_info):
            try:
                return dvc_tree.get_file_hash(path_info)
            except OutputNotFoundError:
                pass
        return self.PARAM_CHECKSUM, file_md5(path_info, self)[0]

    def copytree(self, top, dest):
        top = PathInfo(top)
        dest = PathInfo(dest)

        if not self.exists(top):
            raise FileNotFoundError

        if self.isfile(top):
            makedirs(dest.parent, exist_ok=True)
            with self.open(top, mode="rb") as fobj:
                copy_fobj_to_file(fobj, dest)
            return

        for root, _, files in self.walk(top):
            root = PathInfo(root)
            dest_dir = dest / root.relative_to(top)
            makedirs(dest_dir, exist_ok=True)
            for fname in files:
                src = root / fname
                with self.open(src, mode="rb") as fobj:
                    copy_fobj_to_file(fobj, dest_dir / fname)

    @property
    def hash_jobs(self):  # pylint: disable=invalid-overridden-method
        return self._main_repo.tree.hash_jobs

    def metadata(self, path):
        path_info = PathInfo(os.path.abspath(path))
        tree, dvc_tree = self._get_tree_pair(path_info)

        dvc_meta = None
        if dvc_tree:
            with suppress(OutputNotFoundError):
                dvc_meta = dvc_tree.metadata(path_info)

        stat_result = None
        with suppress(FileNotFoundError):
            stat_result = tree.stat(path_info)

        if not stat_result and not dvc_meta:
            raise FileNotFoundError

        meta = dvc_meta or Metadata(path_info=path_info)

        isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode)
        meta.isdir = meta.isdir or isdir

        if not dvc_meta:
            meta.is_exec = bool(stat_result) and is_exec(stat_result.st_mode)
        return meta
Exemple #29
0
def init_trie_functions_and_info(
        separator: str) -> Tuple[StringTrie, StringTrie, dict]:
    if len(separator) != 1:
        raise ValueError(
            "SeparatorError: the separator must be char and not string!")

    commands_trie, commands_info_trie = StringTrie(
        separator=separator), StringTrie(separator=separator)

    # Defining commands according to the amount of words in the command
    # 2 words command
    commands_trie["show log"] = __show_log
    commands_trie["set timer"] = __add_new_entry
    commands_trie["show db"] = __show_db_values

    # 3 words command
    commands_trie["show logs all"] = __show_logs_all
    commands_trie["show logs last"] = __show_logs
    commands_trie["show logs top"] = __show_logs
    commands_trie["show logs range"] = __show_logs
    commands_trie["show node status"] = __show_node_status
    # commands_trie["show system status"] = show_system_status
    commands_trie["set multicast ip"] = __add_new_entry
    commands_trie["log value add"] = __add_new_entry
    commands_trie["log value delete"] = __add_new_entry
    commands_trie["log value edit"] = __add_new_entry

    # Defines the explanations at each node and what values it receives
    # 1 word command
    commands_info_trie["show"] = "Prints data on CLI"
    commands_info_trie["set"] = "Sets a value in the database"
    commands_info_trie["log"] = "Displays actions on LOGS in RAFT algorithm"

    # 2 words command
    commands_info_trie["show node"] = "action on node parameters"
    # commands_info_trie["show system"] = "action on system parameters (all nodes in RAFT)"
    commands_info_trie["show log"] = "shows last log in node"
    commands_info_trie["show logs"] = "shows all logs in node"
    commands_info_trie["show db"] = "shows all logs in specific DB"
    commands_info_trie["set multicast"] = "action on multicast IP"
    commands_info_trie["set timer"] = "A number between 0.150 to 0.300"
    commands_info_trie[
        "log value"] = "A set of actions functions like add, delete, edit ..."

    # 3 words command
    commands_info_trie["show node all"] = "Shows all logs in cluster"
    commands_info_trie["show logs last"] = "A positive number greater than 0"
    commands_info_trie["show logs top"] = "A positive number greater than 0"
    commands_info_trie[
        "show logs range"] = "A range between two positive numbers"
    commands_info_trie[
        "show node status"] = "Shows all node parameters in cluster"
    # commands_info_trie["show system status"] = "Shows all nodes parameters in system"
    commands_info_trie[
        "set multicast ip"] = "IP A.B.C.D according to the protocol"
    commands_info_trie[
        "log value add"] = "Adds a new log <key, value> to the cluster"
    commands_info_trie[
        "log value delete"] = "Deletes log <key> from the cluster"
    commands_info_trie["log value edit"] = "Edits log <key> in the cluster"

    special_words_dict = {
        "range": (2, [int, int]),
        "add": (2, [str, int]),
        "ip": (1, [str]),
        "timer": (1, [float]),
        "delete": (1, [str]),
        "edit": (2, [str, int]),
        "top": (1, [int]),
        "last": (1, [int]),
        "db": (1, [str])
    }

    return commands_trie, commands_info_trie, special_words_dict
Exemple #30
0
class LifecycleMetaEvent(MetaEvent):
    """生命周期元事件"""
    __event__ = "meta_event.lifecycle"
    meta_event_type: Literal["lifecycle"]
    sub_type: str


class HeartbeatMetaEvent(MetaEvent):
    """心跳元事件"""
    __event__ = "meta_event.heartbeat"
    meta_event_type: Literal["heartbeat"]
    status: Status
    interval: int


_t = StringTrie(separator=".")

# define `model` first to avoid globals changing while `for`
model = None
for model in globals().values():
    if not inspect.isclass(model) or not issubclass(model, Event):
        continue
    _t["." + model.__event__] = model


def get_event_model(event_name) -> List[Type[Event]]:
    """
    :说明:

      根据事件名获取对应 ``Event Model`` 及 ``FallBack Event Model`` 列表