コード例 #1
0
    def __init__(self, fs, root_dir):
        from dvc.repo import Repo

        default_ignore_patterns = [
            ".hg/",
            ".git/",
            ".git",
            f"{Repo.DVC_DIR}/",
        ]

        self.fs = fs
        self.root_dir = root_dir
        self.ignores_trie_fs = Trie()
        self._ignores_trie_subrepos = Trie()

        key = self._get_key(root_dir)
        self.ignores_trie_fs[key] = DvcIgnorePatterns(
            default_ignore_patterns,
            root_dir,
            fs.sep,
        )
        self._ignores_trie_subrepos[key] = self.ignores_trie_fs[key]
        self._update(
            self.root_dir,
            self._ignores_trie_subrepos,
            dnames=None,
            ignore_subrepos=False,
        )
        self._update(
            self.root_dir,
            self.ignores_trie_fs,
            dnames=None,
            ignore_subrepos=True,
        )
コード例 #2
0
    def cal_left_right_entropy(self):
        left_right_entropy = {}
        for n in range(self.min_n, self.max_n + 1):
            ngrams_entropy = {}
            target_ngrams = self.ngrams_words[n]
            parent_words = self.ngrams_words[n + 1]
            left_neighbors = Trie()
            right_neighbors = Trie()

            for parent_word in tqdm(parent_words, desc='build neighbors'):
                right_neighbors[parent_word] = self.ngrams_freq[parent_word]
                left_neighbors[parent_word[1:] +
                               parent_word[0]] = self.ngrams_freq[parent_word]

            for target_ngram in tqdm(target_ngrams, desc='target ngram'):
                try:
                    right_neighbors_counts = (
                        right_neighbors.values(target_ngram))
                    right_entropy = self.cal_ngram_entropy(
                        right_neighbors_counts)
                except KeyError:
                    right_entropy = 0
                try:
                    left_neighbors_counts = (
                        left_neighbors.values(target_ngram))
                    left_entropy = self.cal_ngram_entropy(
                        left_neighbors_counts)
                except KeyError:
                    left_entropy = 0
                ngrams_entropy[target_ngram] = (left_entropy, right_entropy)

            left_right_entropy.update(ngrams_entropy)

        return left_right_entropy
コード例 #3
0
def _calc_ngram_entropy(ngram_freq, ngram_keys, n):
    """
    基于ngram频率信息计算熵信息
    :param ngram_freq:
    :param ngram_keys:
    :param n:
    :return:
    """
    if isinstance(n, Iterable):  ## 一次性计算 len(N)>1 的 ngram
        entropy = {}
        for ni in n:
            entropy = {
                **entropy,
                **_calc_ngram_entropy(ngram_freq, ngram_keys, ni)
            }
        return entropy

    ngram_entropy = {}
    target_ngrams = ngram_keys[n]
    parent_candidates = ngram_keys[n + 1]

    if CPU_COUNT == 1:
        ## 对 n+1 gram 进行建Trie处理
        left_neighbors = Trie()
        right_neighbors = Trie()

        for parent_candidate in parent_candidates:
            right_neighbors[parent_candidate] = ngram_freq[parent_candidate]
            left_neighbors[parent_candidate[1:] +
                           parent_candidate[0]] = ngram_freq[parent_candidate]

        ## 计算
        for target_ngram in target_ngrams:
            try:  ## 一定情况下, 一个candidate ngram 没有左右neighbor
                right_neighbor_counts = (right_neighbors.values(target_ngram))
                right_entropy = _ngram_entropy_scorer(right_neighbor_counts)
            except KeyError:
                right_entropy = 0
            try:
                left_neighbor_counts = (left_neighbors.values(target_ngram))
                left_entropy = _ngram_entropy_scorer(left_neighbor_counts)
            except KeyError:
                left_entropy = 0
            ngram_entropy[target_ngram] = (left_entropy, right_entropy)
        return ngram_entropy
    else:
        ## TODO 多进程计算
        pass
コード例 #4
0
ファイル: objects.py プロジェクト: vijay-pinjala/dvc
    def __init__(self, tree: GitObject, rev: str):
        self.tree = tree
        self.rev = rev
        self.trie = Trie()

        self.trie[()] = tree
        self._build(tree, ())
コード例 #5
0
ファイル: trie.py プロジェクト: sandeepmistry/dvc
def build_outs_trie(stages):
    outs = Trie()

    for stage in filter(bool, stages):  # bug? not using it later
        for out in stage.outs:
            out_key = out.path_info.parts

            # Check for dup outs
            if out_key in outs:
                dup_stages = [stage, outs[out_key].stage]
                raise OutputDuplicationError(str(out), dup_stages)

            # Check for overlapping outs
            if outs.has_subtrie(out_key):
                parent = out
                overlapping = first(outs.values(prefix=out_key))
            else:
                parent = outs.shortest_prefix(out_key).value
                overlapping = out
            if parent and overlapping:
                msg = (
                    "Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                    "overlap. To avoid unpredictable behaviour, "
                    "rerun command with non overlapping outs paths."
                ).format(
                    str(parent),
                    parent.stage.addressing,
                    str(overlapping),
                    overlapping.stage.addressing,
                )
                raise OverlappingOutputPathsError(parent, overlapping, msg)

            outs[out_key] = out

    return outs
コード例 #6
0
ファイル: dvc.py プロジェクト: VanaMartin/dvc
    def walk(self, top, topdown=True, onerror=None, **kwargs):
        from pygtrie import Trie

        assert topdown
        root = PathInfo(os.path.abspath(top))
        try:
            meta = self.metadata(root)
        except OutputNotFoundError:
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not meta.isdir:
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        trie = Trie()
        for out in meta.outs:
            trie[out.path_info.parts] = out

            if out.is_dir_checksum and root.isin_or_eq(out.path_info):
                self._add_dir(top, trie, out, **kwargs)

        yield from self._walk(root, trie, topdown=topdown, **kwargs)
コード例 #7
0
ファイル: tree.py プロジェクト: feedbackfruits/dvc
    def walk(self, top, topdown=True, **kwargs):
        from pygtrie import Trie

        assert topdown

        if not self.exists(top):
            raise FileNotFoundError

        if not self.isdir(top):
            raise NotADirectoryError

        root = PathInfo(os.path.abspath(top))
        outs = self._find_outs(top, recursive=True, strict=False)

        trie = Trie()

        for out in outs:
            trie[out.path_info.parts] = out

            if out.is_dir_checksum and (self.fetch or self.stream):
                # pull dir cache if needed
                dir_cache = out.get_dir_cache(**kwargs)

                # pull dir contents if needed
                if self.fetch:
                    if out.changed_cache(filter_info=top):
                        used_cache = out.get_used_cache(filter_info=top)
                        self.repo.cloud.pull(used_cache, **kwargs)

                for entry in dir_cache:
                    entry_relpath = entry[out.remote.PARAM_RELPATH]
                    path_info = out.path_info / entry_relpath
                    trie[path_info.parts] = None

        yield from self._walk(root, trie, topdown=topdown)
コード例 #8
0
ファイル: dvc.py プロジェクト: nik123/dvc
    def walk(self, top, topdown=True, onerror=None, **kwargs):
        from pygtrie import Trie

        assert topdown
        root = os.path.abspath(top)
        try:
            info = self.info(root)
        except FileNotFoundError:
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if info["type"] != "directory":
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        trie = Trie()
        for out in info["outs"]:
            trie[out.fs.path.parts(out.fs_path)] = out

            if out.is_dir_checksum and self.path.isin_or_eq(root, out.fs_path):
                self._add_dir(trie, out, **kwargs)

        yield from self._walk(root, trie, topdown=topdown, **kwargs)
コード例 #9
0
ファイル: tree.py プロジェクト: stefanvangastel/dvc
    def walk(self, top, topdown=True, onerror=None, **kwargs):
        from pygtrie import Trie

        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        root = PathInfo(os.path.abspath(top))
        outs = self._find_outs(top, recursive=True, strict=False)

        trie = Trie()

        for out in outs:
            trie[out.path_info.parts] = out

            if out.is_dir_checksum and root.isin_or_eq(out.path_info):
                self._add_dir(top, trie, out, **kwargs)

        yield from self._walk(root, trie, topdown=topdown, **kwargs)
コード例 #10
0
def build_outs_trie(stages):
    outs = Trie()

    for stage in stages:
        for out in stage.outs:
            out_key = out.fs.path.parts(out.fs_path)

            # Check for dup outs
            if out_key in outs:
                dup_stages = [stage, outs[out_key].stage]
                raise OutputDuplicationError(str(out), dup_stages)

            # Check for overlapping outs
            if outs.has_subtrie(out_key):
                parent = out
                overlapping = first(outs.values(prefix=out_key))
            else:
                parent = outs.shortest_prefix(out_key).value
                overlapping = out
            if parent and overlapping:
                msg = (
                    "The output paths:\n'{}'('{}')\n'{}'('{}')\n"
                    "overlap and are thus in the same tracked directory.\n"
                    "To keep reproducibility, outputs should be in separate "
                    "tracked directories or tracked individually.").format(
                        str(parent),
                        parent.stage.addressing,
                        str(overlapping),
                        overlapping.stage.addressing,
                    )
                raise OverlappingOutputPathsError(parent, overlapping, msg)

            outs[out_key] = out

    return outs
コード例 #11
0
    def walk(self, top, topdown=True):
        from pygtrie import Trie

        assert topdown

        if not self.exists(top):
            raise FileNotFoundError

        if not self.isdir(top):
            raise NotADirectoryError

        root = PathInfo(os.path.abspath(top))
        outs = self._find_outs(top, recursive=True, strict=False)

        trie = Trie()

        for out in outs:
            trie[out.path_info.parts] = out

            if out.is_dir_checksum and (self.fetch or self.stream):
                # will pull dir cache if needed
                with self.repo.state:
                    cache = out.collect_used_dir_cache()
                for _, names in cache.scheme_names(out.scheme):
                    for name in names:
                        path_info = out.path_info.parent / name
                        trie[path_info.parts] = None

        yield from self._walk(root, trie, topdown=topdown)
コード例 #12
0
def calcul_ngram_entropy(ngram_freq,
                        ngram_keys,
                        n):
    """
    Calcul entropy by ngram frequences
    """
    # Calcul ngram entropy
    if isinstance(n,collections.abc.Iterable): 
        entropy = {}
        for ni in n:
            entropy = {**entropy,**calcul_ngram_entropy(ngram_freq,ngram_keys,ni)}
        return entropy
      
    ngram_entropy = {}
    parent_candidates = ngram_keys[n+1]
    if n!=1:
        target_ngrams = ngram_keys[n]
    else:
        target_ngrams = [l for l in ngram_keys[n] if ToolWord().is_english_word(l[0])]       

    if hp.CPU_COUNT == 1:
        # Build trie for n+1 gram 
        left_neighbors = Trie()
        right_neighbors = Trie()

        for parent_candidate in parent_candidates:
            right_neighbors[parent_candidate] = ngram_freq[parent_candidate]
            left_neighbors[parent_candidate[1:]+(parent_candidate[0],)] = ngram_freq[parent_candidate]

        # Calcul entropy
        for target_ngram in target_ngrams:
            try:  
                right_neighbor_counts = (right_neighbors.values(target_ngram))
                right_entropy = entropy_of_list(right_neighbor_counts)
            except KeyError:
                right_entropy = 0
            try:
                left_neighbor_counts = (left_neighbors.values(target_ngram))
                left_entropy = entropy_of_list(left_neighbor_counts)
            except KeyError:
                left_entropy = 0
            ngram_entropy[target_ngram] = (left_entropy,right_entropy)
        return ngram_entropy
    else:
        # Multi process
        pass
コード例 #13
0
 def build_trie(alphabet, vocab):
     from pygtrie import CharTrie as Trie
     trie = Trie()
     start_time = datetime.now()
     info('start building trie at {}'.format(
         start_time.strftime("%H:%M:%S")))
     for v in vocab:
         trie[v] = 1
     end_time = datetime.now()
     info('finish building trie at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return trie
コード例 #14
0
 def build_trie(alphabet, vocab):
     from pygtrie import CharTrie as Trie
     start_time = datetime.now()
     info('start building trie at {}'.format(
         start_time.strftime("%H:%M:%S")))
     trie = Trie()
     for i, v in enumerate(vocab, start=1):
         trie[v] = 1
         if i % 10000 == 0:
             info('inserted {} ...'.format(i))
     end_time = datetime.now()
     info('finish building trie at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return trie
コード例 #15
0
ファイル: dvc.py プロジェクト: pmrowla/dvc
    def __init__(
        self,
        repo: Optional["Repo"] = None,
        subrepos=False,
        repo_factory: RepoFactory = None,
        **kwargs,
    ):
        super().__init__()

        from pygtrie import Trie

        if repo is None:
            repo, repo_factory = self._repo_from_fs_config(subrepos=subrepos,
                                                           **kwargs)

        if not repo_factory:
            from dvc.repo import Repo

            self.repo_factory: RepoFactory = Repo
        else:
            self.repo_factory = repo_factory

        def _getcwd():
            relparts = ()
            if repo.fs.path.isin(repo.fs.path.getcwd(), repo.root_dir):
                relparts = repo.fs.path.relparts(repo.fs.path.getcwd(),
                                                 repo.root_dir)
            return self.root_marker + self.sep.join(relparts)

        self.path = Path(self.sep, getcwd=_getcwd)
        self.repo = repo
        self.hash_jobs = repo.fs.hash_jobs
        self._traverse_subrepos = subrepos

        self._subrepos_trie = Trie()
        """Keeps track of each and every path with the corresponding repo."""

        key = self._get_key(self.repo.root_dir)
        self._subrepos_trie[key] = repo

        self._datafss = {}
        """Keep a datafs instance of each repo."""

        if hasattr(repo, "dvc_dir"):
            self._datafss[key] = DataFileSystem(repo=repo)
コード例 #16
0
ファイル: tree.py プロジェクト: sahilbhosale63/dvc
    def walk(
        self, top, topdown=True, onerror=None, download_callback=None, **kwargs
    ):
        from pygtrie import Trie

        assert topdown

        if not self.exists(top):
            if onerror is not None:
                onerror(FileNotFoundError(top))
            return

        if not self.isdir(top):
            if onerror is not None:
                onerror(NotADirectoryError(top))
            return

        root = PathInfo(os.path.abspath(top))
        outs = self._find_outs(top, recursive=True, strict=False)

        trie = Trie()

        for out in outs:
            trie[out.path_info.parts] = out

            if out.is_dir_checksum and (self.fetch or self.stream):
                # pull dir cache if needed
                dir_cache = out.get_dir_cache(**kwargs)

                # pull dir contents if needed
                if self.fetch:
                    if out.changed_cache(filter_info=top):
                        used_cache = out.get_used_cache(filter_info=top)
                        downloaded = self.repo.cloud.pull(used_cache, **kwargs)
                        if download_callback:
                            download_callback(downloaded)

                for entry in dir_cache:
                    entry_relpath = entry[out.remote.tree.PARAM_RELPATH]
                    if os.name == "nt":
                        entry_relpath = entry_relpath.replace("/", os.sep)
                    path_info = out.path_info / entry_relpath
                    trie[path_info.parts] = None

        yield from self._walk(root, trie, topdown=topdown)
コード例 #17
0
ファイル: tree.py プロジェクト: rogervaas/dvc
    def walk(self, top, topdown=True):
        from pygtrie import Trie

        assert topdown

        if not self.exists(top):
            raise FileNotFoundError

        if not self.isdir(top):
            raise NotADirectoryError

        root = PathInfo(os.path.abspath(top))
        outs = self._find_outs(top, recursive=True, strict=False)

        trie = Trie()

        for out in outs:
            trie[out.path_info.parts] = out

        yield from self._walk(root, trie, topdown=topdown)
コード例 #18
0
    def _collect_graph(self, stages):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from pygtrie import Trie

        from dvc.exceptions import (
            OutputDuplicationError,
            OverlappingOutputPathsError,
            StagePathAsOutputError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        outs = Trie()  # Use trie to efficiently find overlapping outs and deps

        for stage in filter(bool, stages):  # bug? not using it later
            for out in stage.outs:
                out_key = out.path_info.parts

                # Check for dup outs
                if out_key in outs:
                    dup_stages = [stage, outs[out_key].stage]
                    raise OutputDuplicationError(str(out), dup_stages)

                # Check for overlapping outs
                if outs.has_subtrie(out_key):
                    parent = out
                    overlapping = first(outs.values(prefix=out_key))
                else:
                    parent = outs.shortest_prefix(out_key).value
                    overlapping = out
                if parent and overlapping:
                    msg = ("Paths for outs:\n'{}'('{}')\n'{}'('{}')\n"
                           "overlap. To avoid unpredictable behaviour, "
                           "rerun command with non overlapping outs paths."
                           ).format(
                               str(parent),
                               parent.stage.addressing,
                               str(overlapping),
                               overlapping.stage.addressing,
                           )
                    raise OverlappingOutputPathsError(parent, overlapping, msg)

                outs[out_key] = out

        for stage in stages:
            out = outs.shortest_prefix(PathInfo(stage.path).parts).value
            if out:
                raise StagePathAsOutputError(stage, str(out))

        # Building graph
        G.add_nodes_from(stages)
        for stage in stages:
            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                dep_key = dep.path_info.parts
                overlapping = [n.value for n in outs.prefixes(dep_key)]
                if outs.has_subtrie(dep_key):
                    overlapping.extend(outs.values(prefix=dep_key))

                G.add_edges_from((stage, out.stage) for out in overlapping)
        check_acyclic(G)

        return G
コード例 #19
0
def test_size(trie_dict, size):
    dir_info = DirInfo()
    dir_info.trie = Trie(trie_dict)
    assert dir_info.size == size
コード例 #20
0
    def trie(self):
        from pygtrie import Trie

        return Trie(self._dict)
コード例 #21
0
def test_nfiles(trie_dict, nfiles):
    dir_info = DirInfo()
    dir_info.trie = Trie(trie_dict)
    assert dir_info.nfiles == nfiles
コード例 #22
0
ファイル: stopwords.py プロジェクト: linhx13/pyarc
 def __init__(self, data, tokenizer=None, upper=True, lower=True):
     self.t = tokenizer if tokenizer else tokenizers.get('jieba')
     self.upper = upper
     self.lower = lower
     self.fw_trie = Trie()
     self.bw_trie = Trie()
コード例 #23
0
ファイル: dir_info.py プロジェクト: pyanezs/dvc
 def trie(self):
     return Trie(self._dict)
コード例 #24
0
ファイル: dir_info.py プロジェクト: zivzone/dvc
 def __init__(self):
     self.trie = Trie()
コード例 #25
0
def test_items_with_path(path_info, trie_dict, items):
    dir_info = DirInfo()
    dir_info.trie = Trie(trie_dict)
    assert list(dir_info.items(path_info)) == items
コード例 #26
0
ファイル: templates.py プロジェクト: lili-yu/negotiation
 def __init__(self, price_tracker):
     self.price_tracker = price_tracker
     self.ngram_counter = Trie()
     self.templates = []
     self.template_id = 0
コード例 #27
0
def test_merge(ancestor_dict, our_dict, their_dict, merged_dict):
    actual = _merge(Trie(ancestor_dict), Trie(our_dict), Trie(their_dict))
    expected = Trie(merged_dict)
    assert actual == expected
コード例 #28
0
ファイル: tokenizers.py プロジェクト: linhx13/pyarc
 def __init__(self, user_dict: Union[str, Iterable] = None):
     self.t = posseg.POSTokenizer()
     self.t.initialize()
     self.trie = Trie()
     if user_dict:
         self.load_user_dict(user_dict)
コード例 #29
0
def test_items(trie_dict, items):
    dir_info = DirInfo()
    dir_info.trie = Trie(trie_dict)
    assert list(dir_info.items()) == items
コード例 #30
0
def test_list(lst, trie_dict):
    dir_info = DirInfo.from_list(lst)
    assert dir_info.trie == Trie(trie_dict)
    assert dir_info.to_list() == sorted(lst, key=itemgetter("relpath"))