Esempio n. 1
0
def find_entites(text: str, trie: StringTrie):
    tokens = text.split()
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    for i in range(len(tokens)):
        key = "/".join(tokens[start : i + 1]).lower()
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = (get_entity(trie, key), start, i + 1)
        elif trie.has_key(key):  # noqa: W601 # Find a perfect match
            entities[count] = (trie[key], start, i + 1)
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            entities[count] = (get_entity(trie, old_key), start, i)
            count += 1
            if trie.has_node(
                tokens[i].lower()
            ):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                start = i + 1
        else:  # No match
            start = i + 1
    return reduce_entities(entities)
Esempio n. 2
0
    def detect_path_overlap(paths):
        """
        Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap).

        Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c).

        NOTE: The logic is copied from pulpcore.app.files.validate_file_paths().

        This function returns the first dupe or overlap it detects. We use a trie (or
        prefix tree) to keep track of which paths we've already seen.

        Args:
            paths (iterable of str): An iterable of strings each representing a relative path

        Returns:
            str: a path which overlaps or duplicates another

        """
        path_trie = StringTrie(separator="/")
        for path in paths:
            if path in path_trie:
                # path duplicates a path already in the trie
                return path

            if path_trie.has_subtrie(path):
                # overlap where path is 'a/b' and trie has 'a/b/c'
                return path

            prefixes = list(path_trie.prefixes(path))
            if prefixes:
                # overlap where path is 'a/b/c' and trie has 'a/b'
                return path

            # if there are no overlaps, add it to our trie and continue
            path_trie[path] = True
Esempio n. 3
0
def validate_file_paths(paths):
    """
    Check for valid POSIX paths (ie ones that aren't duplicated and don't overlap).

    Overlapping paths are where one path terminates inside another (e.g. a/b and a/b/c).

    This function will raise an exception at the first dupe or overlap it detects. We use a trie (or
    prefix tree) to keep track of which paths we've already seen.

    Args:
        paths (iterable of str): An iterable of strings each representing a relative path

    Raises:
        ValueError: If any path overlaps another
    """
    overlap_error = _("The path for file '{path}' overlaps: {conflicts}")

    path_trie = StringTrie(separator="/")
    dups = []
    overlaps = []
    for path in paths:
        if path in path_trie:
            # path duplicates a path already in the trie
            dups.append(path)
        elif path_trie.has_subtrie(path):
            # overlap where path is 'a/b' and trie has 'a/b/c'
            conflicts = [item[0] for item in path_trie.items(prefix=path)]
            overlaps.append(
                overlap_error.format(path=path,
                                     conflicts=", ".join(conflicts)))
        else:
            prefixes = list(path_trie.prefixes(path))
            if prefixes:
                # overlap where path is 'a/b/c' and trie has 'a/b'
                conflicts = [prefix.key for prefix in prefixes]
                overlaps.append(
                    overlap_error.format(path=path,
                                         conflicts=", ".join(conflicts)))

        # if there are no overlaps, add it to our trie and continue
        path_trie[path] = True

    if dups or overlaps:
        dups_msg = ""
        overlaps_msg = ""
        if dups:
            dups_msg = _("Paths are duplicated: {paths}").format(
                paths=",".join(dups))
        if overlaps:
            overlaps_msg = "\n".join(overlaps)

        raise ValueError(
            _("Path errors found. {dups}\n{overlaps}").format(
                dups=dups_msg, overlaps=overlaps_msg))
Esempio n. 4
0
def find_entites(text: str, trie: StringTrie, mask: str = MASK_TOKEN):
    tokens = text.split()
    tokens = fix_punct_tokens(tokens)
    start = 0
    count = 1  # start at 1, 0 is for the "NO_MATCH"
    entities = dict()
    out = []
    for i in range(len(tokens)):
        key = "/".join(tokens[start:i + 1]).lower()
        # name = " ".join(tokens[start: i + 1])
        if trie.has_subtrie(key):  # Not done yet
            if i == len(tokens) - 1:  # Reached the end of the string
                entities[count] = get_partial_match(trie, key)
                out.append(add_bold(get_entity(entities[count])))
        elif trie.has_key(key):  # noqa: W601  # Find a perfect match
            entities[count] = trie[key]
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            start = i + 1
        elif start < i:  # Found partial prefix match before this token
            old_key = "/".join(tokens[start:i]).lower()
            #  name = " ".join(tokens[start:i])
            entities[count] = get_partial_match(trie, old_key)
            out.append(add_bold(get_entity(entities[count])))
            count += 1
            if trie.has_node(tokens[i].lower(
            )):  # Need to verify that the current token isn't in the Trie
                start = i
            else:
                out.append(tokens[i])
                start = i + 1
        else:  # No match
            out.append(tokens[i])
            start = i + 1
    retokenized = "".join([
        " " + i if not i.startswith("'") and i not in PUNCT else i for i in out
    ]).strip()
    return retokenized, reduce_entities(entities)