Example #1
0
    def apply(self,
              source_sequence: Sequence,
              plugin: SequenceTransferPlugin = None) -> Sequence:
        """
        :param source_sequence: any sub-sequence of the source
        :param plugin: SequenceTransferPlugin instance
        :return: transferred sub-sequence in the target
        """

        if plugin is not None:
            return plugin.apply(self, source_sequence)

        source_sequence.raise_if_not_in(self._source)

        if source_sequence.size == 0 and source_sequence.start == self._source.start:
            return Sequence(
                self._index[0].start,
                self._index[0].start,
            )
        elif source_sequence.size == 0 and source_sequence.stop == self._source.stop:
            return Sequence(
                self._index[self._source.size - 1].stop,
                self._index[self._source.size - 1].stop,
            )
        else:
            return Sequence.expand(self._index[source_sequence.start],
                                   self._index[source_sequence.stop - 1])
Example #2
0
    def _parallelize(self):
        """
        create an empty sequence on the source foreach hole in the target and add them to the list of matches
        :return:
        """
        parallel_matches = []
        for i, (source_sequence, target_sequence) in enumerate(self._matches):
            if i == 0:
                last_target_sequence = Sequence(self._target.start,
                                                self._target.start)
            else:
                last_target_sequence = self._matches[i - 1][1]

            # Resolving potential holes before the target
            between_target = Sequence.between(last_target_sequence,
                                              target_sequence)
            if between_target.size != 0:
                parallel_matches.append(
                    (Sequence(source_sequence.start,
                              source_sequence.start), between_target))

            # Already ok
            parallel_matches.append((source_sequence, target_sequence))

            # Holes at the end
            if i == len(self._matches
                        ) - 1 and target_sequence.stop != self._target.stop:
                parallel_matches.append(
                    (Sequence(self._source.stop, self._source.stop),
                     Sequence(target_sequence.stop, self._target.stop)))

        return parallel_matches
Example #3
0
    def test_creation(self):
        s = Sequence(1, 2)
        assert s.start == 1
        assert s.stop == 2

        with self.assertRaises(InvalidSequenceException):
            s = Sequence(2, 1)

        s = Sequence(1, 1)
        assert s.start == 1
        assert s.stop == 1
Example #4
0
def remove_html_entities(text: str) -> Tuple[str, SequenceTransfer]:
    output = html.unescape(text)

    i = 0
    j = 0
    transfers = []

    try:
        while i < len(text):
            char = text[i]
            # This "if" can be improved using python 3.8 with := operator
            # if char == "&" and is_html, fin := is_html()
            if char == "&":
                is_html, fin = is_entity(text, i)
                if is_html:
                    transfers.append((Sequence(i, fin), Sequence(j, j + 1)))
                    i = fin
                    j += 1
                else:
                    transfers.append((Sequence(i, i + 1), Sequence(j, j + 1)))
                    i += 1
                    j += 1
            else:
                transfers.append((Sequence(i, i + 1), Sequence(j, j + 1)))
                i += 1
                j += 1

        transfer = SequenceTransfer(Sequence(0, len(text)),
                                    Sequence(0, len(output)), transfers)
    except SequenceNotInException:
        raise Exception("Source has html entity not ending with ;")
    # print(transfer.debug_in_text(text, output))
    return output, transfer
Example #5
0
    def compose(*args: "SequenceTransfer"):
        transfers = list(args)
        if len(transfers) < 1:
            raise SequenceTransferException(
                'compose require at least two transfers')

        first_transfer = transfers[0]
        last_transfer = transfers[-1]

        matches = []

        for i, (sequence, _) in enumerate(first_transfer.matches):
            tmp = sequence
            for f in transfers:
                tmp = f.apply(tmp)

            if i > 0 and matches[-1][1] == tmp:
                match = (Sequence.expand(matches[-1][0], sequence), tmp)
                matches.pop()
                matches.append(match)
            else:
                matches.append((sequence, tmp))

        return SequenceTransfer(first_transfer.source, last_transfer.target,
                                matches)
Example #6
0
    def test_basics(self):
        s1 = Sequence(5, 10)
        assert s1.size == 5
        assert len(s1) == 5

        s1 = Sequence(1, 1)
        assert s1.size == 0

        s1 = Sequence(1, 4)
        for i, s2 in enumerate(s1):
            assert s2.start == s1.start + i
            assert s2.stop == s2.start + 1

        s1 = Sequence(1, 4)
        s2 = Sequence(1, 4)
        assert s1 == s2
Example #7
0
def token_to_text_transfer(tokens: List[Token]):
    str_index = 0
    transfers = []
    for i, token in enumerate(tokens):
        transfers.append((
            Sequence(i, i+1),
            Sequence(str_index, str_index + len(token))
        ))
        str_index += len(token)
        if i != len(tokens)-1:
            str_index += 1

    return SequenceTransfer(
        Sequence(0, len(tokens)),
        Sequence(0, str_index),
        transfers
    )
Example #8
0
    def apply(self, source_sequence: Sequence) -> Sequence:
        """
        :param source_sequence: any sub-sequence of the source
        :return: transferred sub-sequence in the target
        """
        source_sequence.raise_if_not_in(self._source)

        if source_sequence.size == 0 and source_sequence.start == self._source.start:
            return Sequence(
                self._index[0].start,
                self._index[0].start,
            )
        elif source_sequence.size == 0 and source_sequence.stop == self._source.stop:
            return Sequence(
                self._index[self._source.size - 1].stop,
                self._index[self._source.size - 1].stop,
            )
        else:
            return Sequence.expand(self._index[source_sequence.start],
                                   self._index[source_sequence.stop - 1])
def remove_accents(text: str) -> Tuple[str, SequenceTransfer]:
    output = ''
    transfers = []
    for i, char in enumerate(text):
        normalized = unicodedata.normalize("NFD", char)
        without_accent = ''
        for char2 in normalized:
            cat = unicodedata.category(char2)
            if cat == "Mn":
                continue
            without_accent += char2

        else:
            transfers.append((Sequence(i, i + 1),
                              Sequence(len(output),
                                       len(output) + len(without_accent))))
        output += without_accent

    transfer = SequenceTransfer(Sequence(0, len(text)),
                                Sequence(0, len(output)), transfers)
    return output, transfer
Example #10
0
def lcs_transfer(x: str, y: str) -> SequenceTransfer:
    m = len(x)
    n = len(y)

    L = [[None] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif x[i - 1] == y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    transfers = []
    while m > 0 and n > 0:
        max_neighbor = max(L[m - 1][n - 1], L[m][n - 1], L[m - 1][n])
        match = None
        if L[m - 1][n - 1] == max_neighbor:
            match = (Sequence(m - 1, m), Sequence(n - 1, n))
            m = m - 1
            n = n - 1
        elif L[m][n - 1] == max_neighbor:
            # match = (Sequence(m, m), Sequence(n - 1, n))
            n = n - 1
        elif L[m - 1][n] == max_neighbor:
            match = (Sequence(m - 1, m), Sequence(n, n))
            m = m - 1
        if match is not None:
            transfers.append(match)

    while m > 0:
        match = (Sequence(m - 1, m), Sequence(n, n))
        transfers.append(match)
        m -= 1

    # while n > 0:
    #     match = (Sequence(m, m), Sequence(n - 1, n))
    #     transfers.append(match)
    #     n -= 1

    transfers.reverse()

    return SequenceTransfer(Sequence(0, len(x)), Sequence(0, len(y)),
                            transfers)
Example #11
0
def remove_whitespace(text: str) -> Tuple[str, SequenceTransfer]:
    output = ''
    transfers = []
    for i, char in enumerate(text):
        if not is_whitespace(char):
            transfers.append((Sequence(i), Sequence(len(output))))
            output += char
        else:
            transfers.append((Sequence(i), Sequence(len(output), len(output))))
    transfer = SequenceTransfer(Sequence(0, len(text)),
                                Sequence(0, len(output)), transfers)
    return output, transfer
    def test_subsequence(self):
        s1 = Sequence(1, 4)

        s2 = s1[0]
        assert s2.start == 1
        assert s2.stop == 2

        s2 = s1[1]
        assert s2.start == 2
        assert s2.stop == 3

        s2 = s1[2]
        assert s2.start == 3
        assert s2.stop == 4

        with self.assertRaises(KeyError):
            s2 = s1[-1]

        with self.assertRaises(KeyError):
            s2 = s1[3]

        s2 = s1[0:1]
        assert s2.start == 1
        assert s2.stop == 2

        s2 = s1[1:2]
        assert s2.start == 2
        assert s2.stop == 3

        s2 = s1[2:3]
        assert s2.start == 3
        assert s2.stop == 4

        s2 = s1[0:3]
        assert s2.start == 1
        assert s2.stop == 4

        s2 = s1[-1:3]
        assert s2.start == 3
        assert s2.stop == 4

        s2 = s1[:-1]
        assert s2.start == 1
        assert s2.stop == 3
def remove_bert_separator(text: str) -> Tuple[str, SequenceTransfer]:
    output = ""

    i = 0
    j = 0
    transfers = []

    while i < len(text):
        char = text[i]
        if char == "#" and is_separator(text, i):
            transfers.append((Sequence(i, i + 3), Sequence(j, j + 1)))
            output += text[i + 2]
            i += 3
            j += 1
        else:
            transfers.append((Sequence(i, i + 1), Sequence(j, j + 1)))
            output += text[i]
            i += 1
            j += 1

    transfer = SequenceTransfer(Sequence(0, len(text)),
                                Sequence(0, len(output)), transfers)

    return output, transfer
    def test_remove_whitespace(self):
        t1 = " te \n\r\txt "
        t2, transfer = remove_whitespace(t1)

        assert ' ' not in t2
        assert '\n' not in t2
        assert '\r' not in t2
        assert '\t' not in t2

        assert transfer.apply(Sequence(0)) == Sequence(0, 0)
        assert transfer.apply(Sequence(1)) == Sequence(0)
        assert transfer.apply(Sequence(2)) == Sequence(1)
        assert transfer.apply(Sequence(3)) == Sequence(2, 2)
        assert transfer.apply(Sequence(4)) == Sequence(2, 2)
        assert transfer.apply(Sequence(5)) == Sequence(2, 2)
        assert transfer.apply(Sequence(6)) == Sequence(2, 2)
        assert transfer.apply(Sequence(7)) == Sequence(2, 3)
        assert transfer.apply(Sequence(8)) == Sequence(3, 4)
        assert transfer.apply(Sequence(9)) == Sequence(4, 4)
Example #15
0
 def test_iter(self):
     s1 = Sequence(5, 8)
     for s in s1:
         assert s == Sequence(s.start)
Example #16
0
def to_lower(text: str) -> Tuple[str, SequenceTransfer]:
    sequence = Sequence(0, len(text))
    transfer = SequenceTransfer(sequence, sequence, [(sequence, sequence)])
    return text.lower(), transfer