def apply(self, source_sequence: Sequence, plugin: SequenceTransferPlugin = None) -> Sequence: """ :param source_sequence: any sub-sequence of the source :param plugin: SequenceTransferPlugin instance :return: transferred sub-sequence in the target """ if plugin is not None: return plugin.apply(self, source_sequence) source_sequence.raise_if_not_in(self._source) if source_sequence.size == 0 and source_sequence.start == self._source.start: return Sequence( self._index[0].start, self._index[0].start, ) elif source_sequence.size == 0 and source_sequence.stop == self._source.stop: return Sequence( self._index[self._source.size - 1].stop, self._index[self._source.size - 1].stop, ) else: return Sequence.expand(self._index[source_sequence.start], self._index[source_sequence.stop - 1])
def _parallelize(self): """ create an empty sequence on the source foreach hole in the target and add them to the list of matches :return: """ parallel_matches = [] for i, (source_sequence, target_sequence) in enumerate(self._matches): if i == 0: last_target_sequence = Sequence(self._target.start, self._target.start) else: last_target_sequence = self._matches[i - 1][1] # Resolving potential holes before the target between_target = Sequence.between(last_target_sequence, target_sequence) if between_target.size != 0: parallel_matches.append( (Sequence(source_sequence.start, source_sequence.start), between_target)) # Already ok parallel_matches.append((source_sequence, target_sequence)) # Holes at the end if i == len(self._matches ) - 1 and target_sequence.stop != self._target.stop: parallel_matches.append( (Sequence(self._source.stop, self._source.stop), Sequence(target_sequence.stop, self._target.stop))) return parallel_matches
def test_creation(self): s = Sequence(1, 2) assert s.start == 1 assert s.stop == 2 with self.assertRaises(InvalidSequenceException): s = Sequence(2, 1) s = Sequence(1, 1) assert s.start == 1 assert s.stop == 1
def remove_html_entities(text: str) -> Tuple[str, SequenceTransfer]: output = html.unescape(text) i = 0 j = 0 transfers = [] try: while i < len(text): char = text[i] # This "if" can be improved using python 3.8 with := operator # if char == "&" and is_html, fin := is_html() if char == "&": is_html, fin = is_entity(text, i) if is_html: transfers.append((Sequence(i, fin), Sequence(j, j + 1))) i = fin j += 1 else: transfers.append((Sequence(i, i + 1), Sequence(j, j + 1))) i += 1 j += 1 else: transfers.append((Sequence(i, i + 1), Sequence(j, j + 1))) i += 1 j += 1 transfer = SequenceTransfer(Sequence(0, len(text)), Sequence(0, len(output)), transfers) except SequenceNotInException: raise Exception("Source has html entity not ending with ;") # print(transfer.debug_in_text(text, output)) return output, transfer
def compose(*args: "SequenceTransfer"): transfers = list(args) if len(transfers) < 1: raise SequenceTransferException( 'compose require at least two transfers') first_transfer = transfers[0] last_transfer = transfers[-1] matches = [] for i, (sequence, _) in enumerate(first_transfer.matches): tmp = sequence for f in transfers: tmp = f.apply(tmp) if i > 0 and matches[-1][1] == tmp: match = (Sequence.expand(matches[-1][0], sequence), tmp) matches.pop() matches.append(match) else: matches.append((sequence, tmp)) return SequenceTransfer(first_transfer.source, last_transfer.target, matches)
def test_basics(self): s1 = Sequence(5, 10) assert s1.size == 5 assert len(s1) == 5 s1 = Sequence(1, 1) assert s1.size == 0 s1 = Sequence(1, 4) for i, s2 in enumerate(s1): assert s2.start == s1.start + i assert s2.stop == s2.start + 1 s1 = Sequence(1, 4) s2 = Sequence(1, 4) assert s1 == s2
def token_to_text_transfer(tokens: List[Token]): str_index = 0 transfers = [] for i, token in enumerate(tokens): transfers.append(( Sequence(i, i+1), Sequence(str_index, str_index + len(token)) )) str_index += len(token) if i != len(tokens)-1: str_index += 1 return SequenceTransfer( Sequence(0, len(tokens)), Sequence(0, str_index), transfers )
def apply(self, source_sequence: Sequence) -> Sequence: """ :param source_sequence: any sub-sequence of the source :return: transferred sub-sequence in the target """ source_sequence.raise_if_not_in(self._source) if source_sequence.size == 0 and source_sequence.start == self._source.start: return Sequence( self._index[0].start, self._index[0].start, ) elif source_sequence.size == 0 and source_sequence.stop == self._source.stop: return Sequence( self._index[self._source.size - 1].stop, self._index[self._source.size - 1].stop, ) else: return Sequence.expand(self._index[source_sequence.start], self._index[source_sequence.stop - 1])
def remove_accents(text: str) -> Tuple[str, SequenceTransfer]: output = '' transfers = [] for i, char in enumerate(text): normalized = unicodedata.normalize("NFD", char) without_accent = '' for char2 in normalized: cat = unicodedata.category(char2) if cat == "Mn": continue without_accent += char2 else: transfers.append((Sequence(i, i + 1), Sequence(len(output), len(output) + len(without_accent)))) output += without_accent transfer = SequenceTransfer(Sequence(0, len(text)), Sequence(0, len(output)), transfers) return output, transfer
def lcs_transfer(x: str, y: str) -> SequenceTransfer: m = len(x) n = len(y) L = [[None] * (n + 1) for _ in range(m + 1)] for i in range(m + 1): for j in range(n + 1): if i == 0 or j == 0: L[i][j] = 0 elif x[i - 1] == y[j - 1]: L[i][j] = L[i - 1][j - 1] + 1 else: L[i][j] = max(L[i - 1][j], L[i][j - 1]) transfers = [] while m > 0 and n > 0: max_neighbor = max(L[m - 1][n - 1], L[m][n - 1], L[m - 1][n]) match = None if L[m - 1][n - 1] == max_neighbor: match = (Sequence(m - 1, m), Sequence(n - 1, n)) m = m - 1 n = n - 1 elif L[m][n - 1] == max_neighbor: # match = (Sequence(m, m), Sequence(n - 1, n)) n = n - 1 elif L[m - 1][n] == max_neighbor: match = (Sequence(m - 1, m), Sequence(n, n)) m = m - 1 if match is not None: transfers.append(match) while m > 0: match = (Sequence(m - 1, m), Sequence(n, n)) transfers.append(match) m -= 1 # while n > 0: # match = (Sequence(m, m), Sequence(n - 1, n)) # transfers.append(match) # n -= 1 transfers.reverse() return SequenceTransfer(Sequence(0, len(x)), Sequence(0, len(y)), transfers)
def remove_whitespace(text: str) -> Tuple[str, SequenceTransfer]: output = '' transfers = [] for i, char in enumerate(text): if not is_whitespace(char): transfers.append((Sequence(i), Sequence(len(output)))) output += char else: transfers.append((Sequence(i), Sequence(len(output), len(output)))) transfer = SequenceTransfer(Sequence(0, len(text)), Sequence(0, len(output)), transfers) return output, transfer
def test_subsequence(self): s1 = Sequence(1, 4) s2 = s1[0] assert s2.start == 1 assert s2.stop == 2 s2 = s1[1] assert s2.start == 2 assert s2.stop == 3 s2 = s1[2] assert s2.start == 3 assert s2.stop == 4 with self.assertRaises(KeyError): s2 = s1[-1] with self.assertRaises(KeyError): s2 = s1[3] s2 = s1[0:1] assert s2.start == 1 assert s2.stop == 2 s2 = s1[1:2] assert s2.start == 2 assert s2.stop == 3 s2 = s1[2:3] assert s2.start == 3 assert s2.stop == 4 s2 = s1[0:3] assert s2.start == 1 assert s2.stop == 4 s2 = s1[-1:3] assert s2.start == 3 assert s2.stop == 4 s2 = s1[:-1] assert s2.start == 1 assert s2.stop == 3
def remove_bert_separator(text: str) -> Tuple[str, SequenceTransfer]: output = "" i = 0 j = 0 transfers = [] while i < len(text): char = text[i] if char == "#" and is_separator(text, i): transfers.append((Sequence(i, i + 3), Sequence(j, j + 1))) output += text[i + 2] i += 3 j += 1 else: transfers.append((Sequence(i, i + 1), Sequence(j, j + 1))) output += text[i] i += 1 j += 1 transfer = SequenceTransfer(Sequence(0, len(text)), Sequence(0, len(output)), transfers) return output, transfer
def test_remove_whitespace(self): t1 = " te \n\r\txt " t2, transfer = remove_whitespace(t1) assert ' ' not in t2 assert '\n' not in t2 assert '\r' not in t2 assert '\t' not in t2 assert transfer.apply(Sequence(0)) == Sequence(0, 0) assert transfer.apply(Sequence(1)) == Sequence(0) assert transfer.apply(Sequence(2)) == Sequence(1) assert transfer.apply(Sequence(3)) == Sequence(2, 2) assert transfer.apply(Sequence(4)) == Sequence(2, 2) assert transfer.apply(Sequence(5)) == Sequence(2, 2) assert transfer.apply(Sequence(6)) == Sequence(2, 2) assert transfer.apply(Sequence(7)) == Sequence(2, 3) assert transfer.apply(Sequence(8)) == Sequence(3, 4) assert transfer.apply(Sequence(9)) == Sequence(4, 4)
def test_iter(self): s1 = Sequence(5, 8) for s in s1: assert s == Sequence(s.start)
def to_lower(text: str) -> Tuple[str, SequenceTransfer]: sequence = Sequence(0, len(text)) transfer = SequenceTransfer(sequence, sequence, [(sequence, sequence)]) return text.lower(), transfer