def test__delimiter__not_found(self): text = 'Vodka&Balalayka&Big bear' delimiter = '~' assert get_line_piece(text, delimiter, -1) == text assert get_line_piece(text, delimiter, 0) == text assert get_line_piece(text, delimiter, 1) == text assert get_line_piece(text, delimiter, 2) == text
def test__no_delimiter(self): text = 'Line about nine' delimiter = None assert get_line_piece(text, delimiter, -1) == text assert get_line_piece(text, delimiter, 0) == text assert get_line_piece(text, delimiter, 1) == text assert get_line_piece(text, delimiter, 2) == text
def process_line(self, line: str) -> Optional[str]: line_cpy = get_line_piece(line, self.delimiter, self.delimited_position) if len(line_cpy) < self.threshold: return None return line
def process_line(self, line: str) -> Optional[str]: line_cpy = get_line_piece(line, self.delimiter, self.delimited_position) # TODO: `line_cpy = line_cpy.lower()` ? result = self.ft.predict(line_cpy, k=1) if result[1][0] < self.threshold: return None lang = result[0][0].replace('__label__', '') if lang != self.language_code: return None return line
def test__delimiter__found(self): text = 'Column 1~Column #2~Description' delimiter = '~' assert get_line_piece(text, delimiter, -1) == 'Description' assert get_line_piece(text, delimiter, 0) == 'Column 1'