def split_over_length(word): split_list = [] for n in range(1, grapheme.length(word) + 1): # split_list.append((word[:n], word[n:len(word)])) split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word)))) return split_list
def geometric_split(word, prob): split_point = set(np.random.geometric(prob, size=len(word))) split_list = [] for each in split_point: split_list.append((grapheme.slice(word, 0, each), grapheme.slice(word, each, grapheme.length(word)))) # for n in range(1, grapheme.length(word) + 1): # split_list.append((word[:n], word[n:len(word)])) # split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word)))) return split_list
def __getitem__(self, key): if isinstance(key, int): if key <= 0: g_list = list(grapheme.graphemes(str(self))) return g_list[key] else: return self.__class__(grapheme.slice(str(self), key, key + 1)) elif isinstance(key, slice): if (key.start and key.start < 0) or (key.stop and key.stop < 0) or key.step != 1: g_list = list(grapheme.graphemes(str(self))) return "".join(g_list[key.start:key.stop:key.step]) else: return self.__class__(grapheme.slice(str(self), key.start, key.stop)) else: return super().__getitem__(key)
def generate_splits(self, word): ''' Generate all possible splits Parameter: word = Word to be split Output: splits = List of all possible splits ''' splits = [] for s in range(grapheme.length(word) + 1): stem = grapheme.slice(word, 0, s) stem = stem if (grapheme.length(stem) > 0) else '$' suffix = grapheme.slice(word, s) suffix = suffix if (grapheme.length(suffix) > 0) else '$' splits.append((stem, suffix)) return splits
def preview(html: str, min: int = 50, max: int = 158) -> str: # NOQA: A002 """ Return a preview of a HTML blob as plain text, for use as a description tag. This function will attempt to return a HTML paragraph at a time, to avoid truncating sentences. Multiple paragraphs will be used if they are under min characters. :param str html: HTML text to generate a preview from :param int min: Minimum number of characters in the preview (default 50) :param int max: Maximum number of characters in the preview (default 158, recommended for Google) """ # Get the max length we're interested in, for efficiency in grapheme counts. A large # blob of text can impair performance if we're only interested in a small preview. # `max` can be < `min` when the caller specifies a custom `max` without `min` max_length = (max if max > min else min) + 1 blocks = text_blocks(html) if blocks: text = compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) while blocks and length < min: text += ' ' + compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) if length > max: text = grapheme.slice(text, 0, max - 1) + '…' return text return ''
def generate_pairs(lines): for line in lines: line = line.strip() if line.startswith('#') or len(line) == 0: continue # lines are broken up like: # CodePoints ; Status # Emoji EmojiName # so everything we need is after the # _, comparison = line.split('#', 1) comparison = comparison.strip() # grapheme breaks strings down by unicode character so we just need # to get the first one emoji = grapheme.slice(comparison, end=1) name = grapheme.slice(comparison, start=2) yield emoji, name
def list_of_aligned_words(sym_lst): if not sym_lst: return [] l = grapheme.length(sym_lst[0]) res = [] for i in range(l): syms = [grapheme.slice(itm, start=i, end=i + 1) for itm in sym_lst] res.append("".join(syms)) return res
def generate_splits(self, no_of_splits, tokens): splits = [] for token in tokens[:100]: for s in range(no_of_splits): # Draw a sample from a Geometric Distribution split_point = np.random.geometric(p=0.5) stem = grapheme.slice(token, 0, split_point) stem = stem if (grapheme.length(stem) > 0) else '$' suffix = grapheme.slice(token, split_point) suffix = suffix if (grapheme.length(suffix) > 0) else '$' splits.append((stem, suffix)) print('Total data:', len(splits)) print('Data Sample \n', splits[:5]) return splits
def test_mixed_text(self): input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A" graphemes = [ " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ", input_str[-2:] ] self.assertEqual(list(grapheme.graphemes(input_str)), graphemes) self.assertEqual(list(grapheme.grapheme_lengths(input_str)), [len(g) for g in graphemes]) self.assertEqual(grapheme.slice(input_str, 0, 2), " \U0001F476\U0001F3FB") self.assertEqual(grapheme.slice(input_str, 0, 3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, end=3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, 1, 4), "\U0001F476\U0001F3FB a") self.assertEqual(grapheme.slice(input_str, 2), input_str[3:]) self.assertEqual(grapheme.slice(input_str, 2, 4), " a") self.assertEqual(grapheme.length(input_str), 10) self.assertEqual(grapheme.length(input_str, until=0), 0) self.assertEqual(grapheme.length(input_str, until=1), 1) self.assertEqual(grapheme.length(input_str, until=4), 4) self.assertEqual(grapheme.length(input_str, until=10), 10) self.assertEqual(grapheme.length(input_str, until=11), 10)
def fold_lines_iter(lines_it: Iterable[str], width: int, max_removal: int = 14, separate_by_spaces: bool = False) -> Iterator[str]: assert width >= 16 assert max_removal < width w2 = width // 2 for L in lines_it: L = L.rstrip() if not L: yield '' continue # for L len_L_1 = len(L) - 1 idx = 0 while idx < len_L_1: c = w2 s = slice(L, idx, idx + c) sl = wcswidth(s) while idx + c < len_L_1 and sl < width: c += ((width - sl) // 2) or 1 s = slice(L, idx, idx + c) sl = wcswidth(s) if separate_by_spaces and idx + c < len_L_1 and not s[-1].isspace( ): max_rc = min(max_removal, len(s) - 1) for rc in range(0, max_rc): if s[-1 - rc].isspace(): s = s[:-rc] break # for rc assert s yield s idx += len(s)
def list_of_aligned_words(mphon_lst): """Converts a list of morphophonemes into a list of aligned words mphon_lst -- list of same length morphophonemes, e.g. ["lll", "ooo", "vvv", "ieØ"] Returns a list of words constructed out of the 1st, 2nd ... alphabetic symbols of the morphophonemes, e.g. ["lll", "ooo", "vvv", "ieØ"] --> ["lovi", "love", "lovØ"] """ if not mphon_lst: return [] lgth = grapheme.length(mphon_lst[0]) res = [] for i in range(lgth): syms = [grapheme.slice(itm,start=i, end=i+1) for itm in mphon_lst] res.append("".join(syms)) return res
def print_result(aligned_result, comments, weights, layout="horizontal"): """Prints the result of the alignment in one of the three formats aligned_result -- tuple of the weight and a list of aligned words where each aligned word is a list of comments -- possible comments which will be passed over weights -- whether to print also the overall weight of this alignment layout -- one of "horizontal" (a sequence of morphophonemes on a single line), "vertical" (each zero-filled word on a line of its own) or "list" (all zero-filled words on a single line)""" weight, aligned_words_lst = aligned_result if cfg.verbosity >= 10: print("aligned_result", aligned_result) if layout == "horizontal": lgth = grapheme.length(aligned_words_lst[0]) mphon_lst = [] for i in range(lgth): lst = [] for aligned_word in aligned_words_lst: symbol = grapheme.slice(aligned_word, start=i, end=i+1) lst.append(symbol) if len(set(lst)) == 1: mphon_str = lst[0] # abbreviate if all identical else: mphon_str = "".join(lst) mphon_lst.append(mphon_str) zstem_pairsym_str = " ".join(mphon_lst) mphonemic_str = " ".join(mphon_lst) if weights: print(mphonemic_str.ljust(40), weight) else: print(mphonemic_str) elif layout == "vertical": print("\n".join(aligned_words_lst)) print() elif layout == "list": print(" ".join(aligned_words_lst)) return