def __getitem__(self, key): if isinstance(key, int): if key <= 0: g_list = list(grapheme.graphemes(str(self))) return g_list[key] else: return self.__class__(grapheme.slice(str(self), key, key + 1)) elif isinstance(key, slice): if (key.start and key.start < 0) or (key.stop and key.stop < 0) or key.step != 1: g_list = list(grapheme.graphemes(str(self))) return "".join(g_list[key.start:key.stop:key.step]) else: return self.__class__(grapheme.slice(str(self), key.start, key.stop)) else: return super().__getitem__(key)
def mphon_weight(mphon): """Computes a weight for a raw morphophoneme""" global vowels, consonants, mphon_separator, weight_cache if mphon in weight_cache: return weight_cache[mphon] if mphon_separator == "": phon_list = grapheme.graphemes(mphon) else: phon_list = mphon.split(mphon_separator) phon_set = set(phon_list) if cfg.verbosity >= 30: print("phon_set =", phon_set) if phon_set == {"Ø"}: # weight = 100.0 # all-zero morphophonemes must be allowed weight = cfg.all_zero_weight elif len(phon_set) == 1: weight = 0.0 elif phon_set <= consonants: weight = cons_set_weight(phon_set) elif phon_set <= vowels: weight = vowel_set_weight(phon_set) else: #weight = float("Infinity") weight = 1000000.0 weight_cache[mphon] = weight if cfg.verbosity >= 35: print("mphon:", mphon, "weight:", weight) return weight
def create_from_source(name: str, source: Iterable[str], morpheme_delimiter: str, end_of_morpheme_symbol: str, padding_symbol: str, blacklist_char: str) -> "Alphabet": alphabet_set: Set[str] = set() for line in source: # type: str for character in grapheme.graphemes(line.strip()): # type: str category: str = unicodedata.category(character) if category[0] != "Z" and category[ 0] != "C" and character != morpheme_delimiter and character != end_of_morpheme_symbol and character != blacklist_char: alphabet_set.add(character) for symbol in alphabet_set: # type: str for character in symbol: # type: str category: str = unicodedata.category(character) if category[0] == "Z": logging.warning( f"WARNING - alphabet contains whitespace character:\t{Alphabet.unicode_info(symbol)}" ) elif (category[0] == "C" and character != morpheme_delimiter and character != end_of_morpheme_symbol): logging.warning( f"WARNING - alphabet contains control character:\t{Alphabet.unicode_info(symbol)}" ) return Alphabet(name=name, symbols=alphabet_set, end_of_morpheme_symbol=end_of_morpheme_symbol, padding_symbol=padding_symbol)
def predict(self, word: str) -> typing.List[str]: """Predict phonemes for the given word""" # encoder graphemes = list(grapheme.graphemes(word)) enc = self._encode(graphemes) enc = _gru( enc, len(graphemes) + 1, self.enc_w_ih, self.enc_w_hh, self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32), ) last_hidden = enc[:, -1, :] # decoder dec = np.take(self.dec_emb, [2], axis=0) # 2: <s> h = last_hidden preds = [] for _ in range(self.dec_maxlen): h = _grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h) logits = np.matmul(h, self.fc_w.T) + self.fc_b pred = logits.argmax() if pred == self.eos_idx: break # </s> preds.append(pred) dec = np.take(self.dec_emb, [pred], axis=0) preds = [self.phonemes[idx] for idx in preds] return preds
def test_mixed_text(self): input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A" graphemes = [ " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ", input_str[-2:] ] self.assertEqual(list(grapheme.graphemes(input_str)), graphemes) self.assertEqual(list(grapheme.grapheme_lengths(input_str)), [len(g) for g in graphemes]) self.assertEqual(grapheme.slice(input_str, 0, 2), " \U0001F476\U0001F3FB") self.assertEqual(grapheme.slice(input_str, 0, 3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, end=3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, 1, 4), "\U0001F476\U0001F3FB a") self.assertEqual(grapheme.slice(input_str, 2), input_str[3:]) self.assertEqual(grapheme.slice(input_str, 2, 4), " a") self.assertEqual(grapheme.length(input_str), 10) self.assertEqual(grapheme.length(input_str, until=0), 0) self.assertEqual(grapheme.length(input_str, until=1), 1) self.assertEqual(grapheme.length(input_str, until=4), 4) self.assertEqual(grapheme.length(input_str, until=10), 10) self.assertEqual(grapheme.length(input_str, until=11), 10)
def text_to_chars(text): ''' Takes as input Tibetan text, and creates a list of individual characters. ''' temp = graphemes(text) out = list(temp) return out
def helper(): for message in messages: if isinstance(message, list): message = ''.join(message) try: for grapheme_cluster in grapheme.graphemes(message): if grapheme_cluster in emoji.UNICODE_EMOJI_ENGLISH or \ len(grapheme_cluster) > 1: yield grapheme_cluster except: # pylint: disable=bare-except pass
def string_to_fsa(grapheme_string): """Return a FSA which accepts the sequence of graphemes in the string""" bfsa = hfst.HfstBasicTransducer() grapheme_list = list(grapheme.graphemes(grapheme_string)) string_pair_path = tuple(zip(grapheme_list, grapheme_list)) if cfg.verbosity >= 10: print(grapheme_list) print(string_pair_path) bfsa.disjunct(string_pair_path, 0) fsa = hfst.HfstTransducer(bfsa) return (fsa)
def text(update, context): text_received = update.message.text # Hack to check if all graphemes are emojis if emojis.count(text_received) == len( list(grapheme.graphemes(text_received))): print("emojis:", text_received) update.message.reply_text(f'uwu {text_received}') SSEFuckery.sse_broadcast("emojis", text_received) return print("scrolly-text:", text_received) update.message.reply_text(f'auzi cica >{text_received}') SSEFuckery.sse_broadcast("scrolly-text", text_received)
def test_contains(self): input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A" self.assertFalse(grapheme.contains(input_str, " \U0001F476")) self.assertFalse(grapheme.contains(input_str, "\u000D")) self.assertFalse(grapheme.contains(input_str, "\U0001F3FB")) self.assertTrue(grapheme.contains(input_str, "")) graphemes = list(grapheme.graphemes(input_str)) for grapheme_ in graphemes: self.assertTrue(grapheme.contains(input_str, grapheme_)) for i in range(len(graphemes) - 1): self.assertTrue(grapheme.contains(input_str, "".join(graphemes[i:i+2])))
def mphon_is_valid(mphon): """Tests if a raw morphophoneme is all consonants or all vowels""" global vowels, consonants, mphon_separator if mphon_separator == "": phon_list = grapheme.graphemes(mphon) else: phon_list = mphon.split(mphon_separator) phon_set = set(phon_list) if phon_set <= vowels: return True elif phon_set <= consonants: return True else: return False
def render_keys(self): need_more = False previous = "" if self.body.strip().startswith("#"): pa.typewrite(self.body, interval=0.005) else: for gr in grapheme.graphemes(self.body): if need_more and previous == "^": pa.keyDown('ctrl') pa.press(gr) pa.keyUp('ctrl') need_more = False previous = "" continue if gr in ["^"]: need_more = True previous = gr continue pa.press(gr) time.sleep(random.choice(range(1, 50)) / 1000.0) if not self.body.strip().endswith("<<"): pa.press("enter")
def reverse_string(string: str) -> str: """reverse string.""" trans = str.maketrans( { ",": "،", ",": "،", "?": "¿", "?": "¿", "(": ")", ")": "(", "(": ")", ")": "(", "《": "》", "》": "《", "«": "»", "»": "«", "/": "\\", "\\": "/", "“": "”", "”": "“", ">": "<", "<": ">", "〔": "〕", "〕": "〔", "[": "]", "]": "[", "{": "}", "}": "{", "「": "」", "」": "「", "【": "】", "】": "【", "[": "]", "]": "[", } ) return "".join(reversed(list(grapheme.graphemes(string)))).translate(trans)
def _fix_graphemes(text): """ Extract long graphemes sequences that can't be handled by pyte correctly because of the bug pyte#131. Graphemes are omited and replaced with placeholders, and returned as a list. Return: text_without_graphemes, graphemes """ output = "" graphemes = [] for gra in grapheme.graphemes(text): if len(gra) > 1: character = "!" graphemes.append(gra) else: character = gra output += character return output, graphemes
def split_graphemes(text): from grapheme import graphemes return tuple(graphemes(text))
def to_sort_list(q: str): q = unicodedata.normalize('NFKD', q) #compatible decomposed form gc = grapheme.graphemes(q) return sorted([convert(c) for c in gc if filter(c)])
def generate_message(msg, mapping): for c in grapheme.graphemes(msg): try: yield '{l}{m}{r}'.format(m=mapping[c], l=l_wrap, r=r_wrap) except KeyError: yield c
def prep_string_Arab(string): g = list(graphemes(re.sub(r"\s", "", string))) return [g[0] + "\u200d"] + ["\u200d" + c + "\u200d" for c in g[1:-1]] + ["\u200d" + g[1]]
def test_simple(self): self.assertEqual(list(grapheme.graphemes("alvin")), list("alvin"))
# STEP 3: # Compute the zero filled morphs out of the sequences of aligned symbols aligned_morphs = {} """index: (morpheme, morph), value: zero-filled morph """ for morpheme, aligned_sym_seq in alignments.items(): # e.g. "KOTA", ['kkkk', 'oooo', 'tdtd', 'aaØØ'] if args.verbosity >= 25: print("aligned_sym_seq:", aligned_sym_seq) if morpheme not in aligned_morphs: aligned_morphs[morpheme] = collections.OrderedDict() if aligned_sym_seq: aligned_vec_seq = [ tuple(grapheme.graphemes(aligned_sym)) for aligned_sym in aligned_sym_seq ] l = len(aligned_vec_seq[0]) zero_filled_morphs = [ "".join([x[i] for x in aligned_vec_seq]) for i in range(0, l) ] original_morphs = [x.replace("Ø", "") for x in zero_filled_morphs] ########## for origm, zerofm in zip(original_morphs, zero_filled_morphs): #if origm: # aligned_morphs[morpheme][origm] = zerofm aligned_morphs[morpheme][origm] = zerofm else: aligned_morphs[morpheme] = {"": ""} if args.verbosity >= 20:
def main( max_characters: int, max_morphemes: int, alphabet_file: str, end_of_morpheme_symbol: str, morpheme_delimiter: str, input_file: str, output_file: str, verbose: int, blacklist_char: str, ) -> None: import pickle if grapheme.length(end_of_morpheme_symbol) != 1: raise RuntimeError( "The end of morpheme symbol must consist of a single grapheme cluster " + "(see Unicode Standard Annex #29).") with open(alphabet_file, "rb") as f: alphabet: Alphabet = pickle.load(f) with (sys.stdin if input_file == "-" else open(input_file)) as input_source: with gzip.open(output_file, "wb") as output: characters_dimension: Dimension = Dimension( "characters", max_characters) morphemes_dimension: Dimension = Dimension("morphemes", max_morphemes) tpr: TensorProductRepresentation = TensorProductRepresentation( alphabet=alphabet, characters_dimension=characters_dimension) result: Dict[str, torch.Tensor] = {} skipped_morphemes: Set[str] = set() for number, line in enumerate(input_source): logging.debug(f"Processing line {number}\t{line.strip()}") for word in line.strip().split(): if blacklist_char in word: logging.info(f"Skipping unanalyzed word {word}") elif word not in result: for character in grapheme.graphemes(word): if character not in alphabet and character != morpheme_delimiter and character != end_of_morpheme_symbol: logging.warning( f"WARNING - not in alphabet:\t{Alphabet.unicode_info(character)}" ) morphemes = word.split(morpheme_delimiter) for morpheme in morphemes: if len(morpheme) == 0: logging.debug( f"Line {number} - skipping morpheme of length 0 in word {word}" ) elif len(morpheme) == max_characters: logging.warning( f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} equals max length {max_characters}, and there is no space to insert the required end of morpheme symbol" ) elif len(morpheme) > max_characters: logging.warning( f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} exceeds max length {max_characters}" ) else: try: tensor: Tensor = tpr.process_morpheme( morpheme) # if validate_tensors: # reconstructed_surface_form = TensorProductRepresentation.extract_surface_form(alphabet=tpr.alphabet, morpheme_tensor=tensor.data, max_chars_per_morpheme=len(tpr.character_roles)) # assert(reconstructed_surface_form == morpheme) result[morpheme] = tensor.data except IndexError: logging.warning( f"Line {number} - unable to process morpheme {morpheme} (length {len(morpheme)}) of {word}" ) # elif isinstance(e, AssertionError): # logging.warning(f"Line {number} - unable to reconstruct morpheme {morpheme} (length {len(morpheme)}) of {word} from tensor representation") skipped_morphemes.add(morpheme) # raise e logging.info( f"Writing binary file containing {len(result)} morphemes to disk at {output}..." ) pickle.dump(result, output) logging.info(f"...done writing binary file to disk at {output}", file=sys.stderr) logging.info( f"Failed to process {len(skipped_morphemes)} morphemes:\n" + "\n".join(skipped_morphemes))
def test_default_grapheme_suit(input_string, expected_graphemes, description): assert list(grapheme.graphemes(input_string)) == expected_graphemes assert grapheme.length(input_string) == len(expected_graphemes)
def test_cr_lf(self): self.assertEqual(list(grapheme.graphemes("\u000D\u000A")), ["\u000D\u000A"])
def test_emoji_with_modifier(self): input_str = "\U0001F476\U0001F3FB" self.assertEqual(list(grapheme.graphemes(input_str)), [input_str])
def main( morpheme_delimiter: str, end_of_morpheme_symbol: str, padding_symbol: str, input_file, output_file, verbose: int, blacklist_char: str, ) -> None: if grapheme.length(end_of_morpheme_symbol) != 1: raise RuntimeError( "The end of morpheme symbol must consist of a single grapheme cluster " + "(see Unicode Standard Annex #29).") alphabet_set: Set[str] = set() logging.info(f"Reading alphabet from input file {input_file.name}...") for line in input_file: for character in grapheme.graphemes(line.strip()): category = unicodedata.category(character) if category[0] == "Z": logging.debug( "Input contains whitespace character:\t{unicode_info(symbol)}. This character will not be included in the alphabet." ) elif category[0] == "C": logging.debug( "Input contains control character:\t{unicode_info(symbol)}. This character will not be included in the alphabet." ) elif character == morpheme_delimiter: logging.debug( "Not including morpheme delimeter {morpheme_delimiter} in the alphabet." ) elif character == blacklist_char: logging.debug( "Not including character {blacklist_char} in the alphabet." ) elif character == padding_symbol: raise RuntimeError( f"Input contains reserved padding character {padding_symbol}, but this character must not occur in the corpus." ) elif character == end_of_morpheme_symbol: raise RuntimeError( f"Input contains reserved end of morpheme character {end_of_morpheme_symbol}, but this character must not occur in the corpus." ) else: alphabet_set.add(character) # Zero is reserved for OOV output( output_file=output_file, int_value=0, character="", unicode_name="", description= "Integer value 0 is reserved to represent out-of-vocabulary characters in a tensor product representation" ) # We reserve another character to represent the end of morpheme in a tensor product representation output( output_file=output_file, int_value=1, character=escaped_codepoints(end_of_morpheme_symbol), unicode_name=unicode_info(end_of_morpheme_symbol), description= "Integer value 1 is reserved to represent the end of a morpheme in a tensor product representation" ) # We reserve another character to represent the padding after the end of morpheme in a tensor product representation output( output_file=output_file, int_value=2, character=escaped_codepoints(padding_symbol), unicode_name=unicode_info(padding_symbol), description= "Integer value 2 is reserved to represent padding beyond the end of a morpheme in a tensor product representation" ) # Remaining actual characters for i, symbol in enumerate(sorted(alphabet_set), start=3): output(output_file=output_file, int_value=i, character=symbol, unicode_name=unicode_info(symbol), description="")
def emojificate(string): return "".join(convert(ch) for ch in graphemes(string))
def convert_emoji(string): return "".join(convert(ch) for ch in graphemes(string))
def prep_string(string): return list(graphemes(re.sub(r"\s", "", string.upper())))
for line in sys.stdin: line = line.strip() lst = line.split("!", maxsplit=1) if len(lst) > 1: line = lst[0].strip() number = lst[1].strip() + " " else: number = "" words = line.split() comment = number + " ".join(words) best = aligner(words, args.zeros, line) #best2 = [re.sub(r"^([a-zšžŋđüõåäöáâ`´])\1\1*$", r"\1", cc) # for cc in best] if args.layout == "horizontal": mphonemic_best = [] for cc in best: grapheme_list = list(grapheme.graphemes(cc)) lab = grapheme_list[0] if len(set(grapheme_list)) == 1 else cc mphonemic_best.append(lab) print(" ".join(mphonemic_best).ljust(40), "!", comment) elif args.layout == "vertical": #print("best:", best) ### #print("len(best):", [len(x) for x in best]) ### print("\n".join(list_of_aligned_words(best))) print() elif args.layout == "list": print(" ".join(list_of_aligned_words(best)))
def test_empty(self): self.assertEqual(list(grapheme.graphemes("")), [])