def recumpile_sentence(sentance: Sentence) -> List[str]: new_tokens = [] # TODO: determine mood classifier for sentence and add respective emoji for token in sentance.tokens: if decision(random_synonym_probability): token = replace_with_random_synonym(token) if decision(censor_profanity_probability) and profanity.contains_profanity( token ): token = custom_censoring(token, censor_profanity_percent) elif decision(random_censor_probability): token = custom_censoring(token, random_censor_percent) if re.match("musk", token, flags=re.IGNORECASE): add_husky = True else: add_husky = False # processing recumpiled_token = recumpile_token(token) # post processing new_tokens.append(recumpiled_token) if decision(add_definition_in_parenthesis_probability): definition = get_token_random_definition(token) if definition: new_tokens += [ f"[[{recumpile_token('DEFINITION')} {token.upper()}:", f"{recumpile_text(definition)}]]", ] if add_husky: new_tokens.append(recumpile_token("husky")) if add_random_garbage and decision(add_random_garbage_probability): new_tokens.append(recumpile_token(add_random_garbage_token())) if add_randomly_text_face_emoji and decision( add_randomly_text_face_emoji_probability ): new_tokens.append(get_random_text_face_emojis()) if add_random_simple_text_emoji and decision( # TODO: use textblob to determine mood of text and insert faces # accordingly likely need to do this after reconstruction of the # text blob and go through this sentence by sentence rather than # word by word. add_random_simple_text_emoji_probability ): new_tokens.append(get_random_simple_text_emojis()) if add_random_rp_action and decision( add_random_rp_mid_sentence_action_probability ): new_tokens.append(get_random_rp_action_sentence()) if add_random_rp_action and decision(add_random_rp_end_sentence_action_probability): new_tokens.append(get_random_rp_action_sentence()) if decision(random_lorem_ipsum_probability): new_tokens.append(get_random_lorem_ipsum_sentance()) return new_tokens
def utf_8_char_swaps(token: str) -> str: if decision(0.5): token = re.sub(r"ae", "æ", token) token = re.sub(r"AE", "Æ", token) if decision(0.3): token = re.sub(r"ea", "æ", token) token = re.sub(r"EA", "Æ", token) return token
def lazy_char_subbing(token: str) -> str: """e.g.you -> u are -> r""" # TODO: better capital replacement # you -> u, yuu token = re.sub( "^y+(o+)?u+$", lambda match: f"u" if decision(0.5) else f"y{'u' * random.randint(1, 4)}", token, flags=re.IGNORECASE, ) # are -> r, arrr token = re.sub( "^a+(r+)?e+$", lambda match: f"r" if decision(0.5) else f"a{'r' * random.randint(1, 4)}", token, flags=re.IGNORECASE, ) # with -> wif token = re.sub( "^wi+th+$", lambda match: f"w{'i' * random.randint(1, 4)}{'f' * random.randint(1, 4)}", token, flags=re.IGNORECASE, ) # what -> wat OR wut if decision(0.5): token = re.sub( "^wha+t$", lambda match: f"w{random.choice(['a', 'u']) * random.randint(1, 4)}t", token, flags=re.IGNORECASE, ) # er -> ur token = re.sub( "(e+)r", lambda match: f"{'u' * (len(match.group(1)) + random.randint(0, 3))}r", token, flags=re.IGNORECASE, count=random.randint(0, 2), ) # easy -> ez token = re.sub( "^ea+s+y+$", lambda match: f"e{'z' * random.randint(1, 3)}", token, flags=re.IGNORECASE, ) # to,too, -> 2 token = re.sub("to+$", lambda match: f"2", token, flags=re.IGNORECASE) return token
def reeeer(token: str) -> str: if decision(REEE_probability): token = re.sub( r"([Rr])e*", lambda match: f"{match.group(1)}e" + "e" * random.choice(range(1, 15)), token, ) if decision(REEE_allcaps_probability): token = token.upper() return token
def owoer(token: str) -> str: # TODO: owo usually goes to owoo should supress. token = re.sub( r"(ou)([^o])?", lambda match: f"ouo{match.group(2) or ''}", token, flags=re.IGNORECASE, ) token = re.sub( r"(ow)([^o])?", lambda match: f"owo{match.group(2) or ''}", token, flags=re.IGNORECASE, ) token = re.sub( r"(ov)([^o])?", lambda match: f"ovo{match.group(2) or ''}", token, flags=re.IGNORECASE, ) token = re.sub(r"(cor)", lambda match: f"cowor", token) if ( "owo" not in token.lower() and "ouo" not in token.lower() and decision(hard_owo_replace_probability) ): owo_str = "owo" if decision(owo_vs_ouo_bias) else "ouo" token = re.sub( r"(o+)", lambda match: (owo_str * len(match.group(1))).replace("oo", "o"), token, flags=re.IGNORECASE, count=random.choice(range(0, 2)), ) # TODO: UWU # juice -> juwuice if decision(juwuice_swap_probability): token = re.sub( r"u+(i?ce)", lambda match: f"uwu{match.group(1)}", token, flags=re.IGNORECASE, ) if "uwu" not in token.lower() and decision(hard_uwu_replace_probability): uwu_str = "uwu" token = re.sub( r"u+", uwu_str, token, flags=re.IGNORECASE, count=random.choice(range(0, 2)) ) return token
def fuckyer(token: str) -> str: extra_fun = "" y_choice_1 = ("y" if decision(0.5) else "i") * random.choice(range(1, 5)) y_choice_2 = ("y" if decision(0.5) else "i") * random.choice(range(1, 5)) if decision(0.5): extra_fun = f"w{'u' * random.choice(range(1, 5))}k{y_choice_2}" token = re.sub( r"([Ff])?uck(er|ing)?", lambda match: f"{match.group(1) or ''}{'u' * random.choice(range(1,5))}k{y_choice_1}{match.group(2) or ''}" + " " + extra_fun, token, ) return token
def get_random_lorem_ipsum_sentance() -> str: """get lorem ipsum sentence""" lorem_sentence = lorem.sentence() if decision(lorem_ipsum_fuck_probability): lorem_sentence = fix_punctuation_spacing( TreebankWordDetokenizer().detokenize( recumpile_sentence(Sentence(lorem_sentence)) ) ) return lorem_sentence
def get_random_rp_action_sentence() -> str: more_verbs = [] more_verbs_probability = 1 while True: if decision(more_verbs_probability): additional_verb = get_random_action_verb() if decision(0.5): # TODO: config additional_verb = Word(additional_verb).lemmatize() additional_verb = recumpile_token(additional_verb) additional_verb = Word(additional_verb).pluralize() more_verbs.append(additional_verb) else: break more_verbs_probability -= more_verbs_probability_decay noun = get_random_rp_pronoun() if decision(0.5): # TODO: config noun = Word(noun).lemmatize() # TODO: add boolean for enable noun = recumpile_token(noun) noun = Word(noun).pluralize() return to_rp_text(f"{' and '.join(more_verbs)}{' ' if more_verbs else ''}{noun}")
def recumpile_token(token: str) -> str: # TODO: determine mood classifier for token and add respective emoji if decision(split_compound_word_probability): tokens = split_compound_word(token) else: tokens = [token] # TODO: migrate fuck_token to maybe a generator? fucked_tokens = [] for token in tokens: relevant_emoji = None if decision(add_text_relevant_emoji_probability): relevant_emoji = find_text_relevant_emoji( token) # TODO: add ability to get multiple? if relevant_emoji and decision( wrap_text_relevant_emoji_probability): fucked_tokens.append(relevant_emoji) if decision(0.1): token = remove_dupe_chars(token) if decision(lazy_char_subbing_probability): token = lazy_char_subbing(token) # TODO: this is a potential for unexpected behavior if decision(word_to_num_probability): token = word_to_num(token) if decision(num_to_word_probability): token = num_to_word(token) if decision(lr_to_w_swap_probability): token = lr_to_w_swap(token) # TODO: this might be too much idk if decision(invert_word_probability): token = word_inverter(token) if decision(upside_down_word_probability): token = word_upside_downer(token) elif decision(upside_down_word_probability): token = word_upside_downer_preserve_char_order(token) fucked_token = knotter( fuckyer(reeeer(rawrer(garbage(owoer(cummer(token))))))) if decision(add_extra_ed_probability): fucked_token = add_extra_ed(fucked_token) if decision(random_ending_y_probability): fucked_token = add_ending_y(fucked_token) # TODO: likely making fu@k into k # TODO: NOTE: indeed it is doing this fu@k # >>>list(TextBlob("fu@k").words) # ['fu', 'k'] if add_random_plurals and decision(add_random_plurals_probability): fucked_token = Word(fucked_token).pluralize() if randomly_lemmatize and decision(randomly_lemmatize_probability): fucked_token = Word(fucked_token).lemmatize() if randomly_capitalize_word and decision( randomly_capitalize_word_probability): fucked_token = fucked_token.upper() if randomly_spongebob_word and decision( randomly_spongebob_word_probability): fucked_token = generate_spongebob_text(fucked_token) if randomly_overemphasis_punctuation and decision( randomly_overemphasis_punctuation_probability): fucked_token = over_emphasise_punctuation( fucked_token, randomly_overemphasis_punctuation_max_fuck) if decision(common_misspellings_probability): fucked_token = common_mispellings(fucked_token) if randomly_swap_char and decision(randomly_swap_char_probability): fucked_token = random_swap_char(fucked_token, randomly_swap_char_swap_percent) if randomly_insert_char and decision(randomly_insert_char_probability): fucked_token = random_insert_char( fucked_token, randomly_insert_char_insert_percent) if decision(utf_8_char_swaps_probability): fucked_token = utf_8_char_swaps(fucked_token) if random_leet_speak and decision(random_leet_speak_probability): fucked_token = token_to_leet(fucked_token) if decision(common_misspellings_probability): fucked_token = common_mispellings(fucked_token) # TODO: likely also breaking the spacing between punctuation kittly 1! # TODO: `f****d` went to `DS` investigate # TODO: likely this is at fault if decision(homofiy_probability): fucked_token = homoify(fucked_token, homofiy_probability) fucked_tokens.append(fucked_token) if decision(add_x3_if_token_has_rawr_probability) and ( "rawr" in fucked_token.lower()): fucked_tokens.append("X3" if decision(0.5) else "x3") if decision(adding_ending_ksksk_andioop_probability) and ( fucked_token.lower().endswith("ksk") or fucked_token.lower().endswith("sks") or "ksksk" in fucked_token.lower() or "sksks" in fucked_token.lower()): for i in range(random.randint(1, 2)): fucked_tokens.append(recumpile_token("andioop")) if decision(adding_ending_ksksk_save_the_turtles_probability) and ( fucked_token.lower().endswith("ksk") or fucked_token.lower().endswith("sks") or "ksksk" in fucked_token.lower() or "sksks" in fucked_token.lower()): fucked_tokens.append(recumpile_text("save the turtles!")) if decision( fucking_normies_addition) and "reee" in fucked_token.lower(): fucked_tokens.append(recumpile_text("f*****g normies!")) if decision(get_rhymes_probability): for rhyme in get_runon_of_rhymes(token, max_runon=max_runon_rhymes): fucked_rhyme = recumpile_token(rhyme) fucked_tokens.append(fucked_rhyme) if relevant_emoji: fucked_tokens.append(relevant_emoji) for i, fucked_token in enumerate(fucked_tokens): if decision(space_gap_text_probability): # TODO: this modification may be better placed elsewhere fucked_token = space_gap_text( fucked_token, min_gap_size=space_gap_text_min_gap_size, max_gap_size=space_gap_text_max_gap_size, ) # TODO: discord format options if decision(bold_text_probability): fucked_token = bold_text(fucked_token) elif decision(back_tick_text_probability): fucked_token = back_tick_text(fucked_token) fucked_tokens[i] = fucked_token return " ".join(fucked_tokens)
def recumpile_sentence(sentence: Sentence) -> List[str]: new_tokens = [] # TODO: determine mood classifier for sentence and add respective emoji sentiment_emoji = None if decision(0.89): sentiment_emoji = get_sentiment_emoji(sentence) for token in sentence.tokenize(TweetWordTokenizer()): # TODO: this is only for discord so we dont break tokenization if re.match( r"@everyone|@here|<:[^:\s]+:[0-9]+>|<a:[^:\s]+:[0-9]+>|<(?:@!?\d+|:[A-Za-z0-9]+:)\w+>", token, ): new_tokens.append(token) continue emoji = None alias_emoji = get_cheap_emoji_alias(token) # TODO: refactor into its own mutator if decision(0.9) and (re.match("among", token, flags=re.IGNORECASE) or re.match("amogus", token, flags=re.IGNORECASE) or re.match(r"su+s", token, flags=re.IGNORECASE)): emoji = "ඞ" emoticon = get_emoticon(token) if alias_emoji: if decision(0.1) or (len(str(token)) == 1 and decision(0.9)): new_tokens.append(alias_emoji) continue else: if decision(0.5): new_tokens.append(alias_emoji) if decision(0.5): emoji = get_emoji_from_data(token) if decision(0.3): emoji = get_gloveword_emoji(token) if emoji: if decision(0.5): new_tokens.append(emoji) if decision(random_synonym_probability): token = replace_with_random_synonym(token) if decision(0.5) and profanity.contains_profanity(token): token = token.upper() if decision(censor_profanity_probability ) and profanity.contains_profanity(token): if decision(0.1): token = custom_censoring(token, 1) else: token = custom_censoring(token, censor_profanity_percent) elif decision(random_censor_probability): token = custom_censoring(token, random_censor_percent) if re.match("musk", token, flags=re.IGNORECASE): add_husky = True else: add_husky = False # processing recumpiled_token = recumpile_token(token) # post processing new_tokens.append(recumpiled_token) if emoji: if decision(0.8): new_tokens.append(emoji) if alias_emoji: if decision(0.8): new_tokens.append(alias_emoji) if emoticon: if decision(0.8): new_tokens.append(emoticon) if add_husky: new_tokens.append(recumpile_token("husky")) if add_random_garbage and decision(add_random_garbage_probability): new_tokens.append(recumpile_token(add_random_garbage_token())) if add_randomly_text_face_emoji and decision( add_randomly_text_face_emoji_probability): new_tokens.append(get_random_text_face_emojis()) if add_random_simple_text_emoji and decision( # TODO: use textblob to determine mood of text and insert faces # accordingly likely need to do this after reconstruction of the # text blob and go through this sentence by sentence rather than # word by word. add_random_simple_text_emoji_probability): new_tokens.append(get_random_simple_text_emojis()) if add_random_rp_action and decision( add_random_rp_mid_sentence_action_probability): new_tokens.append(get_random_rp_action_sentence()) if add_random_rp_action and decision( add_random_rp_end_sentence_action_probability): new_tokens.append(get_random_rp_action_sentence()) if sentiment_emoji: new_tokens.append(sentiment_emoji) if decision(0.4): for i in range(5): if decision(0.3): new_tokens.append(sentiment_emoji) else: break return new_tokens
def garbage(token: str) -> str: # inserting gay token = re.sub(r"([a-fh-zA-FH-Z])a+y+", lambda match: f"{match.group(1)}gay", token) # hello -> hewwo token = re.sub(r"([Hh])e+ll+o+?", lambda match: f"{match.group(1)}ewwo", token) # er -> ur if decision(0.4): token = re.sub( r"e+r+", lambda match: f"u{'r' * ceil(np.random.rayleigh(1.2))}", token, flags=re.IGNORECASE, ) # ello - >ewwo if decision(0.4): token = re.sub( r"e+ll+o+?", lambda match: f"ew{'w' * ceil(np.random.rayleigh(1.2))}o", token, flags=re.IGNORECASE, ) # 2-6ish # cute -> koot token = re.sub( r"([Cc])u+te", lambda match: f"{match.group(1)}oo{'o' * random.randint(0,5)}t", token, ) # ove -> wuv if decision(0.7): token = re.sub(r"(o+)ve", lambda match: f"w{'u' * len(match.group(1))}v", token) # one -> wun if decision(0.7): token = re.sub(r"one", "wun", token, flags=re.IGNORECASE) # as -> ass asss if decision(0.5): token = re.sub( r"([aA])([sS])($|[^s])", lambda match: f"{match.group(1)}{match.group(2) * random.randint(2,3)}t", token, ) # TODO: refactor (me -> meh|me -> meow) together? # me -> meow if decision(me_2_meow_swap_probability): token = re.sub( r"^me+$", lambda match: f"m{'e' * random.randint(1,3)}{'o' * random.randint(1,3)}w", token, flags=re.IGNORECASE, ) # me -> meh if decision(me_2_meh_swap_probability): token = re.sub( r"^me+$", lambda match: f"m{'e' * random.randint(1, 3)}h", token, flags=re.IGNORECASE, ) # my -> mah, myah if decision(0.5): token = re.sub( r"^my+$", lambda match: f"m{'y' if decision(0.3) else ''}{'a' * random.randint(2, 3)}{'h' if decision(0.5) else ''}", token, ) # ion -> shun if decision(0.5): token = re.sub(r"ion$", "shun", token) # .ome -> .um if decision(0.5): token = re.sub(r"([a-zA-Z])ome", lambda match: f"{match.group(1)}um", token) # teh or da if decision(0.5): token = re.sub(r"^([Tt])he$", lambda match: f"{match.group(1)}eh", token) else: token = re.sub( r"^([Tt])he$", lambda match: f"{'D' if match.group(1) == 'T' else 'd'}a", token, ) # ing -> inn if decision(0.5): token = re.sub( r"ing$", f"in{'n' * random.randint(0,4) if decision(0.5) else 'in' * random.randint(0, 4)}", token, flags=re.IGNORECASE, ) # ks -> ksksksk if decision(ksksk_enlargement_probability): token = re.sub( r"[kK][sS]|[sS][kK]", lambda match: f"{match.group(0) * random.randint(2,6)}", token, flags=re.IGNORECASE, ) # uck -> ucc, uccci if decision(uck_to_ucc_swap_probability): token = re.sub( r"u+c+k+", lambda match: f"u{'c' * random.randint(2,6)}{'i' * random.randint(0,3)}", token, flags=re.IGNORECASE, ) if decision(sub_to_subby_swap_probability): token = re.sub( r"s(u+)b", lambda match: f"s{match.group(1)}bb{('y' if decision(0.5) else 'i') * random.randint(1, 2)}", token, flags=re.IGNORECASE, ) # no -> nu+ nyu+ if decision(0.5): token = re.sub( "([nN])(o+)", lambda match: f"{match.group(1)}{'y' if decision(0.5) else ''}{'u' * (len(match.group(2)) * random.randint(1, 6))}", token, flags=re.IGNORECASE, ) return token