class BetweenLookGenerator(Generator): def __init__(self, bot1: Bot, bot2: Bot): self.bot1 = bot1 self.bot2 = bot2 self.grammar = Grammar({ 'origin': [ '{quote} #name# {verb}.', # '{quote} {verb} #name#', '#name# {verb}, {quote}' ], 'neutral': ['said'], 'question': ['questioned', 'asked', 'inquired'], 'exclaim': ['exclaimed', 'shouted'], 'name': ['#They#', '#name#'] }) self.grammar.add_modifiers(base_english) def generate_text(self, **kwargs): length = kwargs.get('length', 50000) history: List[str] = [self.bot1.respond('')] w_count = word_count(history[0]) quote = '"{}"'.format second = True while w_count < length: if second: sent = self.bot2.respond(history[-1]) info = self.bot2.info() else: sent = self.bot1.respond(history[-1]) info = self.bot1.info() second = not second history.append(sent) w_count += word_count(sent) text = '' for msg in history: msg = quote(msg) msg_type = 'neutral' if len(msg) >= 2: if msg[-2] == '?': msg_type = 'question' elif msg[-2] == '!': msg_type = 'exclaim' rule = self.grammar.flatten('\n{}#origin#'.format(info)) rule = rule.format(quote=msg, verb='#{}#'.format(msg_type)) rule = self.grammar.flatten(rule) text += rule[0].capitalize() + rule[1:] return text
class AcronymProcessor(Processor): splitting_pattern = re.compile(r'[A-Z][^A-Z]*') def __init__(self): self.api = Datamuse() self.grammar = Grammar({}) self.grammar.add_modifiers(base_english) def process_text(self, input_text: str, **kwargs) -> str: topics = kwargs.get('topics', None) splitted = self.splitting_pattern.findall(input_text) dictionary: Dict[str, List[str]] = {} for start in (x for x in splitted if x not in self.grammar.symbols): if topics is None: res = self.api.words(sp='{}*'.format(start), max=1000) else: res = self.api.words(sp='{}*'.format(start), topics=topics, max=1000) dictionary[start] = [obj['word'] for obj in res] for k, v in dictionary.items(): self.grammar.push_rules(k, v) return self.grammar.flatten('#{}.capitalize#'.format( '.capitalize# #'.join(splitted)))
class AcronymGenerator(Generator): splitting_pattern = re.compile(r'[A-Z][^A-Z]*') def __init__(self, acronym: str): self.acronym = acronym api = Datamuse() dictionary: Dict[str, List[str]] = {} splitted = self.splitting_pattern.findall(acronym) self.length = len(splitted) for start in splitted: res = api.words(sp='{}*'.format(start)) dictionary[start] = [obj['word'] for obj in res] self.grammar = Grammar(dictionary) self.grammar.add_modifiers(base_english) self.rule = '#{}.capitalize#'.format('.capitalize# #'.join(splitted)) def generate_text(self, **kwargs) -> str: length = kwargs.get('length', 50000) return '\n'.join((self.grammar.flatten(self.rule) for _ in range(math.ceil(length / self.length)))) def save_to_file(self, file_name: str, **kwargs): length = kwargs.get('length', 50000) text = self.generate_text(length=length) if file_name.endswith('.pdf'): doc = SimpleDocTemplate(file_name, pagesize=letter, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18) styles = getSampleStyleSheet() styles.add( ParagraphStyle(name='Normal_CENTER', parent=styles['Normal'], alignment=TA_CENTER)) doc.build([ Paragraph('<font size="18">{}</font>'.format(self.acronym), styles['Normal_CENTER']), Spacer(1, 12) ] + [ Paragraph(p, styles['Normal_CENTER']) for p in text.split('\n') ]) else: with open(file_name, 'w') as f: f.write(text)
def __init__(self, seed: Optional[int] = None): self.cleaner = CleaningProcessor() self.fake = Faker() self.fake.add_provider(address) self.fake.add_provider(profile) if seed: random.seed(seed) self.fake.seed_instance(seed + 1) dictionary: Dict[str, List[str]] = { 'cred_romance': ['USA Today bestselling', 'RITA Award winning', '#cred_gen#'], 'cred_sci-fi': [ '#scifantasy_award# Award winning', '#scifi_award# Award winning', '#cred_gen#' ], 'scifi_award': ['BSFA', 'Philip K. Dick'], 'cred_fantasy': [ '#scifantasy_award# Award winning', '#scifi_award# Award winning', '#cred_gen#' ], 'fantasy_award': ['British Fantasy', 'World Fantasy', 'Gemmell', 'Mythopoeic'], 'scifantasy_award': ['Hugo', 'Nebula', 'Locus', 'Aurealis'], 'cred_gen': ['#top_award# Prize winning', 'New York Times bestselling'], 'top_award': ['Pulitzer', 'Booker', 'Nobel'], 'good_adj': [ 'thrilling', 'fascinating', 'revolutionary', 'breathtaking', 'beautiful', 'seminal' ], 'married_and_kids': ['with #Their# #spouse# and #children#'], 'married': ['with #Their# #spouse#'], 'kids': ['with #Their# #children#'], 'children': ['#num_kids# kids', '#num_kids# #kid_type.s#', '#kid_type#'], 'kid_type': ['daughter', 'son'], 'num_kids': 4 * ['2'] + ['3'] } def raw(text, **params): try: return self.raw_pattern.match(text).group(1) except AttributeError: return text self.grammar = Grammar(dictionary) self.grammar.add_modifiers(base_english) self.grammar.add_modifiers({ 'raw': raw, 'clean': lambda x: x >> self.cleaner })
def __init__(self, bot1: Bot, bot2: Bot): self.bot1 = bot1 self.bot2 = bot2 self.grammar = Grammar({ 'origin': [ '{quote} #name# {verb}.', # '{quote} {verb} #name#', '#name# {verb}, {quote}' ], 'neutral': ['said'], 'question': ['questioned', 'asked', 'inquired'], 'exclaim': ['exclaimed', 'shouted'], 'name': ['#They#', '#name#'] }) self.grammar.add_modifiers(base_english)
def __init__(self, acronym: str): self.acronym = acronym api = Datamuse() dictionary: Dict[str, List[str]] = {} splitted = self.splitting_pattern.findall(acronym) self.length = len(splitted) for start in splitted: res = api.words(sp='{}*'.format(start)) dictionary[start] = [obj['word'] for obj in res] self.grammar = Grammar(dictionary) self.grammar.add_modifiers(base_english) self.rule = '#{}.capitalize#'.format('.capitalize# #'.join(splitted))
def __init__(self, input_text: str, state_size: int = 2): nltk.download('brown') nltk.download('gutenberg') self.nlp = spacy.load('en_core_web_lg') self.synonyms: Dict[str, List[str]] = defaultdict(list) self.entities: Dict[str, List[str]] = defaultdict(list) input_text = pipe( input_text, # lambda x: x.replace('\n', ' '), lambda x: self.clean_pattern.sub(' ', x), normalize_hyphenated_words, normalize_quotation_marks, normalize_unicode, normalize_whitespace) markovify.Text.__init__(self, input_text, state_size, retain_original=False) self.grammar = Grammar({**self.synonyms, **self.entities}) self.grammar.add_modifiers(base_english)
def intro_letter() -> str: g = Grammar(_INTRO_LETTER) while True: yield g.flatten('#main#')
def monster_names() -> str: g = Grammar(_MONSTER_NAME) while True: yield g.flatten('#main#')
class AutBioGenerator(Generator): raw_pattern = re.compile(r'^\(\((.+)\)\)$') state_pattern = re.compile(r', ([A-Z]{2}) [0-9]{5}$') def __init__(self, seed: Optional[int] = None): self.cleaner = CleaningProcessor() self.fake = Faker() self.fake.add_provider(address) self.fake.add_provider(profile) if seed: random.seed(seed) self.fake.seed_instance(seed + 1) dictionary: Dict[str, List[str]] = { 'cred_romance': ['USA Today bestselling', 'RITA Award winning', '#cred_gen#'], 'cred_sci-fi': [ '#scifantasy_award# Award winning', '#scifi_award# Award winning', '#cred_gen#' ], 'scifi_award': ['BSFA', 'Philip K. Dick'], 'cred_fantasy': [ '#scifantasy_award# Award winning', '#scifi_award# Award winning', '#cred_gen#' ], 'fantasy_award': ['British Fantasy', 'World Fantasy', 'Gemmell', 'Mythopoeic'], 'scifantasy_award': ['Hugo', 'Nebula', 'Locus', 'Aurealis'], 'cred_gen': ['#top_award# Prize winning', 'New York Times bestselling'], 'top_award': ['Pulitzer', 'Booker', 'Nobel'], 'good_adj': [ 'thrilling', 'fascinating', 'revolutionary', 'breathtaking', 'beautiful', 'seminal' ], 'married_and_kids': ['with #Their# #spouse# and #children#'], 'married': ['with #Their# #spouse#'], 'kids': ['with #Their# #children#'], 'children': ['#num_kids# kids', '#num_kids# #kid_type.s#', '#kid_type#'], 'kid_type': ['daughter', 'son'], 'num_kids': 4 * ['2'] + ['3'] } def raw(text, **params): try: return self.raw_pattern.match(text).group(1) except AttributeError: return text self.grammar = Grammar(dictionary) self.grammar.add_modifiers(base_english) self.grammar.add_modifiers({ 'raw': raw, 'clean': lambda x: x >> self.cleaner }) def generate_details(self, **kwargs) -> dict: details: Dict[str, Any] = {'genre': random.choice(['fantasy'])} details.update(self.fake.profile(sex=None)) details['pronouns'] = get_pronouns(details['sex']) details['is_gay'] = random.random() < .0195 details['is_married'] = random.random() < .43 details['has_kids'] = random.random() < .74 if details['is_married']: if details['is_gay']: details['spouse'] = { 'F': 'wife', 'M': 'husband' }[details['sex']] else: details['spouse'] = { 'F': 'husband', 'M': 'wife' }[details['sex']] addr = self.state_pattern.search(details['residence']) while not addr: details['residence'] = self.fake.address() addr = self.state_pattern.search(details['residence']) details['state'] = states.lookup(addr.group(1)).name details['signs'] = { 'astrological': get_astrological_sign(details['birthdate']), 'zodiac': get_zodiac_sign(details['birthdate']) } if random.random() < .3: details['inspired_by'] = {'sex': random.choice(['F', 'M'])} details['inspired_by']['relation'] = random.choice({ 'F': ['mother', 'sister', 'daughter'], 'M': ['father', 'brother', 'son'] }[details['inspired_by']['sex']]) else: details['inspired_by'] = None return details def generate_text(self, **kwargs) -> str: details = kwargs.get('details', self.generate_details()) text = """ {name} is a #cred_{genre}# #author.raw# of #good_adj# {genre} #books.raw# who lives in {state} """ if details['is_married']: if details['has_kids']: text += ' #{pronouns}[spouse:{spouse}]married_and_kids#' else: text += ' #{pronouns}[spouse:{spouse}]married#' elif details['has_kids']: text += ' #{pronouns}kids#' text += '.' print(text) return self.grammar.flatten(text.format(**details)) >> self.cleaner
def __init__(self): self.api = Datamuse() self.grammar = Grammar({}) self.grammar.add_modifiers(base_english)
for action in tbs_plan: print(action) # In[20]: g = Grammar({ 'open': '#body_id.possessive# cranium is open and the brain is exposed', 'closed': '#body_id.possessive# cranium is either intact, or closed and repaired', 'brain_in': '#brain_id.possessive# brain is in #body_id.possessive# head', 'empty': '#body_id.possessive# head does not contain a brain', 'on_ice': '#body_id.possessive# brain is stored in an ice box', 'open_skull': 'open #body_id.possessive# skull carefully using a chainsaw', 'close_skull': 'replace the removed section of #body_id.possessive# skull with a sterilised metal plate and suture the skin', 'remove_brain': 'disconnect #brain_id.possessive# brain from peripheral nervous system connections and remove it from #body_id.possessive# skull', 'replace_brain': 'apply nerve growth factor to connection sites in #body_id.possessive# empty skull and #brain_id.possessive# brain, then place #brain_id.possessive# brain in #body_id.possessive# body' }) g.add_modifiers(modifiers.base_english) # In[21]: for i, subject_id in enumerate(subject_ids): g.push_rules(subject_id, 'subject {0}'.format(chr(ord('A') + int(i))))
def __init__( self, # Data generator_text: Union[str, List[str]] = None, responder_text: List[str] = None, command_text: List[str] = None, grammar: Union[Dict[str, str], Grammar] = None, # Models chain: Union[Dict[str], MarkovText] = None, phraser: Union[Dict[str], Phraser] = None, word_vectors: Union[Dict[str], KeyedVectors] = None, nn: Union[Dict[str], Model] = None, # Chatterbot commander: ChatBot = None, **kwargs: Dict[str, int], ): # Defaults kwargs.update({ "word_vector_size": 256, "min_count": 5, "max_vocab_size": 40000000 }) self.nlp = spacy.load("en") corpus = list(map(self.word_split, responder_text)) # Chain if (not chain) or isinstance(chain, dict): chain = chain or {} for Key, Value in { "state_size": 2, "retain_original": True }.items(): chain.setdefault(Key, Value) MarkovText.__init__( self, None, state_size=chain["state_size"], parsed_sentences=corpus + list(self.generate_corpus(generator_text)), retain_original=chain["retain_original"], ) else: MarkovText.__init__( self, None, state_size=chain.state_size, chain=chain, parsed_sentences=chain.parsed_sentences, retain_original=chain.retain_original, ) corpus = [[word.split(self.separator)[0] for word in sentence] for sentence in corpus] # Phraser if (not phraser) or isinstance(phraser, dict): phraser = phraser or {} for Key, Value in {"gram_size": 3, "scoring": "default"}.items(): phraser.setdefault(Key, Value) for _ in range(phraser["gram_size"]): self.phraser = Phraser( Phrases( corpus, min_count=kwargs["min_count"], max_vocab_size=kwargs["max_vocab_size"], scoring=phraser["scoring"], )) corpus = self.phraser[corpus] else: self.phraser = phraser corpus = self.phraser[corpus] # Word Vectors if (not word_vectors) or isinstance(word_vectors, dict): word_vectors = word_vectors or {} for Key, Value in { "embedding_model": "fasttext", "window": 5, "workers": 3, }.items(): word_vectors.setdefault(Key, Value) self.word_vectors = { "fasttext": FastText, "word2vec": Word2Vec }[word_vectors["embedding_model"].lower()]( corpus, size=kwargs["word_vector_size"], window=word_vectors["window"], min_count=1, # kwargs["min_count"], workers=word_vectors["workers"], max_vocab_size=kwargs["max_vocab_size"], ).wv else: self.word_vectors = word_vectors # LSTM RNN if (not nn) or isinstance(nn, dict): nn = nn or {} for Key, Value in { "cell_type": "LSTM", # "num_layers": 3, Perhaps later "max_words": 100, "sentence_vector_size": 300, "activation": "tanh", "dropout_rate": .2, "loss": "categorical_crossentropy", "learning_rate": .0005, "metrics": ["accuracy"], }.items(): nn.setdefault(Key, Value) input_statement = Input( shape=(nn["max_words"], kwargs["word_vector_size"]), name="input_statement", ) input_response = Input( shape=(nn["max_words"], kwargs["word_vector_size"]), name="input_response", ) self.nn = Model( inputs=[input_statement, input_response], outputs=[ Dense(kwargs["max_vocab_size"], activation="softmax")( Dense(kwargs["max_vocab_size"] / 2, activation="relu")(concatenate( [ Bidirectional({ "LSTM": LSTM, "GRU": GRU }[nn["cell_type"]]( units=nn["sentence_vector_size"], input_shape=( nn["max_words"], kwargs["word_vector_size"], ), activation=nn["activation"], dropout=nn["dropout_rate"], kernel_initializer=lecun_uniform(), ))(input_statement), Bidirectional({ "LSTM": LSTM, "GRU": GRU }[nn["cell_type"]]( units=nn["sentence_vector_size"], input_shape=( nn["max_words"], kwargs["word_vector_size"], ), activation=nn["activation"], dropout=nn["dropout_rate"], kernel_initializer=lecun_uniform(), ))(input_response), ], axis=1, ))) ], ) self.nn.compile( loss=nn["loss"], optimizer=Adam(lr=nn["learning_rate"]), metrics=nn["metrics"], ) else: self.nn = nn # Commander self.commander = commander or ChatBot( "Commander", preprocessors=[ "chatterbot.preprocessors.clean_whitespace", "chatterbot.preprocessors.convert_to_ascii", ], trainer="chatterbot.trainers.ListTrainer", logic_adapters=[ { "import_path": "chatterbot.logic.BestMatch" }, { "import_path": "chatterbot.logic.LowConfidenceAdapter", "threshold": 0.65, "default_response": "FAIL", }, ], ) if command_text: self.commander.train(command_text) # Grammar if (not grammar) or isinstance(grammar, dict): grammar = grammar or {} for Key, Value in {}.items(): grammar.setdefault(Key, Value) self.grammar = Grammar(grammar) self.grammar.add_modifiers(base_english) else: self.grammar = grammar
class Bottimus(MarkovText): @staticmethod def load(filename: str): with open(filename, "rb") as f: result = pickle.load(f) return result def __init__( self, # Data generator_text: Union[str, List[str]] = None, responder_text: List[str] = None, command_text: List[str] = None, grammar: Union[Dict[str, str], Grammar] = None, # Models chain: Union[Dict[str], MarkovText] = None, phraser: Union[Dict[str], Phraser] = None, word_vectors: Union[Dict[str], KeyedVectors] = None, nn: Union[Dict[str], Model] = None, # Chatterbot commander: ChatBot = None, **kwargs: Dict[str, int], ): # Defaults kwargs.update({ "word_vector_size": 256, "min_count": 5, "max_vocab_size": 40000000 }) self.nlp = spacy.load("en") corpus = list(map(self.word_split, responder_text)) # Chain if (not chain) or isinstance(chain, dict): chain = chain or {} for Key, Value in { "state_size": 2, "retain_original": True }.items(): chain.setdefault(Key, Value) MarkovText.__init__( self, None, state_size=chain["state_size"], parsed_sentences=corpus + list(self.generate_corpus(generator_text)), retain_original=chain["retain_original"], ) else: MarkovText.__init__( self, None, state_size=chain.state_size, chain=chain, parsed_sentences=chain.parsed_sentences, retain_original=chain.retain_original, ) corpus = [[word.split(self.separator)[0] for word in sentence] for sentence in corpus] # Phraser if (not phraser) or isinstance(phraser, dict): phraser = phraser or {} for Key, Value in {"gram_size": 3, "scoring": "default"}.items(): phraser.setdefault(Key, Value) for _ in range(phraser["gram_size"]): self.phraser = Phraser( Phrases( corpus, min_count=kwargs["min_count"], max_vocab_size=kwargs["max_vocab_size"], scoring=phraser["scoring"], )) corpus = self.phraser[corpus] else: self.phraser = phraser corpus = self.phraser[corpus] # Word Vectors if (not word_vectors) or isinstance(word_vectors, dict): word_vectors = word_vectors or {} for Key, Value in { "embedding_model": "fasttext", "window": 5, "workers": 3, }.items(): word_vectors.setdefault(Key, Value) self.word_vectors = { "fasttext": FastText, "word2vec": Word2Vec }[word_vectors["embedding_model"].lower()]( corpus, size=kwargs["word_vector_size"], window=word_vectors["window"], min_count=1, # kwargs["min_count"], workers=word_vectors["workers"], max_vocab_size=kwargs["max_vocab_size"], ).wv else: self.word_vectors = word_vectors # LSTM RNN if (not nn) or isinstance(nn, dict): nn = nn or {} for Key, Value in { "cell_type": "LSTM", # "num_layers": 3, Perhaps later "max_words": 100, "sentence_vector_size": 300, "activation": "tanh", "dropout_rate": .2, "loss": "categorical_crossentropy", "learning_rate": .0005, "metrics": ["accuracy"], }.items(): nn.setdefault(Key, Value) input_statement = Input( shape=(nn["max_words"], kwargs["word_vector_size"]), name="input_statement", ) input_response = Input( shape=(nn["max_words"], kwargs["word_vector_size"]), name="input_response", ) self.nn = Model( inputs=[input_statement, input_response], outputs=[ Dense(kwargs["max_vocab_size"], activation="softmax")( Dense(kwargs["max_vocab_size"] / 2, activation="relu")(concatenate( [ Bidirectional({ "LSTM": LSTM, "GRU": GRU }[nn["cell_type"]]( units=nn["sentence_vector_size"], input_shape=( nn["max_words"], kwargs["word_vector_size"], ), activation=nn["activation"], dropout=nn["dropout_rate"], kernel_initializer=lecun_uniform(), ))(input_statement), Bidirectional({ "LSTM": LSTM, "GRU": GRU }[nn["cell_type"]]( units=nn["sentence_vector_size"], input_shape=( nn["max_words"], kwargs["word_vector_size"], ), activation=nn["activation"], dropout=nn["dropout_rate"], kernel_initializer=lecun_uniform(), ))(input_response), ], axis=1, ))) ], ) self.nn.compile( loss=nn["loss"], optimizer=Adam(lr=nn["learning_rate"]), metrics=nn["metrics"], ) else: self.nn = nn # Commander self.commander = commander or ChatBot( "Commander", preprocessors=[ "chatterbot.preprocessors.clean_whitespace", "chatterbot.preprocessors.convert_to_ascii", ], trainer="chatterbot.trainers.ListTrainer", logic_adapters=[ { "import_path": "chatterbot.logic.BestMatch" }, { "import_path": "chatterbot.logic.LowConfidenceAdapter", "threshold": 0.65, "default_response": "FAIL", }, ], ) if command_text: self.commander.train(command_text) # Grammar if (not grammar) or isinstance(grammar, dict): grammar = grammar or {} for Key, Value in {}.items(): grammar.setdefault(Key, Value) self.grammar = Grammar(grammar) self.grammar.add_modifiers(base_english) else: self.grammar = grammar def sentence_split(self, text: str) -> list: return sent_tokenize(text) separator: str = "::" def word_split(self, sentence: str, pos: bool = True) -> List[str]: if pos: tokens = [] for token in self.nlp( sentence, disable=["tagger", "parser", "ner", "textcat"]): orth = token.orth_ if orth.isspace() or token.like_url or orth.startswith("#"): continue elif orth.startswith("@"): tokens.append("#username#") else: tokens.append(self.separator.join((orth, token.pos_))) else: tokens = word_tokenize(sentence) return tokens def word_join(self, words: List[str], pos: bool = True) -> str: if pos: sentence = " ".join( word.split(self.separator)[0] for word in words) else: sentence = " ".join(words) return sentence rule_pattern = re.compile("#(\\w|\\.)+#") def make_sentence(self, statement: str = None, init_state=None, **kwargs: dict) -> str: if statement: response = str(self.commander.get_response(statement)) if response == "FAIL": return "NI YET" else: response = MarkovText.make_sentence(self, init_state=init_state, kwargs=kwargs) return self.grammar.flatten(response) def save(self, filename: str) -> None: with open(filename, "wb") as f: pickle.dump(self, f)
qg = Grammar({ 'animal': pycorpora.animals.common['animals'], 'first_name_en': pycorpora.humans.firstNames['firstNames'], 'last_name_en': pycorpora.humans.lastNames['lastNames'], 'first_name_no': (pycorpora.humans.norwayFirstNamesBoys['firstnames_boys_norwegian'] + pycorpora.humans.norwayFirstNamesGirls['firstnames_girls_norwegian']), 'last_name_no': pycorpora.humans.norwayLastNames['lastnames_norwegian'], 'first_name_es': pycorpora.humans.spanishFirstNames['firstNames'], 'last_name_es': pycorpora.humans.spanishLastNames['lastNames'], 'any_title': pycorpora.humans.englishHonorifics['englishHonorifics'], 'object': [ x.strip() for x in pycorpora.objects.objects['objects'] if x.strip()[-1] != 's' ], # and len(x.split()) < 2 'cluedo_suspect': pycorpora.games.cluedo['suspects']['Cluedo'], 'cluedo_weapon': pycorpora.games.cluedo['weapons']['Cluedo'], 'cluedo_room': pycorpora.games.cluedo['rooms'], 'clue_suspect': pycorpora.games.cluedo['suspects']['Clue'], 'clue_weapon': pycorpora.games.cluedo['weapons']['Clue'], 'clue_room': pycorpora.games.cluedo['rooms'], 'room': pycorpora.architecture.rooms['rooms'], 'appliance': pycorpora.technology.appliances['appliances'], 'strange_word': pycorpora.words.strange_words['words'], 'name_suffix': pycorpora.humans.suffixes['suffixes'], 'greek_god': pycorpora.mythology.greek_gods['greek_gods'], 'greek_monster': pycorpora.mythology.greek_monsters['greek_monsters'], 'greek_titan': pycorpora.mythology.greek_titans['greek_titans'], 'celebrity': pycorpora.humans.celebrities['celebrities'], 'street_core': ([x.split()[-1] for x in pycorpora.humans.celebrities['celebrities']] + [x.split()[-1] for x in pycorpora.humans.britishActors['britishActors']] + pycorpora.geography.english_towns_cities['towns'] + pycorpora.geography.english_towns_cities['cities'] + pycorpora.geography.countries['countries'] + [x['name'] for x in pycorpora.geography.oceans['oceans']] + [x['name'] for x in pycorpora.geography.rivers['rivers']]), 'saint': [ x['saint'] if any(x['saint'].startswith(t) for t in saint_titles) else 'Saint ' + x['saint'] for x in pycorpora.religion.christian_saints ], 'pet': [ '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#animal.a.capitalize#', '#celebrity#' ], 'street_noun': [ 'street', 'road', 'street', 'road', 'street', 'road', 'street', 'road', 'street', 'road', 'street', 'road', 'lane', 'avenue', 'close', 'way', 'lane', 'avenue', 'close', 'way', 'boulevard', 'alley', 'drive', 'crescent', 'court', 'hill', 'strand', 'end', 'prospect', 'gate' ], 'street_adjective': ['old', 'new', 'west', 'east', 'north', 'south'], 'small_cardinal': ['two', 'three', 'four'], 'street': [ '#street_core# #street_noun#', '#street_core# #street_noun#', '#street_core# #street_noun#', '#street_core# #street_noun#', '#street_core# #street_noun#', '#street_core# #street_noun#', '#street_adjective# #street_core# #street_noun#', '#street_adjective# #street_core# #street_noun#', '#street_adjective# #street_noun#', '#street_adjective# #street_noun#', '#small_cardinal# #street_core.s# #street_noun#', 'the #street_adjective# #street_noun#', 'the #street_noun#', '#rare_street#' ], 'rare_street': ['#street#', '#street#', '#street#', '#real_rare_street#'], 'real_rare_street': [ 'whipmawhopma#street_noun#', 'whip-ma-whop-ma-#street_noun#', #'#[street_core:#rude_word#]street#', '#[street_core:#strange_word#]street#' ], 'greek_whatever': ['#greek_god#', '#greek_monster#', '#greek_titan#'], 'cluedo': [ '#cluedo_suspect#, in the #cluedo_room#, with the #cluedo_weapon#', '#clue_suspect#, in the #clue_room#, with the #clue_weapon#' ], 'any_pronouns': [ '{subject}/{object}/{dependentPossessive}/{independentPossessive}/{reflexive}' .format(**pronouns) for pronouns in pycorpora.humans.thirdPersonPronouns['thirdPersonPronouns'] ], 'simple_pronouns': [ 'he/him/his/his/himself', 'she/her/her/hers/herself', 'they/them/their/theirs/themself' ], 'pronouns': [ '#simple_pronouns#', '#simple_pronouns#', '#simple_pronouns#', '#any_pronouns#' ], 'simple_title': ['Mr', 'Mr', 'Mr', 'Mrs', 'Ms', 'Miss', 'Mx', 'Mx', 'Mx'], 'title': ['#simple_title#', '#simple_title#', '#simple_title#', '#any_title#'], 'first_name': [ '#first_name_en#', '#first_name_en#', '#first_name_en#', '#first_name_no#', '#first_name_es#' ], 'single_last_name': [ '#last_name_en#', '#last_name_en#', '#last_name_en#', '#last_name_no#', '#last_name_es#' ], 'last_name': [ '#single_last_name#', '#single_last_name#', '#single_last_name#-#single_last_name#' ], 'full_name_no_suffix': ['#first_name# #last_name#', '#first_name# #first_name# #last_name#'], 'full_name': [ '#full_name_no_suffix#', '#full_name_no_suffix#', '#full_name_no_suffix#', '#full_name_no_suffix# #name_suffix#' ], 'title_last': '#title# #last_name#', 'title_full_name': '#title# #full_name#', 'first_name_noun': [ 'first name', 'given name', 'given name', 'given name', 'given name', 'given name', 'personal name', 'personal name', 'personal name', 'personal name', 'forename', 'Christian name' ], 'last_name_noun': [ 'surname', 'surname', 'surname', 'surname', 'family name', 'family name', 'family name', 'family name', 'family name', 'last name' ], 'title_noun': ['honorific', 'title'], 'low_ordinal_number': [ 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelth', 'thirteenth', 'fourteenth', 'fifteenth' ], 'numerated_object': [ '#object.a#', 'two #object.s#', 'three #object.s#', 'four #object.s#', 'five #object.s#', 'six #object.s#', 'seven #object.s#', 'eight #object.s#', 'nine #object.s#' ], 'object_collection_head': ['#numerated_object#', '#object_collection_head#, #numerated_object#'], 'object_collection': ['#object_collection_head#, #numerated_object#, and #numerated_object#'], 'receive_verb': ['receive', 'get'], 'maybe_x': ['#x#', ''], 'cheese_noun': [ 'cheese', 'cheese', 'cheese', 'cheese', 'curd', 'fermented dairy product', 'cheese, curd, or #[x:other ]maybe_x#fermented dairy product', 'cheese or #[x:other ]maybe_x#fermented dairy product', 'curd or #[x:other ]maybe_x#fermented dairy product', 'cheese or curd' ], 'room_question_clause': [ 'were you born', 'was your first kiss', 'do you usually eat', 'do you usually sleep', 'do you keep your #[x:best ]maybe_x##appliance#', 'were you born', 'was your first kiss', 'do you usually eat', 'do you usually sleep', 'do you keep your #[x:best ]maybe_x##appliance#', 'do you keep your life savings' ], 'room_question': [ 'What kind of room #room_question_clause# in?', 'In what kind of room #room_question_clause#?', 'Where #room_question_clause#?' ], 'room_answer': ['#room.a.capitalize#', 'The #room#'], 'new_or_emerging': ['new', 'emerging', 'new or emerging'], 'fabric_item': [ 'duvet cover', 'coat', 'skirt', 'pair of trousers', 'pair of pants', 'bandana' ], 'fabric_question': [ 'What is your favourite fabric?', 'What is your favourite fabric?', 'What is your favourite fabric?', 'What was your first #fabric_item# made of?', 'What was your first #fabric_item# made out of?', 'Of what fabric was your first #fabric_item# made?' ] })
def location_names() -> str: g = Grammar(_LOCATION_NAMES) while True: yield g.flatten('#main#')
def grammar(self) -> Grammar: g = Grammar(self.raw_grammar) g.add_modifiers(self.context.make_modifires(g)) return g
class POSifiedText(markovify.Text, Generator): separator = "<:>" clean_pattern = re.compile(r'[\n_]') tracery_pattern = re.compile(r'^#.+#$') def __init__(self, input_text: str, state_size: int = 2): nltk.download('brown') nltk.download('gutenberg') self.nlp = spacy.load('en_core_web_lg') self.synonyms: Dict[str, List[str]] = defaultdict(list) self.entities: Dict[str, List[str]] = defaultdict(list) input_text = pipe( input_text, # lambda x: x.replace('\n', ' '), lambda x: self.clean_pattern.sub(' ', x), normalize_hyphenated_words, normalize_quotation_marks, normalize_unicode, normalize_whitespace) markovify.Text.__init__(self, input_text, state_size, retain_original=False) self.grammar = Grammar({**self.synonyms, **self.entities}) self.grammar.add_modifiers(base_english) def sentence_join(self, sentences): return " ".join(sentences) def word_split(self, sentence): tokenized = [] first = True entity = False entity_construct = {"tag": "", "type": "", "words": []} for word in self.nlp(sentence): default = True if word.ent_iob_ == "B": entity = True entity_construct['tag'] = word.tag_ entity_construct['type'] = word.ent_type_ entity_construct['words'] = [] elif entity and word.ent_iob_ == 'O': entity = False text = self.separator.join( (" ".join(entity_construct['words']), entity_construct['tag'])) tokenized.append(text) self.entities[entity_construct['type']].append(text) if word.pos_ in {'NOUN', 'VERB'}: # syns = wn.synsets(word.orth_, self.pos_converter[word.pos_]) modifiers = [] if word.orth_ not in self.synonyms: r = requests.get('https://api.datamuse.com/words', params={'ml': word.orth_}) syns = [] if len(r.json()) > 0: syns = [ obj['word'] for obj in r.json() if 'syn' in obj['tags'] and UNIVERSAL_TO_LETTER[word.pos_] in obj['tags'] and 'prop' not in obj['tags'] ] if len(syns) > 0: self.synonyms[word.orth_] = syns else: self.synonyms[word.orth_] = False if self.synonyms[word.orth_]: default = False if (not (first or entity) and tokenized[-1].lower() in {'a', 'an'}): if tokenized[-1] != tokenized[-1].lower(): first = True modifiers.append('.a') if not first and word.orth_[0].isupper(): modifiers.append('.capitalize') if entity: text = '#{}{}#'.format(word.orth_, ''.join(modifiers)) else: text = self.separator.join( ('#{}{}#'.format(word.orth_, ''.join(modifiers)), word.tag_)) if default: if entity: text = word.orth_ else: text = self.separator.join((word.orth_, word.tag_)) if entity: entity_construct['words'].append(text) else: tokenized.append(text) first = False return tokenized def word_join(self, words): sentence = [] for word in words: (word, _) = word.split(self.separator) sentence.append(self.grammar.flatten(word)) return " ".join(sentence).replace('_', ' ') def generate_text(self, **kwargs) -> str: length = kwargs.get('length', 50000) text = '' w_count = 0 while w_count < length: sent = self.make_sentence() text += sent w_count += word_count(sent) return text def save_to_file(self, file_name: str, length: int = 50000): text = self.generate_text(length=length) with open(file_name, 'w') as f: f.write(text)