Ejemplo n.º 1
0
class BetweenLookGenerator(Generator):
    def __init__(self, bot1: Bot, bot2: Bot):
        self.bot1 = bot1
        self.bot2 = bot2

        self.grammar = Grammar({
            'origin': [
                '{quote} #name# {verb}.',
                # '{quote} {verb} #name#',
                '#name# {verb}, {quote}'
            ],
            'neutral': ['said'],
            'question': ['questioned', 'asked', 'inquired'],
            'exclaim': ['exclaimed', 'shouted'],
            'name': ['#They#', '#name#']
        })
        self.grammar.add_modifiers(base_english)

    def generate_text(self, **kwargs):
        length = kwargs.get('length', 50000)

        history: List[str] = [self.bot1.respond('')]
        w_count = word_count(history[0])

        quote = '"{}"'.format

        second = True
        while w_count < length:
            if second:
                sent = self.bot2.respond(history[-1])
                info = self.bot2.info()
            else:
                sent = self.bot1.respond(history[-1])
                info = self.bot1.info()

            second = not second

            history.append(sent)
            w_count += word_count(sent)

        text = ''

        for msg in history:
            msg = quote(msg)

            msg_type = 'neutral'
            if len(msg) >= 2:
                if msg[-2] == '?':
                    msg_type = 'question'

                elif msg[-2] == '!':
                    msg_type = 'exclaim'

            rule = self.grammar.flatten('\n{}#origin#'.format(info))
            rule = rule.format(quote=msg, verb='#{}#'.format(msg_type))
            rule = self.grammar.flatten(rule)

            text += rule[0].capitalize() + rule[1:]

        return text
Ejemplo n.º 2
0
class AcronymProcessor(Processor):
    splitting_pattern = re.compile(r'[A-Z][^A-Z]*')

    def __init__(self):
        self.api = Datamuse()

        self.grammar = Grammar({})
        self.grammar.add_modifiers(base_english)

    def process_text(self, input_text: str, **kwargs) -> str:
        topics = kwargs.get('topics', None)

        splitted = self.splitting_pattern.findall(input_text)

        dictionary: Dict[str, List[str]] = {}
        for start in (x for x in splitted if x not in self.grammar.symbols):
            if topics is None:
                res = self.api.words(sp='{}*'.format(start), max=1000)
            else:
                res = self.api.words(sp='{}*'.format(start),
                                     topics=topics,
                                     max=1000)

            dictionary[start] = [obj['word'] for obj in res]

        for k, v in dictionary.items():
            self.grammar.push_rules(k, v)

        return self.grammar.flatten('#{}.capitalize#'.format(
            '.capitalize# #'.join(splitted)))
Ejemplo n.º 3
0
class AcronymGenerator(Generator):
    splitting_pattern = re.compile(r'[A-Z][^A-Z]*')

    def __init__(self, acronym: str):
        self.acronym = acronym

        api = Datamuse()

        dictionary: Dict[str, List[str]] = {}

        splitted = self.splitting_pattern.findall(acronym)

        self.length = len(splitted)

        for start in splitted:
            res = api.words(sp='{}*'.format(start))

            dictionary[start] = [obj['word'] for obj in res]

        self.grammar = Grammar(dictionary)
        self.grammar.add_modifiers(base_english)

        self.rule = '#{}.capitalize#'.format('.capitalize# #'.join(splitted))

    def generate_text(self, **kwargs) -> str:
        length = kwargs.get('length', 50000)

        return '\n'.join((self.grammar.flatten(self.rule)
                          for _ in range(math.ceil(length / self.length))))

    def save_to_file(self, file_name: str, **kwargs):
        length = kwargs.get('length', 50000)

        text = self.generate_text(length=length)

        if file_name.endswith('.pdf'):
            doc = SimpleDocTemplate(file_name,
                                    pagesize=letter,
                                    rightMargin=72,
                                    leftMargin=72,
                                    topMargin=72,
                                    bottomMargin=18)

            styles = getSampleStyleSheet()
            styles.add(
                ParagraphStyle(name='Normal_CENTER',
                               parent=styles['Normal'],
                               alignment=TA_CENTER))

            doc.build([
                Paragraph('<font size="18">{}</font>'.format(self.acronym),
                          styles['Normal_CENTER']),
                Spacer(1, 12)
            ] + [
                Paragraph(p, styles['Normal_CENTER']) for p in text.split('\n')
            ])
        else:
            with open(file_name, 'w') as f:
                f.write(text)
Ejemplo n.º 4
0
    def __init__(self, seed: Optional[int] = None):
        self.cleaner = CleaningProcessor()

        self.fake = Faker()
        self.fake.add_provider(address)
        self.fake.add_provider(profile)

        if seed:
            random.seed(seed)
            self.fake.seed_instance(seed + 1)

        dictionary: Dict[str, List[str]] = {
            'cred_romance':
            ['USA Today bestselling', 'RITA Award winning', '#cred_gen#'],
            'cred_sci-fi': [
                '#scifantasy_award# Award winning',
                '#scifi_award# Award winning', '#cred_gen#'
            ],
            'scifi_award': ['BSFA', 'Philip K. Dick'],
            'cred_fantasy': [
                '#scifantasy_award# Award winning',
                '#scifi_award# Award winning', '#cred_gen#'
            ],
            'fantasy_award':
            ['British Fantasy', 'World Fantasy', 'Gemmell', 'Mythopoeic'],
            'scifantasy_award': ['Hugo', 'Nebula', 'Locus', 'Aurealis'],
            'cred_gen':
            ['#top_award# Prize winning', 'New York Times bestselling'],
            'top_award': ['Pulitzer', 'Booker', 'Nobel'],
            'good_adj': [
                'thrilling', 'fascinating', 'revolutionary', 'breathtaking',
                'beautiful', 'seminal'
            ],
            'married_and_kids': ['with #Their# #spouse# and #children#'],
            'married': ['with #Their# #spouse#'],
            'kids': ['with #Their# #children#'],
            'children':
            ['#num_kids# kids', '#num_kids# #kid_type.s#', '#kid_type#'],
            'kid_type': ['daughter', 'son'],
            'num_kids':
            4 * ['2'] + ['3']
        }

        def raw(text, **params):
            try:
                return self.raw_pattern.match(text).group(1)
            except AttributeError:
                return text

        self.grammar = Grammar(dictionary)
        self.grammar.add_modifiers(base_english)
        self.grammar.add_modifiers({
            'raw': raw,
            'clean': lambda x: x >> self.cleaner
        })
Ejemplo n.º 5
0
    def __init__(self, bot1: Bot, bot2: Bot):
        self.bot1 = bot1
        self.bot2 = bot2

        self.grammar = Grammar({
            'origin': [
                '{quote} #name# {verb}.',
                # '{quote} {verb} #name#',
                '#name# {verb}, {quote}'
            ],
            'neutral': ['said'],
            'question': ['questioned', 'asked', 'inquired'],
            'exclaim': ['exclaimed', 'shouted'],
            'name': ['#They#', '#name#']
        })
        self.grammar.add_modifiers(base_english)
Ejemplo n.º 6
0
    def __init__(self, acronym: str):
        self.acronym = acronym

        api = Datamuse()

        dictionary: Dict[str, List[str]] = {}

        splitted = self.splitting_pattern.findall(acronym)

        self.length = len(splitted)

        for start in splitted:
            res = api.words(sp='{}*'.format(start))

            dictionary[start] = [obj['word'] for obj in res]

        self.grammar = Grammar(dictionary)
        self.grammar.add_modifiers(base_english)

        self.rule = '#{}.capitalize#'.format('.capitalize# #'.join(splitted))
Ejemplo n.º 7
0
    def __init__(self, input_text: str, state_size: int = 2):
        nltk.download('brown')
        nltk.download('gutenberg')
        self.nlp = spacy.load('en_core_web_lg')

        self.synonyms: Dict[str, List[str]] = defaultdict(list)
        self.entities: Dict[str, List[str]] = defaultdict(list)

        input_text = pipe(
            input_text,
            # lambda x: x.replace('\n', ' '),
            lambda x: self.clean_pattern.sub(' ', x),
            normalize_hyphenated_words,
            normalize_quotation_marks,
            normalize_unicode,
            normalize_whitespace)

        markovify.Text.__init__(self,
                                input_text,
                                state_size,
                                retain_original=False)

        self.grammar = Grammar({**self.synonyms, **self.entities})
        self.grammar.add_modifiers(base_english)
Ejemplo n.º 8
0
def intro_letter() -> str:
    g = Grammar(_INTRO_LETTER)
    while True:
        yield g.flatten('#main#')
Ejemplo n.º 9
0
def monster_names() -> str:
    g = Grammar(_MONSTER_NAME)
    while True:
        yield g.flatten('#main#')
Ejemplo n.º 10
0
class AutBioGenerator(Generator):
    raw_pattern = re.compile(r'^\(\((.+)\)\)$')
    state_pattern = re.compile(r', ([A-Z]{2}) [0-9]{5}$')

    def __init__(self, seed: Optional[int] = None):
        self.cleaner = CleaningProcessor()

        self.fake = Faker()
        self.fake.add_provider(address)
        self.fake.add_provider(profile)

        if seed:
            random.seed(seed)
            self.fake.seed_instance(seed + 1)

        dictionary: Dict[str, List[str]] = {
            'cred_romance':
            ['USA Today bestselling', 'RITA Award winning', '#cred_gen#'],
            'cred_sci-fi': [
                '#scifantasy_award# Award winning',
                '#scifi_award# Award winning', '#cred_gen#'
            ],
            'scifi_award': ['BSFA', 'Philip K. Dick'],
            'cred_fantasy': [
                '#scifantasy_award# Award winning',
                '#scifi_award# Award winning', '#cred_gen#'
            ],
            'fantasy_award':
            ['British Fantasy', 'World Fantasy', 'Gemmell', 'Mythopoeic'],
            'scifantasy_award': ['Hugo', 'Nebula', 'Locus', 'Aurealis'],
            'cred_gen':
            ['#top_award# Prize winning', 'New York Times bestselling'],
            'top_award': ['Pulitzer', 'Booker', 'Nobel'],
            'good_adj': [
                'thrilling', 'fascinating', 'revolutionary', 'breathtaking',
                'beautiful', 'seminal'
            ],
            'married_and_kids': ['with #Their# #spouse# and #children#'],
            'married': ['with #Their# #spouse#'],
            'kids': ['with #Their# #children#'],
            'children':
            ['#num_kids# kids', '#num_kids# #kid_type.s#', '#kid_type#'],
            'kid_type': ['daughter', 'son'],
            'num_kids':
            4 * ['2'] + ['3']
        }

        def raw(text, **params):
            try:
                return self.raw_pattern.match(text).group(1)
            except AttributeError:
                return text

        self.grammar = Grammar(dictionary)
        self.grammar.add_modifiers(base_english)
        self.grammar.add_modifiers({
            'raw': raw,
            'clean': lambda x: x >> self.cleaner
        })

    def generate_details(self, **kwargs) -> dict:
        details: Dict[str, Any] = {'genre': random.choice(['fantasy'])}

        details.update(self.fake.profile(sex=None))

        details['pronouns'] = get_pronouns(details['sex'])

        details['is_gay'] = random.random() < .0195
        details['is_married'] = random.random() < .43
        details['has_kids'] = random.random() < .74

        if details['is_married']:
            if details['is_gay']:
                details['spouse'] = {
                    'F': 'wife',
                    'M': 'husband'
                }[details['sex']]
            else:
                details['spouse'] = {
                    'F': 'husband',
                    'M': 'wife'
                }[details['sex']]

        addr = self.state_pattern.search(details['residence'])
        while not addr:
            details['residence'] = self.fake.address()

            addr = self.state_pattern.search(details['residence'])

        details['state'] = states.lookup(addr.group(1)).name

        details['signs'] = {
            'astrological': get_astrological_sign(details['birthdate']),
            'zodiac': get_zodiac_sign(details['birthdate'])
        }

        if random.random() < .3:
            details['inspired_by'] = {'sex': random.choice(['F', 'M'])}

            details['inspired_by']['relation'] = random.choice({
                'F': ['mother', 'sister', 'daughter'],
                'M': ['father', 'brother', 'son']
            }[details['inspired_by']['sex']])
        else:
            details['inspired_by'] = None

        return details

    def generate_text(self, **kwargs) -> str:
        details = kwargs.get('details', self.generate_details())

        text = """
            {name} is a #cred_{genre}# #author.raw# of #good_adj# {genre} #books.raw#
            who lives in {state}
        """

        if details['is_married']:
            if details['has_kids']:
                text += ' #{pronouns}[spouse:{spouse}]married_and_kids#'
            else:
                text += ' #{pronouns}[spouse:{spouse}]married#'

        elif details['has_kids']:
            text += ' #{pronouns}kids#'

        text += '.'

        print(text)
        return self.grammar.flatten(text.format(**details)) >> self.cleaner
Ejemplo n.º 11
0
    def __init__(self):
        self.api = Datamuse()

        self.grammar = Grammar({})
        self.grammar.add_modifiers(base_english)
Ejemplo n.º 12
0
for action in tbs_plan:
    print(action)

# In[20]:

g = Grammar({
    'open':
    '#body_id.possessive# cranium is open and the brain is exposed',
    'closed':
    '#body_id.possessive# cranium is either intact, or closed and repaired',
    'brain_in':
    '#brain_id.possessive# brain is in #body_id.possessive# head',
    'empty':
    '#body_id.possessive# head does not contain a brain',
    'on_ice':
    '#body_id.possessive# brain is stored in an ice box',
    'open_skull':
    'open #body_id.possessive# skull carefully using a chainsaw',
    'close_skull':
    'replace the removed section of #body_id.possessive# skull with a sterilised metal plate and suture the skin',
    'remove_brain':
    'disconnect #brain_id.possessive# brain from peripheral nervous system connections and remove it from #body_id.possessive# skull',
    'replace_brain':
    'apply nerve growth factor to connection sites in #body_id.possessive# empty skull and #brain_id.possessive# brain, then place #brain_id.possessive# brain in #body_id.possessive# body'
})
g.add_modifiers(modifiers.base_english)

# In[21]:

for i, subject_id in enumerate(subject_ids):
    g.push_rules(subject_id, 'subject {0}'.format(chr(ord('A') + int(i))))
Ejemplo n.º 13
0
    def __init__(
        self,
        # Data
        generator_text: Union[str, List[str]] = None,
        responder_text: List[str] = None,
        command_text: List[str] = None,
        grammar: Union[Dict[str, str], Grammar] = None,
        # Models
        chain: Union[Dict[str], MarkovText] = None,
        phraser: Union[Dict[str], Phraser] = None,
        word_vectors: Union[Dict[str], KeyedVectors] = None,
        nn: Union[Dict[str], Model] = None,
        # Chatterbot
        commander: ChatBot = None,
        **kwargs: Dict[str, int],
    ):
        # Defaults
        kwargs.update({
            "word_vector_size": 256,
            "min_count": 5,
            "max_vocab_size": 40000000
        })

        self.nlp = spacy.load("en")

        corpus = list(map(self.word_split, responder_text))

        # Chain
        if (not chain) or isinstance(chain, dict):
            chain = chain or {}
            for Key, Value in {
                    "state_size": 2,
                    "retain_original": True
            }.items():
                chain.setdefault(Key, Value)

            MarkovText.__init__(
                self,
                None,
                state_size=chain["state_size"],
                parsed_sentences=corpus +
                list(self.generate_corpus(generator_text)),
                retain_original=chain["retain_original"],
            )
        else:
            MarkovText.__init__(
                self,
                None,
                state_size=chain.state_size,
                chain=chain,
                parsed_sentences=chain.parsed_sentences,
                retain_original=chain.retain_original,
            )

        corpus = [[word.split(self.separator)[0] for word in sentence]
                  for sentence in corpus]

        # Phraser
        if (not phraser) or isinstance(phraser, dict):
            phraser = phraser or {}
            for Key, Value in {"gram_size": 3, "scoring": "default"}.items():
                phraser.setdefault(Key, Value)

            for _ in range(phraser["gram_size"]):
                self.phraser = Phraser(
                    Phrases(
                        corpus,
                        min_count=kwargs["min_count"],
                        max_vocab_size=kwargs["max_vocab_size"],
                        scoring=phraser["scoring"],
                    ))
                corpus = self.phraser[corpus]
        else:
            self.phraser = phraser
            corpus = self.phraser[corpus]

        # Word Vectors
        if (not word_vectors) or isinstance(word_vectors, dict):
            word_vectors = word_vectors or {}
            for Key, Value in {
                    "embedding_model": "fasttext",
                    "window": 5,
                    "workers": 3,
            }.items():
                word_vectors.setdefault(Key, Value)

            self.word_vectors = {
                "fasttext": FastText,
                "word2vec": Word2Vec
            }[word_vectors["embedding_model"].lower()](
                corpus,
                size=kwargs["word_vector_size"],
                window=word_vectors["window"],
                min_count=1,  # kwargs["min_count"],
                workers=word_vectors["workers"],
                max_vocab_size=kwargs["max_vocab_size"],
            ).wv
        else:
            self.word_vectors = word_vectors

        # LSTM RNN
        if (not nn) or isinstance(nn, dict):
            nn = nn or {}
            for Key, Value in {
                    "cell_type": "LSTM",
                    # "num_layers": 3, Perhaps later
                    "max_words": 100,
                    "sentence_vector_size": 300,
                    "activation": "tanh",
                    "dropout_rate": .2,
                    "loss": "categorical_crossentropy",
                    "learning_rate": .0005,
                    "metrics": ["accuracy"],
            }.items():
                nn.setdefault(Key, Value)

            input_statement = Input(
                shape=(nn["max_words"], kwargs["word_vector_size"]),
                name="input_statement",
            )
            input_response = Input(
                shape=(nn["max_words"], kwargs["word_vector_size"]),
                name="input_response",
            )

            self.nn = Model(
                inputs=[input_statement, input_response],
                outputs=[
                    Dense(kwargs["max_vocab_size"], activation="softmax")(
                        Dense(kwargs["max_vocab_size"] / 2,
                              activation="relu")(concatenate(
                                  [
                                      Bidirectional({
                                          "LSTM": LSTM,
                                          "GRU": GRU
                                      }[nn["cell_type"]](
                                          units=nn["sentence_vector_size"],
                                          input_shape=(
                                              nn["max_words"],
                                              kwargs["word_vector_size"],
                                          ),
                                          activation=nn["activation"],
                                          dropout=nn["dropout_rate"],
                                          kernel_initializer=lecun_uniform(),
                                      ))(input_statement),
                                      Bidirectional({
                                          "LSTM": LSTM,
                                          "GRU": GRU
                                      }[nn["cell_type"]](
                                          units=nn["sentence_vector_size"],
                                          input_shape=(
                                              nn["max_words"],
                                              kwargs["word_vector_size"],
                                          ),
                                          activation=nn["activation"],
                                          dropout=nn["dropout_rate"],
                                          kernel_initializer=lecun_uniform(),
                                      ))(input_response),
                                  ],
                                  axis=1,
                              )))
                ],
            )
            self.nn.compile(
                loss=nn["loss"],
                optimizer=Adam(lr=nn["learning_rate"]),
                metrics=nn["metrics"],
            )
        else:
            self.nn = nn

        # Commander
        self.commander = commander or ChatBot(
            "Commander",
            preprocessors=[
                "chatterbot.preprocessors.clean_whitespace",
                "chatterbot.preprocessors.convert_to_ascii",
            ],
            trainer="chatterbot.trainers.ListTrainer",
            logic_adapters=[
                {
                    "import_path": "chatterbot.logic.BestMatch"
                },
                {
                    "import_path": "chatterbot.logic.LowConfidenceAdapter",
                    "threshold": 0.65,
                    "default_response": "FAIL",
                },
            ],
        )
        if command_text:
            self.commander.train(command_text)

        # Grammar
        if (not grammar) or isinstance(grammar, dict):
            grammar = grammar or {}
            for Key, Value in {}.items():
                grammar.setdefault(Key, Value)

            self.grammar = Grammar(grammar)
            self.grammar.add_modifiers(base_english)
        else:
            self.grammar = grammar
Ejemplo n.º 14
0
class Bottimus(MarkovText):
    @staticmethod
    def load(filename: str):
        with open(filename, "rb") as f:
            result = pickle.load(f)
        return result

    def __init__(
        self,
        # Data
        generator_text: Union[str, List[str]] = None,
        responder_text: List[str] = None,
        command_text: List[str] = None,
        grammar: Union[Dict[str, str], Grammar] = None,
        # Models
        chain: Union[Dict[str], MarkovText] = None,
        phraser: Union[Dict[str], Phraser] = None,
        word_vectors: Union[Dict[str], KeyedVectors] = None,
        nn: Union[Dict[str], Model] = None,
        # Chatterbot
        commander: ChatBot = None,
        **kwargs: Dict[str, int],
    ):
        # Defaults
        kwargs.update({
            "word_vector_size": 256,
            "min_count": 5,
            "max_vocab_size": 40000000
        })

        self.nlp = spacy.load("en")

        corpus = list(map(self.word_split, responder_text))

        # Chain
        if (not chain) or isinstance(chain, dict):
            chain = chain or {}
            for Key, Value in {
                    "state_size": 2,
                    "retain_original": True
            }.items():
                chain.setdefault(Key, Value)

            MarkovText.__init__(
                self,
                None,
                state_size=chain["state_size"],
                parsed_sentences=corpus +
                list(self.generate_corpus(generator_text)),
                retain_original=chain["retain_original"],
            )
        else:
            MarkovText.__init__(
                self,
                None,
                state_size=chain.state_size,
                chain=chain,
                parsed_sentences=chain.parsed_sentences,
                retain_original=chain.retain_original,
            )

        corpus = [[word.split(self.separator)[0] for word in sentence]
                  for sentence in corpus]

        # Phraser
        if (not phraser) or isinstance(phraser, dict):
            phraser = phraser or {}
            for Key, Value in {"gram_size": 3, "scoring": "default"}.items():
                phraser.setdefault(Key, Value)

            for _ in range(phraser["gram_size"]):
                self.phraser = Phraser(
                    Phrases(
                        corpus,
                        min_count=kwargs["min_count"],
                        max_vocab_size=kwargs["max_vocab_size"],
                        scoring=phraser["scoring"],
                    ))
                corpus = self.phraser[corpus]
        else:
            self.phraser = phraser
            corpus = self.phraser[corpus]

        # Word Vectors
        if (not word_vectors) or isinstance(word_vectors, dict):
            word_vectors = word_vectors or {}
            for Key, Value in {
                    "embedding_model": "fasttext",
                    "window": 5,
                    "workers": 3,
            }.items():
                word_vectors.setdefault(Key, Value)

            self.word_vectors = {
                "fasttext": FastText,
                "word2vec": Word2Vec
            }[word_vectors["embedding_model"].lower()](
                corpus,
                size=kwargs["word_vector_size"],
                window=word_vectors["window"],
                min_count=1,  # kwargs["min_count"],
                workers=word_vectors["workers"],
                max_vocab_size=kwargs["max_vocab_size"],
            ).wv
        else:
            self.word_vectors = word_vectors

        # LSTM RNN
        if (not nn) or isinstance(nn, dict):
            nn = nn or {}
            for Key, Value in {
                    "cell_type": "LSTM",
                    # "num_layers": 3, Perhaps later
                    "max_words": 100,
                    "sentence_vector_size": 300,
                    "activation": "tanh",
                    "dropout_rate": .2,
                    "loss": "categorical_crossentropy",
                    "learning_rate": .0005,
                    "metrics": ["accuracy"],
            }.items():
                nn.setdefault(Key, Value)

            input_statement = Input(
                shape=(nn["max_words"], kwargs["word_vector_size"]),
                name="input_statement",
            )
            input_response = Input(
                shape=(nn["max_words"], kwargs["word_vector_size"]),
                name="input_response",
            )

            self.nn = Model(
                inputs=[input_statement, input_response],
                outputs=[
                    Dense(kwargs["max_vocab_size"], activation="softmax")(
                        Dense(kwargs["max_vocab_size"] / 2,
                              activation="relu")(concatenate(
                                  [
                                      Bidirectional({
                                          "LSTM": LSTM,
                                          "GRU": GRU
                                      }[nn["cell_type"]](
                                          units=nn["sentence_vector_size"],
                                          input_shape=(
                                              nn["max_words"],
                                              kwargs["word_vector_size"],
                                          ),
                                          activation=nn["activation"],
                                          dropout=nn["dropout_rate"],
                                          kernel_initializer=lecun_uniform(),
                                      ))(input_statement),
                                      Bidirectional({
                                          "LSTM": LSTM,
                                          "GRU": GRU
                                      }[nn["cell_type"]](
                                          units=nn["sentence_vector_size"],
                                          input_shape=(
                                              nn["max_words"],
                                              kwargs["word_vector_size"],
                                          ),
                                          activation=nn["activation"],
                                          dropout=nn["dropout_rate"],
                                          kernel_initializer=lecun_uniform(),
                                      ))(input_response),
                                  ],
                                  axis=1,
                              )))
                ],
            )
            self.nn.compile(
                loss=nn["loss"],
                optimizer=Adam(lr=nn["learning_rate"]),
                metrics=nn["metrics"],
            )
        else:
            self.nn = nn

        # Commander
        self.commander = commander or ChatBot(
            "Commander",
            preprocessors=[
                "chatterbot.preprocessors.clean_whitespace",
                "chatterbot.preprocessors.convert_to_ascii",
            ],
            trainer="chatterbot.trainers.ListTrainer",
            logic_adapters=[
                {
                    "import_path": "chatterbot.logic.BestMatch"
                },
                {
                    "import_path": "chatterbot.logic.LowConfidenceAdapter",
                    "threshold": 0.65,
                    "default_response": "FAIL",
                },
            ],
        )
        if command_text:
            self.commander.train(command_text)

        # Grammar
        if (not grammar) or isinstance(grammar, dict):
            grammar = grammar or {}
            for Key, Value in {}.items():
                grammar.setdefault(Key, Value)

            self.grammar = Grammar(grammar)
            self.grammar.add_modifiers(base_english)
        else:
            self.grammar = grammar

    def sentence_split(self, text: str) -> list:
        return sent_tokenize(text)

    separator: str = "::"

    def word_split(self, sentence: str, pos: bool = True) -> List[str]:
        if pos:
            tokens = []
            for token in self.nlp(
                    sentence, disable=["tagger", "parser", "ner", "textcat"]):
                orth = token.orth_
                if orth.isspace() or token.like_url or orth.startswith("#"):
                    continue
                elif orth.startswith("@"):
                    tokens.append("#username#")
                else:
                    tokens.append(self.separator.join((orth, token.pos_)))
        else:
            tokens = word_tokenize(sentence)
        return tokens

    def word_join(self, words: List[str], pos: bool = True) -> str:
        if pos:
            sentence = " ".join(
                word.split(self.separator)[0] for word in words)
        else:
            sentence = " ".join(words)
        return sentence

    rule_pattern = re.compile("#(\\w|\\.)+#")

    def make_sentence(self,
                      statement: str = None,
                      init_state=None,
                      **kwargs: dict) -> str:
        if statement:
            response = str(self.commander.get_response(statement))
            if response == "FAIL":
                return "NI YET"
        else:
            response = MarkovText.make_sentence(self,
                                                init_state=init_state,
                                                kwargs=kwargs)
        return self.grammar.flatten(response)

    def save(self, filename: str) -> None:
        with open(filename, "wb") as f:
            pickle.dump(self, f)
Ejemplo n.º 15
0
qg = Grammar({
    'animal':
    pycorpora.animals.common['animals'],
    'first_name_en':
    pycorpora.humans.firstNames['firstNames'],
    'last_name_en':
    pycorpora.humans.lastNames['lastNames'],
    'first_name_no':
    (pycorpora.humans.norwayFirstNamesBoys['firstnames_boys_norwegian'] +
     pycorpora.humans.norwayFirstNamesGirls['firstnames_girls_norwegian']),
    'last_name_no':
    pycorpora.humans.norwayLastNames['lastnames_norwegian'],
    'first_name_es':
    pycorpora.humans.spanishFirstNames['firstNames'],
    'last_name_es':
    pycorpora.humans.spanishLastNames['lastNames'],
    'any_title':
    pycorpora.humans.englishHonorifics['englishHonorifics'],
    'object': [
        x.strip() for x in pycorpora.objects.objects['objects']
        if x.strip()[-1] != 's'
    ],  # and len(x.split()) < 2
    'cluedo_suspect':
    pycorpora.games.cluedo['suspects']['Cluedo'],
    'cluedo_weapon':
    pycorpora.games.cluedo['weapons']['Cluedo'],
    'cluedo_room':
    pycorpora.games.cluedo['rooms'],
    'clue_suspect':
    pycorpora.games.cluedo['suspects']['Clue'],
    'clue_weapon':
    pycorpora.games.cluedo['weapons']['Clue'],
    'clue_room':
    pycorpora.games.cluedo['rooms'],
    'room':
    pycorpora.architecture.rooms['rooms'],
    'appliance':
    pycorpora.technology.appliances['appliances'],
    'strange_word':
    pycorpora.words.strange_words['words'],
    'name_suffix':
    pycorpora.humans.suffixes['suffixes'],
    'greek_god':
    pycorpora.mythology.greek_gods['greek_gods'],
    'greek_monster':
    pycorpora.mythology.greek_monsters['greek_monsters'],
    'greek_titan':
    pycorpora.mythology.greek_titans['greek_titans'],
    'celebrity':
    pycorpora.humans.celebrities['celebrities'],
    'street_core':
    ([x.split()[-1] for x in pycorpora.humans.celebrities['celebrities']] +
     [x.split()[-1] for x in pycorpora.humans.britishActors['britishActors']] +
     pycorpora.geography.english_towns_cities['towns'] +
     pycorpora.geography.english_towns_cities['cities'] +
     pycorpora.geography.countries['countries'] +
     [x['name'] for x in pycorpora.geography.oceans['oceans']] +
     [x['name'] for x in pycorpora.geography.rivers['rivers']]),
    'saint': [
        x['saint'] if any(x['saint'].startswith(t)
                          for t in saint_titles) else 'Saint ' + x['saint']
        for x in pycorpora.religion.christian_saints
    ],
    'pet': [
        '#animal.a.capitalize#', '#animal.a.capitalize#',
        '#animal.a.capitalize#', '#animal.a.capitalize#',
        '#animal.a.capitalize#', '#animal.a.capitalize#',
        '#animal.a.capitalize#', '#animal.a.capitalize#', '#celebrity#'
    ],
    'street_noun': [
        'street', 'road', 'street', 'road', 'street', 'road', 'street', 'road',
        'street', 'road', 'street', 'road', 'lane', 'avenue', 'close', 'way',
        'lane', 'avenue', 'close', 'way', 'boulevard', 'alley', 'drive',
        'crescent', 'court', 'hill', 'strand', 'end', 'prospect', 'gate'
    ],
    'street_adjective': ['old', 'new', 'west', 'east', 'north', 'south'],
    'small_cardinal': ['two', 'three', 'four'],
    'street': [
        '#street_core# #street_noun#', '#street_core# #street_noun#',
        '#street_core# #street_noun#', '#street_core# #street_noun#',
        '#street_core# #street_noun#', '#street_core# #street_noun#',
        '#street_adjective# #street_core# #street_noun#',
        '#street_adjective# #street_core# #street_noun#',
        '#street_adjective# #street_noun#', '#street_adjective# #street_noun#',
        '#small_cardinal# #street_core.s# #street_noun#',
        'the #street_adjective# #street_noun#', 'the #street_noun#',
        '#rare_street#'
    ],
    'rare_street': ['#street#', '#street#', '#street#', '#real_rare_street#'],
    'real_rare_street': [
        'whipmawhopma#street_noun#',
        'whip-ma-whop-ma-#street_noun#',
        #'#[street_core:#rude_word#]street#',
        '#[street_core:#strange_word#]street#'
    ],
    'greek_whatever': ['#greek_god#', '#greek_monster#', '#greek_titan#'],
    'cluedo': [
        '#cluedo_suspect#, in the #cluedo_room#, with the #cluedo_weapon#',
        '#clue_suspect#, in the #clue_room#, with the #clue_weapon#'
    ],
    'any_pronouns': [
        '{subject}/{object}/{dependentPossessive}/{independentPossessive}/{reflexive}'
        .format(**pronouns) for pronouns in
        pycorpora.humans.thirdPersonPronouns['thirdPersonPronouns']
    ],
    'simple_pronouns': [
        'he/him/his/his/himself', 'she/her/her/hers/herself',
        'they/them/their/theirs/themself'
    ],
    'pronouns': [
        '#simple_pronouns#', '#simple_pronouns#', '#simple_pronouns#',
        '#any_pronouns#'
    ],
    'simple_title': ['Mr', 'Mr', 'Mr', 'Mrs', 'Ms', 'Miss', 'Mx', 'Mx', 'Mx'],
    'title':
    ['#simple_title#', '#simple_title#', '#simple_title#', '#any_title#'],
    'first_name': [
        '#first_name_en#', '#first_name_en#', '#first_name_en#',
        '#first_name_no#', '#first_name_es#'
    ],
    'single_last_name': [
        '#last_name_en#', '#last_name_en#', '#last_name_en#', '#last_name_no#',
        '#last_name_es#'
    ],
    'last_name': [
        '#single_last_name#', '#single_last_name#',
        '#single_last_name#-#single_last_name#'
    ],
    'full_name_no_suffix':
    ['#first_name# #last_name#', '#first_name# #first_name# #last_name#'],
    'full_name': [
        '#full_name_no_suffix#', '#full_name_no_suffix#',
        '#full_name_no_suffix#', '#full_name_no_suffix# #name_suffix#'
    ],
    'title_last':
    '#title# #last_name#',
    'title_full_name':
    '#title# #full_name#',
    'first_name_noun': [
        'first name', 'given name', 'given name', 'given name', 'given name',
        'given name', 'personal name', 'personal name', 'personal name',
        'personal name', 'forename', 'Christian name'
    ],
    'last_name_noun': [
        'surname', 'surname', 'surname', 'surname', 'family name',
        'family name', 'family name', 'family name', 'family name', 'last name'
    ],
    'title_noun': ['honorific', 'title'],
    'low_ordinal_number': [
        'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh',
        'eighth', 'ninth', 'tenth', 'eleventh', 'twelth', 'thirteenth',
        'fourteenth', 'fifteenth'
    ],
    'numerated_object': [
        '#object.a#', 'two #object.s#', 'three #object.s#', 'four #object.s#',
        'five #object.s#', 'six #object.s#', 'seven #object.s#',
        'eight #object.s#', 'nine #object.s#'
    ],
    'object_collection_head':
    ['#numerated_object#', '#object_collection_head#, #numerated_object#'],
    'object_collection':
    ['#object_collection_head#, #numerated_object#, and #numerated_object#'],
    'receive_verb': ['receive', 'get'],
    'maybe_x': ['#x#', ''],
    'cheese_noun': [
        'cheese', 'cheese', 'cheese', 'cheese', 'curd',
        'fermented dairy product',
        'cheese, curd, or #[x:other ]maybe_x#fermented dairy product',
        'cheese or #[x:other ]maybe_x#fermented dairy product',
        'curd or #[x:other ]maybe_x#fermented dairy product', 'cheese or curd'
    ],
    'room_question_clause': [
        'were you born', 'was your first kiss', 'do you usually eat',
        'do you usually sleep',
        'do you keep your #[x:best ]maybe_x##appliance#', 'were you born',
        'was your first kiss', 'do you usually eat', 'do you usually sleep',
        'do you keep your #[x:best ]maybe_x##appliance#',
        'do you keep your life savings'
    ],
    'room_question': [
        'What kind of room #room_question_clause# in?',
        'In what kind of room #room_question_clause#?',
        'Where #room_question_clause#?'
    ],
    'room_answer': ['#room.a.capitalize#', 'The #room#'],
    'new_or_emerging': ['new', 'emerging', 'new or emerging'],
    'fabric_item': [
        'duvet cover', 'coat', 'skirt', 'pair of trousers', 'pair of pants',
        'bandana'
    ],
    'fabric_question': [
        'What is your favourite fabric?', 'What is your favourite fabric?',
        'What is your favourite fabric?',
        'What was your first #fabric_item# made of?',
        'What was your first #fabric_item# made out of?',
        'Of what fabric was your first #fabric_item# made?'
    ]
})
Ejemplo n.º 16
0
def location_names() -> str:
    g = Grammar(_LOCATION_NAMES)
    while True:
        yield g.flatten('#main#')
Ejemplo n.º 17
0
 def grammar(self) -> Grammar:
     g = Grammar(self.raw_grammar)
     g.add_modifiers(self.context.make_modifires(g))
     return g
Ejemplo n.º 18
0
class POSifiedText(markovify.Text, Generator):
    separator = "<:>"
    clean_pattern = re.compile(r'[\n_]')
    tracery_pattern = re.compile(r'^#.+#$')

    def __init__(self, input_text: str, state_size: int = 2):
        nltk.download('brown')
        nltk.download('gutenberg')
        self.nlp = spacy.load('en_core_web_lg')

        self.synonyms: Dict[str, List[str]] = defaultdict(list)
        self.entities: Dict[str, List[str]] = defaultdict(list)

        input_text = pipe(
            input_text,
            # lambda x: x.replace('\n', ' '),
            lambda x: self.clean_pattern.sub(' ', x),
            normalize_hyphenated_words,
            normalize_quotation_marks,
            normalize_unicode,
            normalize_whitespace)

        markovify.Text.__init__(self,
                                input_text,
                                state_size,
                                retain_original=False)

        self.grammar = Grammar({**self.synonyms, **self.entities})
        self.grammar.add_modifiers(base_english)

    def sentence_join(self, sentences):
        return " ".join(sentences)

    def word_split(self, sentence):
        tokenized = []
        first = True
        entity = False

        entity_construct = {"tag": "", "type": "", "words": []}
        for word in self.nlp(sentence):
            default = True

            if word.ent_iob_ == "B":
                entity = True
                entity_construct['tag'] = word.tag_
                entity_construct['type'] = word.ent_type_
                entity_construct['words'] = []
            elif entity and word.ent_iob_ == 'O':
                entity = False
                text = self.separator.join(
                    (" ".join(entity_construct['words']),
                     entity_construct['tag']))

                tokenized.append(text)
                self.entities[entity_construct['type']].append(text)

            if word.pos_ in {'NOUN', 'VERB'}:
                # syns = wn.synsets(word.orth_, self.pos_converter[word.pos_])

                modifiers = []

                if word.orth_ not in self.synonyms:
                    r = requests.get('https://api.datamuse.com/words',
                                     params={'ml': word.orth_})

                    syns = []
                    if len(r.json()) > 0:
                        syns = [
                            obj['word'] for obj in r.json()
                            if 'syn' in obj['tags']
                            and UNIVERSAL_TO_LETTER[word.pos_] in obj['tags']
                            and 'prop' not in obj['tags']
                        ]

                    if len(syns) > 0:
                        self.synonyms[word.orth_] = syns
                    else:
                        self.synonyms[word.orth_] = False

                if self.synonyms[word.orth_]:
                    default = False

                    if (not (first or entity)
                            and tokenized[-1].lower() in {'a', 'an'}):
                        if tokenized[-1] != tokenized[-1].lower():
                            first = True
                        modifiers.append('.a')

                    if not first and word.orth_[0].isupper():
                        modifiers.append('.capitalize')

                    if entity:
                        text = '#{}{}#'.format(word.orth_, ''.join(modifiers))
                    else:
                        text = self.separator.join(
                            ('#{}{}#'.format(word.orth_,
                                             ''.join(modifiers)), word.tag_))

            if default:
                if entity:
                    text = word.orth_
                else:
                    text = self.separator.join((word.orth_, word.tag_))

            if entity:
                entity_construct['words'].append(text)
            else:
                tokenized.append(text)

            first = False

        return tokenized

    def word_join(self, words):
        sentence = []
        for word in words:
            (word, _) = word.split(self.separator)
            sentence.append(self.grammar.flatten(word))

        return " ".join(sentence).replace('_', ' ')

    def generate_text(self, **kwargs) -> str:
        length = kwargs.get('length', 50000)

        text = ''
        w_count = 0

        while w_count < length:
            sent = self.make_sentence()
            text += sent
            w_count += word_count(sent)

        return text

    def save_to_file(self, file_name: str, length: int = 50000):
        text = self.generate_text(length=length)

        with open(file_name, 'w') as f:
            f.write(text)