Ejemplo n.º 1
0
def markov():
    form = MarkovChainTextMakerForm()
    form.debug_log()

    if form.validate_on_submit():
        logger.info(u'[flask] received valid form submission')

        data = {
            field.name: (clean.CleanInputString(field.data) if isinstance(field.data, basestring) else field.data)
            for field in iter(form)
            }

        text_maker = text_makers.create_text_maker(
                input_text=data['input_text'],
                strategy=data['text_maker_strategy'],
                sentence_tokenizer=data['tokenizer_strategy'],
                joiner=data['joiner_strategy'],
                ngram_size=data['ngram_size'],
        )

        generated_text_title = text_maker.join(text_maker.make_sentences(count=1))
        generated_text_body = text_maker.join(text_maker.make_sentences(count=data['count_of_sentences_to_make']))

        generated_text_body = text_maker.proofread(generated_text_body)
        generated_text_title = text_maker.proofread(generated_text_title)

        for field in iter(form):
            # make the fields 'sticky' by keeping values from last submission
            if not field.name.lower().startswith('csrf'):
                field.default = field.data

        return render_template(
                'index.html', form=form, generated_text=generated_text_body, generated_text_title=generated_text_title)

    return render_template('index.html', form=form)
Ejemplo n.º 2
0
 def tokenize(self, text):
     """
         >>> WordTokenizerNLTK().tokenize("It's a beautiful day today...")
         [u"It's", u'a', u'beautiful', u'day', u'today', u'...']
         >>> from presswork.text.clean import CleanInputString
         >>> WordTokenizerNLTK().tokenize(CleanInputString("Hello there!!!"))
         [u'Hello', u'there', u'!', u'!', u'!']
     """
     text = clean.CleanInputString(text).unwrap()
     return WordList(self.strategy.tokenize(text))
Ejemplo n.º 3
0
def create_text_maker(
    strategy=DEFAULT_TEXT_MAKER_NICKNAME,
    sentence_tokenizer=None,
    joiner=None,
    input_text=None,
    ngram_size=constants.DEFAULT_NGRAM_SIZE,
):
    """ Convenience factory to just "gimme a text maker" without knowing exact module layout. nicknames supported.

    rationale: I *do* want an easy way for callers to make these, but I want to keep the *classes* minimal -
    the constructors should have minimum necessary 'smarts'. so we stick the convenience-smarts here.

    :param strategy: specific nickname of class to use e.g. 'crude', 'pymc' 'markovify'
        can also just pass in an exact class. if not given, will use the default.
    :param sentence_tokenizer: (optional) an instance of sentence tokenizer - or a nickname such as
        'nltk', 'just_whitespace'. if not given, a TextMaker class will just use its default.
    :param joiner: (optional) an instance of joiner - or a nickname such as 'just_whitespace', 'moses'
    :param input_text: (optional) the input text to load into the TextMaker class.
        (if not given, can be loaded later load it later.)
    """
    text_maker_kwargs = {}

    ngram_size = int(ngram_size)

    if isinstance(strategy, basestring) or hasattr(strategy, 'lower'):
        ATextMakerClass = _get_text_maker_class(name_or_nickname=strategy)
    elif callable(strategy):
        ATextMakerClass = strategy
    else:
        raise ValueError(
            '{!r} does not appear to be a valid class nor nickname'.format(
                strategy))

    if sentence_tokenizer:
        if isinstance(sentence_tokenizer, basestring) or hasattr(
                sentence_tokenizer, 'lower'):
            sentence_tokenizer = tokenizers.create_sentence_tokenizer(
                sentence_tokenizer)

        text_maker_kwargs["sentence_tokenizer"] = sentence_tokenizer

    if joiner:
        if isinstance(joiner, basestring) or hasattr(joiner, 'lower'):
            joiner = joiners.create_joiner(joiner)

        text_maker_kwargs["joiner"] = joiner

    text_maker = ATextMakerClass(ngram_size=ngram_size, **text_maker_kwargs)

    if input_text is not None:
        # CleanInputString 'memoizes' to avoid redundant cleaning - so it's no problem to call redundantly
        input_text = clean.CleanInputString(input_text)
        text_maker.input_text(input_text)

    return text_maker
Ejemplo n.º 4
0
    def input_text(self, input_text):
        """ build a fresh model from input text. (does not generate text - call make_sentences() to generate text.)

        * base class input_text() is public and handles pre/post hook that is same for all variants.
        * each subclass implements _input_tokenized(), private, implements the strategy. (may just adapt/forward)
        implements the *common* parts (not specific to subclasses)

        main effect is to change the state of the instance. the instance stores the strategy, the strategy
         stores the markov chain model it learns from the input text.

        :return: (optional) also returns the tokenized input text; this is mainly relevant for testing purposes
        """
        if self.is_locked:
            raise TextMakerIsLockedException(
                "locked! has input_text() already been called? (can only be called once)"
            )

        input_text = clean.CleanInputString(input_text)
        sentences_as_word_lists = self.sentence_tokenizer.tokenize(input_text)

        self._input_text(sentences_as_word_lists)
        self._lock()

        return sentences_as_word_lists
Ejemplo n.º 5
0
def main(ngram_size, strategy, tokenize, join, input_filename, input_encoding, output_encoding, count, ):
    logger = setup_logging()
    logger.debug("CLI invocation variable dump: {}".format(locals()))

    if input_filename == '-':
        if input_encoding == "raw":
            input_text = sys.stdin.read()
        else:
            UTF8Reader = codecs.getreader(input_encoding)
            sys.stdin = UTF8Reader(sys.stdin)
            input_text = sys.stdin.read()
    else:
        if input_encoding == "raw":
            with open(input_filename, 'r') as f:
                input_text = f.read()
        else:
            with codecs.open(input_filename, 'r', encoding=input_encoding) as f:
                input_text = f.read()

    logger.debug("CLI invocation variable dump again: {}".format(locals()))
    text_maker = text_makers.create_text_maker(
            strategy=strategy,
            sentence_tokenizer=tokenize,
            joiner=join,
            input_text=clean.CleanInputString(input_text),
            ngram_size=ngram_size)

    output_sentences = text_maker.make_sentences(count)
    output_text = text_maker.join(output_sentences)
    final_result = text_maker.proofread(output_text)

    UTF8Writer = codecs.getwriter(output_encoding)
    sys.stdout = UTF8Writer(sys.stdout)

    sys.stdout.write(final_result)
    sys.stdout.write("\n")
Ejemplo n.º 6
0
 def _tokenize_to_sentence_strings(self, text):
     text = clean.CleanInputString(text).unwrap()
     return self.strategy.tokenize(text)
Ejemplo n.º 7
0
 def _tokenize_to_sentence_strings(self, text):
     text = clean.CleanInputString(text).unwrap()
     return markovify.splitters.split_into_sentences(text)
Ejemplo n.º 8
0
 def _clean_text(text):
     return denoise_punctuation(clean.OutputProofreader().proofread(
         clean.CleanInputString(text).unwrap()))