Ejemplo n.º 1
0
def write_story():
    corpora = {}.copy()

    log_it("INFO: about to start processing corpora.")

    for which_corpus in glob.glob(circe_corpora_path + '*txt'):
        log_it('  INFO: processing "%s".' % which_corpus, 2)
        starts, the_mapping = buildMapping_withMixins(chain_length, [which_corpus], glob.glob('%s/*txt' % mixin_texts_dir))
        corpus_name = os.path.basename(which_corpus)[:-4]
        corpora[corpus_name] = [starts, the_mapping]

    log_it("DEBUGGING: Corpora are: \n" + pprint.pformat(corpora), 6)           # pprint.pformat() for the WHOLE DICTIONARY takes FOREVER

    the_chapter = [][:]

    def get_speaker_text(speaker_name, num_sentences):
        if speaker_name in corpora:
            which_index = speaker_name
        elif speaker_name == 'STAGE':
            which_index = 'STAGE DIRECTIONS'
        else:
            which_index = 'MINOR CHARACTERS'
        starts, the_mapping = tuple(corpora[which_index])
        return gen_text(the_mapping, starts, markov_length=chain_length, sentences_desired=num_sentences, paragraph_break_probability = 0)

    log_it("INFO: About to process stats file.")

    with open(circe_stats_path) as circe_stats_file:
        for the_encoded_paragraph in circe_stats_file:
            # Process each line, using it as a map of the corresponding paragraph in 'Circe'.
            # Structure of these lines is defined in /UlyssesRedux/code/utility_scripts/analyze-chapter-15.py.
            # But here's a quick reminder:
            # Two parts: a name of a speaker (or "STAGE" if it's a paragraph of stage directions), then a series of codes for "chunks" of the paragraph.
            # A "chunk" is a number of sentences. If the number is preceded by opening parens, it's an intraparagraph stage direction.
            # Parts of the line, and chunk descriptions, are separated by vertical bars (pipe characters), hence the .psv extension.
            log_it('INFO: Processing coded line "%s".' % the_encoded_paragraph.strip(), 2)
            code_to_process = the_encoded_paragraph.split('|')
            speaker_name = code_to_process.pop(0)
            log_it('  speaker name is "%s".' % speaker_name, 2)
            if speaker_name != 'STAGE':                                     # Unless the name is 'STAGE', add it to the beginning of this paragraph
                this_paragraph = '%s: ' % speaker_name
            else:                                                           # In which case, begin with an opening parenthesis.
                this_paragraph = '('
            while len(code_to_process) > 0:
                chunk_descriptor = code_to_process.pop(0)
                log_it('    processing chunk "%s".' % chunk_descriptor.strip(), 2)
                if chunk_descriptor[0] == '(':
                    this_paragraph = this_paragraph + '(%s) ' % (get_speaker_text('STAGE', int(chunk_descriptor[1:])))
                else:
                    this_paragraph = this_paragraph + '%s ' % (get_speaker_text(speaker_name, int(chunk_descriptor)))
                log_it('      current paragraph length is now %d.' % len(this_paragraph), 3)
            if speaker_name == 'STAGE':
                this_paragraph = this_paragraph.strip() + ')'
            log_it('        done with this paragraph; total length is %d.' % len(this_paragraph), 2)
            the_chapter.append(this_paragraph)

    return '\n'.join(the_chapter)
Ejemplo n.º 2
0
def write_story():
    output_text = [][:]

    # First, set up table of filenames
    section_filenames = [][:]
    for which_section in range(1, 1 + sections_in_chapter):
        section_filenames.append('%s/%02d.txt' % (wandering_rocks_sections_path, which_section))

    log_it("INFO: filenames table set up")
    log_it("  length is %d" % len(section_filenames), 2)
    log_it("\n    and the filenames table is:\n" + pformat(section_filenames))

    stats_file = open(wandering_rocks_stats_file)
    the_line = stats_file.readline()                  # Read and ignore the header line

    log_it("INFO: header read from stats file, about to parse stats file and start generating text")

    for which_section in range(1, 1 + sections_in_chapter):
        the_line = stats_file.readline()        # Read another line from the stats file
        log_it("INFO: Parsing the line '%s'." % the_line.split(), 2)
        sec, pars, sents, words = map(int, the_line.split(','))
        log_it("    sec: %d; pars: %d; sents: %d; words: %d" % (sec, pars, sents, words), 2)
        if sec != which_section:        # elementary sanity check
            raise IndexError("The stats file for Wandering Rocks is corrupt: section number %d encountered out of order." % sec)
        log_it("    generating based on sections %d, %d, %d." % (1 + (which_section + 17) % 19, which_section, (which_section + 1) % 19), 2)
        log_it("      asking for %d sentences with paragraph break probability of %f." % (sents, pars/sents))
        
        which_rocks_sections = [
                                 section_filenames[1 + (which_section + 17) % 19 - 1],
                                 section_filenames[which_section - 1],
                                 section_filenames[(which_section + 1) % 19 - 1]
                                ]
        starts, the_mapping = buildMapping_withMixins(chain_length, which_rocks_sections, glob.glob('%s/*txt' % mixin_texts_dir))

        output_text.append(gen_text(the_mapping, starts, markov_length=chain_length, sentences_desired=sents,
                paragraph_break_probability=(pars/sents)))

    return '\n*   *   *\n'.join(output_text)
Ejemplo n.º 3
0
import patrick_logger                 # From https://github.com/patrick-brian-mooney/personal-library
from patrick_logger import log_it

# First, set up constants
questions_chain_length = 1
answers_chain_length = 2
mixin_texts_dir = '%s17' % current_run_corpus_directory

patrick_logger.verbosity_level = 0
log_it("INFO: Imports successful, moving on", 2)

# Create the necessary sets of Markov chains once, at the beginning of the script's run

questions_starts, questions_mapping = buildMapping(word_list(ithaca_questions_path), markov_length=questions_chain_length)
answers_starts, answers_mapping = buildMapping_withMixins(answers_chain_length, [ithaca_answers_path], glob.glob('%s/*txt' %mixin_texts_dir))

log_it("INFO: built mappings from both question and answer files, moving on", 2)

# Unlike the 'Aeolus' script, this script makes no effort to enforce sticking within word-limit boundaries.
# You can see that in the next two routines, which just call sentence_generator.gen_text() directly.

def getQuestion(num_sents, num_words):
    log_it("    getQuestion() called", 2)
    log_it("      num_sents: %d; num_words: %d" % (num_sents, num_words), 3)
    return gen_text(questions_mapping, questions_starts, markov_length=questions_chain_length, sentences_desired=num_sents, paragraph_break_probability=0)

def getAnswer(num_sents, num_words):
    log_it("    getAnswer() called", 2)
    log_it("      num_sents: %d; num_words: %d" % (num_sents, num_words), 3)
    return gen_text(answers_mapping, answers_starts, markov_length=answers_chain_length, sentences_desired=num_sents, paragraph_break_probability=0)