def write_story(): corpora = {}.copy() log_it("INFO: about to start processing corpora.") for which_corpus in glob.glob(circe_corpora_path + '*txt'): log_it(' INFO: processing "%s".' % which_corpus, 2) starts, the_mapping = buildMapping_withMixins(chain_length, [which_corpus], glob.glob('%s/*txt' % mixin_texts_dir)) corpus_name = os.path.basename(which_corpus)[:-4] corpora[corpus_name] = [starts, the_mapping] log_it("DEBUGGING: Corpora are: \n" + pprint.pformat(corpora), 6) # pprint.pformat() for the WHOLE DICTIONARY takes FOREVER the_chapter = [][:] def get_speaker_text(speaker_name, num_sentences): if speaker_name in corpora: which_index = speaker_name elif speaker_name == 'STAGE': which_index = 'STAGE DIRECTIONS' else: which_index = 'MINOR CHARACTERS' starts, the_mapping = tuple(corpora[which_index]) return gen_text(the_mapping, starts, markov_length=chain_length, sentences_desired=num_sentences, paragraph_break_probability = 0) log_it("INFO: About to process stats file.") with open(circe_stats_path) as circe_stats_file: for the_encoded_paragraph in circe_stats_file: # Process each line, using it as a map of the corresponding paragraph in 'Circe'. # Structure of these lines is defined in /UlyssesRedux/code/utility_scripts/analyze-chapter-15.py. # But here's a quick reminder: # Two parts: a name of a speaker (or "STAGE" if it's a paragraph of stage directions), then a series of codes for "chunks" of the paragraph. # A "chunk" is a number of sentences. If the number is preceded by opening parens, it's an intraparagraph stage direction. # Parts of the line, and chunk descriptions, are separated by vertical bars (pipe characters), hence the .psv extension. log_it('INFO: Processing coded line "%s".' % the_encoded_paragraph.strip(), 2) code_to_process = the_encoded_paragraph.split('|') speaker_name = code_to_process.pop(0) log_it(' speaker name is "%s".' % speaker_name, 2) if speaker_name != 'STAGE': # Unless the name is 'STAGE', add it to the beginning of this paragraph this_paragraph = '%s: ' % speaker_name else: # In which case, begin with an opening parenthesis. this_paragraph = '(' while len(code_to_process) > 0: chunk_descriptor = code_to_process.pop(0) log_it(' processing chunk "%s".' % chunk_descriptor.strip(), 2) if chunk_descriptor[0] == '(': this_paragraph = this_paragraph + '(%s) ' % (get_speaker_text('STAGE', int(chunk_descriptor[1:]))) else: this_paragraph = this_paragraph + '%s ' % (get_speaker_text(speaker_name, int(chunk_descriptor))) log_it(' current paragraph length is now %d.' % len(this_paragraph), 3) if speaker_name == 'STAGE': this_paragraph = this_paragraph.strip() + ')' log_it(' done with this paragraph; total length is %d.' % len(this_paragraph), 2) the_chapter.append(this_paragraph) return '\n'.join(the_chapter)
def write_story(): output_text = [][:] # First, set up table of filenames section_filenames = [][:] for which_section in range(1, 1 + sections_in_chapter): section_filenames.append('%s/%02d.txt' % (wandering_rocks_sections_path, which_section)) log_it("INFO: filenames table set up") log_it(" length is %d" % len(section_filenames), 2) log_it("\n and the filenames table is:\n" + pformat(section_filenames)) stats_file = open(wandering_rocks_stats_file) the_line = stats_file.readline() # Read and ignore the header line log_it("INFO: header read from stats file, about to parse stats file and start generating text") for which_section in range(1, 1 + sections_in_chapter): the_line = stats_file.readline() # Read another line from the stats file log_it("INFO: Parsing the line '%s'." % the_line.split(), 2) sec, pars, sents, words = map(int, the_line.split(',')) log_it(" sec: %d; pars: %d; sents: %d; words: %d" % (sec, pars, sents, words), 2) if sec != which_section: # elementary sanity check raise IndexError("The stats file for Wandering Rocks is corrupt: section number %d encountered out of order." % sec) log_it(" generating based on sections %d, %d, %d." % (1 + (which_section + 17) % 19, which_section, (which_section + 1) % 19), 2) log_it(" asking for %d sentences with paragraph break probability of %f." % (sents, pars/sents)) which_rocks_sections = [ section_filenames[1 + (which_section + 17) % 19 - 1], section_filenames[which_section - 1], section_filenames[(which_section + 1) % 19 - 1] ] starts, the_mapping = buildMapping_withMixins(chain_length, which_rocks_sections, glob.glob('%s/*txt' % mixin_texts_dir)) output_text.append(gen_text(the_mapping, starts, markov_length=chain_length, sentences_desired=sents, paragraph_break_probability=(pars/sents))) return '\n* * *\n'.join(output_text)
import patrick_logger # From https://github.com/patrick-brian-mooney/personal-library from patrick_logger import log_it # First, set up constants questions_chain_length = 1 answers_chain_length = 2 mixin_texts_dir = '%s17' % current_run_corpus_directory patrick_logger.verbosity_level = 0 log_it("INFO: Imports successful, moving on", 2) # Create the necessary sets of Markov chains once, at the beginning of the script's run questions_starts, questions_mapping = buildMapping(word_list(ithaca_questions_path), markov_length=questions_chain_length) answers_starts, answers_mapping = buildMapping_withMixins(answers_chain_length, [ithaca_answers_path], glob.glob('%s/*txt' %mixin_texts_dir)) log_it("INFO: built mappings from both question and answer files, moving on", 2) # Unlike the 'Aeolus' script, this script makes no effort to enforce sticking within word-limit boundaries. # You can see that in the next two routines, which just call sentence_generator.gen_text() directly. def getQuestion(num_sents, num_words): log_it(" getQuestion() called", 2) log_it(" num_sents: %d; num_words: %d" % (num_sents, num_words), 3) return gen_text(questions_mapping, questions_starts, markov_length=questions_chain_length, sentences_desired=num_sents, paragraph_break_probability=0) def getAnswer(num_sents, num_words): log_it(" getAnswer() called", 2) log_it(" num_sents: %d; num_words: %d" % (num_sents, num_words), 3) return gen_text(answers_mapping, answers_starts, markov_length=answers_chain_length, sentences_desired=num_sents, paragraph_break_probability=0)