def tone_and_syllabify_verse(verse, special_tokens, tone_tagger, synalepha=True): if verse.strip() == '': return [] if verse in special_tokens.values(): return [verse] toned_verse = ' '.join([ tone_tagger.tone(w) if w.strip() not in special_tokens.values() else w for w in verse.split() ]) syllables = syllabify_verse(toned_verse, special_tokens, synalepha=synalepha) return syllables
def get_mean_std_verse_length(toned_verses_syls, synalepha): lengths = [] for verse_syls in toned_verses_syls: syllables = [ s.strip() for s in verse_syls if s.strip() not in special_tokens.values()] #syllables = [ s for s in syllables if s != '' and s != '\n'] if syllables: lengths.append(len(syllables)) lengths = np.array(lengths) return np.mean(lengths), np.std(lengths)
def is_hendecasyllable(syllables, special_tokens): syllables = [ prettify_text(s, special_tokens).strip() for s in syllables if s not in special_tokens.values() ] if len(syllables) > 9: return get_last_tonedsyl_index(syllables, special_tokens) == 9 else: return False
def _apply_synalepha(syllables, special_tokens): syllables_cleaned = [ prettify_text(s, special_tokens).strip() for s in syllables if s not in special_tokens.values() ] if len(syllables_cleaned) <= 9: return syllables # SMARAGLIATA vowels = "ÁÀAaàáÉÈEeèéIÍÌiíìOoóòÚÙUuúùHh'" n_synalepha = 0 i = 1 while i < (len(syllables) - 1): if syllables[i] == special_tokens['WORD_SEP']: pre_syl = syllables[i - 1] next_syl = syllables[i + 1] if pre_syl[-1] in vowels and next_syl[0] in vowels: i += 1 n_synalepha += 1 i += 1 last_tonedrv_index = get_last_tonedsyl_index(syllables, special_tokens) n_synalepha_needed = last_tonedrv_index - 9 n_synalepha_to_apply = min(n_synalepha_needed, n_synalepha) result = [syllables[0]] i = 1 n_synalepha_applied = 0 while i < (len(syllables) - 1): if syllables[i] == special_tokens['WORD_SEP']: pre_syl = syllables[i - 1] next_syl = syllables[i + 1] if pre_syl[-1] in vowels and next_syl[0] in vowels: if n_synalepha_applied < n_synalepha_to_apply: result.append(result[-1] + syllables[i] + next_syl) del result[-2] n_synalepha_applied += 1 i += 1 else: result.append(syllables[i]) else: result.append(syllables[i]) else: result.append(syllables[i]) i += 1 result.append(syllables[-1]) return result
def get_last_tonedsyl_index(syllables, special_tokens): # syllables only without <word_sep> token syllables = [ prettify_text(s, special_tokens).strip() for s in syllables if s not in special_tokens.values() ] syllables_rev = syllables[::-1] for i, syl in enumerate(syllables_rev): if is_toned_syl(syl): return len(syllables_rev) - i - 1
def syllabify_verse(verse, special_tokens, synalepha=True): if verse.strip() == '': return [] if verse in special_tokens.values(): return [verse] words = [w for w in verse.split()] list_of_syllables = [ syllabify_word(w).split('#') if w.strip() not in special_tokens.values() else [w] for w in words ] ## [['nel'], ['<word_sep>'], ['mez', 'zo'], ['<word_sep>'], ['del'], ['<word_sep>'], ['cam', 'min'], ['<word_sep>'], ['di'], ['<word_sep>'], ['no', 'stra'], ['<word_sep>'], ['vi', 'ta'], ['<end_of_verso>']] syllables = [] for s in list_of_syllables: syllables += s ## ['nèl', '<word_sep>', 'mèz', 'zo', '<word_sep>', 'dèl', '<word_sep>', 'cam', 'mìn', '<word_sep>', 'di', '<word_sep>', 'nò', 'stra', '<word_sep>', 'vì', 'ta', '<end_of_verso>'] if synalepha: syllables = _apply_synalepha(syllables, special_tokens) return syllables
if __name__ == "__main__": # print(special_tokens) with open("divina_commedia_toned_cleaned.txt", "r") as f: divine_comedy = f.read() # divine_comedy = prettify_text(divine_comedy,special_tokens) divine_comedy_list = divine_comedy.split("\n") # removing special tokens lines divine_comedy_list = [ line for line in divine_comedy_list if line.strip() not in special_tokens.values() ] # tone_tagger = ToneTagger() count = 0 for line in divine_comedy_list[:]: syllables = syllabify_verse(line, special_tokens) if not is_hendecasyllable(syllables, special_tokens): count += 1 # print(syllables) syllables = [ syl for syl in syllables if syl != special_tokens['WORD_SEP'] ] syllables = [ syl for syl in syllables if syl != special_tokens['END_OF_VERSO'] ]
def generate_text(model_rhyme, model_verse, special_tokens, vocab_size_rhyme, vocab_size_verse, syl2idx_rhyme, idx2syl_rhyme, syl2idx_verse, idx2syl_verse, seq_length_rhyme, seq_length_verse, start_seq_rhyme, start_seq_verse, temperature=1.0): seq_text_rhyme = start_seq_rhyme seq_text_verse = start_seq_verse generated_text_list = [] model_rhyme.reset_states() model_verse.reset_states() end_of_canto = False while not end_of_canto: # and generated_text_list.count(special_tokens['END_OF_VERSO']) < 10: # and generated_text_list.count(special_tokens['END_OF_TERZINA']) < 45 \ next_syl_rhyme = '' end_verse_list = [] structure_list = [] while not end_of_canto and next_syl_rhyme != special_tokens[ 'END_OF_VERSO']: seq_text_rhyme = seq_text_rhyme[-seq_length_rhyme:] sequence_rhyme = [ syl2idx_rhyme[syl] for syl in seq_text_rhyme[-seq_length_rhyme:] ] sequence_rhyme = tf.keras.preprocessing.sequence.pad_sequences( [sequence_rhyme], maxlen=seq_length_rhyme) x_rhyme = np.array(sequence_rhyme, dtype='int64') prediction_rhyme = model_rhyme.predict(x_rhyme, verbose=0) prediction_rhyme = tf.squeeze(prediction_rhyme, 0)[-1] prediction_rhyme = prediction_rhyme / temperature prediction_rhyme = prediction_rhyme.numpy() # index_rhyme = np.random.choice(len(prediction_rhyme), size=1, p=prediction_rhyme)[0] index_rhyme = np.argmax(prediction_rhyme) next_syl_rhyme = idx2syl_rhyme[index_rhyme] seq_text_rhyme.append(next_syl_rhyme) if next_syl_rhyme in special_tokens.values( ) and next_syl_rhyme != special_tokens['END_OF_VERSO']: structure_list.append(next_syl_rhyme) else: end_verse_list.append(next_syl_rhyme) if next_syl_rhyme == special_tokens['END_OF_CANTO']: end_of_canto = True generated_text_list += structure_list reverse_rhyme_list = end_verse_list[::-1] ## seq_text_verse += structure_list seq_text_verse += reverse_rhyme_list next_syl_verse = '' rest_revese_verse_list = [] while not end_of_canto and next_syl_verse != special_tokens[ 'END_OF_VERSO']: seq_text_verse = seq_text_verse[-seq_length_verse:] sequence_verse = [ syl2idx_verse[syl] for syl in seq_text_verse[-seq_length_verse:] ] sequence_verse = tf.keras.preprocessing.sequence.pad_sequences( [sequence_verse], maxlen=seq_length_verse) x_verse = np.array(sequence_verse, dtype='int64') prediction_verse = model_verse.predict(x_verse, verbose=0) prediction_verse = tf.squeeze(prediction_verse, 0)[-1] prediction_verse = prediction_verse / temperature prediction_verse = prediction_verse.numpy() index_verse = np.random.choice(len(prediction_verse), size=1, p=prediction_verse)[0] next_syl_verse = idx2syl_verse[index_verse] if next_syl_verse != special_tokens['END_OF_VERSO']: seq_text_verse.append(next_syl_verse) rest_revese_verse_list.append(next_syl_verse) whole_verse_list = rest_revese_verse_list[::-1] + end_verse_list generated_text_list += whole_verse_list print(prettify_text(''.join(structure_list), special_tokens), end='', flush=True) # print(prettify_text(''.join(whole_verse_list), special_tokens), end='', flush=True) print(prettify_text( ''.join(remove_tone(whole_verse_list, special_tokens)), special_tokens), end='', flush=True) generated_text_no_tone_list = remove_tone(generated_text_list, special_tokens) return ''.join(generated_text_list), ''.join(generated_text_no_tone_list)