def __init__(self, max_files=1000000, file_lines=FILE_LINES, batch_size=32, shuffle=True): 'Initialization' self.raw_dir = RAW_DIR self.cache_dir = CACHE_DIR self.raw_files = [] self.cache_files = [] files = [] for file in os.listdir(self.raw_dir): files.append(file) if len(files) >= max_files: break for file in files: self.raw_files.append(self.raw_dir + '/' + file) for file in files: self.cache_files.append(self.cache_dir + '/' + file + '.pk') # arange does this self.indexes = [1] * len(self.raw_files) for i in range(len(self.raw_files)): self.indexes[i] = i self.file_lines = file_lines self.batch_size = batch_size self.syll_mgr = syllables.syllables() self.n_classes = self.syll_mgr.get_size() self.shuffle = shuffle self.on_epoch_end() self.cmudict = cmudict.CMUDict()
def rhyme( sentence ): target_syllables = syllables.sentence_syllables( sentence ) tokens = nltk.word_tokenize(sentence) rhymes = word_rhyme_candidates(last_word(tokens)) cs = [] if len(tokens) == 1: return ", ".join(rhymes[:12]) for rhyme in rhymes: cs += candidate_sentences( rhyme ) syllable_sentences = [] for sentence in cs: ss = sum( [ syllables.syllables(word) for word in sentence ] ) syllable_sentences.append( (ss, " ".join(sentence)) ) syllable_sentences.sort() syllable_sentences.reverse() if len( syllable_sentences ) == 0: if len( rhymes ) > 0: return ", ".join(rhymes[:12]) else: return "month, orange, Nantucket" syllable_numbers = [ n for n, sentence in syllable_sentences ] closest_number = min( syllable_numbers, key=lambda x:abs(x-target_syllables) ) closest_sentences = [ sentence for n, sentence in syllable_sentences if n == closest_number ] return random.choice(closest_sentences)
def rhyme(sentence): target_syllables = syllables.sentence_syllables(sentence) tokens = nltk.word_tokenize(sentence) rhymes = word_rhyme_candidates(last_word(tokens)) cs = [] if len(tokens) == 1: return ", ".join(rhymes[:12]) for rhyme in rhymes: cs += candidate_sentences(rhyme) syllable_sentences = [] for sentence in cs: ss = sum([syllables.syllables(word) for word in sentence]) syllable_sentences.append((ss, " ".join(sentence))) syllable_sentences.sort() syllable_sentences.reverse() if len(syllable_sentences) == 0: if len(rhymes) > 0: return ", ".join(rhymes[:12]) else: return "month, orange, Nantucket" syllable_numbers = [n for n, sentence in syllable_sentences] closest_number = min(syllable_numbers, key=lambda x: abs(x - target_syllables)) closest_sentences = [ sentence for n, sentence in syllable_sentences if n == closest_number ] return random.choice(closest_sentences)
def findRhymingCore(source, num, numSyb = 2): sourceNoun = syllables.getWord(source, numSyb, numSyb) rhymeNouns = rhyme.word_rhyme_candidates(sourceNoun) nounList = [] nounList.append(sourceNoun) suggest = [] for word in rhymeNouns: if syllables.syllables(word) == numSyb: if word in source: suggest.append(word) if len(suggest) < num: return [] maxCount = 100 count = 1 for i in range(num-1): testWord = random.choice(suggest) while testWord in nounList and count < maxCount: testWord = random.choice(suggest) count += 1 if count >= maxCount: return [] nounList.append(testWord) return nounList
def get_rhyme(sentence): pattern = sentence target_syllables = syllables.sentence_syllables(sentence) tokens = nltk.word_tokenize(sentence) rhymes = word_rhyme_candidates(last_word(tokens)) candidate_sentence = [] for rhyme in rhymes: candidate_sentence += candidate_sentences(rhyme) syllable_sentences = [] for sentence in candidate_sentence: sumOfSyllables = sum([syllables.syllables(word) for word in sentence]) syllable_sentences.append((sumOfSyllables, " ".join(sentence))) syllable_sentences.sort() syllable_sentences.reverse() if len(syllable_sentences) == 0: if len(rhymes) > 0: # get synonyms of rhyme return " " return ", ".join(rhymes[:12]) else: return "Oho ho ho ho ho" syllable_numbers = [n for n, sentence in syllable_sentences] close_number = min(syllable_numbers, key=lambda x: abs(x - target_syllables)) close_sentences = [ sentence for n, sentence in syllable_sentences if close_number - 1 <= n <= close_number ] close_sentences_set = set(close_sentences) close_sentences_list = list(close_sentences_set) closest_sentences = get_corpus_score(close_sentences_list) # story_score = get_sentiment_value(pattern, tension_dictionary) story_score = get_corpus_score(pattern) rhyme_sentences = [] if story_score > 0: rhyme_sentences = [ sentence for score, sentence in closest_sentences if score == story_score ] rhyme_sentences.sort() elif story_score < 0: rhyme_sentences = [ sentence for score, sentence in closest_sentences if score == story_score ] rhyme_sentences.sort() else: rhyme_sentences = [ sentence for score, sentence in closest_sentences if score == 0 ] if not rhyme_sentences: rhyme_sentences = close_sentences_list print "Possible rhyme sentences:" print rhyme_sentences print "Sentence taken:" print random.choice(rhyme_sentences) return random.choice(rhyme_sentences) # need to fix this
def term_match_score(term, lyrics, phrase_comsumption_rate, hparams): # 単語の歌詞に対するスコア # まずは単語を長音符化 term = preprocess(term, macron='double_vowel_to_macron') lyrics_syl_gen = lyrics_syllable_genrator(lyrics) # 長音符を母音に戻す term_syls = [ preprocess(syl, macron='macron_to_vowel') for syl in syllables(term) ] remaining_phrases = lyrics.split() old_pcr = phrase_comsumption_rate new_pcr = 1. - old_pcr # 音節ごとのスコアとレートのペア score_rates = [] for i, syl in enumerate(term_syls): try: _syl, new_pcr, remaining_phrases, _pos = next(lyrics_syl_gen) score_rates.append( syllable_match_score(syl, _syl, i, _pos, hparams)) except StopIteration: # 歌詞よりも単語が長くてはみ出す時はペナルティを与える return -1e+6, ' '.join(remaining_phrases), new_pcr remaining_lyrics = ' '.join(remaining_phrases) # 単語に対するスコアを計算 term_score = 0. min_score, syl_rate = 1e+6, 1. if score_rates: # syl_rate の降順にソート score_rates = sorted(score_rates, key=lambda x: x[1], reverse=True) for _score, _rate in score_rates: if _rate > 1.: syl_rate *= _rate else: syl_rate = _rate if _score < min_score: min_score = _score term_score += np.sum([s for s, r in score_rates]) * syl_rate else: min_score = 0. term_score = np.min([term_score, term_score * min_score]) # フレーズ消化率によるレートの計算 theta = hparams['theta_pcr'] pcr_rate = (float(np.max([old_pcr, new_pcr]) == 1.) * (1. - theta) + theta) * (float(old_pcr * new_pcr == 1.) * (1. - theta) + theta) length_rate = len(term_syls) # スコアの最終的な評価 score = term_score * pcr_rate return score, remaining_lyrics, new_pcr
def __init__(self, dir=DIR, file_lines=FILE_LINES, num_lines=LINES, batch_size=32, shuffle=True): 'Initialization' self.dir = dir self.list_IDs = [] for file in os.listdir(dir): self.list_IDs.append(dir + '/' + file) self.indexes = [1] * len(self.list_IDs) for i in range(len(self.list_IDs)): self.indexes[i] = i self.file_lines = file_lines self.num_lines = num_lines self.batch_size = batch_size self.syll_mgr = syllables.syllables() self.n_classes = self.syll_mgr.get_size() self.shuffle = shuffle self.on_epoch_end() self.cmudict = cmudict.CMUDict()
def lyrics_syllable_genrator(lyrics): # 歌詞をフレーズ(文節)に区切る lyrics_phrases = lyrics.split() for i, p in enumerate(lyrics_phrases): # 長音符を母音に戻す syls = [ preprocess(syl, macron='macron_to_vowel') for syl in syllables(p) ] for j, syl in enumerate(syls): # 残りの音節 remaining_syllables = preprocess(''.join(syls[j + 1:]), macron='double_vowel_to_macron') # 残りのフレーズ if remaining_syllables: remaining_phrases = [remaining_syllables ] + lyrics_phrases[i + 1:] else: remaining_phrases = lyrics_phrases[i + 1:] # フレーズの消化率 pcr = (j + 1) / len(syls) assert (0. <= pcr and pcr <= 1.) # フレーズ内の音節の位置 pos = j yield syl, pcr, remaining_phrases, pos
def syllables(self): if not hasattr(self, '_syllables'): self._syllables = syllables.syllables(unicode(self.original, 'utf8')) return self._syllables
text_sylls.append(label) num_lines = len(text_lines) label_array = np.zeros((num_symbols, num_lines, num_syllables), dtype=np.int8) for i in range(0, num_lines): for j in range(num_symbols): label_array[j][i][syll_mgr.get_encoding(text_sylls[i][j])] = 1 return (text_lines, label_array) # syllables in descending order of occurrence - 6k in gutenberg.iambic_pentameter, 15k total # clamp to most common 100 syllables while debugging- use NCE to get all syllables or interesting number # 98 + pause + wildcard # iambic pentameter num_symbols = 10 #syll_mgr = syllables.syllables(num_syllables) syll_mgr = syllables.syllables() num_syllables = syll_mgr.get_size() (train_text, train_label) = get_data('prepped_data/gutenberg.iambic_pentameter.train', syll_mgr, num_symbols) num_training = len(train_text) #train_text = train_text[0:100] #train_label = train_label[0:100] (test_text, test_label) = get_data('prepped_data/gutenberg.iambic_pentameter.test', syll_mgr, num_symbols) #test_text = test_text[0:100] #test_label = test_label[0:100] num_testing = len(test_text) print(len(train_text))