def __init__(self,
              max_files=1000000,
              file_lines=FILE_LINES,
              batch_size=32,
              shuffle=True):
     'Initialization'
     self.raw_dir = RAW_DIR
     self.cache_dir = CACHE_DIR
     self.raw_files = []
     self.cache_files = []
     files = []
     for file in os.listdir(self.raw_dir):
         files.append(file)
         if len(files) >= max_files:
             break
     for file in files:
         self.raw_files.append(self.raw_dir + '/' + file)
     for file in files:
         self.cache_files.append(self.cache_dir + '/' + file + '.pk')
     # arange does this
     self.indexes = [1] * len(self.raw_files)
     for i in range(len(self.raw_files)):
         self.indexes[i] = i
     self.file_lines = file_lines
     self.batch_size = batch_size
     self.syll_mgr = syllables.syllables()
     self.n_classes = self.syll_mgr.get_size()
     self.shuffle = shuffle
     self.on_epoch_end()
     self.cmudict = cmudict.CMUDict()
Esempio n. 2
0
def rhyme( sentence ):
    target_syllables = syllables.sentence_syllables( sentence )
    tokens = nltk.word_tokenize(sentence) 
    rhymes = word_rhyme_candidates(last_word(tokens))
    cs = []
            
    if len(tokens) == 1:
        return ", ".join(rhymes[:12])

    for rhyme in rhymes:
        cs += candidate_sentences( rhyme )

    syllable_sentences = []
    for sentence in cs:
        ss = sum( [ syllables.syllables(word) for word in sentence ] )
        syllable_sentences.append( (ss, " ".join(sentence)) )

    syllable_sentences.sort()
    syllable_sentences.reverse()
    
    if len( syllable_sentences ) == 0:
        if len( rhymes ) > 0: 
            return ", ".join(rhymes[:12])
        else:
            return "month, orange, Nantucket"
    
    syllable_numbers = [ n for n, sentence in syllable_sentences ] 
    closest_number = min( syllable_numbers, key=lambda x:abs(x-target_syllables) )
    
    closest_sentences = [ sentence for n, sentence in syllable_sentences if n == closest_number ] 

    return random.choice(closest_sentences)
Esempio n. 3
0
def rhyme(sentence):
    target_syllables = syllables.sentence_syllables(sentence)
    tokens = nltk.word_tokenize(sentence)
    rhymes = word_rhyme_candidates(last_word(tokens))
    cs = []

    if len(tokens) == 1:
        return ", ".join(rhymes[:12])

    for rhyme in rhymes:
        cs += candidate_sentences(rhyme)

    syllable_sentences = []
    for sentence in cs:
        ss = sum([syllables.syllables(word) for word in sentence])
        syllable_sentences.append((ss, " ".join(sentence)))

    syllable_sentences.sort()
    syllable_sentences.reverse()

    if len(syllable_sentences) == 0:
        if len(rhymes) > 0:
            return ", ".join(rhymes[:12])
        else:
            return "month, orange, Nantucket"

    syllable_numbers = [n for n, sentence in syllable_sentences]
    closest_number = min(syllable_numbers,
                         key=lambda x: abs(x - target_syllables))

    closest_sentences = [
        sentence for n, sentence in syllable_sentences if n == closest_number
    ]

    return random.choice(closest_sentences)
Esempio n. 4
0
def findRhymingCore(source, num, numSyb = 2):
  sourceNoun = syllables.getWord(source, numSyb, numSyb)
  rhymeNouns = rhyme.word_rhyme_candidates(sourceNoun)
  
  nounList = []
  nounList.append(sourceNoun)
  
  suggest = []
  for word in rhymeNouns:
    if syllables.syllables(word) == numSyb:
      if word in source:
        suggest.append(word)
  
  if len(suggest) < num:
    return []
  
  maxCount = 100
  count = 1
  for i in range(num-1):
    testWord = random.choice(suggest)
    while testWord in nounList and count < maxCount:
      testWord = random.choice(suggest)
      count += 1
    if count >= maxCount:
      return []
    nounList.append(testWord)
    
  return nounList
Esempio n. 5
0
def get_rhyme(sentence):
    pattern = sentence
    target_syllables = syllables.sentence_syllables(sentence)
    tokens = nltk.word_tokenize(sentence)
    rhymes = word_rhyme_candidates(last_word(tokens))
    candidate_sentence = []
    for rhyme in rhymes:
        candidate_sentence += candidate_sentences(rhyme)
    syllable_sentences = []
    for sentence in candidate_sentence:
        sumOfSyllables = sum([syllables.syllables(word) for word in sentence])
        syllable_sentences.append((sumOfSyllables, " ".join(sentence)))
    syllable_sentences.sort()
    syllable_sentences.reverse()
    if len(syllable_sentences) == 0:
        if len(rhymes) > 0:
            # get synonyms of rhyme
            return " "
            return ", ".join(rhymes[:12])
        else:
            return "Oho ho ho ho ho"
    syllable_numbers = [n for n, sentence in syllable_sentences]
    close_number = min(syllable_numbers,
                       key=lambda x: abs(x - target_syllables))
    close_sentences = [
        sentence for n, sentence in syllable_sentences
        if close_number - 1 <= n <= close_number
    ]
    close_sentences_set = set(close_sentences)
    close_sentences_list = list(close_sentences_set)
    closest_sentences = get_corpus_score(close_sentences_list)
    # story_score = get_sentiment_value(pattern, tension_dictionary)
    story_score = get_corpus_score(pattern)
    rhyme_sentences = []
    if story_score > 0:
        rhyme_sentences = [
            sentence for score, sentence in closest_sentences
            if score == story_score
        ]
        rhyme_sentences.sort()
    elif story_score < 0:
        rhyme_sentences = [
            sentence for score, sentence in closest_sentences
            if score == story_score
        ]
        rhyme_sentences.sort()
    else:
        rhyme_sentences = [
            sentence for score, sentence in closest_sentences if score == 0
        ]
    if not rhyme_sentences:
        rhyme_sentences = close_sentences_list
    print "Possible rhyme sentences:"
    print rhyme_sentences
    print "Sentence taken:"
    print random.choice(rhyme_sentences)
    return random.choice(rhyme_sentences)  # need to fix this
Esempio n. 6
0
def term_match_score(term, lyrics, phrase_comsumption_rate, hparams):
    # 単語の歌詞に対するスコア
    # まずは単語を長音符化
    term = preprocess(term, macron='double_vowel_to_macron')
    lyrics_syl_gen = lyrics_syllable_genrator(lyrics)
    # 長音符を母音に戻す
    term_syls = [
        preprocess(syl, macron='macron_to_vowel') for syl in syllables(term)
    ]
    remaining_phrases = lyrics.split()
    old_pcr = phrase_comsumption_rate
    new_pcr = 1. - old_pcr
    # 音節ごとのスコアとレートのペア
    score_rates = []
    for i, syl in enumerate(term_syls):
        try:
            _syl, new_pcr, remaining_phrases, _pos = next(lyrics_syl_gen)
            score_rates.append(
                syllable_match_score(syl, _syl, i, _pos, hparams))
        except StopIteration:
            # 歌詞よりも単語が長くてはみ出す時はペナルティを与える
            return -1e+6, ' '.join(remaining_phrases), new_pcr
    remaining_lyrics = ' '.join(remaining_phrases)
    # 単語に対するスコアを計算
    term_score = 0.
    min_score, syl_rate = 1e+6, 1.
    if score_rates:
        # syl_rate の降順にソート
        score_rates = sorted(score_rates, key=lambda x: x[1], reverse=True)
        for _score, _rate in score_rates:
            if _rate > 1.:
                syl_rate *= _rate
            else:
                syl_rate = _rate
            if _score < min_score:
                min_score = _score
        term_score += np.sum([s for s, r in score_rates]) * syl_rate
    else:
        min_score = 0.
    term_score = np.min([term_score, term_score * min_score])
    # フレーズ消化率によるレートの計算
    theta = hparams['theta_pcr']
    pcr_rate = (float(np.max([old_pcr, new_pcr]) == 1.) *
                (1. - theta) + theta) * (float(old_pcr * new_pcr == 1.) *
                                         (1. - theta) + theta)
    length_rate = len(term_syls)
    # スコアの最終的な評価
    score = term_score * pcr_rate
    return score, remaining_lyrics, new_pcr
Esempio n. 7
0
 def __init__(self,
              dir=DIR,
              file_lines=FILE_LINES,
              num_lines=LINES,
              batch_size=32,
              shuffle=True):
     'Initialization'
     self.dir = dir
     self.list_IDs = []
     for file in os.listdir(dir):
         self.list_IDs.append(dir + '/' + file)
     self.indexes = [1] * len(self.list_IDs)
     for i in range(len(self.list_IDs)):
         self.indexes[i] = i
     self.file_lines = file_lines
     self.num_lines = num_lines
     self.batch_size = batch_size
     self.syll_mgr = syllables.syllables()
     self.n_classes = self.syll_mgr.get_size()
     self.shuffle = shuffle
     self.on_epoch_end()
     self.cmudict = cmudict.CMUDict()
Esempio n. 8
0
def lyrics_syllable_genrator(lyrics):
    # 歌詞をフレーズ(文節)に区切る
    lyrics_phrases = lyrics.split()
    for i, p in enumerate(lyrics_phrases):
        # 長音符を母音に戻す
        syls = [
            preprocess(syl, macron='macron_to_vowel') for syl in syllables(p)
        ]
        for j, syl in enumerate(syls):
            # 残りの音節
            remaining_syllables = preprocess(''.join(syls[j + 1:]),
                                             macron='double_vowel_to_macron')
            # 残りのフレーズ
            if remaining_syllables:
                remaining_phrases = [remaining_syllables
                                     ] + lyrics_phrases[i + 1:]
            else:
                remaining_phrases = lyrics_phrases[i + 1:]
            # フレーズの消化率
            pcr = (j + 1) / len(syls)
            assert (0. <= pcr and pcr <= 1.)
            # フレーズ内の音節の位置
            pos = j
            yield syl, pcr, remaining_phrases, pos
Esempio n. 9
0
 def syllables(self):
     if not hasattr(self, '_syllables'):
         self._syllables = syllables.syllables(unicode(self.original, 'utf8'))
     return self._syllables
Esempio n. 10
0
        text_sylls.append(label)
    num_lines = len(text_lines)
    label_array = np.zeros((num_symbols, num_lines, num_syllables), dtype=np.int8)
    for i in range(0, num_lines):
      for j in range(num_symbols):
        label_array[j][i][syll_mgr.get_encoding(text_sylls[i][j])] = 1

    return (text_lines, label_array)

# syllables in descending order of occurrence - 6k in gutenberg.iambic_pentameter, 15k total
# clamp to most common 100 syllables while debugging- use NCE to get all syllables or interesting number
# 98 + pause + wildcard
# iambic pentameter
num_symbols = 10
#syll_mgr = syllables.syllables(num_syllables)
syll_mgr = syllables.syllables()
num_syllables = syll_mgr.get_size()



(train_text, train_label) = get_data('prepped_data/gutenberg.iambic_pentameter.train', syll_mgr, num_symbols)
num_training = len(train_text)
#train_text = train_text[0:100]
#train_label = train_label[0:100]

(test_text, test_label) = get_data('prepped_data/gutenberg.iambic_pentameter.test', syll_mgr, num_symbols)
#test_text = test_text[0:100]
#test_label = test_label[0:100]

num_testing = len(test_text)
print(len(train_text))