def generate(funcutils,generator,keywords):

    assert NUM_OF_SENTENCES == len(keywords)
    pron_dict = PronDict()
    saver = tf.train.Saver()
    context = start_of_sentence()
    with tf.Session() as session:
        flag_trained = generator.initialize_session(session, saver)
        if not flag_trained:
            print("Please train the model first! (./train.py -g)")
            sys.exit(1)
        for keyword in keywords:
            ##为何 [keyword*_batch_size?  keyword_data 都是一样的
            keyword_data, keyword_length = funcutils.fill_np_matrix(
                [keyword] * _BATCH_SIZE)
            context_data, context_length = funcutils.fill_np_matrix(
                [context] * _BATCH_SIZE)
            char = start_of_sentence()
            for _ in range(7):
                # print('char==>', char)
                decoder_input, decoder_input_length = \
                    funcutils.fill_np_matrix([char])
                encoder_feed_dict = {
                    generator.keywords: keyword_data,
                    generator.length_keywords: keyword_length,
                    generator.context: context_data,
                    generator.context_length: context_length,
                    generator.sequence_decoder: decoder_input,
                    generator.length_decoder: decoder_input_length
                }
                if char == start_of_sentence():
                    pass
                else:
                    encoder_feed_dict[generator.initial_decode_state] = state
                # do = session.run([self.decoder_outputs], feed_dict=encoder_feed_dict)
                # print(do)
                # print(do[0].shape)
                probs, state = session.run(
                    [generator.logits, generator.decoder_final_state],
                    feed_dict=encoder_feed_dict)
                prob_list = _gen_prob_list(funcutils,probs, context, pron_dict)
                id_probmax = np.argmax(prob_list, axis=0)
                char = funcutils.char_dict.id2char(id_probmax)
                # prob_sums = np.cumsum(prob_list)
                # rand_val = prob_sums[-1] * random()
                # for i, prob_sum in enumerate(prob_sums):
                #     if rand_val < prob_sum:
                #         char = self.char_dict.int2char(i)
                #         break
                context += char
            context += end_of_sentence()
    return context[1:].split(end_of_sentence())
Exemple #2
0
    def _train_a_batch(self, keywords, contexts, sentences):
        keyword_data, keyword_length = self._fill_np_matrix(keywords)
        context_data, context_length = self._fill_np_matrix(contexts)
        decoder_input, decoder_input_length = self._fill_np_matrix(
            [start_of_sentence() + sentence[:-1] for sentence in sentences])
        targets = self._fill_targets(sentences)

        #sentences is from data_utils --> (sentence, keyword, context)
        #澄潭皎镜石崔巍$ 石   ^
        #万壑千岩暗绿苔$	暗	^澄潭皎镜石崔巍$

        # loss, learning_rate = 0
        with tf.GradientTape() as tape:
            encoder_output = self.encoder(keyword_data, context_data)
            probs, logits, decoder_output = self.decoder(
                encoder_output, decoder_input, decoder_input_length)
            loss = self.loss_func(targets, logits, probs)

            learning_rate = self.learning_rate_func(loss)
            optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

            print(" loss =  %f, learning_rate = %f" % (loss, learning_rate))

        variables = self.encoder.trainable_variables + self.decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
 def generate(self, keywords):
     if NUM_OF_SENTENCES < len(keywords):
         keywords = keywords[:4]
     pron_dict = PronDict()
     context = start_of_sentence()
     with tf.Session() as session:
         self._initialize_session(session)
         if not self.trained:
             print("Please train the model first! (./train.py -g)")
             sys.exit(1)
         for keyword in keywords:
             keyword_data, keyword_length = self._fill_np_matrix(
                 [keyword] * _BATCH_SIZE)
             context_data, context_length = self._fill_np_matrix(
                 [context] * _BATCH_SIZE)
             char = start_of_sentence()
             for _ in range(7):
                 decoder_input, decoder_input_length = \
                         self._fill_np_matrix([char])
                 encoder_feed_dict = {
                     self.keyword: keyword_data,
                     self.keyword_length: keyword_length,
                     self.context: context_data,
                     self.context_length: context_length,
                     self.decoder_inputs: decoder_input,
                     self.decoder_input_length: decoder_input_length
                 }
                 if char == start_of_sentence():
                     pass
                 else:
                     encoder_feed_dict[self.decoder_init_state] = state
                 probs, state = session.run(
                     [self.probs, self.decoder_final_state],
                     feed_dict=encoder_feed_dict)
                 prob_list = self._gen_prob_list(probs, context, pron_dict)
                 prob_sums = np.cumsum(prob_list)
                 rand_val = prob_sums[-1] * random()
                 for i, prob_sum in enumerate(prob_sums):
                     if rand_val < prob_sum:
                         char = self.char_dict.int2char(i)
                         break
                 context += char
             context += end_of_sentence()
     return context[1:].split(end_of_sentence())
    def generate(self, keywords):
        if not tf.train.get_checkpoint_state(save_dir):
            print("Please train the model first! (./train.py -g)")
            sys.exit(1)

        self.checkpoint.restore(self.manager.latest_checkpoint)
        print("Checkpoint is loaded successfully !")
        assert NUM_OF_SENTENCES == len(keywords)
        context = start_of_sentence()
        pron_dict = PronDict()
        for keyword in keywords:
            keyword_data, keyword_length = self._fill_np_matrix([keyword] *
                                                                _BATCH_SIZE)
            context_data, context_length = self._fill_np_matrix([context] *
                                                                _BATCH_SIZE)

            keyword_state, context_output, final_output, final_state, context_state = self.encoder(
                keyword_data, context_data)
            char = start_of_sentence()
            for _ in range(7):
                decoder_input, decoder_input_length = \
                    self._fill_np_matrix([char])
                if char == start_of_sentence():
                    pass
                else:
                    keyword_state = final_state
                probs, final_state, _ = self.decoder(keyword_state,
                                                     context_output,
                                                     decoder_input,
                                                     decoder_input_length,
                                                     final_output, final_state,
                                                     context_state)
                prob_list = self._gen_prob_list(probs, context, pron_dict)
                prob_sums = np.cumsum(prob_list)
                rand_val = prob_sums[-1] * random()
                for i, prob_sum in enumerate(prob_sums):
                    if rand_val < prob_sum:
                        char = self.char_dict.int2char(i)
                        break
                context += char
            context += end_of_sentence()
        return context[1:].split(end_of_sentence())
Exemple #5
0
def gen_train_data():
    """获取每一句的keywords,拼起来写入文件"""
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    ranked_words = RankedWords()

    gen_data = list()
    plan_data = list()

    valid = True
    counter_line = 0
    print('len(poems)==>', len(poems))
    for poem in poems:
        # print(len(poem))
        if len(poem) != 4:
            # print(poem)
            valid = False
            continue
        context = start_of_sentence()
        keywords = list()
        for sentence in poem:
            counter_line += 1
            keyword = ''
            if len(sentence) != 7:
                valid = False
                break
            filterwords = list(
                filter(lambda x: x in ranked_words,
                       segmenter.segment(sentence)))
            if filterwords:
                keyword = filterwords[0]
            for word in filterwords:
                # print('word==>',word)
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            if keyword:
                gen_line = sentence + end_of_sentence() + \
                           '\t' + keyword + '\t' + context + '\n'
                keywords.append(keyword)
                gen_data.append(gen_line)
                context += sentence + end_of_sentence()
        plan_data.append(' '.join(keywords))
    with open(plan_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter + '\n')
    with open(gen_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter)

    print('counter_line==>', counter_line)
    del segmenter, poems, ranked_words
Exemple #6
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
Exemple #7
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
 def _train_a_batch(self, session, epoch, keywords, contexts, sentences):
     keyword_data, keyword_length = self._fill_np_matrix(keywords)
     context_data, context_length = self._fill_np_matrix(contexts)
     decoder_inputs, decoder_input_length  = self._fill_np_matrix(
             [start_of_sentence() + sentence[:-1] \
                     for sentence in sentences])
     targets = self._fill_targets(sentences)
     feed_dict = {
         self.keyword: keyword_data,
         self.keyword_length: keyword_length,
         self.context: context_data,
         self.context_length: context_length,
         self.decoder_inputs: decoder_inputs,
         self.decoder_input_length: decoder_input_length,
         self.targets: targets
     }
     loss, learning_rate, _ = session.run(
         [self.loss, self.learning_rate, self.opt_step],
         feed_dict=feed_dict)
     print(" loss =  %f, learning_rate = %f" % (loss, learning_rate))
Exemple #9
0
 def _compute_prob_list(self,char,keyword_data,keyword_length,context_data, \
         context_length,current_context, state, session, pron_dict):
     decoder_input, decoder_input_length = \
         self._fill_np_matrix([char])
     encoder_feed_dict = {
         self.keyword : keyword_data,
         self.keyword_length : keyword_length,
         self.context : context_data,
         self.context_length : context_length,
         self.decoder_inputs : decoder_input,
         self.decoder_input_length : decoder_input_length
         }
     if char == start_of_sentence():
         pass
     else:
         encoder_feed_dict[self.decoder_init_state] = state
     probs, state = session.run(
         [self.probs, self.decoder_final_state], 
         feed_dict = encoder_feed_dict)
     prob_list = self._gen_prob_list(probs, current_context, pron_dict)
     return prob_list, state
Exemple #10
0
 def _train_a_batch(self, session, epoch, keywords, contexts, sentences):
     # padding
     keyword_data, keyword_length = self._fill_np_matrix(keywords)
     context_data, context_length = self._fill_np_matrix(contexts)
     decoder_inputs, decoder_input_length  = self._fill_np_matrix(
             [start_of_sentence() + sentence[:-1] \
                     for sentence in sentences])
     targets = self._fill_targets(sentences)
     # 对所有占位符进行赋值
     feed_dict = {
         self.keyword: keyword_data,
         self.keyword_length: keyword_length,
         self.context: context_data,
         self.context_length: context_length,
         self.decoder_inputs: decoder_inputs,
         self.decoder_input_length: decoder_input_length,
         self.targets: targets
     }
     loss, learning_rate, _ = session.run(
         [self.loss, self.learning_rate, self.opt_step],
         feed_dict=feed_dict)
     print(" loss =  %f, learning_rate = %f" % (loss, learning_rate))
     with open('save/loss.log', 'a+') as file:
         file.write("{}: {}\n".format(epoch, loss))
    def _train_a_batch(self, keywords, contexts, sentences):
        keyword_data, keyword_length = self._fill_np_matrix(keywords)
        context_data, context_length = self._fill_np_matrix(contexts)
        decoder_input, decoder_input_length = self._fill_np_matrix(
            [start_of_sentence() + sentence[:-1] for sentence in sentences])
        targets = self._fill_targets(sentences)

        # sentences is from data_utils -->
        # 澄潭皎镜石崔巍$ 石   ^
        # 万壑千岩暗绿苔$	暗	^澄潭皎镜石崔巍$

        # loss, learning_rate = 0
        with tf.GradientTape() as tape:
            keyword_state, context_output, final_output, final_state, context_state = self.encoder(
                keyword_data, context_data)
            probs, final_state, logits = self.decoder(
                keyword_state, context_output, decoder_input,
                decoder_input_length, final_output, final_state, context_state)
            loss = self.loss_func(targets, logits)  # self???
            print(" loss =  %f" % loss)

        variables = self.encoder.trainable_variables + self.decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
Exemple #12
0
    def generate(self, keywords):
        assert NUM_OF_SENTENCES + 1 == len(keywords)
        pron_dict = PronDict()
        context = start_of_sentence()
        with tf.Session() as session:
            self._initialize_session(session)
            if not self.trained:
                print("Please train the model first! (./train.py -g)")
                sys.exit(1)
            # iterate through all keyword, which means iterate through all four sentences

            # provide a random hint to the first sentence to avoid generating the same thing
            hint = keywords.pop(randrange(len(keywords)))

            first_line = True
            for keyword in keywords:
                if first_line:
                    context += hint
                    first_line = False

                keyword_data, keyword_length = self._fill_np_matrix(
                        [keyword] * _BATCH_SIZE)
                context_data, context_length = self._fill_np_matrix(
                        [context] * _BATCH_SIZE)
                char = start_of_sentence()

                word_count = 0
                state = ''
                while word_count < 7:
                    prob_list, state = self._compute_prob_list(char,keyword_data,keyword_length,\
                        context_data,context_length,context,state,session,pron_dict)
                    
                    # randomly sample BEAM_SIZE number of characters and choose the highest probability
                    # generates different poems when given different keywords
                    if word_count == 0:
                        prob_sums = np.cumsum(prob_list)
                        # the array which store the first char

                        char_array = []
                        score_array = []
                        for i in range(BEAM_SIZE):
                            char_array.append('')
                            score_array.append(1)

                        for i in range(BEAM_SIZE):
                            rand_val = prob_sums[-1] * random()
                            for j, prob_sum in enumerate(prob_sums):
                                if rand_val < prob_sum:
                                    char_array[i] = self.char_dict.int2char(j)
                                    score_array[i] *= -math.log(prob_list[j])
                                    break
                        # because we took the negative log we need the minimum prob
                        min_value = 1000
                        min_index = 0
                        for k in range(len(score_array)):
                            if score_array[k] < min_value:
                                min_index = k
                                min_value = score_array[k]
                        char = char_array[min_index]
                        
                        # generates the same poem for the same keywords
                        '''
                        max_value = prob_list[0]
                        max_index = 0
                        for k in range(len(prob_list)):
                            if prob_list[k] > max_value:
                                max_index = k
                                max_value = prob_list[k]
                        char = self.char_dict.int2char(max_index)
                        '''
                        context += char
                        word_count += 1
                        # end of first word

                    else:
                        # perform beam search for two chars
                        char_array = []
                        second_char_array = []
                        score_array = []

                        for i in range(BEAM_SIZE):
                            char_array.append('')
                            second_char_array.append('')
                            score_array.append(1)
                        
                        max = 0

                        # choose the BEAM_SIZE most possible choices
                        for i in range(BEAM_SIZE):
                            char_array[i], score, used_index = self._return_n_most_likely(prob_list,i+1)
                            score_array[i] *= score
                            # make sure that the same thing is not selected again
                            prob_list[used_index] = 0


                        # choose the most possible choice based on the current choice
                        for i in range(BEAM_SIZE):
                            current_context = context + char_array[i]
                            prob_list, state = self._compute_prob_list(char_array[i],keyword_data,keyword_length,\
                                context_data,context_length,current_context,state,session,pron_dict)
                            second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,1)
                            # randomly sample second array and make sure it does not repeat
                            # random_sample = second_char_array[randrange(len(second_char_array))]
                            random_sample = second_char_array[i]
                            used_chars = set(ch for ch in context)

                            tmp = 2

                            while(random_sample == char_array[i] or random_sample in used_chars):
                                second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,tmp)
                                random_sample = second_char_array[i]
                                tmp += 1
                            score_array[i] *= score

                        # because we took the negative log the minimum score is the best
                        min_value = 1000
                        min_index = 0
                        for i in range(len(score_array)):
                            if score_array[i] < min_value:
                                min_index = i
                                min_value = score_array[i]
                        
                        # adjust so that we prevent using the same character again and again
                        used_chars = set(ch for ch in context)
                        first_char = char_array[min_index]
                        in_loop = 0
                        
                        while first_char in used_chars and in_loop < len(char_array):
                            score_array[min_index] = 1000
                            min_value = 1000
                            for i in range(len(score_array)):
                                # find the minimum in the remaining
                                if score_array[i] < min_value:
                                    min_index = i
                                    min_value = score_array[i]
                            first_char = char_array[min_index]
                            in_loop += 1

                        first_char = char_array[min_index]
                        second_char = second_char_array[min_index]

                        context += first_char
                        context += second_char
                        char = second_char
                        word_count += 2
                # append the <END> label
                context += end_of_sentence()
            # remove the extra hint
            context = context[0] + context[len(hint) + 1:]
        return context[1:].split(end_of_sentence())