Python end_of_sentence Exemples, char_dict.end_of_sentence Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : generate.py Projet : Epoch-Mengying/Generating-Poetry-with-Chatbot

 def _fill_np_matrix(self, texts, keyword = True):
     
     max_time = max(map(len, texts))
     # Edited
     if keyword: # one keyword each, 64 keywords in total for 16 poems
         matrix = np.zeros([_BATCH_SIZE, max_time, CHAR_VEC_DIM], 
                 dtype = np.float32)
         for i in range(_BATCH_SIZE):
             for j in range(max_time):
                 matrix[i, j, :] = self.planner.get_vect(end_of_sentence())
         for i, text in enumerate(texts):
             matrix[i, : len(text)] = self.planner.get_vects(text)
         seq_length = [len(texts[i]) if i < len(texts) else 0 \
                 for i in range(_BATCH_SIZE)]
     else:
         matrix = np.zeros([_BATCH_SIZE, max_time, CHARPIN_VEC_DIM], 
                 dtype = np.float32)
         for i in range(_BATCH_SIZE):
             for j in range(max_time):
                 matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence())
         for i, text in enumerate(texts):
             matrix[i, : len(text)] = self.char2vec.get_vects(text)
         seq_length = [len(texts[i]) if i < len(texts) else 0 \
                 for i in range(_BATCH_SIZE)]
     return matrix, seq_length

Exemple #2

0

Afficher le fichier

def gen_train_data():
    """获取每一句的keywords，拼起来写入文件"""
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    ranked_words = RankedWords()

    gen_data = list()
    plan_data = list()

    valid = True
    counter_line = 0
    print('len(poems)==>', len(poems))
    for poem in poems:
        # print(len(poem))
        if len(poem) != 4:
            # print(poem)
            valid = False
            continue
        context = start_of_sentence()
        keywords = list()
        for sentence in poem:
            counter_line += 1
            keyword = ''
            if len(sentence) != 7:
                valid = False
                break
            filterwords = list(
                filter(lambda x: x in ranked_words,
                       segmenter.segment(sentence)))
            if filterwords:
                keyword = filterwords[0]
            for word in filterwords:
                # print('word==>',word)
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            if keyword:
                gen_line = sentence + end_of_sentence() + \
                           '\t' + keyword + '\t' + context + '\n'
                keywords.append(keyword)
                gen_data.append(gen_line)
                context += sentence + end_of_sentence()
        plan_data.append(' '.join(keywords))
    with open(plan_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter + '\n')
    with open(gen_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter)

    print('counter_line==>', counter_line)
    del segmenter, poems, ranked_words

Exemple #3

0

Afficher le fichier

Fichier : infer.py Projet : zpf5007/chinese_ancient_poetry

def generate(funcutils,generator,keywords):

    assert NUM_OF_SENTENCES == len(keywords)
    pron_dict = PronDict()
    saver = tf.train.Saver()
    context = start_of_sentence()
    with tf.Session() as session:
        flag_trained = generator.initialize_session(session, saver)
        if not flag_trained:
            print("Please train the model first! (./train.py -g)")
            sys.exit(1)
        for keyword in keywords:
            ##为何 [keyword*_batch_size?  keyword_data 都是一样的
            keyword_data, keyword_length = funcutils.fill_np_matrix(
                [keyword] * _BATCH_SIZE)
            context_data, context_length = funcutils.fill_np_matrix(
                [context] * _BATCH_SIZE)
            char = start_of_sentence()
            for _ in range(7):
                # print('char==>', char)
                decoder_input, decoder_input_length = \
                    funcutils.fill_np_matrix([char])
                encoder_feed_dict = {
                    generator.keywords: keyword_data,
                    generator.length_keywords: keyword_length,
                    generator.context: context_data,
                    generator.context_length: context_length,
                    generator.sequence_decoder: decoder_input,
                    generator.length_decoder: decoder_input_length
                }
                if char == start_of_sentence():
                    pass
                else:
                    encoder_feed_dict[generator.initial_decode_state] = state
                # do = session.run([self.decoder_outputs], feed_dict=encoder_feed_dict)
                # print(do)
                # print(do[0].shape)
                probs, state = session.run(
                    [generator.logits, generator.decoder_final_state],
                    feed_dict=encoder_feed_dict)
                prob_list = _gen_prob_list(funcutils,probs, context, pron_dict)
                id_probmax = np.argmax(prob_list, axis=0)
                char = funcutils.char_dict.id2char(id_probmax)
                # prob_sums = np.cumsum(prob_list)
                # rand_val = prob_sums[-1] * random()
                # for i, prob_sum in enumerate(prob_sums):
                #     if rand_val < prob_sum:
                #         char = self.char_dict.int2char(i)
                #         break
                context += char
            context += end_of_sentence()
    return context[1:].split(end_of_sentence())

Exemple #4

0

Afficher le fichier

def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)

Exemple #5

0

Afficher le fichier

Fichier : data_utils.py Projet : Abingcbc/MuTao

def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)

Exemple #6

0

Afficher le fichier

Fichier : generator.py Projet : yujianzhang7/COEN296_fall2018

 def generate(self, keywords):
     if NUM_OF_SENTENCES < len(keywords):
         keywords = keywords[:4]
     pron_dict = PronDict()
     context = start_of_sentence()
     with tf.Session() as session:
         self._initialize_session(session)
         if not self.trained:
             print("Please train the model first! (./train.py -g)")
             sys.exit(1)
         for keyword in keywords:
             keyword_data, keyword_length = self._fill_np_matrix(
                 [keyword] * _BATCH_SIZE)
             context_data, context_length = self._fill_np_matrix(
                 [context] * _BATCH_SIZE)
             char = start_of_sentence()
             for _ in range(7):
                 decoder_input, decoder_input_length = \
                         self._fill_np_matrix([char])
                 encoder_feed_dict = {
                     self.keyword: keyword_data,
                     self.keyword_length: keyword_length,
                     self.context: context_data,
                     self.context_length: context_length,
                     self.decoder_inputs: decoder_input,
                     self.decoder_input_length: decoder_input_length
                 }
                 if char == start_of_sentence():
                     pass
                 else:
                     encoder_feed_dict[self.decoder_init_state] = state
                 probs, state = session.run(
                     [self.probs, self.decoder_final_state],
                     feed_dict=encoder_feed_dict)
                 prob_list = self._gen_prob_list(probs, context, pron_dict)
                 prob_sums = np.cumsum(prob_list)
                 rand_val = prob_sums[-1] * random()
                 for i, prob_sum in enumerate(prob_sums):
                     if rand_val < prob_sum:
                         char = self.char_dict.int2char(i)
                         break
                 context += char
             context += end_of_sentence()
     return context[1:].split(end_of_sentence())

Exemple #7

0

Afficher le fichier

Fichier : generateModel.py Projet : MangoManGeek/Chinese-Poem-Generation

    def generate(self, keywords):
        if not tf.train.get_checkpoint_state(save_dir):
            print("Please train the model first! (./train.py -g)")
            sys.exit(1)

        self.checkpoint.restore(self.manager.latest_checkpoint)
        print("Checkpoint is loaded successfully !")
        assert NUM_OF_SENTENCES == len(keywords)
        context = start_of_sentence()
        pron_dict = PronDict()
        for keyword in keywords:
            keyword_data, keyword_length = self._fill_np_matrix([keyword] *
                                                                _BATCH_SIZE)
            context_data, context_length = self._fill_np_matrix([context] *
                                                                _BATCH_SIZE)

            keyword_state, context_output, final_output, final_state, context_state = self.encoder(
                keyword_data, context_data)
            char = start_of_sentence()
            for _ in range(7):
                decoder_input, decoder_input_length = \
                    self._fill_np_matrix([char])
                if char == start_of_sentence():
                    pass
                else:
                    keyword_state = final_state
                probs, final_state, _ = self.decoder(keyword_state,
                                                     context_output,
                                                     decoder_input,
                                                     decoder_input_length,
                                                     final_output, final_state,
                                                     context_state)
                prob_list = self._gen_prob_list(probs, context, pron_dict)
                prob_sums = np.cumsum(prob_list)
                rand_val = prob_sums[-1] * random()
                for i, prob_sum in enumerate(prob_sums):
                    if rand_val < prob_sum:
                        char = self.char_dict.int2char(i)
                        break
                context += char
            context += end_of_sentence()
        return context[1:].split(end_of_sentence())

Exemple #8

0

Afficher le fichier

Fichier : generate.py Projet : MangoManGeek/Chinese-Poem-Generation

 def _fill_np_matrix(self, texts):
     max_time = max(map(len, texts))  #the len of keyword
     matrix = np.zeros([_BATCH_SIZE, max_time, CHAR_VEC_DIM],
                       dtype=np.float32)
     for i in range(_BATCH_SIZE):
         for j in range(max_time):
             matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence())
     for i, text in enumerate(texts):
         matrix[i, :len(text)] = self.char2vec.get_vects(text)
     seq_length = [len(texts[i]) if i < len(texts) else 0 \
             for i in range(_BATCH_SIZE)]
     return matrix, seq_length

Exemple #9

0

Afficher le fichier

 def fill_np_matrix(self, texts):
     maxlen = max(map(len, texts))
     matrix = np.zeros(shape=[_BATCH_SIZE, maxlen, _NUM_UNITS])
     for i in range(_BATCH_SIZE):
         for j in range(maxlen):
             matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence())
     for index, text in enumerate(texts):
         matrix[index, :len(text)] = self.char2vec.get_vects(text)
     lens_seq = [
         len(texts[index]) if index < len(texts) else 0
         for index in range(_BATCH_SIZE)
     ]
     return matrix, np.array(lens_seq)

Exemple #10

0

Afficher le fichier

    def generate(self, keywords):
        assert NUM_OF_SENTENCES + 1 == len(keywords)
        pron_dict = PronDict()
        context = start_of_sentence()
        with tf.Session() as session:
            self._initialize_session(session)
            if not self.trained:
                print("Please train the model first! (./train.py -g)")
                sys.exit(1)
            # iterate through all keyword, which means iterate through all four sentences

            # provide a random hint to the first sentence to avoid generating the same thing
            hint = keywords.pop(randrange(len(keywords)))

            first_line = True
            for keyword in keywords:
                if first_line:
                    context += hint
                    first_line = False

                keyword_data, keyword_length = self._fill_np_matrix(
                        [keyword] * _BATCH_SIZE)
                context_data, context_length = self._fill_np_matrix(
                        [context] * _BATCH_SIZE)
                char = start_of_sentence()

                word_count = 0
                state = ''
                while word_count < 7:
                    prob_list, state = self._compute_prob_list(char,keyword_data,keyword_length,\
                        context_data,context_length,context,state,session,pron_dict)
                    
                    # randomly sample BEAM_SIZE number of characters and choose the highest probability
                    # generates different poems when given different keywords
                    if word_count == 0:
                        prob_sums = np.cumsum(prob_list)
                        # the array which store the first char

                        char_array = []
                        score_array = []
                        for i in range(BEAM_SIZE):
                            char_array.append('')
                            score_array.append(1)

                        for i in range(BEAM_SIZE):
                            rand_val = prob_sums[-1] * random()
                            for j, prob_sum in enumerate(prob_sums):
                                if rand_val < prob_sum:
                                    char_array[i] = self.char_dict.int2char(j)
                                    score_array[i] *= -math.log(prob_list[j])
                                    break
                        # because we took the negative log we need the minimum prob
                        min_value = 1000
                        min_index = 0
                        for k in range(len(score_array)):
                            if score_array[k] < min_value:
                                min_index = k
                                min_value = score_array[k]
                        char = char_array[min_index]
                        
                        # generates the same poem for the same keywords
                        '''
                        max_value = prob_list[0]
                        max_index = 0
                        for k in range(len(prob_list)):
                            if prob_list[k] > max_value:
                                max_index = k
                                max_value = prob_list[k]
                        char = self.char_dict.int2char(max_index)
                        '''
                        context += char
                        word_count += 1
                        # end of first word

                    else:
                        # perform beam search for two chars
                        char_array = []
                        second_char_array = []
                        score_array = []

                        for i in range(BEAM_SIZE):
                            char_array.append('')
                            second_char_array.append('')
                            score_array.append(1)
                        
                        max = 0

                        # choose the BEAM_SIZE most possible choices
                        for i in range(BEAM_SIZE):
                            char_array[i], score, used_index = self._return_n_most_likely(prob_list,i+1)
                            score_array[i] *= score
                            # make sure that the same thing is not selected again
                            prob_list[used_index] = 0


                        # choose the most possible choice based on the current choice
                        for i in range(BEAM_SIZE):
                            current_context = context + char_array[i]
                            prob_list, state = self._compute_prob_list(char_array[i],keyword_data,keyword_length,\
                                context_data,context_length,current_context,state,session,pron_dict)
                            second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,1)
                            # randomly sample second array and make sure it does not repeat
                            # random_sample = second_char_array[randrange(len(second_char_array))]
                            random_sample = second_char_array[i]
                            used_chars = set(ch for ch in context)

                            tmp = 2

                            while(random_sample == char_array[i] or random_sample in used_chars):
                                second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,tmp)
                                random_sample = second_char_array[i]
                                tmp += 1
                            score_array[i] *= score

                        # because we took the negative log the minimum score is the best
                        min_value = 1000
                        min_index = 0
                        for i in range(len(score_array)):
                            if score_array[i] < min_value:
                                min_index = i
                                min_value = score_array[i]
                        
                        # adjust so that we prevent using the same character again and again
                        used_chars = set(ch for ch in context)
                        first_char = char_array[min_index]
                        in_loop = 0
                        
                        while first_char in used_chars and in_loop < len(char_array):
                            score_array[min_index] = 1000
                            min_value = 1000
                            for i in range(len(score_array)):
                                # find the minimum in the remaining
                                if score_array[i] < min_value:
                                    min_index = i
                                    min_value = score_array[i]
                            first_char = char_array[min_index]
                            in_loop += 1

                        first_char = char_array[min_index]
                        second_char = second_char_array[min_index]

                        context += first_char
                        context += second_char
                        char = second_char
                        word_count += 2
                # append the <END> label
                context += end_of_sentence()
            # remove the extra hint
            context = context[0] + context[len(hint) + 1:]
        return context[1:].split(end_of_sentence())