Ejemplo n.º 1
0
    def generate(self, input_file, output_file):
        """
        Generate a model.
        """
        deploy_model = self.model.get_deploy_function()
        with open(output_file, 'w') as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines():
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_zi.decode("gb18030")
                    line = line.strip()
                    print(line.encode(config.globalCharSet()))
                    fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                    res, score = beam_search(line,
                                             self.cr,
                                             deploy_model,
                                             beam_size=200,
                                             search_scope=200)
                    print res
                    res = [
                        ' '.join(self.cr.transform_input_text(s)) for s in res
                    ]
                    for r, s in zip(res, score):

                        print('result: %s, score: %f.' % (r, s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     observe_model = self.model.get_observe_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 sentence, score = beam_search(line, self.cr, deploy_model, beam_size=50, search_scope=50)
                 print sentence
                 res = [' '.join(self.cr.transform_input_text(s)) for s in sentence]
                 for r, st, s in zip(res, sentence, score)[0:5] :
                     (question, question_mask) = self.cr.transform_input_data(line)
                     (tanswer, tanswer_mask) = ([[i] for i in st], [[i] for i in [1]*len(st)])
                     [alpha] = observe_model(question[:-1,:], question_mask[:-1,:], tanswer, tanswer_mask)
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     for row in range(alpha.shape[0]) :
                         for col in range(alpha.shape[1]) :
                             print alpha[row, col, 0, 0],
                         print 
                     fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
Ejemplo n.º 3
0
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     evaluate_model = self.reverse_model.get_evaluation_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 res, score = beam_search(line, self.cr, deploy_model,
                                           beam_size=10,
                                           search_scope=10,
                                           output_size=20)
                 print res
                 res = [' '.join(self.cr.transform_input_text(s[1:-1])) for s in res]
                 cbres = list()
                 for r, s in zip(res, score) :
                     (question, question_mask) = self.cr.transform_input_data(r)
                     (answer, answer_mask) = self.cr.transform_input_data(line)
                     answer = np.concatenate([question[-1:], answer], axis=0)
                     answer_mask = np.concatenate([question_mask[-1:], answer_mask], axis=0)
                     question = question[:-1,:]
                     question_mask = question_mask[:-1,:]
                     sae, _, _= evaluate_model(question, question_mask, answer, answer_mask)
                     cbres.append([r, s, sae])
                 for r, s, rs in sorted(cbres, key=lambda x: x[1]+x[2], reverse=True) :
                     print ('result: %s, score: %f, %f' % (r.encode(config.globalCharSet()), s, rs))
                     # fw.writelines('result: %s, score: %f, %f\n' % (r.encode(config.globalCharSet()), s, rs))
                 fw.writelines('\n')
Ejemplo n.º 4
0
 def generate_one_question(self, question, deploy_model, output_size=50):
     res, score = beam_search(question, self.cr, deploy_model,
                               beam_size=200,
                               search_scope=200,
                               output_size=output_size)
     print res
     res = [' '.join(self.cr.transform_input_text(s)) for s in res]
     return res, score
 def generate(self, input_file, output_file):
     """
     Generate a model with style modeling.
     """
     k = 10
     topic_distribution_function = self.model.get_topic_distribution_function()
     style_distribution_function = self.model.get_style_distribution_function()
     deploy_model = self.model.get_deploy_function()
     style_number = self.conf_dict['n_style']
     with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
         with open(output_file, 'w') as fw:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 (question, question_mask) = self.cr.transform_input_data(line)
                 question = question[:-1]
                 question_mask = question_mask[:-1]
                 media_data, topic_distribution = \
                     topic_distribution_function(question, question_mask)
                 sorted_topics = \
                     sorted(enumerate(topic_distribution[0]), key=lambda x: x[1], reverse=True)
                     
                 all_prob = list()
                 all_res = list()
                 for topic, prob in sorted_topics[0:k] :
                     for style in range(style_number):
                         style_distribution =\
                             style_distribution_function(question, question_mask, 
                                                         numpy.array([topic], dtype='int64'))[0]
                         all_prob.append([topic, style, prob, style_distribution[0][style]])
                         # print 'style number: %d, score: %f' % (style, style_distribution[0][style])
                 for topic, style, tp, sp in sorted(all_prob, key=lambda x: x[2]*x[3], reverse=True)[0:k] :   
                     
                     def distribution_calculate(question, question_mask,
                                           answer, answer_mask):
                         topic_vector = \
                             numpy.concatenate([numpy.array([topic], dtype='int64')]*question.shape[1],
                                               axis=0)
                         return deploy_model(question, question_mask,
                                           answer, answer_mask,
                                           media_data, 
                                           topic_vector, style)
                             
                     res, score = beam_search(line, self.cr, distribution_calculate, 
                                              beam_size=5, search_scope=5)
                     # print res
                     for idx, r in enumerate(res) :
                         all_res.append([res[idx], score[idx]-math.log(tp*sp)])
                 all_res = sorted(all_res, key=lambda x: x[1], reverse=False)
                 print all_res
                 res = [(' '.join(self.cr.transform_input_text(s[0])), s[1]) for s in all_res[0:5]]
                 for r, s in res :
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
Ejemplo n.º 6
0
 def generate_one_question(self, question, media_function, deploy_function,
                            output_size=50, n_chosen_style=2):
     """
     Generate a model with style modeling.
     """
     style_number = self.conf_dict['n_style']
     
     style_score = [0] * style_number
     style_candidate_list = [[]] * style_number
     
     # line_word, line_zi = SegProcess(line.strip())
     # line = line_zi.decode("gb18030")
     print (question.encode(config.globalCharSet()))
     (question0, question_mask0) = self.cr.transform_input_data(question)
     question0 = question0[:-1]
     question_mask0 = question_mask0[:-1]
     media_data, style_distribution = media_function(question0, question_mask0)
     print style_distribution
     style_score = style_distribution[0]
     style_sorted_index = sorted(range(style_number), key=lambda x:-math.log(style_score[x]))[:n_chosen_style]
     
     res_list = []
     
     for style in range(style_number):
         def distribution_calculate(question, question_mask,
                               answer, answer_mask):
             return deploy_function(question, question_mask,
                               answer, answer_mask,
                               media_data, style)
         
         res, score = beam_search(question,
                                  self.cr,
                                  distribution_calculate,
                                  beam_size=200,
                                  search_scope=200,
                                  output_size=5)
         print res
         res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]]
         for r, s in zip(res, score) :
             style_candidate_list[style].append((r, s - math.log(style_score[style])))
     
     for style_index in style_sorted_index:
         res_list += style_candidate_list[style_index]
     
     res_list = sorted(res_list, key=lambda x:x[1])
     answer_list = []
     answer_set = set()
     
     count = 0
     for answer, score in res_list:
         if count >= output_size:
             break
         if not answer in answer_set:
             answer_list.append((answer, score))
             count += 1
             answer_set.add(answer)
             
     return style_candidate_list, answer_list
Ejemplo n.º 7
0
 def generate(self, input_file, output_file):
     """
     Generate a model with style modeling.
     """
     media_function = self.model.get_media_data_function()
     deploy_function = self.model.get_deploy_function()
     style_number = self.conf_dict['n_style']
     for style in range(style_number):
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                 with open(output_file + str(style), 'w') as fw:
                     for line in fo.readlines() :
                         # line_word, line_zi = SegProcess(line.strip())
                         # line = line_zi.decode("gb18030")
                         line = line.strip()
                         print (line.encode(config.globalCharSet()))
                         fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                         (question, question_mask) = self.cr.transform_input_data(line)
                         question = question[:-1]
                         question_mask = question_mask[:-1]
                         media_data, style_distribution = media_function(question, question_mask)
                         print style_distribution
                         print 'style number : %d, score: %f' % (style, style_distribution[0][style])
                         fw.writelines('style number : %d, score: %f\n' % (style, style_distribution[0][style]))
                         
                         def distribution_calculate(question, question_mask,
                                               answer, answer_mask):
                             return deploy_function(question, question_mask,
                                               answer, answer_mask,
                                               media_data, style)
                         
                         res, score = beam_search(line, self.cr, distribution_calculate, beam_size=200, search_scope=200)
                         print res
                         res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]]
                         for r, s in zip(res, score) :
                             print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                             fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                         fw.writelines('\n')
Ejemplo n.º 8
0
    def generate_one_question_b_v(self, question, deploy_model):
        question_make_sense = isMakeSense(question)
        res, score = beam_search(question, self.cr, deploy_model,
                                              beam_size=100,
                                              search_scope=100,
                                              output_size=50)
        print res
        res = [' '.join(self.cr.transform_input_text(s)) for s in res]
        
        resorted_list = list()
        for r, s in zip(res, score):
            idf = 0.0
            tokens = r.split(u' ')
            for token in tokens[1:-1]:
                idf += get_idf(token)
#                         idf /= len(tokens)
#                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
            idf_revise = 4 * np.tanh(4 * idf)
            resorted_list.append((r, s, s))
        if len(question) > 3:   
            resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(question) ** 1)
        else:
            resorted_list = sorted(resorted_list, key=lambda x:x[2])
            
        candidates = list()
            
        if question_make_sense == 1:
            f = 0
            for r, _, _ in resorted_list[:5]:
                ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                if isMakeSense(ori_sentence) == 1:
                    f += 1
            if f <= 1:
                question_make_sense = 0
                
        for r, s1, s2 in resorted_list:
            ori_sentence = r.strip().replace(u'<END>', u'')
            ori_sentence = ori_sentence.replace(u' ', u'')
            answer_make_sense = isMakeSense(ori_sentence)
            r0 = r
            if isinstance(r, unicode) :
                r0 = r.encode(config.globalCharSet())
            print r0, s1, s2, answer_make_sense,
            
            if len(ori_sentence) <= 3 \
                and len(ori_sentence) < len(question) and ori_sentence in question:
                print 'continue1'
                continue
                    
            if answer_make_sense == -1 or u'ϵͳ' in ori_sentence or u'NUM' in ori_sentence:
                print 'continue2'
                continue
            
            if question_make_sense == 1 and answer_make_sense <= 0:
                print 'continue3'
                continue
            
#             r_token_count = len(ori_sentence.strip().split(u' '))
#             if question_word_count > 1 and r_token_count == 1:
#                 print 'continue4'
#                 continue
            candidates.append((r, s2))
        
        print 'variousen'
        
        variousen_scope = 15
        output_size = 5
        high_fruq_left = 4
        
        if len(candidates) == 0:
            return
        candidates, _ = zip(*candidates)
        
#                     v_index = variousen_strings(candidates[:variousen_scope], output_size)
#                     v_index = range(min(len(candidates), high_fruq_left)) + v_index
#             #                     print v_index
#                     func = lambda x, y:x if y in x else x + [y]
#                     v_index = reduce(func, [[], ] + v_index)
#                     toReturn = [candidates[i] for i in v_index[:output_size]]
        toReturn = candidates[:output_size]
        return toReturn    
Ejemplo n.º 9
0
    def generate_b_v(self, input_file, output_file):
        """
        Generate a model with special optimizers.
        """
        deploy_model = self.model.get_deploy_function()
        with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines() :
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_word.decode("gb18030")
                    # line = line_word
                    line = line.strip()
                    #question_make_sense = isMakeSense(line)
                    question_make_sense=1
                    print (line.encode(config.globalCharSet()))
                    fw.writelines('%s\n' % line)
                    res, score = beam_search(line, self.cr, deploy_model, beam_size=1000, search_scope=1000)
                    print res
                    res = [' '.join(self.cr.transform_input_text(s)) for s in res]
                    resorted_list = list()
                    for r, s in zip(res, score):
                        idf = 0.0
                        tokens = r.split(u' ')
                        for token in tokens[1:-1]:
                            idf += get_idf(token)
            #                         idf /= len(tokens)
            #                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
                        idf_revise = 4 * np.tanh(4 * idf)
                        resorted_list.append((r, s, s))
                    if len(line) > 3:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(line) ** 1)
                    else:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2])

                    candidates = list()

                    if question_make_sense == 1:
                        f = 0
                        for r, _, _ in resorted_list[:5]:
                            ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                            #if isMakeSense(ori_sentence) == 1:
                            if 1:
                                f += 1
                        if f <= 1:
                            question_make_sense = 0

                    for r, s1, s2 in resorted_list:
                        ori_sentence = r.strip().replace(u'<END>', u'')
                        ori_sentence = ori_sentence.replace(u' ', u'')
                        answer_make_sense = 1 #isMakeSense(ori_sentence)
                        r0 = r
                        if isinstance(r, unicode) :
                            r0 = r.encode(config.globalCharSet())
                        print r0, s1, s2, answer_make_sense,

                        if len(ori_sentence) <= 3 \
                            and len(ori_sentence) < len(line) and ori_sentence in line:
                            print 'continue1'
                            continue

                        if answer_make_sense == -1:
                            print 'continue2'
                            continue

                        if question_make_sense == 1 and answer_make_sense <= 0:
                            print 'continue3'
                            continue

                #             r_token_count = len(ori_sentence.strip().split(u' '))
                #             if question_word_count > 1 and r_token_count == 1:
                #                 print 'continue4'
                #                 continue
                        candidates.append((r, s2))

                    print 'variousen'

                    variousen_scope = 15
                    output_size = 5
                    high_fruq_left = 4
                    v_index = variousen_strings(candidates[:variousen_scope], output_size)
                    v_index = range(min(len(candidates), high_fruq_left)) + v_index
                #                     print v_index
                    func = lambda x, y:x if y in x else x + [y]
                    v_index = reduce(func, [[], ] + v_index)

                    toReturn = [candidates[i] for i in v_index[:output_size]]
                    for r, s in toReturn :
                        print ('result: %s, score: %f.' % (r.encode(config.globalCharSet()), s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))