Esempio n. 1
0
    def generate(self, input_file, output_file):
        """
        Generate a model.
        """
        deploy_model = self.model.get_deploy_function()
        with open(output_file, 'w') as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines():
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_zi.decode("gb18030")
                    line = line.strip()
                    print(line.encode(config.globalCharSet()))
                    fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                    res, score = beam_search(line,
                                             self.cr,
                                             deploy_model,
                                             beam_size=200,
                                             search_scope=200)
                    print res
                    res = [
                        ' '.join(self.cr.transform_input_text(s)) for s in res
                    ]
                    for r, s in zip(res, score):

                        print('result: %s, score: %f.' % (r, s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))
 def style_chaos(self, input_file, output_file):
     """
     Compute the chaos of the questions.
     """
     get_media_data_function = self.model.get_media_data_function()
     style_number = self.conf_dict['n_style']
     with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
         with open(output_file + 'chaos', 'w') as fw:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\t' % line.encode(config.globalCharSet()))
                 (question, question_mask) = self.cr.transform_input_data(line)
                 question = question[:-1]
                 question_mask = question_mask[:-1]
                 _, style_distribution = get_media_data_function(question, question_mask)
                 
                 st = style_distribution.tolist()[0]
                 chaos = 0
                 for p in st:
                     chaos += -p * math.log(p) 
                 
                 print chaos, '\t', str(st)
                 
                 output = '%f\t%s\n' % (chaos, str(st))
                 fw.writelines(output.encode(config.globalCharSet()))
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     evaluate_model = self.reverse_model.get_evaluation_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 res, score = beam_search(line, self.cr, deploy_model,
                                           beam_size=10,
                                           search_scope=10,
                                           output_size=20)
                 print res
                 res = [' '.join(self.cr.transform_input_text(s[1:-1])) for s in res]
                 cbres = list()
                 for r, s in zip(res, score) :
                     (question, question_mask) = self.cr.transform_input_data(r)
                     (answer, answer_mask) = self.cr.transform_input_data(line)
                     answer = np.concatenate([question[-1:], answer], axis=0)
                     answer_mask = np.concatenate([question_mask[-1:], answer_mask], axis=0)
                     question = question[:-1,:]
                     question_mask = question_mask[:-1,:]
                     sae, _, _= evaluate_model(question, question_mask, answer, answer_mask)
                     cbres.append([r, s, sae])
                 for r, s, rs in sorted(cbres, key=lambda x: x[1]+x[2], reverse=True) :
                     print ('result: %s, score: %f, %f' % (r.encode(config.globalCharSet()), s, rs))
                     # fw.writelines('result: %s, score: %f, %f\n' % (r.encode(config.globalCharSet()), s, rs))
                 fw.writelines('\n')
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     observe_model = self.model.get_observe_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 sentence, score = beam_search(line, self.cr, deploy_model, beam_size=50, search_scope=50)
                 print sentence
                 res = [' '.join(self.cr.transform_input_text(s)) for s in sentence]
                 for r, st, s in zip(res, sentence, score)[0:5] :
                     (question, question_mask) = self.cr.transform_input_data(line)
                     (tanswer, tanswer_mask) = ([[i] for i in st], [[i] for i in [1]*len(st)])
                     [alpha] = observe_model(question[:-1,:], question_mask[:-1,:], tanswer, tanswer_mask)
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     for row in range(alpha.shape[0]) :
                         for col in range(alpha.shape[1]) :
                             print alpha[row, col, 0, 0],
                         print 
                     fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
 def generate(self, input_file, output_file):
     """
     Generate a model with style modeling.
     """
     k = 10
     topic_distribution_function = self.model.get_topic_distribution_function()
     style_distribution_function = self.model.get_style_distribution_function()
     deploy_model = self.model.get_deploy_function()
     style_number = self.conf_dict['n_style']
     with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
         with open(output_file, 'w') as fw:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 (question, question_mask) = self.cr.transform_input_data(line)
                 question = question[:-1]
                 question_mask = question_mask[:-1]
                 media_data, topic_distribution = \
                     topic_distribution_function(question, question_mask)
                 sorted_topics = \
                     sorted(enumerate(topic_distribution[0]), key=lambda x: x[1], reverse=True)
                     
                 all_prob = list()
                 all_res = list()
                 for topic, prob in sorted_topics[0:k] :
                     for style in range(style_number):
                         style_distribution =\
                             style_distribution_function(question, question_mask, 
                                                         numpy.array([topic], dtype='int64'))[0]
                         all_prob.append([topic, style, prob, style_distribution[0][style]])
                         # print 'style number: %d, score: %f' % (style, style_distribution[0][style])
                 for topic, style, tp, sp in sorted(all_prob, key=lambda x: x[2]*x[3], reverse=True)[0:k] :   
                     
                     def distribution_calculate(question, question_mask,
                                           answer, answer_mask):
                         topic_vector = \
                             numpy.concatenate([numpy.array([topic], dtype='int64')]*question.shape[1],
                                               axis=0)
                         return deploy_model(question, question_mask,
                                           answer, answer_mask,
                                           media_data, 
                                           topic_vector, style)
                             
                     res, score = beam_search(line, self.cr, distribution_calculate, 
                                              beam_size=5, search_scope=5)
                     # print res
                     for idx, r in enumerate(res) :
                         all_res.append([res[idx], score[idx]-math.log(tp*sp)])
                 all_res = sorted(all_res, key=lambda x: x[1], reverse=False)
                 print all_res
                 res = [(' '.join(self.cr.transform_input_text(s[0])), s[1]) for s in all_res[0:5]]
                 for r, s in res :
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
Esempio n. 6
0
 def generate_emb(self, input_file, output_file):
     #print output_file
     #print 'generate_b_v_t_g'
     get_cost= self.model.get_encoder_vector()
     with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 line = line.strip()
                 lines = line.strip().split('\t')
                 (question, question_mask) = self.cr.transform_input_data(lines[0])
                 #(answer, answer_mask) = self.cr.transform_input_data(lines[1])
                 #(context,context_mask,context_mask2)=self.cr.transform_input_data_context(lines[3:])
                 #print question, question_mask
                 #print answer, answer_mask
                 #print lines[1]
                 qa_cost=get_cost(question, question_mask)
                 fw.write(line.strip()+'\t'+' '.join(str(i) for i in qa_cost[0])+'\n')
Esempio n. 7
0
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 (question, question_mask) = self.cr.transform_input_data(line)
                 hidden_states = deploy_model(question, question_mask)[0][0]
                 print ('result: %s' % (' '.join([str(value) for value in hidden_states])))
                 fw.writelines('\n')
Esempio n. 8
0
 def generate_one_question(self, question, media_function, deploy_function,
                            output_size=50, n_chosen_style=2):
     """
     Generate a model with style modeling.
     """
     style_number = self.conf_dict['n_style']
     
     style_score = [0] * style_number
     style_candidate_list = [[]] * style_number
     
     # line_word, line_zi = SegProcess(line.strip())
     # line = line_zi.decode("gb18030")
     print (question.encode(config.globalCharSet()))
     (question0, question_mask0) = self.cr.transform_input_data(question)
     question0 = question0[:-1]
     question_mask0 = question_mask0[:-1]
     media_data, style_distribution = media_function(question0, question_mask0)
     print style_distribution
     style_score = style_distribution[0]
     style_sorted_index = sorted(range(style_number), key=lambda x:-math.log(style_score[x]))[:n_chosen_style]
     
     res_list = []
     
     for style in range(style_number):
         def distribution_calculate(question, question_mask,
                               answer, answer_mask):
             return deploy_function(question, question_mask,
                               answer, answer_mask,
                               media_data, style)
         
         res, score = beam_search(question,
                                  self.cr,
                                  distribution_calculate,
                                  beam_size=200,
                                  search_scope=200,
                                  output_size=5)
         print res
         res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]]
         for r, s in zip(res, score) :
             style_candidate_list[style].append((r, s - math.log(style_score[style])))
     
     for style_index in style_sorted_index:
         res_list += style_candidate_list[style_index]
     
     res_list = sorted(res_list, key=lambda x:x[1])
     answer_list = []
     answer_set = set()
     
     count = 0
     for answer, score in res_list:
         if count >= output_size:
             break
         if not answer in answer_set:
             answer_list.append((answer, score))
             count += 1
             answer_set.add(answer)
             
     return style_candidate_list, answer_list
Esempio n. 9
0
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 
                 res, score = self.generate_one_question(line, deploy_model, output_size=5)
                 for r, s in zip(res, score) :
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
Esempio n. 10
0
 def generate_b_v(self, input_file, output_file):
     """
     Generate a model with special optimizers.
     """
     deploy_model = self.model.get_deploy_function()
     with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_word.decode("gb18030")
                 # line = line_word
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line)
                 
                 toReturn = self.generate_one_question_b_v(line, deploy_model)
                 
                 for r in toReturn :
                     print ('result: %s' % (r))
                     fw.writelines('result: %s\n' % (r))
Esempio n. 11
0
 def generate(self, input_file, output_file):
     """
     Generate a model.
     """
     deploy_model = self.model.get_deploy_function()
     with open(output_file, 'w') as fw:
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
             for line in fo.readlines() :
                 # line_word, line_zi = SegProcess(line.strip())
                 # line = line_zi.decode("gb18030")
                 line = line.strip()
                 print (line.encode(config.globalCharSet()))
                 fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                 
                 res, score = self.generate_one_question(line, deploy_model,
                                                         output_size=5,
                                                         beam_size=100,
                                                         search_scope=100)
                 for r, s in zip(res, score) :
                     print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                     fw.writelines('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                 fw.writelines('\n')
Esempio n. 12
0
 def generate(self, input_file, output_file):
     """
     Generate a model with style modeling.
     """
     media_function = self.model.get_media_data_function()
     deploy_function = self.model.get_deploy_function()
     style_number = self.conf_dict['n_style']
     for style in range(style_number):
         with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                 with open(output_file + str(style), 'w') as fw:
                     for line in fo.readlines() :
                         # line_word, line_zi = SegProcess(line.strip())
                         # line = line_zi.decode("gb18030")
                         line = line.strip()
                         print (line.encode(config.globalCharSet()))
                         fw.writelines('%s\n' % line.encode(config.globalCharSet()))
                         (question, question_mask) = self.cr.transform_input_data(line)
                         question = question[:-1]
                         question_mask = question_mask[:-1]
                         media_data, style_distribution = media_function(question, question_mask)
                         print style_distribution
                         print 'style number : %d, score: %f' % (style, style_distribution[0][style])
                         fw.writelines('style number : %d, score: %f\n' % (style, style_distribution[0][style]))
                         
                         def distribution_calculate(question, question_mask,
                                               answer, answer_mask):
                             return deploy_function(question, question_mask,
                                               answer, answer_mask,
                                               media_data, style)
                         
                         res, score = beam_search(line, self.cr, distribution_calculate, beam_size=200, search_scope=200)
                         print res
                         res = [' '.join(self.cr.transform_input_text(s)) for s in res[0:5]]
                         for r, s in zip(res, score) :
                             print ('result: %s, score: %f' % (r.encode(config.globalCharSet()), s))
                             fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                         fw.writelines('\n')
Esempio n. 13
0
 def generate_b_v_t_c(self, input_file, output_file):
     #print output_file
     #print 'generate_b_v_t_g'
     get_cost = self.model.classification_deploy()
     total_num = 0
     true_num = 0
     with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
         for line in fo.readlines():
             line = line.strip()
             lines = line.strip().split('\t')
             (question,
              question_mask) = self.cr.transform_input_data(lines[0])
             qa_cost = get_cost(question, question_mask)
             total_num += 1
             if (string.atof(lines[2]) == qa_cost[0][0]):
                 true_num += 1.0
     print output_file
     print true_num
     print total_num
     print true_num / total_num
Esempio n. 14
0
    def generate_one_question_b_v(self, question, deploy_model):
        question_make_sense = isMakeSense(question)
        res, score = beam_search(question, self.cr, deploy_model,
                                              beam_size=100,
                                              search_scope=100,
                                              output_size=50)
        print res
        res = [' '.join(self.cr.transform_input_text(s)) for s in res]
        
        resorted_list = list()
        for r, s in zip(res, score):
            idf = 0.0
            tokens = r.split(u' ')
            for token in tokens[1:-1]:
                idf += get_idf(token)
#                         idf /= len(tokens)
#                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
            idf_revise = 4 * np.tanh(4 * idf)
            resorted_list.append((r, s, s))
        if len(question) > 3:   
            resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(question) ** 1)
        else:
            resorted_list = sorted(resorted_list, key=lambda x:x[2])
            
        candidates = list()
            
        if question_make_sense == 1:
            f = 0
            for r, _, _ in resorted_list[:5]:
                ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                if isMakeSense(ori_sentence) == 1:
                    f += 1
            if f <= 1:
                question_make_sense = 0
                
        for r, s1, s2 in resorted_list:
            ori_sentence = r.strip().replace(u'<END>', u'')
            ori_sentence = ori_sentence.replace(u' ', u'')
            answer_make_sense = isMakeSense(ori_sentence)
            r0 = r
            if isinstance(r, unicode) :
                r0 = r.encode(config.globalCharSet())
            print r0, s1, s2, answer_make_sense,
            
            if len(ori_sentence) <= 3 \
                and len(ori_sentence) < len(question) and ori_sentence in question:
                print 'continue1'
                continue
                    
            if answer_make_sense == -1 or u'ϵͳ' in ori_sentence or u'NUM' in ori_sentence:
                print 'continue2'
                continue
            
            if question_make_sense == 1 and answer_make_sense <= 0:
                print 'continue3'
                continue
            
#             r_token_count = len(ori_sentence.strip().split(u' '))
#             if question_word_count > 1 and r_token_count == 1:
#                 print 'continue4'
#                 continue
            candidates.append((r, s2))
        
        print 'variousen'
        
        variousen_scope = 15
        output_size = 5
        high_fruq_left = 4
        
        if len(candidates) == 0:
            return
        candidates, _ = zip(*candidates)
        
#                     v_index = variousen_strings(candidates[:variousen_scope], output_size)
#                     v_index = range(min(len(candidates), high_fruq_left)) + v_index
#             #                     print v_index
#                     func = lambda x, y:x if y in x else x + [y]
#                     v_index = reduce(func, [[], ] + v_index)
#                     toReturn = [candidates[i] for i in v_index[:output_size]]
        toReturn = candidates[:output_size]
        return toReturn    
Esempio n. 15
0
    def generate_b_v_t_g(self, input_file, output_file):
        """
        Generate a model with special optimizers.
        """
        deploy_model = self.model.get_deploy_function()
        #get_cost= self.model.get_cost()
        print output_file
        print 'generate_b_v_t_g'
        with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines() :
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_word.decode("gb18030")
                    # line = line_word
                    line = line.strip()
                    lines = line.strip().split('\t')
                    #(question, question_mask) = self.cr.transform_input_data(lines[0])
                    #(answer, answer_mask) = self.cr.transform_input_data(lines[1])
                    #qa_cost=get_cost(question, question_mask,answer,answer_mask,[[string.atoi(lines[2])]])
                    #fw.write(line+'\t'+str(qa_cost)+'\n')
                    
                    question_make_sense = 1#isMakeSense(line)
                    print (line.encode(config.globalCharSet()))
                    #fw.writelines('%s\n' % line)
                    if(len(lines)==3):
                        line=''
                        line+=lines[0]
                        line+='\t'
                        line+=lines[2]
                    fw.writelines('%s\n' % line)
                    res, score = beam_search_t(line, self.cr, deploy_model, beam_size=200, search_scope=100)
                    print res
                    res1= [s[:-1] for s in res]
                    res2= [s[-1] for s in res]
                    res = [' '.join(self.cr.transform_input_text(s)) for s in res1]
                    for res_len in range(len(res)):
                        res[res_len]+='\t'
                        res[res_len]+=str(res2[res_len])

                    resorted_list = list()
                    for r, s in zip(res, score):
                        idf = 0.0
                        tokens = r.split(u' ')
                        for token in tokens[1:-1]:
                            idf += get_idf(token)
                #                         idf /= len(tokens)
                #                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
                        idf_revise = 4 * np.tanh(4 * idf)
                        resorted_list.append((r, s, s))
                    if len(line) > 3:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(line) ** 1)
                    else:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2])

                    candidates = list()

                    if question_make_sense == 1:
                        f = 0
                        for r, _, _ in resorted_list[:5]:
                            ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                            #if isMakeSense(ori_sentence) == 1:
                            if 1:
                                f += 1
                        if f <= 1:
                            question_make_sense = 0

                    for r, s1, s2 in resorted_list:
                        ori_sentence = r.strip().replace(u'<END>', u'')
                        ori_sentence = ori_sentence.replace(u' ', u'')
                        answer_make_sense = 1#isMakeSense(ori_sentence)
                        r0 = r
                        if isinstance(r, unicode) :
                            r0 = r.encode(config.globalCharSet())
                        print r0, s1, s2, answer_make_sense

                        if len(ori_sentence) <= 3 \
                            and len(ori_sentence) < len(line) and ori_sentence in line:
                            print 'continue1'
                            continue

                        if answer_make_sense == -1:
                            print 'continue2'
                            continue

                        if question_make_sense == 1 and answer_make_sense <= 0:
                            print 'continue3'
                            continue

                #             r_token_count = len(ori_sentence.strip().split(u' '))
                #             if question_word_count > 1 and r_token_count == 1:
                #                 print 'continue4'
                #                 continue
                        candidates.append((r, s2))

                    print 'variousen'

                    variousen_scope = 15
                    output_size = 5
                    high_fruq_left = 5

                    v_index = variousen_strings(candidates[:variousen_scope], output_size)
                    v_index = range(min(len(candidates), high_fruq_left)) + v_index
            #                     print v_index
                    func = lambda x, y:x if y in x else x + [y]
                    v_index = reduce(func, [[], ] + v_index)

                    toReturn = [candidates[i] for i in v_index[:output_size]]
                    for r, s in toReturn :
                        print ('result: %s, score: %f.' % (r.encode(config.globalCharSet()), s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))
                    '''
Esempio n. 16
0
    def generate_b_v_t(self, input_file, output_file):
        """
        Generate a model with special optimizers.
        """
        answer_set=[]
        answer_dict={}
        for answer_smaple in answer_set:
            tmp=[]
            for i in range(len(answer_smaple)):
                tmp.append(answer_smaple[i])
                #print map(str,tmp)
                answer_dict[str(tmp)]=1
        deploy_model = self.model.get_deploy_function()
        print output_file
        start = time.clock()
        with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines() :
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_word.decode("gb18030")
                    # line = line_word
                    line = line.strip()
                    lines=line.split('\t')
                    line1=lines[0]+'\t'+lines[2]+'\t'+lines[3]+'\n'
                    question_make_sense = 1#isMakeSense(line)
                    #print (line.encode(config.globalCharSet()))
                    #fw.writelines('%s\n' % line)
                    #line=lines[0]+'\t1\n'
                    res, score = beam_search_t(line1, self.cr, deploy_model,answer_dict, beam_size=10, search_scope=10)
                    #print score
                    if(len(res)<=0):
                        print 'not find'
                        continue
                    for i in range(1):
                        res1=[res[i][1:-1]]
                    #print res,score
                        res1 = [' '.join(self.cr.transform_input_text(s)) for s in res1]
                        try:
                            fw.write(line+'\t'+res1[0]+'\t'+str(score[i])+'\n')
                            #fw.write(line+'\n')
                        except:
                            print res
                    '''
                    resorted_list = list()
                    for r, s in zip(res, score):
                        idf = 0.0
                        tokens = r.split(u' ')
                        for token in tokens[1:-1]:
                            idf += get_idf(token)
                #                         idf /= len(tokens)
                #                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
                        idf_revise = 4 * np.tanh(4 * idf)
                        resorted_list.append((r, s, s))
                    if len(line) > 3:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(line) ** 1)
                    else:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2])

                    candidates = list()

                    if question_make_sense == 1:
                        f = 0
                        for r, _, _ in resorted_list[:5]:
                            ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                            #if isMakeSense(ori_sentence) == 1:
                            if 1:
                                f += 1
                        if f <= 1:
                            question_make_sense = 0

                    for r, s1, s2 in resorted_list:
                        ori_sentence = r.strip().replace(u'<END>', u'')
                        ori_sentence = ori_sentence.replace(u' ', u'')
                        answer_make_sense = 1#isMakeSense(ori_sentence)
                        r0 = r
                        if isinstance(r, unicode) :
                            r0 = r.encode(config.globalCharSet())
                        print r0, s1, s2, answer_make_sense,

                        if len(ori_sentence) <= 3 \
                            and len(ori_sentence) < len(line) and ori_sentence in line:
                            print 'continue1'
                            continue

                        if answer_make_sense == -1:
                            print 'continue2'
                            continue

                        if question_make_sense == 1 and answer_make_sense <= 0:
                            print 'continue3'
                            continue

                #             r_token_count = len(ori_sentence.strip().split(u' '))
                #             if question_word_count > 1 and r_token_count == 1:
                #                 print 'continue4'
                #                 continue
                        candidates.append((r, s2))

                    print 'variousen'

                    variousen_scope = 15
                    output_size = 5
                    high_fruq_left = 4

                    v_index = variousen_strings(candidates[:variousen_scope], output_size)
                    v_index = range(min(len(candidates), high_fruq_left)) + v_index
            #                     print v_index
                    func = lambda x, y:x if y in x else x + [y]
                    v_index = reduce(func, [[], ] + v_index)

                    toReturn = [candidates[i] for i in v_index[:output_size]]
                    for r, s in toReturn :
                        print ('result: %s, score: %f.' % (r.encode(config.globalCharSet()), s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))

                    for r in res[0:5] :
                        #fw.writelines('result: %s, score: %f\n' % (r.encode(config.globalCharSet()), s))
                        fw.writelines('result: %s, score: %f\n' % (r, s))
                    fw.writelines('\n')
                    '''
        end = time.clock()
        print "read: %f s" % (end - start)
Esempio n. 17
0


if __name__ == '__main__':
    base_path = os.path.join(os.getcwd(), 'data')
    dataset_folder = os.path.join(base_path, sys.argv[1]) 
    dataset_file = os.path.join(base_path, sys.argv[2])
    dict_file = os.path.join(base_path, sys.argv[3])
    stopwords_file = os.path.join(base_path, sys.argv[4])
    word_embedding_file = os.path.join(base_path, sys.argv[5])
    train_rate = string.atof(sys.argv[6])
    valid_rate = string.atof(sys.argv[7])
    test_rate = string.atof(sys.argv[8])
    algo_name = sys.argv[9]
    mode = sys.argv[10]
    charset = config.globalCharSet()
    print ('dataset file: %s.' % (dataset_file))
    print ('dict file: %s.' % (dict_file))
    print ('stopwords file: %s.' % (stopwords_file))
    print ('word embedding file: %s.' % (word_embedding_file))
    print ('algorithms name: %s.' % (algo_name))
    print ('mode: %s.' % (mode))
    print ('charset: %s.' % (charset))
        
    if algo_name == 'SeqToSeq' :
        from deep.manage.model.seq_to_seq import RnnEncoderDecoder
        manager = RnnEncoderDecoder(dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
                                    train_rate, valid_rate, test_rate, algo_name, charset, mode) 
    elif algo_name == 'ChoEncoderDecoder' :
        from deep.manage.model.cho_encoder_decoder import RnnEncoderDecoder
        manager = RnnEncoderDecoder(dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
Esempio n. 18
0
    dict_file = sys.argv[3]
    stopwords_file = sys.argv[4]
    word_embedding_file = sys.argv[5]
    '''
    dataset_folder = os.path.join(base_path, sys.argv[1]) 
    dataset_file = os.path.join(base_path, sys.argv[2])
    dict_file = os.path.join(base_path, sys.argv[3])
    stopwords_file = os.path.join(base_path, sys.argv[4])
    word_embedding_file = os.path.join(base_path, sys.argv[5])
    '''
    train_rate = string.atof(sys.argv[6])
    valid_rate = string.atof(sys.argv[7])
    test_rate = string.atof(sys.argv[8])
    algo_name = sys.argv[9]
    mode = sys.argv[10]
    charset = config.globalCharSet()
    print('dataset file: %s.' % (dataset_file))
    print('dict file: %s.' % (dict_file))
    print('stopwords file: %s.' % (stopwords_file))
    print('word embedding file: %s.' % (word_embedding_file))
    print('algorithms name: %s.' % (algo_name))
    print('mode: %s.' % (mode))
    print('charset: %s.' % (charset))

    if algo_name == 'SeqToSeq':
        from deep.manage.model.seq_to_seq import RnnEncoderDecoder
        manager = RnnEncoderDecoder(dataset_folder, dataset_file, dict_file,
                                    stopwords_file, word_embedding_file,
                                    train_rate, valid_rate, test_rate,
                                    algo_name, charset, mode)
    elif algo_name == 'ChoEncoderDecoder':