Ejemplo n.º 1
0
    def generate_one_question_b_v(self, question, deploy_model):
        question_make_sense = isMakeSense(question)
        res, score = beam_search(question, self.cr, deploy_model,
                                              beam_size=100,
                                              search_scope=100,
                                              output_size=50)
        print res
        res = [' '.join(self.cr.transform_input_text(s)) for s in res]
        
        resorted_list = list()
        for r, s in zip(res, score):
            idf = 0.0
            tokens = r.split(u' ')
            for token in tokens[1:-1]:
                idf += get_idf(token)
#                         idf /= len(tokens)
#                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
            idf_revise = 4 * np.tanh(4 * idf)
            resorted_list.append((r, s, s))
        if len(question) > 3:   
            resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(question) ** 1)
        else:
            resorted_list = sorted(resorted_list, key=lambda x:x[2])
            
        candidates = list()
            
        if question_make_sense == 1:
            f = 0
            for r, _, _ in resorted_list[:5]:
                ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                if isMakeSense(ori_sentence) == 1:
                    f += 1
            if f <= 1:
                question_make_sense = 0
                
        for r, s1, s2 in resorted_list:
            ori_sentence = r.strip().replace(u'<END>', u'')
            ori_sentence = ori_sentence.replace(u' ', u'')
            answer_make_sense = isMakeSense(ori_sentence)
            r0 = r
            if isinstance(r, unicode) :
                r0 = r.encode(config.globalCharSet())
            print r0, s1, s2, answer_make_sense,
            
            if len(ori_sentence) <= 3 \
                and len(ori_sentence) < len(question) and ori_sentence in question:
                print 'continue1'
                continue
                    
            if answer_make_sense == -1 or u'ϵͳ' in ori_sentence or u'NUM' in ori_sentence:
                print 'continue2'
                continue
            
            if question_make_sense == 1 and answer_make_sense <= 0:
                print 'continue3'
                continue
            
#             r_token_count = len(ori_sentence.strip().split(u' '))
#             if question_word_count > 1 and r_token_count == 1:
#                 print 'continue4'
#                 continue
            candidates.append((r, s2))
        
        print 'variousen'
        
        variousen_scope = 15
        output_size = 5
        high_fruq_left = 4
        
        if len(candidates) == 0:
            return
        candidates, _ = zip(*candidates)
        
#                     v_index = variousen_strings(candidates[:variousen_scope], output_size)
#                     v_index = range(min(len(candidates), high_fruq_left)) + v_index
#             #                     print v_index
#                     func = lambda x, y:x if y in x else x + [y]
#                     v_index = reduce(func, [[], ] + v_index)
#                     toReturn = [candidates[i] for i in v_index[:output_size]]
        toReturn = candidates[:output_size]
        return toReturn    
Ejemplo n.º 2
0
    def generate_b_v_t_g(self, input_file, output_file):
        """
        Generate a model with special optimizers.
        """
        deploy_model = self.model.get_deploy_function()
        #get_cost= self.model.get_cost()
        print output_file
        print 'generate_b_v_t_g'
        with codecs.open(output_file, 'w', config.globalCharSet()) as fw:
            with codecs.open(input_file, 'r', config.globalCharSet()) as fo:
                for line in fo.readlines() :
                    # line_word, line_zi = SegProcess(line.strip())
                    # line = line_word.decode("gb18030")
                    # line = line_word
                    line = line.strip()
                    lines = line.strip().split('\t')
                    #(question, question_mask) = self.cr.transform_input_data(lines[0])
                    #(answer, answer_mask) = self.cr.transform_input_data(lines[1])
                    #qa_cost=get_cost(question, question_mask,answer,answer_mask,[[string.atoi(lines[2])]])
                    #fw.write(line+'\t'+str(qa_cost)+'\n')
                    
                    question_make_sense = 1#isMakeSense(line)
                    print (line.encode(config.globalCharSet()))
                    #fw.writelines('%s\n' % line)
                    if(len(lines)==3):
                        line=''
                        line+=lines[0]
                        line+='\t'
                        line+=lines[2]
                    fw.writelines('%s\n' % line)
                    res, score = beam_search_t(line, self.cr, deploy_model, beam_size=200, search_scope=100)
                    print res
                    res1= [s[:-1] for s in res]
                    res2= [s[-1] for s in res]
                    res = [' '.join(self.cr.transform_input_text(s)) for s in res1]
                    for res_len in range(len(res)):
                        res[res_len]+='\t'
                        res[res_len]+=str(res2[res_len])

                    resorted_list = list()
                    for r, s in zip(res, score):
                        idf = 0.0
                        tokens = r.split(u' ')
                        for token in tokens[1:-1]:
                            idf += get_idf(token)
                #                         idf /= len(tokens)
                #                         idf_revise = 1 / (1 +  np.exp(-2 / idf))
                        idf_revise = 4 * np.tanh(4 * idf)
                        resorted_list.append((r, s, s))
                    if len(line) > 3:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2] / len(line) ** 1)
                    else:
                        resorted_list = sorted(resorted_list, key=lambda x:x[2])

                    candidates = list()

                    if question_make_sense == 1:
                        f = 0
                        for r, _, _ in resorted_list[:5]:
                            ori_sentence = r.replace(u'<END>', u'').replace(u' ', u'')
                            #if isMakeSense(ori_sentence) == 1:
                            if 1:
                                f += 1
                        if f <= 1:
                            question_make_sense = 0

                    for r, s1, s2 in resorted_list:
                        ori_sentence = r.strip().replace(u'<END>', u'')
                        ori_sentence = ori_sentence.replace(u' ', u'')
                        answer_make_sense = 1#isMakeSense(ori_sentence)
                        r0 = r
                        if isinstance(r, unicode) :
                            r0 = r.encode(config.globalCharSet())
                        print r0, s1, s2, answer_make_sense

                        if len(ori_sentence) <= 3 \
                            and len(ori_sentence) < len(line) and ori_sentence in line:
                            print 'continue1'
                            continue

                        if answer_make_sense == -1:
                            print 'continue2'
                            continue

                        if question_make_sense == 1 and answer_make_sense <= 0:
                            print 'continue3'
                            continue

                #             r_token_count = len(ori_sentence.strip().split(u' '))
                #             if question_word_count > 1 and r_token_count == 1:
                #                 print 'continue4'
                #                 continue
                        candidates.append((r, s2))

                    print 'variousen'

                    variousen_scope = 15
                    output_size = 5
                    high_fruq_left = 5

                    v_index = variousen_strings(candidates[:variousen_scope], output_size)
                    v_index = range(min(len(candidates), high_fruq_left)) + v_index
            #                     print v_index
                    func = lambda x, y:x if y in x else x + [y]
                    v_index = reduce(func, [[], ] + v_index)

                    toReturn = [candidates[i] for i in v_index[:output_size]]
                    for r, s in toReturn :
                        print ('result: %s, score: %f.' % (r.encode(config.globalCharSet()), s))
                        fw.writelines('result: %s, score: %f.\n' % (r, s))
                    '''