Beispiel #1
0
def JOIN(text, trans, langin, langout, change, un):
    i = 0
    j = 0            
    while i < len(trans):
        try:           
            # print 'orig: ', text[i+1], 'trans:', trans[i]
            if change:                                
                prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i+1], lang_trans=langin).probability
            else:
                prob = Translation.objects.get(orig=text[i+1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability
            # print 'ok'
        except:
            prob = 0.0
            # print 'fail'
            
        if prob > 0.0:
            ngramm = make_string([text[i], text[i+1]])                        
            res_text = text[0:i] + [ngramm] + text[i+2:len(text)]            
            res_trans = trans[0:i] + [trans[i]] + trans[i+2:len(trans)]            
            un_new = uncertainty(res_text, langin, res_trans, langout, change)            
            if un_new < un:
                text = res_text
                trans = res_trans
                un = un_new     
                print 'HYPO: ', make_string(trans), ' PP: ', un                
                
        i += 1            
    return {'orig': text, 'trans': trans, 'un': un}
Beispiel #2
0
def JOIN(text, trans, langin, langout, change, un):
    i = 0
    j = 0
    while i < len(trans):
        try:
            # print 'orig: ', text[i+1], 'trans:', trans[i]
            if change:
                prob = Translation.objects.get(orig=trans[i],
                                               lang_orig=langout,
                                               trans=text[i + 1],
                                               lang_trans=langin).probability
            else:
                prob = Translation.objects.get(orig=text[i + 1],
                                               lang_orig=langin,
                                               trans=trans[i],
                                               lang_trans=langout).probability
            # print 'ok'
        except:
            prob = 0.0
            # print 'fail'

        if prob > 0.0:
            ngramm = make_string([text[i], text[i + 1]])
            res_text = text[0:i] + [ngramm] + text[i + 2:len(text)]
            res_trans = trans[0:i] + [trans[i]] + trans[i + 2:len(trans)]
            un_new = uncertainty(res_text, langin, res_trans, langout, change)
            if un_new < un:
                text = res_text
                trans = res_trans
                un = un_new
                print 'HYPO: ', make_string(trans), ' PP: ', un

        i += 1
    return {'orig': text, 'trans': trans, 'un': un}
Beispiel #3
0
def join_by_n(words, n):
    tmp = []            
    i = 0
    while i < len(words):
        if i+n < len(words):
            tmp.append(make_string(words[i:i+n]))
        else:
            tmp.append(make_string(words[i:len(words)]))
        i += 1  # n
    return tmp
Beispiel #4
0
def join_by_n(words, n):
    tmp = []
    i = 0
    while i < len(words):
        if i + n < len(words):
            tmp.append(make_string(words[i:i + n]))
        else:
            tmp.append(make_string(words[i:len(words)]))
        i += 1  # n
    return tmp
Beispiel #5
0
def JOIN_P(text, trans, langin, langout, change, pp):
    print 'JOIN'
    i = 0
    j = 0            
    while i < len(trans):
        try:           
            # print 'orig: ', text[i+1], 'trans:', trans[i]
            if change:                                
                prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i+1], lang_trans=langin).probability
            else:
                prob = Translation.objects.get(orig=text[i+1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability
            # print 'ok'
        except:
            prob = 0.0
            # print 'fail'
            
        if prob > 0.0:
            ngramm = make_string([text[i], text[i+1]])                        
            res_text = text[0:i] + [ngramm] + text[i+2:len(text)]            
            res_trans = trans[0:i] + [trans[i]] + trans[i+2:len(trans)]          
            new_pp = perplexity(res_trans, langout, 5)             
            # print 'HYPO: ', make_string(res_trans), ' PP: ', new_pp
            if new_pp <= pp:
                text = res_text
                trans = res_trans
                pp = new_pp
                # print 'HYPO: ', make_string(trans), ' PP: ', new_pp
                
        i += 1            
    return {'orig': text, 'trans': trans, 'pp': pp}
Beispiel #6
0
def uncertainty(orig, langin, trans, langout, change):
    # print 'UNCERTAINTY'
    t = make_string(trans)
    words = split_n_gramm(t)
    # print words
    sum_entropy = 0.0
    i = 0
    n = 5
    while i < len(words):
        try:
            if i + n < len(words):
                sum_entropy += log(
                    n_gramm_estimation(words[i:i + n], langout, n), 2)
            else:
                sum_entropy += log(
                    n_gramm_estimation(words[i:len(words)], langout,
                                       len(words) - i), 2)
            i += n
        except:
            sum_entropy += -99999

    sum_max_prob = 0.0
    e_log = sum_entropy
    # print 'Entropy = ', sum_entropy
    for (i, n_gramm) in enumerate(trans):
        try:
            t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout)
            o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout)
            if change:
                sum_max_prob += log(
                    Translation.objects.get(orig=n_gramm,
                                            lang_orig=langout,
                                            trans=orig[i],
                                            lang_trans=langin).probability, 2)
            else:
                sum_max_prob += log(
                    Translation.objects.get(orig=orig[i],
                                            lang_orig=langin,
                                            trans=n_gramm,
                                            lang_trans=langout).probability, 2)
        except:
            # sum_frequence = -99999
            sum_max_prob += -99999


#    try:
#        e_log = log(sum_entropy, 2)
#    except:
#        e_log = -99999
    power = -1 * (e_log + sum_max_prob / len(trans))
    # print power
    if power > 10:
        # print 'a ', power
        return power
    # print 'b ',power
    return pow(2, power)
Beispiel #7
0
def language_model(word, seq, lang, size):
    phrase = make_string([seq, word])        
    try:
        phrase_freq = Ngramm.objects.get(n_gramm=phrase, lang=lang).frequence
    except:
        phrase_freq = 0        
                
    # print seq
    try:
        seq_freq = Ngramm.objects.filter(n_gramm__istartswith=seq, lang=lang, n=size).aggregate(Sum('frequence'))
    except:    
        seq_freq = 0            
    # seq_freq = Ngramm.objects.get(n_gramm = seq, lang = lang)
    V = Ngramm.objects.filter(n=size, lang=lang).count()
    
    # print V
    p = float(1 + phrase_freq) / (seq_freq + V)    
    return p
Beispiel #8
0
def cross_entropy(text, langout, size):
    sum = 0.0    
    text = make_string(text)
    words = split_n_gramm(text)
    # разбиваем на н-граммы высшего порядка
    # words = join_by_n(text, size)
    i = 0    
    while i < len(words):
        try:
            if i+size < len(words):
                sum += log(n_gramm_estimation(words[i:i+size], langout, size), 2)
            else:
                sum += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2)               
        except:        
            sum += -99999    
        i += size
                       
    return sum / len(words)
Beispiel #9
0
def uncertainty(orig, langin, trans, langout, change):    
    # print 'UNCERTAINTY'
    t = make_string(trans)    
    words = split_n_gramm(t)  
    # print words
    sum_entropy = 0.0
    i = 0
    n = 5    
    while i < len(words):
        try:
            if i+n < len(words):
                sum_entropy += log(n_gramm_estimation(words[i:i+n], langout, n), 2)
            else:
                sum_entropy += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2)
            i += n
        except:
            sum_entropy += -99999    
                   
    sum_max_prob = 0.0
    e_log = sum_entropy
    # print 'Entropy = ', sum_entropy
    for (i, n_gramm) in enumerate(trans):
        try:
            t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout)
            o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout)
            if change:                                
                sum_max_prob += log(Translation.objects.get(orig=n_gramm, lang_orig=langout, trans=orig[i], lang_trans=langin).probability, 2)
            else:
                sum_max_prob += log(Translation.objects.get(orig=orig[i], lang_orig=langin, trans=n_gramm, lang_trans=langout).probability, 2)
        except:
            # sum_frequence = -99999
            sum_max_prob += -99999      
#    try:
#        e_log = log(sum_entropy, 2)        
#    except:
#        e_log = -99999
    power = -1*(e_log + sum_max_prob/len(trans))        
    # print power
    if power > 10:
        # print 'a ', power
        return power
    # print 'b ',power
    return pow(2, power)
Beispiel #10
0
def language_model(word, seq, lang, size):
    phrase = make_string([seq, word])
    try:
        phrase_freq = Ngramm.objects.get(n_gramm=phrase, lang=lang).frequency
    except:
        phrase_freq = 0

    # print seq
    try:
        seq_freq = Ngramm.objects.filter(n_gramm__istartswith=seq,
                                         lang=lang,
                                         n=size).aggregate(Sum('frequency'))
    except:
        seq_freq = 0
    # seq_freq = Ngramm.objects.get(n_gramm = seq, lang = lang)
    V = Ngramm.objects.filter(n=size, lang=lang).count()

    # print V
    p = float(1 + phrase_freq) / (seq_freq + V)
    return p
Beispiel #11
0
def cross_entropy(text, langout, size):
    sum = 0.0
    text = make_string(text)
    words = split_n_gramm(text)
    # разбиваем на н-граммы высшего порядка
    # words = join_by_n(text, size)
    i = 0
    while i < len(words):
        try:
            if i + size < len(words):
                sum += log(
                    n_gramm_estimation(words[i:i + size], langout, size), 2)
            else:
                sum += log(
                    n_gramm_estimation(words[i:len(words)], langout,
                                       len(words) - i), 2)
        except:
            sum += -99999
        i += size

    return sum / len(words)
Beispiel #12
0
def CHANGE(words, trans, langin, langout, change, un):
    i = 0
    j = 0
    n = len(words)    
    # получаем все комбинации для change
    # while i <= n*n:
        # создаем гипотезу hypo
    for (j, word) in enumerate(words):
        # print j
        changes = CHANGE10(word, trans[j], langin, langout, change)['top_ten']   
        changes = [item for sublist in changes for item in sublist]            
        for c in changes:
            hypo = trans[0:j] + [c] + trans[j+1:len(trans)]              
            # new_pp = perplexity(trans, langout, 5)
            un_new = uncertainty(words, langin, hypo, langout, change)           
            if un_new < un: 
                trans = hypo
                un = un_new
                print 'HYPO: ', make_string(hypo), ' UNS: ', un_new  
                break                
            else:
                i += 1    
    return {'orig': words, 'trans': trans, 'un': un}
Beispiel #13
0
def CHANGE(words, trans, langin, langout, change, un):
    i = 0
    j = 0
    n = len(words)
    # получаем все комбинации для change
    # while i <= n*n:
    # создаем гипотезу hypo
    for (j, word) in enumerate(words):
        # print j
        changes = CHANGE10(word, trans[j], langin, langout, change)['top_ten']
        changes = [item for sublist in changes for item in sublist]
        for c in changes:
            hypo = trans[0:j] + [c] + trans[j + 1:len(trans)]
            # new_pp = perplexity(trans, langout, 5)
            un_new = uncertainty(words, langin, hypo, langout, change)
            if un_new < un:
                trans = hypo
                un = un_new
                print 'HYPO: ', make_string(hypo), ' UNS: ', un_new
                break
            else:
                i += 1
    return {'orig': words, 'trans': trans, 'un': un}
Beispiel #14
0
def JOIN_P(text, trans, langin, langout, change, pp):
    print 'JOIN'
    i = 0
    j = 0
    while i < len(trans):
        try:
            # print 'orig: ', text[i+1], 'trans:', trans[i]
            if change:
                prob = Translation.objects.get(orig=trans[i],
                                               lang_orig=langout,
                                               trans=text[i + 1],
                                               lang_trans=langin).probability
            else:
                prob = Translation.objects.get(orig=text[i + 1],
                                               lang_orig=langin,
                                               trans=trans[i],
                                               lang_trans=langout).probability
            # print 'ok'
        except:
            prob = 0.0
            # print 'fail'

        if prob > 0.0:
            ngramm = make_string([text[i], text[i + 1]])
            res_text = text[0:i] + [ngramm] + text[i + 2:len(text)]
            res_trans = trans[0:i] + [trans[i]] + trans[i + 2:len(trans)]
            new_pp = perplexity(res_trans, langout, 5)
            # print 'HYPO: ', make_string(res_trans), ' PP: ', new_pp
            if new_pp <= pp:
                text = res_text
                trans = res_trans
                pp = new_pp
                # print 'HYPO: ', make_string(trans), ' PP: ', new_pp

        i += 1
    return {'orig': text, 'trans': trans, 'pp': pp}
Beispiel #15
0
def n_gramm_estimation(n_gramm, lang, size):
    return language_model(make_string([n_gramm[size - 1]]),
                          make_string(n_gramm[0:size - 1]), lang, size)
Beispiel #16
0
def translating(msg, langin, langout, change):
    print 'START TRANSLATION', datetime.datetime.now()
    t = time.time()
    text = split_by_sentences(msg)
    result = []
    words = []
    for s in text:
        words += encode_phrase(s)

    # вычисление прямого перевода
    trans = simple_translation(words, langin, langout, change)
    # вычисление величины неопределенности для полученного перевода
    un = uncertainty(words, langin, trans, langout, change)
    un_old = un
    print 'TRANS', make_final_string(trans), 'uns', un
    pp = perplexity(trans, langout, 5)

    i = 0
    j = 0
    k = 1
    reject = True
    n = len(words)
    ok = True
    # получаем все комбинации для change
    while i < 100:
        # создаем гипотезу hypo
        # print i
        if not reject:
            i = 0
            k = 1
        j = 0
        print k
        pair = improve(words, trans, langin, langout, change, pp)
        tmp_words = pair['orig']
        tmp_trans = pair['trans']
        pp_new = pair['pp']

        for j in range(1, 4):
            pair = improve_n(j, tmp_words, tmp_trans, langin, langout, change,
                             pp_new)
            tmp2_words = pair['orig']
            tmp2_trans = pair['trans']
            pp_new2 = pair['pp']

            un_new = uncertainty(tmp2_words, langin, tmp2_trans, langout,
                                 change)
            if un - un_new >= 0.001:
                un = un_new
                words = tmp2_words
                trans = tmp2_trans
                pp = pp_new2
                print 'HYPO: ', make_string(trans), ' PP: ', pp
                reject = False
                break
            else:
                reject = True
                i += 1
        if j == 3:
            k += 1
            if k > 9:
                break

    print 'TRANS', make_final_string(trans), 'uns-old', un_old, 'uns', un_new
    print i
    print 'END TRANSLATION', datetime.datetime.now()
    t = time.time() - t
    print 'TIME: ', t
    # result += pair['trans']
    # где-то тут мы производим детокенизацию
    return make_final_string(trans)
Beispiel #17
0
def translating(msg, langin, langout, change):
    print 'START TRANSLATION', datetime.datetime.now()
    t = time.time() 
    text = split_by_sentences(msg)
    result = []
    words = []
    for s in text:        
        words += encode_phrase(s)
                
    # вычисление прямого перевода
    trans = simple_translation(words, langin, langout, change)      
    # вычисление величины неопределенности для полученного перевода
    un = uncertainty(words, langin, trans, langout, change)              
    un_old = un
    print 'TRANS', make_final_string(trans), 'uns', un
    pp = perplexity(trans, langout, 5)
    
    i = 0
    j = 0    
    k = 1
    reject = True
    n = len(words)    
    ok = True
    # получаем все комбинации для change
    while i < 100:
        # создаем гипотезу hypo
        # print i
        if not reject:
            i = 0
            k = 1
        j = 0
        print k
        pair = improve(words, trans, langin, langout, change, pp)
        tmp_words = pair['orig']    
        tmp_trans = pair['trans']           
        pp_new = pair['pp']
        
        for j in range(1,4):
            pair = improve_n(j, tmp_words, tmp_trans, langin, langout, change, pp_new)
            tmp2_words = pair['orig']    
            tmp2_trans = pair['trans']                       
            pp_new2 = pair['pp']
                          
            un_new = uncertainty(tmp2_words, langin, tmp2_trans, langout, change)                      
            if un - un_new >= 0.001:
                un = un_new    
                words = tmp2_words
                trans = tmp2_trans
                pp = pp_new2            
                print 'HYPO: ', make_string(trans), ' PP: ', pp                
                reject = False
                break
            else:             
                reject = True            
                i += 1
        if j == 3:
            k += 1
            if k > 9:
                break
            
    print 'TRANS', make_final_string(trans), 'uns-old', un_old, 'uns', un_new
    print i
    print 'END TRANSLATION', datetime.datetime.now()    
    t = time.time() - t 
    print 'TIME: ', t
    # result += pair['trans']
    # где-то тут мы производим детокенизацию
    return make_final_string(trans)
Beispiel #18
0
def spec_split(sp):
    v1 = [make_string(sp[0:len(sp) - 1]), sp[len(sp) - 1]]
    v2 = [sp[0], make_string(sp[1:len(sp)])]
    return [v1, v2]
Beispiel #19
0
def spec_split(sp):
    v1 = [make_string(sp[0:len(sp)-1]), sp[len(sp)-1]]
    v2 = [sp[0], make_string(sp[1:len(sp)])]
    return [v1, v2]
Beispiel #20
0
def n_gramm_estimation(n_gramm, lang, size):          
    return language_model(make_string([n_gramm[size-1]]), make_string(n_gramm[0:size-1]), lang, size)