コード例 #1
0
ファイル: features.py プロジェクト: sortiz/bicleaner
def feature_dict_qmax(slwords,
                      tlwords,
                      dict_stot,
                      normalize_by_length,
                      treat_oovs,
                      dict_ttos,
                      limit=20):
    logresult = 0

    slwords_s_a = set()
    slwords_s_n = set()
    for i in slwords:
        if regex_alpha.match(i):
            if i in dict_stot.d:
                slwords_s_a.add(i)
        else:
            slwords_s_n.add(i)

    tlwords2 = list(tlwords)
    tlwords2.sort(key=len, reverse=True)

    if treat_oovs:
        for tlword in tlwords2[0:limit]:
            if tlword not in dict_ttos:
                pass
            else:
                t = [
                    dict_stot.get_prob_alpha(slword, tlword)
                    for slword in slwords_s_a
                ]
                t.extend([
                    dict_stot.get_prob_nonalpha(slword, tlword)
                    for slword in slwords_s_n
                ])
                prob = max(t, default=dict_stot.smooth)
                logresult += math.log(prob)
    else:
        for tlword in tlwords2[0:limit]:
            t = [
                dict_stot.get_prob_alpha(slword, tlword)
                for slword in slwords_s_a
            ]
            t.extend([
                dict_stot.get_prob_nonalpha(slword, tlword)
                for slword in slwords_s_n
            ])
            prob = max(t, default=dict_stot.smooth)
            logresult += math.log(prob)

    if normalize_by_length:
        logresult = float(logresult) / float(max(len(tlwords), limit))

    return math.exp(logresult)
コード例 #2
0
def feature_dict_qmax_nosmooth_nolimit_freq(slwords, tlwords, dict_stot,
                                            normalize_by_length, tlwordfreqs,
                                            fv):
    logresult = 0

    slwords_s_a = set()
    slwords_s_n = set()
    for i in slwords:
        if regex_alpha.match(i):
            if i in dict_stot.d:
                slwords_s_a.add(i)
        else:
            slwords_s_n.add(i)

    slwords_s_n.add("NULL")
    tlwords2 = list(tlwords)
    tlwords2.sort(key=len, reverse=True)

    tlword_freqs = {}
    total_tlword_freq = 0
    for tlword in tlwords2:
        freq = tlwordfreqs.get_word_freq(tlword)
        total_tlword_freq += freq
        tlword_freqs[tlword] = freq
    tlword_norm_freqs_inverse = {
        w: 1 - (f / total_tlword_freq)
        for w, f in tlword_freqs.items()
    }

    for tlword in tlwords2:
        t = [
            dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a
        ]
        t.extend([
            dict_stot.get_prob_nonalpha(slword, tlword)
            for slword in slwords_s_n
        ])
        prob = max(t,
                   default=dict_stot.smooth)**tlword_norm_freqs_inverse[tlword]
        logresult += math.log(prob)
        logging.debug("\t" + str(prob) + "\t" + str(logresult))

    #logging.debug(str(logresult)+"\t"+str(float(logresult) / float(len(tlwords)))+"\t"+str(math.exp(float(logresult) / float(len(tlwords)))))
    if normalize_by_length:
        if fv >= 2:
            logresult = float(logresult) / float(
                max(1, len(tlwords))
            )  # the max is to prevent zero division when tl sentence is empty
        else:
            # old behavior (it was a bug)
            logresult = float(logresult) / float(len(tlwords))
    return math.exp(logresult)
コード例 #3
0
def feature_dict_qmax_nosmooth(slwords,
                               tlwords,
                               dict_stot,
                               normalize_by_length,
                               fv,
                               limit=20):
    logresult = 0

    slwords_s_a = set()
    slwords_s_n = set()
    for i in slwords:
        if regex_alpha.match(i):
            if i in dict_stot.d:
                slwords_s_a.add(i)
        else:
            slwords_s_n.add(i)

    slwords_s_n.add("NULL")
    tlwords2 = list(tlwords)
    tlwords2.sort(key=len, reverse=True)

    for tlword in tlwords2[0:limit]:
        t = [
            dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a
        ]
        t.extend([
            dict_stot.get_prob_nonalpha(slword, tlword)
            for slword in slwords_s_n
        ])
        prob = max(t, default=dict_stot.smooth)
        if prob > 0.0:
            logresult += math.log(prob)
        logging.debug("\t" + str(prob) + "\t" + str(logresult))

    logging.debug(
        str(logresult) + "\t" + str(float(logresult) / float(len(tlwords))) +
        "\t" + str(math.exp(float(logresult) / float(len(tlwords)))))

    if normalize_by_length:
        if fv >= 2:
            logresult = float(logresult) / float(
                max(1, min(len(tlwords), limit))
            )  # the max is to prevent zero division when tl sentence is empty
        else:
            # old behavior (it was a bug)
            logresult = float(logresult) / float(max(len(tlwords), limit))

    return math.exp(logresult)
コード例 #4
0
def feature_dict_qmax_nosmooth_nolimit_cummulated_prob(slwords, tlwords,
                                                       dict_stot,
                                                       normalize_by_length,
                                                       fv):
    logresult = 0

    slwords_s_a = set()
    slwords_s_n = set()
    for i in slwords:
        if regex_alpha.match(i):
            if i in dict_stot.d:
                slwords_s_a.add(i)
        else:
            slwords_s_n.add(i)

    slwords_s_n.add("NULL")
    tlwords2 = list(tlwords)
    tlwords2.sort(key=len, reverse=True)

    for tlword in tlwords2:
        t = [
            dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a
        ]
        if len(t) == 0:
            t.extend([
                dict_stot.get_prob_nonalpha(slword, tlword)
                for slword in slwords_s_n
            ])
            if len(t) > 0:
                prob = 1.0
                logresult += math.log(prob)
        else:
            prob = sum(t) / float(len(slwords))
            logresult += math.log(prob)
    if normalize_by_length:
        if fv >= 2:
            logresult = float(logresult) / float(
                max(1, len(tlwords))
            )  # the max is to prevent zero division when tl sentence is empty
        else:
            # old behavior (it was a bug)
            logresult = float(logresult) / float(len(tlwords))

    return math.exp(logresult)
コード例 #5
0
ファイル: features.py プロジェクト: Parkchanjun/bicleaner
def feature_dict_qmax(slwords, tlwords, dict_stot, normalize_by_length, treat_oovs, dict_ttos, fv, limit = 20):
    logresult = 0
    
    slwords_s_a = set()
    slwords_s_n = set()
    for i in slwords:
        if regex_alpha.match(i):
           if i in dict_stot.d:
               slwords_s_a.add(i)
        else:
           slwords_s_n.add(i)
    
    tlwords2 = list(tlwords)
    tlwords2.sort(key=len, reverse=True)
    
    if treat_oovs:
        for tlword in tlwords2[0:limit]:
            if tlword not in dict_ttos:
                if fv >= 2:
                    logresult += math.log(0.0000001)
                else:
                    pass # old behavior (it was a bug)
            else:
                t = [dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a]
                t.extend([dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n])
                prob = max(t, default = dict_stot.smooth)
                logresult += math.log(prob)
    else:
        for tlword in tlwords2[0:limit]:
            t = [dict_stot.get_prob_alpha(slword, tlword) for slword in slwords_s_a]
            t.extend([dict_stot.get_prob_nonalpha(slword, tlword) for slword in slwords_s_n])
            prob = max(t, default = dict_stot.smooth)
            logresult += math.log(prob)
            

    if normalize_by_length:
        if fv >= 2:
            logresult = float(logresult)/float(max(1, min(len(tlwords), limit))) # the max is to prevent zero division when tl sentence is empty        
        else:
            # old behavior (it was a bug)
            logresult = float(logresult)/float(max(len(tlwords), limit))

    return math.exp(logresult)