Beispiel #1
0
def consolidate_grams(dictModel):

    lstSorted = sorted(dictModel.iteritems(),
                       key=operator.itemgetter(1),
                       reverse=True)

    p1 = 0
    while p1 < len(lstSorted):
        (t1, c1) = lstSorted[p1]
        g1 = count_grams(t1)
        if p1 % 10000 == 0:
            print "#",
        p2 = p1 + 1
        while p2 < len(lstSorted):
            removed = 0
            (t2, c2) = lstSorted[p2]
            g2 = count_grams(t2)
            if g1 < g2:
                if t2.startswith(t1 + '_') or t2.endswith('_' + t1):
                    if c1 >= c2:
                        removed = removed + 1
                        if dictModel.has_key(t1):
                            dictModel[t2] = dictModel[t1]
                            del dictModel[t1]
                        # end if
                    # end if
                # end if
            # end if
            if g1 > g2:
                if t1.startswith(t2 + '_') or t1.endswith('_' + t2):
                    if c1 >= c2:
                        removed = removed + 1
                        if dictModel.has_key(t2):
                            del dictModel[t2]
                        # end if
                    # end if
                # end if
            # end if
            if removed == g1:
                # to do: move on to next p1
                break
            if c2 * 1.2 >= c1:
                break
            p2 = p2 + 1
            if p2 % 10000 == 0:
                print "#",
        # end while
        p1 = p1 + 1
    # end while

    return dictModel
Beispiel #2
0
def dump_entries(dictModel, nNumber, grams, sFilter, bDescending):

    if dictModel == None:
        return None

    if dictModel == {}:
        return None

    if nNumber == 0:
        return None

    top = int(nNumber)
    if bDescending:
        dictSorted = sorted(dictModel.iteritems(),
                            key=operator.itemgetter(1),
                            reverse=True)
    else:
        dictSorted = sorted(dictModel.iteritems(),
                            key=operator.itemgetter(1),
                            reverse=False)
    for (term, count) in dictSorted:
        if int(grams) > 0:
            if count_grams(term) == int(grams):
                # grams specified
                if sFilter:
                    # filter specified
                    if sFilter in term:
                        # filter and gram match
                        print term, count
                    else:
                        # filter non-match
                        continue
                else:
                    # no filter, gram match
                    print term, count
            else:
                # no gram match
                continue
        else:
            if sFilter:
                # filter specified
                if sFilter in term:
                    # filter match
                    print term, count
                else:
                    # filter no match
                    continue
            else:
                # no filter, output
                print term, count
        # end if
        top = top - 1
        if top == 0:
            break

    return True
Beispiel #3
0
def consolidate_grams_old(dictModel):

    # if xyz and xy and yz are in the model
    # take the highest item and drop the others, e.g.
    # boost xyz and drop xy and yz
    # tbd: drop xx

    # to do: make sure second loop starts at c1
    # to do: go no more than some % past c

    dictSorted = sorted(dictModel.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
    for (t1, c1) in dictSorted:
        g1 = count_grams(t1)
        removed = 0
        for (t2, c2) in dictSorted:
            g2 = count_grams(t2)
            if g1 < g2:
                if t2.startswith(t1 + '_') or t2.endswith('_' + t1):
                    if c1 >= c2:
                        if dictModel.has_key(t1):
                            dictModel[t2] = dictModel[t1]
                            del dictModel[t1]
                        removed = removed + 1
                        continue
                # end if
            # end if
            if g1 > g2:
                if t1.startswith(t2 + '_') or t1.endswith('_' + t2):
                    if c1 >= c2:
                        if dictModel.has_key(t2):
                            del dictModel[t2]
                        removed = removed + 1
                        continue
            # end if
            if removed == g1:
                break
        # end for

    # end for

    return dictModel
Beispiel #4
0
def normalize_classification_model(dictModel, bTop=False, list_stopwords=None):

    script_name = "models.normalize_classification_module"

    if not dictModel.has_key("_words"):
        print script_name, "warning, _words not found in classification model"
        return None

    if not dictModel.has_key("_files"):
        print script_name, "warning, _files not found in classification model"
        return None

    # to do: look at problem of xyz xy eg high ranking officials and ranking officials
    # to do: why is explciitly_said high value?

    # file size
    dictNormalized = {}
    fAverage = float(dictModel["_words"]) / float(dictModel["_files"])

    for entry in dictModel.iterkeys():
        # just copy meta entries
        if entry[0:1] != "_":
            # normalize non-meta entries
            dictNormalized[entry] = float(dictModel[entry]) / fAverage

    if bTop:
        dictTop = {}
        gramCount = {}
        # process the normalized dictionary, reverse sorted by normalized frequencies
        for term in sorted(dictNormalized,
                           key=dictNormalized.get,
                           reverse=True):
            # print term, dictNormalized[term]
            # take top 10 for each gram found in the model
            grams = count_grams(term)
            if gramCount.has_key(grams):
                if gramCount[grams] < 11:
                    # take this one
                    dictTop[term] = dictNormalized[term]
                    gramCount[grams] = gramCount[grams] + 1
                else:
                    # don't need more of this gram
                    # for now just continue
                    continue
            else:
                # start counting this n-gram
                gramCount[grams] = 1
        # return it
        return dictTop
    else:
        return dictNormalized
Beispiel #5
0
def classify_top(dictInput, dictModel, nGrams):

    script_name = "classify_top"

    if dictInput == {}:
        return None

    if dictModel == {}:
        return None

    if dictInput.has_key("_words"):
        print script_name, "error: input model is not normalized, _words key found"

    if dictModel.has_key("_words"):
        print script_name, "error: reference model is not normalized, _words key found"

    dictExplain = {}

    # for each kt in the input file:
    for kt in dictInput.keys():
        # if it is in the model...
        if dictModel.has_key(kt):
            gram_count = count_grams(kt)
            fContrib = 0.015 * float(len(kt))
            # add to explain dictionary
            if dictExplain.has_key(kt):
                dictExplain[kt] = float(dictExplain[kt]) + fContrib
            else:
                dictExplain[kt] = fContrib
        # end if
    # end for

    fScore = 0.0
    lstExplain = sorted(dictExplain.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
    for (term, count) in lstExplain:
        fScore = fScore + count
    fScore = fScore * len(lstExplain)

    if fScore > 1.0:
        fScore = 1.0

    return [fScore, dictExplain]
Beispiel #6
0
def classify(dictInput, dictModel, dictIDF, nGrams):

    script_name = "models.classify"

    if dictInput == {}:
        return None

    if dictModel == {}:
        return None

    if dictIDF == {}:
        return None

    if dictInput.has_key("_words"):
        print script_name, "error: input model is not normalized, _words key found"

    if dictModel.has_key("_words"):
        print script_name, "error: reference model is not normalized, _words key found"

    if dictIDF.has_key("_words"):
        print script_name, "error: idf model is not normalized, _words key found"

    fScore = 0.0
    dictExplain = {}

    # this is applied directly to the final score, so use sparingly
    size_adjust = 1.0
    # upper
    if len(dictInput) > 500:
        size_adjust = 0.7
    # lower
    if len(dictInput) < 100:
        size_adjust = 2.0
    if len(dictInput) < 25:
        size_adjust = 6.0
    if len(dictInput) < 10:
        size_adjust = 10.0
    if len(dictInput) < 3:
        size_adjust = 20.0
    if len(dictInput) < 2:
        size_adjust = 33.0

    # tbd: this needs work...
#     input_vs_model = (float(len(dictInput)) / float(len(dictModel))) # e.g. .004
#     if input_vs_model > .002:
#         size_adjust = 0.67
# print size_adjust,

# compute average frequencies for various lengths
    dict_freqs = compute_average_frequency_by_length(dictModel)

    for gram in dictInput.keys():
        if float(dictInput[gram]) > 1.0:
            continue
        gram_count = count_grams(gram)
        if gram_count > 1:
            gc = 0
            gl = 0
            for g in gram.split('_'):
                gc = gc + 1
                gl = gl + len(g)
            gram_average_length = float(gl) / float(gc)
        else:
            gram_average_length = len(gram)

        # is it in the classification model?
        if dictModel.has_key(gram):
            idf = dictModel[gram]
            if dictIDF.has_key(gram):
                if dictIDF[gram] > idf:
                    idf = dictIDF[gram]
        else:
            continue

        # compute tf/idf
        fContrib = float(
            (float(dictInput[gram]) / float(idf)) *
            (math.log1p(gram_average_length) * (gram_average_length**2)))

        # aggregate
        if fContrib > 1.0:
            # accept notable contributions only
            fScore = fScore + fContrib
            # add to explain dictionary
            if dictExplain.has_key(gram):
                dictExplain[gram] = dictExplain[gram] + fContrib
            else:
                dictExplain[gram] = fContrib
        # end if

    # end for

    # aggregate the top 20 hits into the score
    # to do: rewrite so that we score as the model (dictInput) is built, and stop as soon as we have 10 hits
    fScore1 = 0.0
    fScoreFinal = 0.0
    xTop = 50
    top = xTop
    lstExplain = sorted(dictExplain.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
    for (term, count) in lstExplain:
        if count > 1.0:
            fScore1 = fScore1 + 1.0
        if count > 0.25 and count <= 1.0:
            fScore1 = fScore1 + 0.75
        if count > 0.1 and count <= 0.25:
            fScore1 = fScore1 + 0.5
        top = top - 1
        if top == 0:
            break

    # adjust small models
    fScore1 = fScore1 * size_adjust

    # end for
    if len(dictInput) < 100:
        fScoreFinal = float(float(fScore1) / (float(xTop) / 1.5))
    else:
        fScoreFinal = float(float(fScore1) / float(xTop))

    if fScoreFinal > 1.0:
        fScoreFinal = 1.0

    return [fScoreFinal, dictExplain]