def update_word_counts(update, pos, word, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None):
    CWord.update_word_counts(update=update,
                             pos=pos,
                             word=word,
                             lmk_class=lmk_class,
                             lmk_ori_rels=lmk_ori_rels,
                             lmk_color=lmk_color,
                             rel=rel_type(rel),
                             rel_dist_class=(rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None),
                             rel_deg_class=(rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None))
Exemple #2
0
def print_tree_entropy(tree, 
                       cptotalentropy=None,
                       cpcolumns=None,
                       cwtotalentropy=None,
                       cwcolumns=None,
                       printlength=None):

    if cptotalentropy is None:
        cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy()

    lhs = tree.node
    if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree)
    else: rhs = ' '.join(n for n in tree)

    print tree
    print '+++',lhs,'-->',rhs,'+++'

    if lhs in NONTERMINALS:
        cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs)
        totalss = get_query_totalss(cp_db,cpcolumns)
        print_totalss_entropy(totalss,cptotalentropy,cpcolumns,printlength)

        for subtree in tree:
            print_tree_entropy(subtree, 
                               cptotalentropy, 
                               cpcolumns,
                               cwtotalentropy, 
                               cwcolumns,
                               printlength)
    else:
        cw_db = CWord.get_word_counts(pos=lhs,word=rhs)
        totalss = get_query_totalss(cw_db,cwcolumns)
        print_totalss_entropy(totalss,cwtotalentropy,cwcolumns,printlength)
def get_words(terminals, landmarks, rel=None):
    words = []
    probs = []
    entropy = []

    for n,lmk in zip(terminals, landmarks):
        # if we could not get an expansion for the LHS, we just pass down the unexpanded nonterminal symbol
        # it gets the probability of 1 and entropy of 0
        if n in NONTERMINALS:
            words.append(n)
            probs.append(1.0)
            entropy.append(0.0)
            continue

        lmk_class = (lmk.object_class if lmk else None)
        lmk_color = (lmk.color if lmk else None)
        rel_class = rel_type(rel)
        dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None)
        deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None)

        cp_db = CWord.get_word_counts(pos=n,
                                      lmk_class=lmk_class,
                                      lmk_ori_rels=get_lmk_ori_rels_str(lmk),
                                      lmk_color=lmk_color,
                                      rel=rel_class,
                                      rel_dist_class=dist_class,
                                      rel_deg_class=deg_class)

        if cp_db.count() <= 0:
            logger( 'Could not expand %s (lmk_class: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, lmk_class, lmk_color, rel_class, dist_class, deg_class) )
            terminals.append( n )
            continue

        logger( 'Expanded %s (lmk_class: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, lmk_class, lmk_color, rel_class, dist_class, deg_class) )

        ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cp_db.all()])

        ccounter = {}
        for cword in cp_db.all():
            if cword.word in ccounter: ccounter[cword.word] += cword.count
            else: ccounter[cword.word] = cword.count

        ckeys, ccounts = zip(*ccounter.items())

        # print 'ckeys', ckeys
        # print 'ccounts', ccounts

        ccounts = np.array(ccounts, dtype=float)
        ccounts /= ccounts.sum()

        w, w_prob, w_entropy = categorical_sample(ckeys, ccounts)
        words.append(w)
        probs.append(w_prob)
        entropy.append(w_entropy)

    p, H = np.prod(probs), np.sum(entropy)
    # print 'expanding %s to %s (p: %f, H: %f)' % (terminals, words, p, H)
    return words, p, H
Exemple #4
0
def build_meaning(tree,
                  parent=None,
                  parts=[],
                  cptotalentropy=None,
                  cpcolumns=None,
                  cwtotalentropy=None,
                  cwcolumns=None,
                  threshold=0.75):
    
    cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy()

    lhs = tree.node
    if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree)
    else: rhs = ' '.join(n for n in tree)

    print '+++',lhs,'-->',rhs,'+++'
    if lhs in NONTERMINALS:

        if not lhs == 'LOCATION-PHRASE':

            if lhs == 'RELATION':
                parts.append( ('relation',Counter()) )
            elif lhs == parent == 'LANDMARK-PHRASE':
                parts.append( ('parent-landmark',Counter()) )
            elif lhs == 'LANDMARK-PHRASE':
                parts.append( ('landmark',Counter()) )

            cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs)
            totalss = get_query_totalss(cp_db,cpcolumns)

            for name,totals in zip(cpcolumns[:-1],totalss):
                ent = entropy_of_counts( totals.values() )
                totent = cptotalentropy[name.name]
                if ent < threshold*totent:
                    parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1


        for subtree in tree:
            parts = build_meaning(subtree,
                                  lhs,
                                  parts,
                                  cptotalentropy, 
                                  cpcolumns,
                                  cwtotalentropy, 
                                  cwcolumns,
                                  threshold)
    else:

        cw_db = CWord.get_word_counts(pos=lhs,word=rhs)
        totalss = get_query_totalss(cw_db,cwcolumns)

        for name,totals in zip(cwcolumns[:-1],totalss):
            ent = entropy_of_counts( totals.values() )
            totent = cwtotalentropy[name.name]
            if ent < threshold*totent:
                parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1

    return parts
Exemple #5
0
def update_word_counts(update,
                       pos,
                       word,
                       prev_word='<no prev word>',
                       lmk_class=None,
                       lmk_ori_rels=None,
                       lmk_color=None,
                       rel=None):
    CWord.update_word_counts(
        update=update,
        pos=pos,
        word=word,
        prev_word=prev_word,
        lmk_class=lmk_class,
        lmk_ori_rels=lmk_ori_rels,
        lmk_color=lmk_color,
        rel=rel_type(rel),
        rel_dist_class=(rel.measurement.best_distance_class if hasattr(
            rel, 'measurement') else None),
        rel_deg_class=(rel.measurement.best_degree_class if hasattr(
            rel, 'measurement') else None))
Exemple #6
0
 def db_mass():
     total = CProduction.get_production_sum(None)
     total += CWord.get_word_sum(None)
     return total
def get_words(terminals, landmarks, rel=None, prevword=None):
    words = []
    probs = []
    alphas = []
    entropy = []
    C = CWord.get_count

    for n,lmk in zip(terminals, landmarks):
        # if we could not get an expansion for the LHS, we just pass down the unexpanded nonterminal symbol
        # it gets the probability of 1 and entropy of 0
        if n in NONTERMINALS:
            words.append(n)
            probs.append(1.0)
            entropy.append(0.0)
            continue

        lmk_class = (lmk.object_class if lmk else None)
        lmk_color = (lmk.color if lmk else None)
        rel_class = rel_type(rel)
        dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None)
        deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None)



        meaning = dict(pos=n,
                       lmk_class=lmk_class,
                       lmk_ori_rels=get_lmk_ori_rels_str(lmk),
                       lmk_color=lmk_color,
                       rel=rel_class,
                       rel_dist_class=dist_class,
                       rel_deg_class=deg_class)

        cp_db_uni = CWord.get_word_counts(**meaning)

        ccounter = {}
        for c in cp_db_uni:
            ccounter[c.word] = ccounter.get(c.word, 0) + c.count
        ckeys, ccounts_uni = zip(*ccounter.items())
        ccounts_uni = np.array(ccounts_uni, dtype=float)
        ccounts_uni /= ccounts_uni.sum()


        prev_word = words[-1] if words else prevword
        alpha = C(prev_word=prev_word, **meaning) / C(**meaning)
        alphas.append(alpha)

        if alpha:
            cp_db_bi = CWord.get_word_counts(prev_word=prev_word, **meaning)

            ccounter = {}
            for c in cp_db_bi:
                ccounter[c.word] = ccounter.get(c.word, 0) + c.count
            ccounts_bi = np.array([ccounter.get(k,0) for k in ckeys], dtype=float)
            ccounts_bi /= ccounts_bi.sum()

            cprob = (alpha * ccounts_bi) + ((1-alpha) * ccounts_uni)

        else:
            cprob = ccounts_uni


        # if cp_db.count() <= 0:
            # logger( 'Could not expand %s (lmk_class: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, lmk_class, lmk_color, rel_class, dist_class, deg_class) )
            # terminals.append( n )
            # continue

        # ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cp_db.all()])

        # ccounter = {}
        # for cword in cp_db.all():
        #     if cword.word in ccounter: ccounter[cword.word] += cword.count
        #     else: ccounter[cword.word] = cword.count

        # ckeys, ccounts = zip(*ccounter.items())

        # print 'ckeys', ckeys
        # print 'ccounts', ccounts

        # ccounts = np.array(ccounts, dtype=float)
        # ccounts /= ccounts.sum()

        w, w_prob, w_entropy = categorical_sample(ckeys, cprob)
        words.append(w)
        probs.append(w_prob)
        entropy.append(w_entropy)

    p, H = np.prod(probs), np.sum(entropy)
    # print 'expanding %s to %s (p: %f, H: %f)' % (terminals, words, p, H)
    return words, p, H, alphas
def get_tree_probs(tree, lmk=None, rel=None):
    lhs_rhs_parent_chain = []
    prob_chain = []
    entropy_chain = []
    term_prods = []

    lhs = tree.node

    if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree)
    else: rhs = ' '.join(n for n in tree)

    parent = tree.parent.node if tree.parent else None

    if lhs == 'RELATION':
        # everything under a RELATION node should ignore the landmark
        lmk = None

    if lhs == 'LANDMARK-PHRASE':
        # everything under a LANDMARK-PHRASE node should ignore the relation
        rel = None

    if lhs == parent == 'LANDMARK-PHRASE':
        # we need to move to the parent landmark
        lmk = parent_landmark(lmk)

    lmk_class = (lmk.object_class if lmk and lhs != 'LOCATION-PHRASE' else None)
    lmk_ori_rels = get_lmk_ori_rels_str(lmk) if lhs != 'LOCATION-PHRASE' else None
    lmk_color = (lmk.color if lmk and lhs != 'LOCATION-PHRASE' else None)
    rel_class = rel_type(rel) if lhs != 'LOCATION-PHRASE' else None
    dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') and lhs != 'LOCATION-PHRASE' else None)
    deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') and lhs != 'LOCATION-PHRASE' else None)

    if lhs in NONTERMINALS:
        cp_db = CProduction.get_production_counts(lhs=lhs,
                                                  parent=parent,
                                                  lmk_class=lmk_class,
                                                  lmk_ori_rels=lmk_ori_rels,
                                                  lmk_color=lmk_color,
                                                  rel=rel_class,
                                                  dist_class=dist_class,
                                                  deg_class=deg_class)

        if cp_db.count() <= 0:
            logger('Could not expand %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (lhs, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class))
        else:
            ckeys, ccounts = zip(*[(cprod.rhs,cprod.count) for cprod in cp_db.all()])

            ccounter = {}
            for cprod in cp_db.all():
                if cprod.rhs in ccounter: ccounter[cprod.rhs] += cprod.count
                else: ccounter[cprod.rhs] = cprod.count + 1

            # we have never seen this RHS in this context before
            if rhs not in ccounter: ccounter[rhs] = 1

            ckeys, ccounts = zip(*ccounter.items())

            # add 1 smoothing
            ccounts = np.array(ccounts, dtype=float)
            ccount_probs = ccounts / ccounts.sum()
            cprod_entropy = -np.sum( (ccount_probs * np.log(ccount_probs)) )
            cprod_prob = ccounter[rhs]/ccounts.sum()

            # logger('ckeys: %s' % str(ckeys))
            # logger('ccounts: %s' % str(ccounts))
            # logger('rhs: %s, cprod_prob: %s, cprod_entropy: %s' % (rhs, cprod_prob, cprod_entropy))

            prob_chain.append( cprod_prob )
            entropy_chain.append( cprod_entropy )

        lhs_rhs_parent_chain.append( ( lhs, rhs, parent, lmk, rel ) )

        for subtree in tree:
            pc, ec, lrpc, tps = get_tree_probs(subtree, lmk, rel)
            prob_chain.extend( pc )
            entropy_chain.extend( ec )
            lhs_rhs_parent_chain.extend( lrpc )
            term_prods.extend( tps )

    else:
        cw_db = CWord.get_word_counts(pos=lhs,
                                      lmk_class=lmk_class,
                                      lmk_ori_rels=lmk_ori_rels,
                                      lmk_color=lmk_color,
                                      rel=rel_class,
                                      rel_dist_class=dist_class,
                                      rel_deg_class=deg_class)

        if cw_db.count() <= 0:
            # we don't know the probability or entropy values for the context we have never seen before
            # we just update the term_prods list
            logger('Could not expand %s (lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (lhs, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class))
        else:

            ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cw_db.all()])

            ccounter = {}
            for cword in cw_db.all():
                if cword.word in ccounter: ccounter[cword.word] += cword.count
                else: ccounter[cword.word] = cword.count + 1

            # we have never seen this RHS in this context before
            if rhs not in ccounter: ccounter[rhs] = 1

            ckeys, ccounts = zip(*ccounter.items())

            # logger('ckeys: %s' % str(ckeys))
            # logger('ccounts: %s' % str(ccounts))

            # add 1 smoothing
            ccounts = np.array(ccounts, dtype=float)
            ccount_probs = ccounts/ccounts.sum()

            w_prob = ccounter[rhs]/ccounts.sum()
            w_entropy = -np.sum( (ccount_probs * np.log(ccount_probs)) )

            prob_chain.append(w_prob)
            entropy_chain.append(w_entropy)

        term_prods.append( (lhs, rhs, lmk, rel) )

    return prob_chain, entropy_chain, lhs_rhs_parent_chain, term_prods
Exemple #9
0
def get_words(terminals, landmarks, rel=None, prevword=None):
    words = []
    probs = []
    alphas = []
    entropy = []
    C = CWord.get_count

    for n, lmk in zip(terminals, landmarks):
        # if we could not get an expansion for the LHS, we just pass down the unexpanded nonterminal symbol
        # it gets the probability of 1 and entropy of 0
        if n in NONTERMINALS:
            words.append(n)
            probs.append(1.0)
            entropy.append(0.0)
            continue

        lmk_class = (lmk.object_class if lmk else None)
        lmk_color = (lmk.color if lmk else None)
        rel_class = rel_type(rel)
        dist_class = (rel.measurement.best_distance_class if hasattr(
            rel, 'measurement') else None)
        deg_class = (rel.measurement.best_degree_class if hasattr(
            rel, 'measurement') else None)

        meaning = dict(pos=n,
                       lmk_class=lmk_class,
                       lmk_ori_rels=get_lmk_ori_rels_str(lmk),
                       lmk_color=lmk_color,
                       rel=rel_class,
                       rel_dist_class=dist_class,
                       rel_deg_class=deg_class)

        cp_db_uni = CWord.get_word_counts(**meaning)

        ccounter = {}
        for c in cp_db_uni:
            ccounter[c.word] = ccounter.get(c.word, 0) + c.count
        ckeys, ccounts_uni = zip(*ccounter.items())
        ccounts_uni = np.array(ccounts_uni, dtype=float)
        ccounts_uni /= ccounts_uni.sum()

        prev_word = words[-1] if words else prevword
        alpha = C(prev_word=prev_word, **meaning) / C(**meaning)
        alphas.append(alpha)

        if alpha:
            cp_db_bi = CWord.get_word_counts(prev_word=prev_word, **meaning)

            ccounter = {}
            for c in cp_db_bi:
                ccounter[c.word] = ccounter.get(c.word, 0) + c.count
            ccounts_bi = np.array([ccounter.get(k, 0) for k in ckeys],
                                  dtype=float)
            ccounts_bi /= ccounts_bi.sum()

            cprob = (alpha * ccounts_bi) + ((1 - alpha) * ccounts_uni)

        else:
            cprob = ccounts_uni

        # if cp_db.count() <= 0:
        # logger( 'Could not expand %s (lmk_class: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, lmk_class, lmk_color, rel_class, dist_class, deg_class) )
        # terminals.append( n )
        # continue

        # ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cp_db.all()])

        # ccounter = {}
        # for cword in cp_db.all():
        #     if cword.word in ccounter: ccounter[cword.word] += cword.count
        #     else: ccounter[cword.word] = cword.count

        # ckeys, ccounts = zip(*ccounter.items())

        # print 'ckeys', ckeys
        # print 'ccounts', ccounts

        # ccounts = np.array(ccounts, dtype=float)
        # ccounts /= ccounts.sum()

        w, w_prob, w_entropy = categorical_sample(ckeys, cprob)
        words.append(w)
        probs.append(w_prob)
        entropy.append(w_entropy)

    p, H = np.prod(probs), np.sum(entropy)
    # print 'expanding %s to %s (p: %f, H: %f)' % (terminals, words, p, H)
    return words, p, H, alphas
Exemple #10
0
                                .outerjoin(w1,Bigram.w1) \
                                .join(w2,Bigram.w2) \
                                .join(parent,w2.parent) \
                                .group_by(w1.word, w2.word, w2.pos, parent.lhs,
                                          parent.landmark, parent.landmark_class,
                                          parent.landmark_orientation_relations,
                                          parent.landmark_color, parent.relation,
                                          parent.relation_distance_class,
                                          parent.relation_degree_class)
    for row in qry:
        cw = CWord(word=row[1],
                   prev_word=row[0],
                   pos=row[2],
                   landmark=row[4],
                   landmark_class=row[5],
                   landmark_orientation_relations=row[6],
                   landmark_color=row[7],
                   relation=row[8],
                   relation_distance_class=row[9],
                   relation_degree_class=row[10],
                   count=row[11])

    # count productions with no parent
    parent = aliased(Production)
    qry = session.query(Production.lhs, Production.rhs,
                        Production.landmark, Production.landmark_class, Production.landmark_orientation_relations, Production.landmark_color,
                        Production.relation, Production.relation_distance_class,
                        Production.relation_degree_class, func.count(Production.id)).\
                  filter_by(parent=None).\
                  group_by(Production.lhs, Production.rhs,
                           Production.landmark, Production.landmark_class, Production.landmark_orientation_relations,