Beispiel #1
0
def build_meaning(tree,
                  parent=None,
                  parts=[],
                  cptotalentropy=None,
                  cpcolumns=None,
                  cwtotalentropy=None,
                  cwcolumns=None,
                  threshold=0.75):
    
    cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy()

    lhs = tree.node
    if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree)
    else: rhs = ' '.join(n for n in tree)

    print '+++',lhs,'-->',rhs,'+++'
    if lhs in NONTERMINALS:

        if not lhs == 'LOCATION-PHRASE':

            if lhs == 'RELATION':
                parts.append( ('relation',Counter()) )
            elif lhs == parent == 'LANDMARK-PHRASE':
                parts.append( ('parent-landmark',Counter()) )
            elif lhs == 'LANDMARK-PHRASE':
                parts.append( ('landmark',Counter()) )

            cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs)
            totalss = get_query_totalss(cp_db,cpcolumns)

            for name,totals in zip(cpcolumns[:-1],totalss):
                ent = entropy_of_counts( totals.values() )
                totent = cptotalentropy[name.name]
                if ent < threshold*totent:
                    parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1


        for subtree in tree:
            parts = build_meaning(subtree,
                                  lhs,
                                  parts,
                                  cptotalentropy, 
                                  cpcolumns,
                                  cwtotalentropy, 
                                  cwcolumns,
                                  threshold)
    else:

        cw_db = CWord.get_word_counts(pos=lhs,word=rhs)
        totalss = get_query_totalss(cw_db,cwcolumns)

        for name,totals in zip(cwcolumns[:-1],totalss):
            ent = entropy_of_counts( totals.values() )
            totent = cwtotalentropy[name.name]
            if ent < threshold*totent:
                parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1

    return parts
Beispiel #2
0
def print_totalss_entropy(totalss,totalentropy,columns,printlength):
    print rjust('column',printlength), rjust('context',7), rjust('overall',7), 'best'
    print rjust('',printlength), rjust('entropy',7), rjust('entropy',7)
    for name,totals in zip(columns[:-1],totalss):
        print rjust(name.name,printlength), \
              rjust("%02.4f" % entropy_of_counts( totals.values() ),7), \
              rjust("%02.4f" % totalentropy[name.name],7), \
              zip(*sorted(zip(*reversed(zip(*totals.items()))),reverse=True))[1]
    print
    print
Beispiel #3
0
def get_total_entropy():
    cp_db = CProduction.query
    cpcolumns = list(CProduction.__table__.columns)[3:]
    totalss = get_query_totalss(cp_db,cpcolumns)
    cptotalentropy = {}
    for name,totals in zip(cpcolumns[:-1],totalss):
        ent = entropy_of_counts( totals.values() )
        cptotalentropy[name.name] = ent

    cw_db = CWord.query
    cwcolumns = list(CWord.__table__.columns)[3:]
    totalss = get_query_totalss(cw_db,cwcolumns)
    cwtotalentropy = {}
    for name,totals in zip(cwcolumns[:-1],totalss):
        ent = entropy_of_counts( totals.values() )
        cwtotalentropy[name.name] = ent

    printlength = max( [len(column.name) for column in \
                        list(CProduction.__table__.columns)[3:-1] + \
                        list(CWord.__table__.columns)[3:-1] ] )

    return cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength