def update_expansion_counts(update, lhs, rhs, parent=None, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None): CProduction.update_production_counts(update=update, lhs=lhs, rhs=rhs, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_type(rel), dist_class=(rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None), deg_class=(rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None))
def print_tree_entropy(tree, cptotalentropy=None, cpcolumns=None, cwtotalentropy=None, cwcolumns=None, printlength=None): if cptotalentropy is None: cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy() lhs = tree.node if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree) else: rhs = ' '.join(n for n in tree) print tree print '+++',lhs,'-->',rhs,'+++' if lhs in NONTERMINALS: cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs) totalss = get_query_totalss(cp_db,cpcolumns) print_totalss_entropy(totalss,cptotalentropy,cpcolumns,printlength) for subtree in tree: print_tree_entropy(subtree, cptotalentropy, cpcolumns, cwtotalentropy, cwcolumns, printlength) else: cw_db = CWord.get_word_counts(pos=lhs,word=rhs) totalss = get_query_totalss(cw_db,cwcolumns) print_totalss_entropy(totalss,cwtotalentropy,cwcolumns,printlength)
def build_meaning(tree, parent=None, parts=[], cptotalentropy=None, cpcolumns=None, cwtotalentropy=None, cwcolumns=None, threshold=0.75): cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy() lhs = tree.node if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree) else: rhs = ' '.join(n for n in tree) print '+++',lhs,'-->',rhs,'+++' if lhs in NONTERMINALS: if not lhs == 'LOCATION-PHRASE': if lhs == 'RELATION': parts.append( ('relation',Counter()) ) elif lhs == parent == 'LANDMARK-PHRASE': parts.append( ('parent-landmark',Counter()) ) elif lhs == 'LANDMARK-PHRASE': parts.append( ('landmark',Counter()) ) cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs) totalss = get_query_totalss(cp_db,cpcolumns) for name,totals in zip(cpcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) totent = cptotalentropy[name.name] if ent < threshold*totent: parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1 for subtree in tree: parts = build_meaning(subtree, lhs, parts, cptotalentropy, cpcolumns, cwtotalentropy, cwcolumns, threshold) else: cw_db = CWord.get_word_counts(pos=lhs,word=rhs) totalss = get_query_totalss(cw_db,cwcolumns) for name,totals in zip(cwcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) totent = cwtotalentropy[name.name] if ent < threshold*totent: parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1 return parts
def update_expansion_counts(update, lhs, rhs, parent=None, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None): CProduction.update_production_counts( update=update, lhs=lhs, rhs=rhs, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_type(rel), dist_class=(rel.measurement.best_distance_class if hasattr( rel, 'measurement') else None), deg_class=(rel.measurement.best_degree_class if hasattr( rel, 'measurement') else None))
def db_mass(): total = CProduction.get_production_sum(None) total += CWord.get_word_sum(None) return total
def get_expansion(lhs, parent=None, lmk=None, rel=None): lhs_rhs_parent_chain = [] prob_chain = [] entropy_chain = [] terminals = [] landmarks = [] for n in lhs.split(): if n in NONTERMINALS: if n == parent == 'LANDMARK-PHRASE': # we need to move to the parent landmark lmk = parent_landmark(lmk) lmk_class = (lmk.object_class if lmk else None) lmk_ori_rels = get_lmk_ori_rels_str(lmk) lmk_color = (lmk.color if lmk else None) rel_class = rel_type(rel) dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None) deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None) cp_db = CProduction.get_production_counts(lhs=n, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_class, dist_class=dist_class, deg_class=deg_class) if cp_db.count() <= 0: logger('Could not expand %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) terminals.append( n ) continue ckeys, ccounts = zip(*[(cprod.rhs,cprod.count) for cprod in cp_db.all()]) ccounter = {} for cprod in cp_db.all(): if cprod.rhs in ccounter: ccounter[cprod.rhs] += cprod.count else: ccounter[cprod.rhs] = cprod.count ckeys, ccounts = zip(*ccounter.items()) # print 'ckeys', ckeys # print 'ccounts', ccounts ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() cprod, cprod_prob, cprod_entropy = categorical_sample(ckeys, ccounts) # print cprod, cprod_prob, cprod_entropy lhs_rhs_parent_chain.append( ( n,cprod,parent,lmk ) ) prob_chain.append( cprod_prob ) entropy_chain.append( cprod_entropy ) lrpc, pc, ec, t, ls = get_expansion( lhs=cprod, parent=n, lmk=lmk, rel=rel ) lhs_rhs_parent_chain.extend( lrpc ) prob_chain.extend( pc ) entropy_chain.extend( ec ) terminals.extend( t ) landmarks.extend( ls ) else: terminals.append( n ) landmarks.append( lmk ) return lhs_rhs_parent_chain, prob_chain, entropy_chain, terminals, landmarks
def get_tree_probs(tree, lmk=None, rel=None): lhs_rhs_parent_chain = [] prob_chain = [] entropy_chain = [] term_prods = [] lhs = tree.node if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree) else: rhs = ' '.join(n for n in tree) parent = tree.parent.node if tree.parent else None if lhs == 'RELATION': # everything under a RELATION node should ignore the landmark lmk = None if lhs == 'LANDMARK-PHRASE': # everything under a LANDMARK-PHRASE node should ignore the relation rel = None if lhs == parent == 'LANDMARK-PHRASE': # we need to move to the parent landmark lmk = parent_landmark(lmk) lmk_class = (lmk.object_class if lmk and lhs != 'LOCATION-PHRASE' else None) lmk_ori_rels = get_lmk_ori_rels_str(lmk) if lhs != 'LOCATION-PHRASE' else None lmk_color = (lmk.color if lmk and lhs != 'LOCATION-PHRASE' else None) rel_class = rel_type(rel) if lhs != 'LOCATION-PHRASE' else None dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') and lhs != 'LOCATION-PHRASE' else None) deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') and lhs != 'LOCATION-PHRASE' else None) if lhs in NONTERMINALS: cp_db = CProduction.get_production_counts(lhs=lhs, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_class, dist_class=dist_class, deg_class=deg_class) if cp_db.count() <= 0: logger('Could not expand %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (lhs, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) else: ckeys, ccounts = zip(*[(cprod.rhs,cprod.count) for cprod in cp_db.all()]) ccounter = {} for cprod in cp_db.all(): if cprod.rhs in ccounter: ccounter[cprod.rhs] += cprod.count else: ccounter[cprod.rhs] = cprod.count + 1 # we have never seen this RHS in this context before if rhs not in ccounter: ccounter[rhs] = 1 ckeys, ccounts = zip(*ccounter.items()) # add 1 smoothing ccounts = np.array(ccounts, dtype=float) ccount_probs = ccounts / ccounts.sum() cprod_entropy = -np.sum( (ccount_probs * np.log(ccount_probs)) ) cprod_prob = ccounter[rhs]/ccounts.sum() # logger('ckeys: %s' % str(ckeys)) # logger('ccounts: %s' % str(ccounts)) # logger('rhs: %s, cprod_prob: %s, cprod_entropy: %s' % (rhs, cprod_prob, cprod_entropy)) prob_chain.append( cprod_prob ) entropy_chain.append( cprod_entropy ) lhs_rhs_parent_chain.append( ( lhs, rhs, parent, lmk, rel ) ) for subtree in tree: pc, ec, lrpc, tps = get_tree_probs(subtree, lmk, rel) prob_chain.extend( pc ) entropy_chain.extend( ec ) lhs_rhs_parent_chain.extend( lrpc ) term_prods.extend( tps ) else: cw_db = CWord.get_word_counts(pos=lhs, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_class, rel_dist_class=dist_class, rel_deg_class=deg_class) if cw_db.count() <= 0: # we don't know the probability or entropy values for the context we have never seen before # we just update the term_prods list logger('Could not expand %s (lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (lhs, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) else: ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cw_db.all()]) ccounter = {} for cword in cw_db.all(): if cword.word in ccounter: ccounter[cword.word] += cword.count else: ccounter[cword.word] = cword.count + 1 # we have never seen this RHS in this context before if rhs not in ccounter: ccounter[rhs] = 1 ckeys, ccounts = zip(*ccounter.items()) # logger('ckeys: %s' % str(ckeys)) # logger('ccounts: %s' % str(ccounts)) # add 1 smoothing ccounts = np.array(ccounts, dtype=float) ccount_probs = ccounts/ccounts.sum() w_prob = ccounter[rhs]/ccounts.sum() w_entropy = -np.sum( (ccount_probs * np.log(ccount_probs)) ) prob_chain.append(w_prob) entropy_chain.append(w_entropy) term_prods.append( (lhs, rhs, lmk, rel) ) return prob_chain, entropy_chain, lhs_rhs_parent_chain, term_prods
def get_expansion(lhs, parent=None, lmk=None, rel=None): lhs_rhs_parent_chain = [] prob_chain = [] entropy_chain = [] terminals = [] landmarks = [] for n in lhs.split(): if n in NONTERMINALS: if n == parent == 'LANDMARK-PHRASE': # we need to move to the parent landmark lmk = parent_landmark(lmk) lmk_class = (lmk.object_class if lmk else None) lmk_ori_rels = get_lmk_ori_rels_str(lmk) lmk_color = (lmk.color if lmk else None) rel_class = rel_type(rel) dist_class = (rel.measurement.best_distance_class if hasattr( rel, 'measurement') else None) deg_class = (rel.measurement.best_degree_class if hasattr( rel, 'measurement') else None) cp_db = CProduction.get_production_counts( lhs=n, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_class, dist_class=dist_class, deg_class=deg_class) if cp_db.count() <= 0: logger( 'Could not expand %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) terminals.append(n) continue ckeys, ccounts = zip(*[(cprod.rhs, cprod.count) for cprod in cp_db.all()]) ccounter = {} for cprod in cp_db.all(): if cprod.rhs in ccounter: ccounter[cprod.rhs] += cprod.count else: ccounter[cprod.rhs] = cprod.count ckeys, ccounts = zip(*ccounter.items()) # print 'ckeys', ckeys # print 'ccounts', ccounts ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() cprod, cprod_prob, cprod_entropy = categorical_sample( ckeys, ccounts) # print cprod, cprod_prob, cprod_entropy lhs_rhs_parent_chain.append((n, cprod, parent, lmk)) prob_chain.append(cprod_prob) entropy_chain.append(cprod_entropy) lrpc, pc, ec, t, ls = get_expansion(lhs=cprod, parent=n, lmk=lmk, rel=rel) lhs_rhs_parent_chain.extend(lrpc) prob_chain.extend(pc) entropy_chain.extend(ec) terminals.extend(t) landmarks.extend(ls) else: terminals.append(n) landmarks.append(lmk) return lhs_rhs_parent_chain, prob_chain, entropy_chain, terminals, landmarks
parent = aliased(Production) qry = session.query(Production.lhs, Production.rhs, Production.landmark, Production.landmark_class, Production.landmark_orientation_relations, Production.landmark_color, Production.relation, Production.relation_distance_class, Production.relation_degree_class, func.count(Production.id)).\ filter_by(parent=None).\ group_by(Production.lhs, Production.rhs, Production.landmark, Production.landmark_class, Production.landmark_orientation_relations, Production.relation, Production.relation_distance_class, Production.relation_degree_class) for row in qry: cp = CProduction(lhs=row[0], rhs=row[1], landmark=row[2], landmark_class=row[3], landmark_orientation_relations=row[4], landmark_color=row[5], relation=row[6], relation_distance_class=row[7], relation_degree_class=row[8], count=row[9]) # count productions with parent parent = aliased(Production) qry = session.query(Production.lhs, Production.rhs, parent.lhs, Production.landmark, Production.landmark_class, Production.landmark_orientation_relations, Production.landmark_color, Production.relation, Production.relation_distance_class, Production.relation_degree_class, func.count(Production.id)).\ join(parent, Production.parent).\ group_by(Production.lhs, Production.rhs, parent.lhs, Production.landmark, Production.landmark_class, Production.landmark_orientation_relations, Production.relation, Production.relation_distance_class,