def get_expansion(lhs, parent=None, lmk=None, rel=None, usebest=False, golden=False, depth=0, printing=True): lhs_rhs_parent_chain = [] prob_chain = [] entropy_chain = [] terminals = [] landmarks = [] if depth > 3: return lhs_rhs_parent_chain, prob_chain, entropy_chain, terminals, landmarks for n in lhs.split(): if n in NONTERMINALS: if n == parent == 'LANDMARK-PHRASE': # we need to move to the parent landmark lmk = parent_landmark(lmk) lmk_class = (lmk.object_class if lmk else None) lmk_ori_rels = get_lmk_ori_rels_str(lmk) lmk_color = (lmk.color if lmk else None) rel_class = rel_type(rel) dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None) deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None) cp_db = CProduction.get_production_counts(lhs=n, parent=parent, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel_class, dist_class=dist_class, deg_class=deg_class, golden=golden) if cp_db.count() <= 0: if printing: logger('Could not expand %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) terminals.append( n ) continue if printing: logger('Expanded %s (parent: %s, lmk_class: %s, lmk_ori_rels: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, parent, lmk_class, lmk_ori_rels, lmk_color, rel_class, dist_class, deg_class)) ckeys, ccounts = zip(*[(cprod.rhs,cprod.count) for cprod in cp_db.all()]) ccounter = {} for cprod in cp_db.all(): if cprod.rhs in ccounter: ccounter[cprod.rhs] += cprod.count else: ccounter[cprod.rhs] = cprod.count ckeys, ccounts = zip(*ccounter.items()) # print 'ckeys', ckeys # print 'ccounts', ccounts ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() if usebest: cprod, cprod_prob, cprod_entropy = pick_best(ckeys, ccounts) else: cprod, cprod_prob, cprod_entropy = categorical_sample(ckeys, ccounts) # print cprod, cprod_prob, cprod_entropy lhs_rhs_parent_chain.append( ( n,cprod,parent,lmk ) ) prob_chain.append( cprod_prob ) entropy_chain.append( cprod_entropy ) lrpc, pc, ec, t, ls = get_expansion( lhs=cprod, parent=n, lmk=lmk, rel=rel, golden=golden, printing=printing, depth=depth+1 ) lhs_rhs_parent_chain.extend( lrpc ) prob_chain.extend( pc ) entropy_chain.extend( ec ) terminals.extend( t ) landmarks.extend( ls ) else: terminals.append( n ) landmarks.append( lmk ) return lhs_rhs_parent_chain, prob_chain, entropy_chain, terminals, landmarks
def get_words(terminals, landmarks, rel=None, prevword=None, usebest=False, golden=False, printing=True): words = [] probs = [] alphas = [] entropy = [] C = CWord.get_count for n,lmk in zip(terminals, landmarks): # if we could not get an expansion for the LHS, we just pass down the unexpanded nonterminal symbol # it gets the probability of 1 and entropy of 0 if n in NONTERMINALS: words.append(n) probs.append(1.0) entropy.append(0.0) continue lmk_class = (lmk.object_class if lmk else None) lmk_color = (lmk.color if lmk else None) rel_class = rel_type(rel) dist_class = (rel.measurement.best_distance_class if hasattr(rel, 'measurement') else None) deg_class = (rel.measurement.best_degree_class if hasattr(rel, 'measurement') else None) meaning = dict(pos=n, lmk_class=lmk_class, lmk_ori_rels=get_lmk_ori_rels_str(lmk), lmk_color=lmk_color, rel=rel_class, rel_dist_class=dist_class, rel_deg_class=deg_class, golden=golden) cp_db_uni = CWord.get_word_counts(**meaning) ccounter = {} for c in cp_db_uni: ccounter[c.word] = ccounter.get(c.word, 0) + c.count ckeys, ccounts_uni = zip(*ccounter.items()) ccounts_uni = np.array(ccounts_uni, dtype=float) ccounts_uni /= ccounts_uni.sum() prev_word = words[-1] if words else prevword alpha = C(prev_word=prev_word, **meaning) / C(**meaning) alphas.append(alpha) if alpha: cp_db_bi = CWord.get_word_counts(prev_word=prev_word, **meaning) ccounter = {} for c in cp_db_bi: ccounter[c.word] = ccounter.get(c.word, 0) + c.count ccounts_bi = np.array([ccounter.get(k,0) for k in ckeys], dtype=float) ccounts_bi /= ccounts_bi.sum() cprob = (alpha * ccounts_bi) + ((1-alpha) * ccounts_uni) else: cprob = ccounts_uni if cp_db_uni.count() <= 0: logger( 'Could not expand %s (lmk_class: %s, lmk_color: %s, rel: %s, dist_class: %s, deg_class: %s)' % (n, lmk_class, lmk_color, rel_class, dist_class, deg_class) ) # terminals.append( n ) # continue # ckeys, ccounts = zip(*[(cword.word,cword.count) for cword in cp_db.all()]) # ccounter = {} # for cword in cp_db.all(): # if cword.word in ccounter: ccounter[cword.word] += cword.count # else: ccounter[cword.word] = cword.count # ckeys, ccounts = zip(*ccounter.items()) # print 'ckeys', ckeys # print 'ccounts', ccounts # ccounts = np.array(ccounts, dtype=float) # ccounts /= ccounts.sum() if usebest: w, w_prob, w_entropy = pick_best(ckeys, cprob) else: w, w_prob, w_entropy = categorical_sample(ckeys, cprob) words.append(w) probs.append(w_prob) entropy.append(w_entropy) p, H = np.prod(probs), np.sum(entropy) # print 'expanding %s to %s (p: %f, H: %f)' % (terminals, words, p, H) return words, p, H, alphas