def delete_word(limit, terminals, words, lmk=None, rel=None): num_deleted = [] for term, word in zip(terminals, words): # get word for POS num_deleted.append( Word.delete_words(limit, pos=term, word=word, lmk=lmk_id(lmk), rel=rel_type(rel)) ) return num_deleted
def remove_expansion(limit, lhs, rhs, parent=None, lmk=None, rel=None): return Production.delete_productions(limit, lhs=lhs, rhs=rhs, parent=parent, lmk=lmk_id(lmk), rel=rel_type(rel))
def get_words(expn, parent, lmk=None, rel=None): words = [] probs = [] entropy = [] for n in expn.split(): if n in NONTERMINALS: if n == parent == 'LANDMARK-PHRASE': # we need to move to the parent landmark lmk = parent_landmark(lmk) # we need to keep expanding expansion, exp_prob, exp_ent = get_expansion(n, parent, lmk, rel) w, w_prob, w_ent = get_words(expansion, n, lmk, rel) words.append(w) probs.append(exp_prob * w_prob) entropy.append(exp_ent + w_ent) else: # get word for POS w_db = Word.get_words(pos=n, lmk=lmk_id(lmk), rel=rel_type(rel)) counter = collections.Counter(w_db) keys, counts = zip(*counter.items()) counts = np.array(counts) counts /= counts.sum() w, w_prob, w_entropy = categorical_sample(keys, counts) words.append(w.word) probs.append(w.prob) entropy.append(w_entropy) p, H = np.prod(probs), np.sum(entropy) print 'expanding %s to %s (p: %f, H: %f)' % (expn, words, p, H) return words, p, H
def get_tree_prob(tree, lmk=None, rel=None): prob = 1.0 if len(tree.productions()) == 1: # if this tree only has one production # it means that its child is a terminal (word) word = tree[0] pos = tree.node p = WordCPT.probability(word=word, pos=pos, lmk=lmk_id(lmk), rel=rel_type(rel)) print p, pos, '->', word, m2s(lmk,rel) prob *= p else: lhs = tree.node rhs = ' '.join(n.node for n in tree) parent = tree.parent().node if tree.parent() else None if lhs == 'RELATION': # everything under a RELATION node should ignore the landmark lmk = None elif lhs == 'LANDMARK-PHRASE': # everything under a LANDMARK-PHRASE node should ignore the relation rel = None if parent == 'LANDMARK-PHRASE': # if the current node is a LANDMARK-PHRASE and the parent node # is also a LANDMARK-PHRASE then we should move to the parent # of the current landmark lmk = parent_landmark(lmk) if not parent: # LOCATION-PHRASE has no parent and is not related to lmk and rel p = ExpansionCPT.probability(rhs=rhs, lhs=lhs) print p, repr(lhs), '->', repr(rhs) else: p = ExpansionCPT.probability(rhs=rhs, lhs=lhs, parent=parent, lmk=lmk_id(lmk), rel=rel_type(rel)) print p, repr(lhs), '->', repr(rhs), 'parent=%r'%parent, m2s(lmk,rel) prob *= p # call get_tree_prob recursively for each subtree for subtree in tree: prob *= get_tree_prob(subtree, lmk, rel) return prob
def update_word_counts(cls, update, pos, word, prev_word, lmk=None, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None, rel_dist_class=None, rel_deg_class=None): cp_db = cls.get_word_counts(pos, word, lmk, lmk_class, lmk_ori_rels, lmk_color, rel, rel_dist_class, rel_deg_class, prev_word) if cp_db.count() <= 0: if update > 0: return CWord(word=word, pos=pos, prev_word=prev_word, landmark=lmk_id(lmk), landmark_class=lmk_class, landmark_orientation_relations=lmk_ori_rels, landmark_color=lmk_color, relation=rel, relation_distance_class=rel_dist_class, relation_degree_class=rel_deg_class, count=update) else: # for cword in cp_db.all(): # print 'Count for %s before: %f' % (cword.word, cword.count) # cword.count *= (1.0 + update) # print 'Count for %s after: %f' % (cword.word, cword.count) ccounter = {} for cword in cp_db.all(): # print cword.word, cword.count if cword.word in ccounter: ccounter[cword.word] += cword.count else: ccounter[cword.word] = cword.count # print '----------------' ckeys, ccounts = zip(*ccounter.items()) ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() updates = ccounts * update ups = dict(zip(ckeys, updates)) for cword in cp_db.all(): if cword.count <= -ups[cword.word]: cword.count = 1 else: cword.count += ups[cword.word] # print cword.word, cword.count session.commit()
def update_word_counts(cls, update, pos, word, prev_word, lmk=None, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None, rel_dist_class=None, rel_deg_class=None): cp_db = cls.get_word_counts(pos, word, lmk, lmk_class, lmk_ori_rels, lmk_color, rel, rel_dist_class, rel_deg_class, prev_word) if cp_db.count() <= 0: if update > 0: return CWord(word=word, pos=pos, prev_word=prev_word, landmark=lmk_id(lmk), landmark_class=lmk_class, landmark_orientation_relations=lmk_ori_rels, landmark_color=lmk_color, relation=rel, relation_distance_class=rel_dist_class, relation_degree_class=rel_deg_class, count=update) else: # for cword in cp_db.all(): # print 'Count for %s before: %f' % (cword.word, cword.count) # cword.count *= (1.0 + update) # print 'Count for %s after: %f' % (cword.word, cword.count) ccounter = {} for cword in cp_db.all(): # print cword.word, cword.count if cword.word in ccounter: ccounter[cword.word] += cword.count else: ccounter[cword.word] = cword.count # print '----------------' ckeys, ccounts = zip(*ccounter.items()) ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() updates = ccounts * update ups = dict( zip(ckeys, updates) ) for cword in cp_db.all(): if cword.count <= -ups[cword.word]: cword.count = 1 else: cword.count += ups[cword.word] # print cword.word, cword.count session.commit()
def delete_word(limit, terminals, words, lmk=None, rel=None): num_deleted = [] for term, word in zip(terminals, words): # get word for POS num_deleted.append( Word.delete_words(limit, pos=term, word=word, lmk=lmk_id(lmk), rel=rel_type(rel))) return num_deleted
def get_expansion(lhs, parent=None, lmk=None, rel=None): p_db = Production.get_productions(lhs=lhs, parent=parent, lmk=lmk_id(lmk), rel=rel_type(rel)) counter = collections.Counter(p_db) keys, counts = zip(*counter.items()) counts = np.array(counts) counts /= counts.sum() prod, prod_prob, prod_entropy = categorical_sample(keys, counts) print 'expanding:', prod, prod_prob, prod_entropy return prod.rhs, prod_prob, prod_entropy
def save_tree(tree, loc, rel, lmk, parent=None): if len(tree.productions()) == 1: # if this tree only has one production # it means that its child is a terminal (word) word = Word() word.word = tree[0] word.pos = tree.node word.parent = parent word.location = loc else: prod = Production() prod.lhs = tree.node prod.rhs = ' '.join(n.node for n in tree) prod.parent = parent prod.location = loc # some productions are related to semantic representation if prod.lhs == 'RELATION': prod.relation = rel_type(rel) if hasattr(rel, 'measurement'): prod.relation_distance_class = rel.measurement.best_distance_class prod.relation_degree_class = rel.measurement.best_degree_class elif prod.lhs == 'LANDMARK-PHRASE': prod.landmark = lmk_id(lmk) prod.landmark_class = lmk.object_class prod.landmark_orientation_relations = get_lmk_ori_rels_str(lmk) prod.landmark_color = lmk.color # next landmark phrase will need the parent landmark lmk = parent_landmark(lmk) elif prod.lhs == 'LANDMARK': # LANDMARK has the same landmark as its parent LANDMARK-PHRASE prod.landmark = parent.landmark prod.landmark_class = parent.landmark_class prod.landmark_orientation_relations = parent.landmark_orientation_relations prod.landmark_color = parent.landmark_color # save subtrees, keeping track of parent for subtree in tree: save_tree(subtree, loc, rel, lmk, prod)
# convert variables to the right types xloc = float(xloc) yloc = float(yloc) loc = (xloc, yloc) parse = ParentedTree.parse(parse) modparse = ParentedTree.parse(modparse) # how many ancestors should the sampled landmark have? num_ancestors = count_lmk_phrases(modparse) - 1 # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) if args.verbose: print "utterance:", repr(sentence) print "location: %s" % repr(loc) print "landmark: %s (%s)" % (lmk, lmk_id(lmk)) print "relation: %s" % rel_type(rel) print "parse:" print parse.pprint() print "modparse:" print modparse.pprint() print "-" * 70 location = Location(x=xloc, y=yloc) save_tree(modparse, location, rel, lmk) Bigram.make_bigrams(location.words) Trigram.make_trigrams(location.words) session.commit()
print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse) continue # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) lmk, _, _ = lmk rel, _, _ = rel assert(not isinstance(lmk, tuple)) assert(not isinstance(rel, tuple)) if args.verbose: print 'utterance:', repr(sentence) print 'location: %s' % repr(loc) print 'landmark: %s (%s)' % (lmk, lmk_id(lmk)) print 'relation: %s' % rel_type(rel) print 'parse:' print parse.pprint() print 'modparse:' print modparse.pprint() print '-' * 70 location = Location(x=xloc, y=yloc) save_tree(modparse, location, rel, lmk) Bigram.make_bigrams(location.words) Trigram.make_trigrams(location.words) if i % 200 == 0: session.commit() if SentenceParse.query().count() == 0:
def update_word_counts(cls, update, pos, word, prev_word, lmk=None, lmk_class=None, lmk_ori_rels=None, lmk_color=None, rel=None, rel_dist_class=None, rel_deg_class=None, golden=False, multiply=False): # logger( 'Really gonna multiply??? %s' % multiply, 'okgreen' ) # if multiply: # cp_db = cls.get_word_counts(pos=pos, # lmk=lmk, # lmk_class=lmk_class, # lmk_ori_rels=lmk_ori_rels, # lmk_color=lmk_color, # rel=rel, # rel_dist_class=rel_dist_class, # rel_deg_class=rel_deg_class, # prev_word=prev_word, # golden=golden) # if cp_db.count() <= 0: # update *= 10 # # logger( 'Count was zero', 'okgreen' ) # else: # ccounter = defaultdict(int) # ccounter[word] = 0 # for cword in cp_db.all(): # ccounter[cword.word] += cword.count # ckeys, ccounts = zip(*ccounter.items()) # ccounts = np.array(ccounts, dtype=float) # total = ccounts.sum() # update *= total cp_db = cls.get_word_counts(pos=pos, word=word, lmk=lmk, lmk_class=lmk_class, lmk_ori_rels=lmk_ori_rels, lmk_color=lmk_color, rel=rel, rel_dist_class=rel_dist_class, rel_deg_class=rel_deg_class, prev_word=prev_word, golden=golden) committed = False while not committed: try: num_results = cp_db.count() if num_results <= 0: if update <= 0: return # logger( 'Updating by %f, %f' % (update, update), 'warning') count = update CWord(word=word, pos=pos, prev_word=prev_word, landmark=lmk_id(lmk), landmark_class=lmk_class, landmark_orientation_relations=lmk_ori_rels, landmark_color=lmk_color, relation=rel, relation_distance_class=rel_dist_class, relation_degree_class=rel_deg_class, count=count) # elif num_results == 1: # cword = cp_db.one() # if multiply: # # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning') # cword.count *= 1+update # if cword.count < 1: cword.count = 1 # else: # # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning') # if cword.count <= -update: cword.count = 1 # else: cword.count += update else: ccounter = {} for cword in cp_db.all(): # print cword.word, cword.count if cword.word in ccounter: ccounter[cword.word] += cword.count else: ccounter[cword.word] = cword.count # print '----------------' ckeys, ccounts = zip(*ccounter.items()) ccounts = np.array(ccounts, dtype=float) ccounts /= ccounts.sum() updates = ccounts * update ups = dict( zip(ckeys, updates) ) if multiply: for cword in cp_db.all(): # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning') assert( not np.isnan( ups[cword.word] ) ) cword.count *= 1+ups[cword.word] if cword.count < 1: cword.count = 1 else: for cword in cp_db.all(): # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning') if cword.count <= -ups[cword.word]: cword.count = 1 else: cword.count += ups[cword.word] session().commit() committed = True except Exception as e: logger( 'Could not commit', 'warning' ) logger( e ) session().rollback() continue
print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse) continue # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) lmk, _, _ = lmk rel, _, _ = rel assert(not isinstance(lmk, tuple)) assert(not isinstance(rel, tuple)) if args.verbose: print 'utterance:', repr(sentence) print 'location: %s' % repr(loc) print 'landmark: %s (%s)' % (lmk, lmk_id(lmk)) print 'relation: %s' % rel_type(rel) print 'parse:' print parse.pprint() print 'modparse:' print modparse.pprint() print '-' * 70 location = Location(x=xloc, y=yloc) save_tree(modparse, location, rel, lmk) Bigram.make_bigrams(location.words) Trigram.make_trigrams(location.words) if i % 200 == 0: session.commit() for sentence,(parse,modparse) in unique_sentences.items():