def delete_word(limit, terminals, words, lmk=None, rel=None):

    num_deleted = []
    for term, word in zip(terminals, words):
        # get word for POS
        num_deleted.append( Word.delete_words(limit, pos=term, word=word, lmk=lmk_id(lmk), rel=rel_type(rel)) )
    return num_deleted
Exemple #2
0
def remove_expansion(limit, lhs, rhs, parent=None, lmk=None, rel=None):
    return Production.delete_productions(limit,
                                         lhs=lhs,
                                         rhs=rhs,
                                         parent=parent,
                                         lmk=lmk_id(lmk),
                                         rel=rel_type(rel))
def get_words(expn, parent, lmk=None, rel=None):
    words = []
    probs = []
    entropy = []

    for n in expn.split():
        if n in NONTERMINALS:
            if n == parent == 'LANDMARK-PHRASE':
                # we need to move to the parent landmark
                lmk = parent_landmark(lmk)
            # we need to keep expanding
            expansion, exp_prob, exp_ent = get_expansion(n, parent, lmk, rel)
            w, w_prob, w_ent = get_words(expansion, n, lmk, rel)
            words.append(w)
            probs.append(exp_prob * w_prob)
            entropy.append(exp_ent + w_ent)
        else:
            # get word for POS
            w_db = Word.get_words(pos=n, lmk=lmk_id(lmk), rel=rel_type(rel))
            counter = collections.Counter(w_db)
            keys, counts = zip(*counter.items())
            counts = np.array(counts)
            counts /= counts.sum()
            w, w_prob, w_entropy = categorical_sample(keys, counts)
            words.append(w.word)
            probs.append(w.prob)
            entropy.append(w_entropy)
    p, H = np.prod(probs), np.sum(entropy)
    print 'expanding %s to %s (p: %f, H: %f)' % (expn, words, p, H)
    return words, p, H
def get_tree_prob(tree, lmk=None, rel=None):
    prob = 1.0

    if len(tree.productions()) == 1:
        # if this tree only has one production
        # it means that its child is a terminal (word)
        word = tree[0]
        pos = tree.node

        p = WordCPT.probability(word=word, pos=pos,
                                    lmk=lmk_id(lmk), rel=rel_type(rel))
        print p, pos, '->', word, m2s(lmk,rel)
        prob *= p

    else:
        lhs = tree.node
        rhs = ' '.join(n.node for n in tree)
        parent = tree.parent().node if tree.parent() else None

        if lhs == 'RELATION':
            # everything under a RELATION node should ignore the landmark
            lmk = None
        elif lhs == 'LANDMARK-PHRASE':
            # everything under a LANDMARK-PHRASE node should ignore the relation
            rel = None

            if parent == 'LANDMARK-PHRASE':
                # if the current node is a LANDMARK-PHRASE and the parent node
                # is also a LANDMARK-PHRASE then we should move to the parent
                # of the current landmark
                lmk = parent_landmark(lmk)

        if not parent:
            # LOCATION-PHRASE has no parent and is not related to lmk and rel
            p = ExpansionCPT.probability(rhs=rhs, lhs=lhs)
            print p, repr(lhs), '->', repr(rhs)
        else:
            p = ExpansionCPT.probability(rhs=rhs, lhs=lhs, parent=parent,
                                             lmk=lmk_id(lmk), rel=rel_type(rel))
            print p, repr(lhs), '->', repr(rhs), 'parent=%r'%parent, m2s(lmk,rel)
        prob *= p

        # call get_tree_prob recursively for each subtree
        for subtree in tree:
            prob *= get_tree_prob(subtree, lmk, rel)

    return prob
Exemple #5
0
    def update_word_counts(cls,
                           update,
                           pos,
                           word,
                           prev_word,
                           lmk=None,
                           lmk_class=None,
                           lmk_ori_rels=None,
                           lmk_color=None,
                           rel=None,
                           rel_dist_class=None,
                           rel_deg_class=None):
        cp_db = cls.get_word_counts(pos, word, lmk, lmk_class, lmk_ori_rels,
                                    lmk_color, rel, rel_dist_class,
                                    rel_deg_class, prev_word)

        if cp_db.count() <= 0:
            if update > 0: return
            CWord(word=word,
                  pos=pos,
                  prev_word=prev_word,
                  landmark=lmk_id(lmk),
                  landmark_class=lmk_class,
                  landmark_orientation_relations=lmk_ori_rels,
                  landmark_color=lmk_color,
                  relation=rel,
                  relation_distance_class=rel_dist_class,
                  relation_degree_class=rel_deg_class,
                  count=update)
        else:
            # for cword in cp_db.all():
            #     print 'Count for %s before: %f' % (cword.word, cword.count)
            #     cword.count *= (1.0 + update)
            #     print 'Count for %s after: %f' % (cword.word, cword.count)

            ccounter = {}
            for cword in cp_db.all():
                # print cword.word, cword.count
                if cword.word in ccounter: ccounter[cword.word] += cword.count
                else: ccounter[cword.word] = cword.count

            # print '----------------'

            ckeys, ccounts = zip(*ccounter.items())

            ccounts = np.array(ccounts, dtype=float)
            ccounts /= ccounts.sum()
            updates = ccounts * update
            ups = dict(zip(ckeys, updates))

            for cword in cp_db.all():
                if cword.count <= -ups[cword.word]: cword.count = 1
                else: cword.count += ups[cword.word]
                # print cword.word, cword.count

        session.commit()
Exemple #6
0
    def update_word_counts(cls,
                           update,
                           pos,
                           word,
                           prev_word,
                           lmk=None,
                           lmk_class=None,
                           lmk_ori_rels=None,
                           lmk_color=None,
                           rel=None,
                           rel_dist_class=None,
                           rel_deg_class=None):
        cp_db = cls.get_word_counts(pos, word, lmk, lmk_class, lmk_ori_rels, lmk_color, rel, rel_dist_class, rel_deg_class, prev_word)

        if cp_db.count() <= 0:
            if update > 0: return
            CWord(word=word,
                  pos=pos,
                  prev_word=prev_word,
                  landmark=lmk_id(lmk),
                  landmark_class=lmk_class,
                  landmark_orientation_relations=lmk_ori_rels,
                  landmark_color=lmk_color,
                  relation=rel,
                  relation_distance_class=rel_dist_class,
                  relation_degree_class=rel_deg_class,
                  count=update)
        else:
            # for cword in cp_db.all():
            #     print 'Count for %s before: %f' % (cword.word, cword.count)
            #     cword.count *= (1.0 + update)
            #     print 'Count for %s after: %f' % (cword.word, cword.count)

            ccounter = {}
            for cword in cp_db.all():
                # print cword.word, cword.count
                if cword.word in ccounter: ccounter[cword.word] += cword.count
                else: ccounter[cword.word] = cword.count

            # print '----------------'

            ckeys, ccounts = zip(*ccounter.items())

            ccounts = np.array(ccounts, dtype=float)
            ccounts /= ccounts.sum()
            updates = ccounts * update
            ups = dict( zip(ckeys, updates) )

            for cword in cp_db.all():
                if cword.count <= -ups[cword.word]: cword.count = 1
                else: cword.count += ups[cword.word]
                # print cword.word, cword.count

        session.commit()
Exemple #7
0
def delete_word(limit, terminals, words, lmk=None, rel=None):

    num_deleted = []
    for term, word in zip(terminals, words):
        # get word for POS
        num_deleted.append(
            Word.delete_words(limit,
                              pos=term,
                              word=word,
                              lmk=lmk_id(lmk),
                              rel=rel_type(rel)))
    return num_deleted
def get_expansion(lhs, parent=None, lmk=None, rel=None):
    p_db = Production.get_productions(lhs=lhs, parent=parent,
                                      lmk=lmk_id(lmk), rel=rel_type(rel))

    counter = collections.Counter(p_db)
    keys, counts = zip(*counter.items())
    counts = np.array(counts)
    counts /= counts.sum()

    prod, prod_prob, prod_entropy = categorical_sample(keys, counts)
    print 'expanding:', prod, prod_prob, prod_entropy
    return prod.rhs, prod_prob, prod_entropy
Exemple #9
0
def save_tree(tree, loc, rel, lmk, parent=None):
    if len(tree.productions()) == 1:
        # if this tree only has one production
        # it means that its child is a terminal (word)
        word = Word()
        word.word = tree[0]
        word.pos = tree.node
        word.parent = parent
        word.location = loc
    else:
        prod = Production()
        prod.lhs = tree.node
        prod.rhs = ' '.join(n.node for n in tree)
        prod.parent = parent
        prod.location = loc

        # some productions are related to semantic representation
        if prod.lhs == 'RELATION':
            prod.relation = rel_type(rel)
            if hasattr(rel, 'measurement'):
                prod.relation_distance_class = rel.measurement.best_distance_class
                prod.relation_degree_class = rel.measurement.best_degree_class

        elif prod.lhs == 'LANDMARK-PHRASE':
            prod.landmark = lmk_id(lmk)
            prod.landmark_class = lmk.object_class
            prod.landmark_orientation_relations = get_lmk_ori_rels_str(lmk)
            prod.landmark_color = lmk.color
            # next landmark phrase will need the parent landmark
            lmk = parent_landmark(lmk)

        elif prod.lhs == 'LANDMARK':
            # LANDMARK has the same landmark as its parent LANDMARK-PHRASE
            prod.landmark = parent.landmark
            prod.landmark_class = parent.landmark_class
            prod.landmark_orientation_relations = parent.landmark_orientation_relations
            prod.landmark_color = parent.landmark_color

        # save subtrees, keeping track of parent
        for subtree in tree:
            save_tree(subtree, loc, rel, lmk, prod)
Exemple #10
0
def save_tree(tree, loc, rel, lmk, parent=None):
    if len(tree.productions()) == 1:
        # if this tree only has one production
        # it means that its child is a terminal (word)
        word = Word()
        word.word = tree[0]
        word.pos = tree.node
        word.parent = parent
        word.location = loc
    else:
        prod = Production()
        prod.lhs = tree.node
        prod.rhs = ' '.join(n.node for n in tree)
        prod.parent = parent
        prod.location = loc

        # some productions are related to semantic representation
        if prod.lhs == 'RELATION':
            prod.relation = rel_type(rel)
            if hasattr(rel, 'measurement'):
                prod.relation_distance_class = rel.measurement.best_distance_class
                prod.relation_degree_class = rel.measurement.best_degree_class

        elif prod.lhs == 'LANDMARK-PHRASE':
            prod.landmark = lmk_id(lmk)
            prod.landmark_class = lmk.object_class
            prod.landmark_orientation_relations = get_lmk_ori_rels_str(lmk)
            prod.landmark_color = lmk.color
            # next landmark phrase will need the parent landmark
            lmk = parent_landmark(lmk)

        elif prod.lhs == 'LANDMARK':
            # LANDMARK has the same landmark as its parent LANDMARK-PHRASE
            prod.landmark = parent.landmark
            prod.landmark_class = parent.landmark_class
            prod.landmark_orientation_relations = parent.landmark_orientation_relations
            prod.landmark_color = parent.landmark_color

        # save subtrees, keeping track of parent
        for subtree in tree:
            save_tree(subtree, loc, rel, lmk, prod)
def remove_expansion(limit, lhs, rhs, parent=None, lmk=None, rel=None):
    return Production.delete_productions(limit, lhs=lhs, rhs=rhs, parent=parent,
                                  lmk=lmk_id(lmk), rel=rel_type(rel))
Exemple #12
0
        # convert variables to the right types
        xloc = float(xloc)
        yloc = float(yloc)
        loc = (xloc, yloc)
        parse = ParentedTree.parse(parse)
        modparse = ParentedTree.parse(modparse)

        # how many ancestors should the sampled landmark have?
        num_ancestors = count_lmk_phrases(modparse) - 1

        # sample `args.iterations` times for each sentence
        for _ in xrange(args.iterations):
            lmk, rel = get_meaning(loc, num_ancestors)

            if args.verbose:
                print "utterance:", repr(sentence)
                print "location: %s" % repr(loc)
                print "landmark: %s (%s)" % (lmk, lmk_id(lmk))
                print "relation: %s" % rel_type(rel)
                print "parse:"
                print parse.pprint()
                print "modparse:"
                print modparse.pprint()
                print "-" * 70

            location = Location(x=xloc, y=yloc)
            save_tree(modparse, location, rel, lmk)
            Bigram.make_bigrams(location.words)
            Trigram.make_trigrams(location.words)
            session.commit()
Exemple #13
0
            print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse)
            continue

        # sample `args.iterations` times for each sentence
        for _ in xrange(args.iterations):
            lmk, rel = get_meaning(loc, num_ancestors)
            lmk, _, _ = lmk
            rel, _, _ = rel

            assert(not isinstance(lmk, tuple))
            assert(not isinstance(rel, tuple))

            if args.verbose:
                print 'utterance:', repr(sentence)
                print 'location: %s' % repr(loc)
                print 'landmark: %s (%s)' % (lmk, lmk_id(lmk))
                print 'relation: %s' % rel_type(rel)
                print 'parse:'
                print parse.pprint()
                print 'modparse:'
                print modparse.pprint()
                print '-' * 70

            location = Location(x=xloc, y=yloc)
            save_tree(modparse, location, rel, lmk)
            Bigram.make_bigrams(location.words)
            Trigram.make_trigrams(location.words)

        if i % 200 == 0: session.commit()

    if SentenceParse.query().count() == 0:
Exemple #14
0
    def update_word_counts(cls,
                           update,
                           pos,
                           word,
                           prev_word,
                           lmk=None,
                           lmk_class=None,
                           lmk_ori_rels=None,
                           lmk_color=None,
                           rel=None,
                           rel_dist_class=None,
                           rel_deg_class=None,
                           golden=False,
                           multiply=False):

        # logger( 'Really gonna multiply??? %s' % multiply, 'okgreen' )
        # if multiply:
        #     cp_db = cls.get_word_counts(pos=pos,
        #                                 lmk=lmk,
        #                                 lmk_class=lmk_class,
        #                                 lmk_ori_rels=lmk_ori_rels,
        #                                 lmk_color=lmk_color,
        #                                 rel=rel,
        #                                 rel_dist_class=rel_dist_class,
        #                                 rel_deg_class=rel_deg_class, 
        #                                 prev_word=prev_word,
        #                                 golden=golden)
        #     if cp_db.count() <= 0:
        #         update *= 10
        #         # logger( 'Count was zero', 'okgreen' )
        #     else:
        #         ccounter = defaultdict(int)
        #         ccounter[word] = 0
        #         for cword in cp_db.all():
        #             ccounter[cword.word] += cword.count

        #         ckeys, ccounts = zip(*ccounter.items())
        #         ccounts = np.array(ccounts, dtype=float)
        #         total = ccounts.sum()
        #         update *= total

        cp_db = cls.get_word_counts(pos=pos, 
                                    word=word,
                                    lmk=lmk,
                                    lmk_class=lmk_class,
                                    lmk_ori_rels=lmk_ori_rels,
                                    lmk_color=lmk_color,
                                    rel=rel,
                                    rel_dist_class=rel_dist_class,
                                    rel_deg_class=rel_deg_class, 
                                    prev_word=prev_word,
                                    golden=golden)

        committed = False
        while not committed:

            try:
                num_results = cp_db.count()
                if num_results <= 0:
                    if update <= 0: return
                    # logger( 'Updating by %f, %f' % (update, update), 'warning')
                    count = update
                    CWord(word=word,
                          pos=pos,
                          prev_word=prev_word,
                          landmark=lmk_id(lmk),
                          landmark_class=lmk_class,
                          landmark_orientation_relations=lmk_ori_rels,
                          landmark_color=lmk_color,
                          relation=rel,
                          relation_distance_class=rel_dist_class,
                          relation_degree_class=rel_deg_class,
                          count=count)

                # elif num_results == 1:

                #     cword = cp_db.one()
                #     if multiply:
                #         # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning')
                #         cword.count *= 1+update
                #         if cword.count < 1: cword.count = 1
                #     else:
                #         # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning')
                #         if cword.count <= -update: cword.count = 1
                #         else: cword.count += update

                else:

                    ccounter = {}
                    for cword in cp_db.all():
                        # print cword.word, cword.count
                        if cword.word in ccounter: ccounter[cword.word] += cword.count
                        else: ccounter[cword.word] = cword.count

                    # print '----------------'

                    ckeys, ccounts = zip(*ccounter.items())

                    ccounts = np.array(ccounts, dtype=float)
                    ccounts /= ccounts.sum()
                    updates = ccounts * update
                    ups = dict( zip(ckeys, updates) )

                    if multiply:
                        for cword in cp_db.all():
                            # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning')
                            assert( not np.isnan( ups[cword.word] ) )
                            cword.count *= 1+ups[cword.word]
                            if cword.count < 1: cword.count = 1
                    else:
                        for cword in cp_db.all():
                            # logger( 'Updating by %f, %f' % (update, ups[cword.word]), 'warning')
                            if cword.count <= -ups[cword.word]: cword.count = 1
                            else: cword.count += ups[cword.word]

                session().commit()
                committed = True
            except Exception as e:
                logger( 'Could not commit', 'warning' )
                logger( e )
                session().rollback()
                continue
Exemple #15
0
            print 'Failed to parse %d [%s] [%s] [%s]' % (i, sentence, parse, modparse)
            continue

        # sample `args.iterations` times for each sentence
        for _ in xrange(args.iterations):
            lmk, rel = get_meaning(loc, num_ancestors)
            lmk, _, _ = lmk
            rel, _, _ = rel

            assert(not isinstance(lmk, tuple))
            assert(not isinstance(rel, tuple))

            if args.verbose:
                print 'utterance:', repr(sentence)
                print 'location: %s' % repr(loc)
                print 'landmark: %s (%s)' % (lmk, lmk_id(lmk))
                print 'relation: %s' % rel_type(rel)
                print 'parse:'
                print parse.pprint()
                print 'modparse:'
                print modparse.pprint()
                print '-' * 70

            location = Location(x=xloc, y=yloc)
            save_tree(modparse, location, rel, lmk)
            Bigram.make_bigrams(location.words)
            Trigram.make_trigrams(location.words)

        if i % 200 == 0: session.commit()

    for sentence,(parse,modparse) in unique_sentences.items():