Beispiel #1
0
    # Keeping track of the dataset name
    config.dataset_name = "-".join(args.test_file.split("data/")[1].split("/")[0:3]) #todo: os agnostic

    # Set up output dir
    config.experiment_out_dir = os.path.join(
        config.experiment_out_dir, 'results', config.clustering_scheme, ts)
    output_dir = config.experiment_out_dir

    copy_source_to_dir(output_dir,config)

    # assert config.best_model is not None
    # assert config.partition_threshold is not None

    model = coref.models.load_model(config)

    g = Graphviz()

    for i, f in enumerate(config.test_files):

        print('[BEGIN TEST] %s\n' % f)
        pts = []
        counter = 0
        for pt in Ment.load_ments(f, model=model):
            pts.append(pt)

        # Shuffle the data points
        rand.shuffle(pts)

        print('[CLUSTERING...]')
        clustering_time_start = time.time()
Beispiel #2
0
    def insert(self, p, p_idx):
        """
        Incrementally add p to the tree.

        :param p - (MentObject,GroundTruth,Id)
        """

        p_ment = MentNode([p], aproj=p[0].attributes)
        p_ment.cluster_marker = True
        start_time = time.time()
        print('Inserting p (%s,%s,%s) into tree ' % (p_ment.id, p[1], p[2]))
        if self.root is None:
            self.root = p_ment
            self.nn_structure.insert(p_ment)
        else:
            # Find k nearest neighbors

            time_start_placement = time.time()
            if self.config.add_to_mention:
                offlimits = set([
                    d.nsw_node for d in self.root.descendants()
                    if d.point_counter > 1 if d.nsw_node
                ])
            else:
                offlimits = set()

            # print('##########################################')
            # print("#### KNN SEARCH W/ New Point %s #############" % p_ment.id)

            insert_start_time = time.time()
            knn_and_score, num_searched_approx = self.nn_structure.knn_and_score_offlimits(
                p_ment, offlimits, k=self.nn_k, r=self.nsw_r)
            insert_end_time = time.time()
            self.insert_comps[0] += num_searched_approx
            self.insert_comps[1] += num_searched_approx
            self.insert_time[0] += insert_end_time - insert_start_time
            self.insert_time[1] += insert_end_time - insert_start_time
            self.num_computations += num_searched_approx

            approximate_closest_node, approx_closest_score = knn_and_score[0][
                1].v, knn_and_score[0][0]

            # possible_nn_with_same_class = p[1] in self.observed_classes

            # print("#KnnSearchRes\tNewMention\tapprox=%s\tapprox_score=%s" %
            #       (approximate_closest_node.id,approx_closest_score))
            #
            # print("#NumSearched\tNewMention\tapprox=%s\tnsw_edges=%s"
            #       "\ttree_nodes=%s\tscore=%s\tposs=%s"
            #       % (
            #         num_searched_approx,
            #         self.nn_structure.num_edges,p_idx * 2 - 1,
            #         approx_closest_score,possible_nn_with_same_class
            # ))
            #
            # print('##########################################')
            # print()
            # print('##########################################')
            # print("############## KNN ADD %s #############" % p_ment.id)
            #
            #
            # print('##########################################')
            # print()
            # print('##########################################')
            # print('############## Find Insert Stop ##########')

            # Find where to be added / rotate
            insert_node, new_ap, new_score, time_before_rotation, time_finish_placement = self.find_insert(
                approximate_closest_node, p_ment)
            # print('Splitting Down at %s with new scores %s' % (insert_node.id, new_score))

            # print('#TimeNNFindTime\t%s\t%s' % (time_before_rotation - time_start_placement,time_before_rotation-start_time))
            # print('#TimeUntilAfterRotation\t%s\t%s' % (time_finish_placement - time_start_placement,time_finish_placement-start_time))

            time_before_insert = time.time()
            # Add yourself to the knn structures
            num_comp_insertions = self.nn_structure.insert(p_ment)
            time_after_insert = time.time()

            self.insert_comps[0] += num_comp_insertions
            self.insert_comps[1] += num_comp_insertions
            self.insert_time[0] += time_after_insert - time_before_insert
            self.insert_time[1] += time_after_insert - time_before_insert

            # print('#TimeAddPointToNSW\t%s\t%s' % (time_after_insert-time_before_insert,time_after_insert-start_time))

            # Add the point
            new_internal_node = insert_node.split_down(p_ment, new_ap,
                                                       new_score)

            assert p_ment.root() == insert_node.root(
            ), "p_ment.root() %s == insert_node.root() %s" % (
                p_ment.root(), insert_node.root())
            assert p_ment.lca(
                insert_node
            ) == new_internal_node, "p_ment.lca(insert_node) %s == new_internal_node %s" % (
                p_ment.lca(insert_node), new_internal_node)

            # print('Created new node %s ' % new_internal_node.id)

            # Update throughout the tree.
            if new_internal_node.parent:
                new_internal_node.parent.update_aps(p[0].attributes,
                                                    self.model.sub_ent_model)

            # update all the entity scores
            before_update_time = time.time()
            curr = new_internal_node
            new_leaf_anc = p_ment._ancestors()
            num_updates_here = 0
            while curr:
                self.update_for_new(curr, p_ment, new_leaf_anc, True)
                curr = curr.parent
                num_updates_here += 1
            after_update_time = time.time()
            self.insert_comps[0] += num_updates_here
            self.insert_comps[1] += num_updates_here
            self.insert_time[0] += after_update_time - before_update_time
            self.insert_time[1] += after_update_time - before_update_time

            # print('#TimeForUpdateOfNewPt\t%s\t%s' %(after_update_time-before_update_time,after_update_time-start_time))
            # print('##########################################')
            # print()
            # print('##########################################')
            # print("############## KNN ADD %s #############" % new_internal_node.id)

            # Add the newly created node to the NN structure
            time_before_insert = time.time()
            num_comp_insertions = self.nn_structure.insert(new_internal_node)
            time_after_insert = time.time()
            self.insert_comps[0] += num_comp_insertions
            self.insert_comps[1] += num_comp_insertions
            self.insert_time[0] += time_after_insert - time_before_insert
            self.insert_time[1] += time_after_insert - time_before_insert

            # print('#TimeAddInternalNodetoNSW\t%s\t%s' % (time_after_insert - time_before_insert, time_after_insert - start_time))

            # print()
            # print('##########################################')
            # print()

            self.root = self.root.root()
            time_before_graft = time.time()
            total_graft_comps = 0
            if self.perform_graft:
                graft_index = 0

                curr = new_internal_node
                while curr.parent:
                    time_before_this_graft = time.time()
                    # print()
                    # print("=============================================")
                    # print('Curr %s CurrType %s ' % (curr.id, type(curr)))
                    #
                    # print('Finding Graft for %s ' % curr.id)
                    #
                    # print('##########################################')
                    # print("#### KNN SEARCH W/ Node %s #########" % curr.id)

                    time_before_offlimits = time.time()
                    offlimits = set([
                        x.nsw_node
                        for x in (curr.siblings() + curr.descendants() +
                                  curr._ancestors() + [curr])
                    ])
                    time_after_offlimits = time.time()
                    # print('#TimeFindOfflimits\t%s\t%s' % (time_after_offlimits-time_before_offlimits,time_after_offlimits-start_time))

                    time_before_graft_nn_search = time.time()
                    knn_and_score, num_searched_approx = self.nn_structure.knn_and_score_mention(
                        curr, offlimits, k=self.nn_k, r=self.nsw_r)
                    time_after_graft_nn_search = time.time()
                    # print('#TimeNNGraftSearch\t%s\t%s' %(time_after_graft_nn_search-time_before_graft_nn_search,time_after_graft_nn_search-start_time))
                    self.num_computations += num_searched_approx
                    total_graft_comps += num_searched_approx

                    # if len(knn_and_score) == 0:
                    #     print("#NumSearched\tGraft\tapprox=%s\texact=%s\tnsw_edges=%s\terror="
                    #       % (num_searched_approx,self.nn_structure.num_edges,
                    #          p_idx * 2))
                    # print('##########################################')
                    # print()

                    if len(knn_and_score) > 0:
                        approximate_closest_node, approx_closest_score = knn_and_score[
                            0][1].v, knn_and_score[0][0]

                        # print("#NumSearched\tGraft\tapprox=%s\tnsw_edges=%s\ttree_nodes=%s\terror=%s"
                        #      % (num_searched_approx, self.nn_structure.num_edges,
                        #         p_idx * 2, np.abs(approx_closest_score)))
                        # print("#KnnSearchRes\tGraft\tapprox=%s\tapprox_score=%s" %
                        #       (approximate_closest_node.id, approx_closest_score))

                        def allowable_graft(n):
                            if n.deleted:
                                print('Deleted')
                                return False
                            if n.parent is None:
                                # print('Parent is None')
                                return False
                            if curr in n.siblings():
                                # print('curr in sibs')
                                return False
                            lca = curr.lca(n)
                            if lca != curr and lca != n:
                                # print("Found candidate - returning true")
                                return True
                            else:
                                # print('lca = curr %s lca = n %s' % (lca == curr, lca == n))
                                return False

                        # allowed = allowable_graft(best)
                        allowed = True
                        if not allowed:
                            # self.graft_recorder.records.append(GraftMetaData(self, curr, best, False,False,False))
                            pass
                        else:
                            # print(approx_closest_score)
                            # print(curr.parent.my_score)
                            # print(approximate_closest_node.parent.my_score)
                            # print('Best %s BestTypes %s ' % (approximate_closest_node.id,type(approximate_closest_node)))

                            you_like_them_better = approx_closest_score > curr.parent.my_score
                            they_like_you_better = approx_closest_score > approximate_closest_node.parent.my_score

                            approx_says_perform_graft = you_like_them_better and they_like_you_better
                            is_allowable = True
                            while you_like_them_better \
                                    and not they_like_you_better \
                                    and is_allowable \
                                    and approximate_closest_node.parent \
                                    and approximate_closest_node.parent.parent:
                                approximate_closest_node = approximate_closest_node.parent
                                is_allowable = allowable_graft(
                                    approximate_closest_node)
                                if is_allowable:
                                    best_pw, best_pw_n1, best_pw_n2 = self.best_pairwise(
                                        curr, approximate_closest_node)
                                    new_ap_graft = self.hallucinate_merge(
                                        curr, approximate_closest_node,
                                        best_pw.data.numpy()[0])
                                    approx_closest_score = self.model.e_score(
                                        new_ap_graft).data.numpy()[0]
                                    total_graft_comps += 1
                                    you_like_them_better = approx_closest_score > curr.parent.my_score
                                    they_like_you_better = approx_closest_score > approximate_closest_node.parent.my_score

                                    approx_says_perform_graft = you_like_them_better and they_like_you_better

                            # if you like them better than your current sibling, but they don't like you better then you
                            # want to check the parent of them.

                            # print('(Approx.) Candidate Graft: (best: %s, score: %s) to (%s,par.score %s) from (%s,par.score %s)' %
                            #       (approximate_closest_node.id,approx_closest_score,curr.id,curr.parent.my_score,approximate_closest_node.id,approximate_closest_node.parent.my_score))
                            # Perform Graft
                            # print("#GraftSuggestions\tp_idx=%s\tg_idx=%s\tapprox=%s" %
                            #       (p_idx,graft_index,approx_says_perform_graft))

                            if approx_says_perform_graft:
                                approximate_closest_node_sib = approximate_closest_node.siblings(
                                )[0]

                                # Write the tree before the graft
                                if self.config.write_every_tree:
                                    Graphviz.write_tree(
                                        os.path.join(
                                            self.config.canopy_out,
                                            'tree_%s_before_graft_%s.gv' %
                                            (p_idx, graft_index)), self.root,
                                        [approximate_closest_node.id, curr.id],
                                        [p_ment.id])
                                # self.graft_recorder.records.append(GraftMetaData(self, best, curr, True, True, False))
                                # print("Performing graft: ")
                                best_pw, best_pw_n1, best_pw_n2 = self.best_pairwise(
                                    curr, approximate_closest_node)
                                # print('best_pw = %s %s %s' % (best_pw_n1,best_pw_n2,best_pw))
                                new_ap_graft = self.hallucinate_merge(
                                    curr, approximate_closest_node,
                                    best_pw.data.numpy()[0])
                                new_graft_internal = curr.graft_to_me(
                                    approximate_closest_node,
                                    new_aproj=new_ap_graft,
                                    new_my_score=None
                                )  # We don't want a pw guy here.

                                # print('Finished Graft')
                                # print('updating.....')
                                # Update nodes

                                # This updates the ancestors of the current node after the graft

                                before_update_time = time.time()
                                curr_update = new_graft_internal
                                while curr_update:
                                    e_score = self.score_np(curr_update)
                                    total_graft_comps += 1
                                    # if e_score != curr_update.my_score:
                                    #     print(
                                    #         'Updated my_score %s of curr my_score %s aproj_local[\'es\'] %s to be %s' % (
                                    #             curr_update.my_score,
                                    #             curr_update.as_ment.attributes.aproj_local[
                                    #             'es'] if 'es' in curr_update.as_ment.attributes.aproj_local else "None",
                                    #             curr_update.id, e_score))
                                    curr_update.my_score = e_score
                                    curr_update.as_ment.attributes.aproj_local[
                                        'es'] = e_score
                                    if curr_update.parent is None:
                                        self.root = curr_update
                                    curr_update = curr_update.parent
                                after_update_time = time.time()

                                # This updates the ancestors of the node which was grafted to you:
                                sibling_of_grafted_node = approximate_closest_node_sib
                                curr_update = sibling_of_grafted_node.parent
                                while curr_update:
                                    e_score = self.score_np(curr_update)
                                    total_graft_comps += 1
                                    # if e_score != curr_update.my_score:
                                    #     print(
                                    #         '[From Graftees old sib] Updated my_score %s of curr my_score %s aproj_local[\'es\'] %s to be %s' % (
                                    #             curr_update.my_score,
                                    #             curr_update.as_ment.attributes.aproj_local[
                                    #             'es'] if 'es' in curr_update.as_ment.attributes.aproj_local else "None",
                                    #             curr_update.id, e_score))
                                    curr_update.my_score = e_score
                                    curr_update.as_ment.attributes.aproj_local[
                                        'es'] = e_score
                                    curr_update = curr_update.parent

                                print('#TimeForUpdateInGraft\t%s\t%s' %
                                      (after_update_time - before_update_time,
                                       after_update_time - start_time))
                                # print('##########################################')
                                # print("############## KNN ADD %s #############" % new_graft_internal.id)
                                # print('Adding new node to NSW')
                                insert_comps = self.nn_structure.insert(
                                    new_graft_internal)
                                total_graft_comps += insert_comps
                                # print('##########################################')
                                # Write the tree after the graft
                                if self.config.write_every_tree:
                                    Graphviz.write_tree(
                                        os.path.join(
                                            self.config.canopy_out,
                                            'tree_%s_post_graft_%s.gv' %
                                            (p_idx, graft_index)), self.root,
                                        [approximate_closest_node.id, curr.id],
                                        [p_ment.id])

                            # else:
                            # self.graft_recorder.records.append(GraftMetaData(self, best, curr, False, True, False))
                            # print('Chose not to graft.')

                    # else:
                    # self.graft_recorder.records.append(GraftMetaData(self, None, curr, False, False, True))
                    # print('No possible grafts for %s ' % curr.id)
                    graft_index += 1
                    curr = curr.parent
                    time_after_this_graft = time.time()
                    print("#TimeAfterThisGraftProposal\t%s\t%s" %
                          (time_after_this_graft - time_before_this_graft,
                           time_after_this_graft - start_time))
                    # print("=============================================")
                    # print()
                    end_time = time.time()
                    if curr.parent is None:
                        self.grafting_time[0] += end_time - time_before_graft
                        self.grafting_time[1] += end_time - time_before_graft
                        self.grafting_comps[0] += total_graft_comps
                        self.grafting_comps[1] += total_graft_comps
                        print("#TimeAfterAllGrafts\t%s\t%s" %
                              (end_time - time_before_graft,
                               end_time - start_time))
        end_time = time.time()
        print('Done Inserting p (%s,%s,%s) into tree in %s seconds  ' %
              (p_ment.id, p[1], p[2], end_time - start_time))
        self.observed_classes.add(p[1])
        sys.stdout.flush()
        if self.config.write_every_tree:
            if len(self.config.canopy_out) > 0:
                Graphviz.write_tree(
                    os.path.join(self.config.canopy_out, 'tree_%s.gv' % p_idx),
                    self.root, [], [p_ment.id])
                if self.config.nn_structure == 'nsw':
                    GraphvizNSW.write_nsw(
                        os.path.join(self.config.canopy_out,
                                     'nsw_%s.gv' % p_idx), self.nn_structure)
        return p_ment
Beispiel #3
0
            print('%s already exists' % output_dir)

    if config.model_name == 'ByCanopyBaseline':
        model = ByCanopyBaseline()
    elif config.model_name == 'ByNameBaseline':
        model = ByNameBaseline()
    elif config.model_name == 'ByNameBaselineStrict':
        model = ByNameBaselineStrict()
    elif config.model_name == 'ByFirstNameBaseline':
        model = ByFirstNameBaseline()
    elif config.model_name == 'ByFirstNameBaselineStrict':
        model = ByFirstNameBaselineStrict()
    else:
        raise Exception("Unknown Model: {}".format(config.model_name))

    g = Graphviz()

    for i, f in enumerate(config.test_files):
        print('[BEGIN TEST] %s\n' % f)
        pts = []
        counter = 0
        if config.out_by_canopy:
            canopy_out = os.path.join(output_dir, config.out_by_canopy[i])
        else:
            canopy_out = os.path.join(output_dir, 'canopy_%s' % str(i))
        os.makedirs(canopy_out)
        with open('%s/predicted.tsv' % canopy_out, 'w') as predf:
            with open('%s/gold.tsv' % canopy_out, 'w') as goldf:
                print('[CLUSTERING...]')
                clustering_time_start = time.time()
                for m, m.gt, m.id in Ment.load_ments(f):
Beispiel #4
0
    def _try_graft_fast(self, curr, knn_and_score, offlimits, gnode, p_idx,
                        graft_index, start_time):
        """Try to find a graft for curr, faster than the speed of sound, faster than we thought we'd go

        Look through the knn_and_score for the closest leaf in the tree that is NOT OFFLIMITS.
        If there is such a leaf, try to graft it as before. 
        If you graft then try grafting from your parent.
        
        Grafts follow the same logic as before:  Compute the score and check for a merge. 
        If the score is
        better than curr.parent.my_score and the other.parent.my_score,
        perform the merge and update. If the merge score is better than
        curr.parent.my_score but not others parent score, then try to merge with
        other's parent. If the merge score is worse than curr's parent score,
        return nothing. This function also does a bunch of logging.

        Args:
            curr - the node to initiate grafting from.
            knn_and_score - the results from the nn search that added curr to the NSW. 
            offlimits - the nodes in the NSW that cannot be grafted.
            gnode - newly created node with new point.
            p_idx - the point index (int)
            graft_index - number of times grafted so far
            start_time - time we started insert

        Returns:
            Nothing
        """
        if self.config.debug:
            print('#tryGraftFast trying to graft \t%s' % (curr.id))

        # First do a search for the closest leaf in the NSW.
        knn_and_score_valid = []
        for score, node in knn_and_score:
            if node not in offlimits:
                knn_and_score_valid.append((score, node))

        if self.config.debug:
            print('#tryGraftFast number of nns %s, num valid \t%s' %
                  (len(knn_and_score), len(knn_and_score_valid)))

        # If there aren't enough nodes to explore just do nothing.
        if knn_and_score_valid:
            other, other_score = knn_and_score_valid[0][
                1].v, knn_and_score_valid[0][0]
            if self.config.debug:
                print('#tryGraftFast found nn\t%s\t%s' % (curr.id, other.id))
        else:
            if self.config.debug:
                print('#tryGraftFast no nn found for \t%s' % (curr.id))
            return None, knn_and_score_valid

        our_lca = curr.lca(other)

        while curr != our_lca and other != our_lca and curr not in other.siblings(
        ):

            if self.max_node_graft_size is not None:
                if other.point_counter > self.max_node_graft_size or curr.point_counter > self.max_node_graft_size:
                    if self.config.debug:
                        print(
                            '#tryGraftFast Breaking because of sizes\t%s\t%s\t%s\t%s'
                            % (curr.id, other.id, curr.point_counter,
                               other.point_counter))
                    break

            if self.config.debug:
                print('#tryGraftFast trying new parent\t%s\t%s' %
                      (curr.id, other.id))
                sys.stdout.flush()
            # Trying to speed up grafting:
            #  - if you don't like me, then go to your parent
            #  - if you like me, but I don't like you, go to my parent
            #  - if we both like each other, then graft and do another search.
            #  - if either of us gets to our lca, then stop, we shouldn't graft

            # Check if graft score is better than both of the parents scores.
            other_score = self.model.quick_e_score(curr.e_model, other.e_model)
            i_like_you = other_score > curr.parent.lazy_my_score()
            you_like_me = other_score > other.parent.lazy_my_score()

            if self.config.debug:
                print('#i_like_you and you_like me\t%s\t%s\t%s\t%s\t%s' %
                      (i_like_you, you_like_me, other_score,
                       curr.parent.lazy_my_score(),
                       other.parent.lazy_my_score()))

            if self.config.aggressive_rejection_stop and not you_like_me and not you_like_me:
                if self.config.debug:
                    print(
                        '#tryGraftFast aggressive stop you dont like me and i dont like you'
                    )
                break

            if not you_like_me:
                other = other.parent
            elif you_like_me and not i_like_you:
                curr = curr.parent
            else:
                assert you_like_me and i_like_you
                print('#doingGraft')
                # We're going to graft.
                # [LOGGING] Write the tree before the graft
                if self.config.write_every_tree:
                    Graphviz.write_tree(
                        os.path.join(
                            self.config.canopy_out,
                            'tree_%s_before_graft_%s.gv' %
                            (p_idx, graft_index)), self.root,
                        [other.id, curr.id], [gnode.id])

                # Do the graft.
                assert other.parent
                prev_gp = other.parent.parent
                # new_ap_graft = self.hallucinate_merge(curr, other, None)
                new_graft_internal = curr.graft_to_me(
                    other, new_aproj=None,
                    new_my_score=None)  # We don't want a pw guy here.

                # Update from new_graft_internal to the root.
                before_update_time = time.time()
                curr_update = new_graft_internal
                while curr_update:
                    curr_update.update_from_children()
                    curr_update = curr_update.parent
                after_update_time = time.time()

                print('#TimeForUpdateInGraft\t%s\t%s' %
                      (after_update_time - before_update_time,
                       after_update_time - start_time))

                # Update from previous parent to root.
                if prev_gp:
                    before_update_time = time.time()
                    curr_update = prev_gp
                    while curr_update:
                        curr_update.update_from_children()
                        curr_update = curr_update.parent
                    after_update_time = time.time()
                    print('#TimeForUpdateInPrevGPGraft\t%s\t%s' %
                          (after_update_time - before_update_time,
                           after_update_time - start_time))

                # Add new graft internal to the nn-struct.
                # self.nn_structure.insert(new_graft_internal)
                # TODO AK: doe we need this?
                self.root = new_graft_internal.root()

                # Write some trees.
                if self.config.write_every_tree:
                    Graphviz.write_tree(
                        os.path.join(
                            self.config.canopy_out,
                            'tree_%s_post_graft_%s.gv' % (p_idx, graft_index)),
                        self.root, [other.id, curr.id], [gnode.id])

                # Update offlimits.
                # offlimits.update({other})
                # offlimits.update(other.descendants())
                # return offlimits
                return new_graft_internal, knn_and_score_valid
        return None, knn_and_score_valid  # No graft found.
Beispiel #5
0
    def insert(self, p, p_idx):
        """Incrementally add p to the tree.

        Based on my parameters, either apply rotations and/or grafting. Steps:
        1) Find the closest node to p in the tree (optionally rotate).
        2) Add p to the nn-structure.
        3) Add p to the tree.
        4) Update the nodes on the path from the new internal node to the root.
        5) Add new internal node to the nn-structure.
        6) Try grafting:
        6.1)   Construct offlimits.
        6.2)   Find nearest non-offlimits leaf.
        6.3)   Compute scores to check if graft should be done. If yes:
        6.3.1)     Do the graft.
        6.3.2)     update nodes on path from new internal to root
        6.3.3)     update nodes on path from previous parent to root
        6.3.4)     Add new graft parent to nn-structure.
        6.4)   If No because leaf likes where it is:
        6.4.1)     Try to graft its parent.
        6.5)   Otherwise do not graft.


        Args:
            p - (np.array, str, str)
            p_idx - int index
        """

        # TODO (AK): next line should be cleaner.
        print(self.my_score_f)
        gnode = GNode([p],
                      self.model.new(p[0], ment_id=p[2]),
                      my_score_f=self.my_score_f,
                      grinch=self)
        start_time = time.time()
        print('Inserting p (%s,%s) into tree ' % (p[1], p[2]))
        graft_index = 0
        if self.root is None:
            self.root = gnode
            self.nn_structure.insert(gnode)
        else:
            # If add_to_mention is True, then internal nodes are offlimits.
            if self.config.add_to_mention:
                offlimits = set([
                    d.nsw_node for d in self.root.descendants()
                    if d.point_counter > 1 if d.nsw_node
                ])
            else:
                offlimits = set()

            # Find the k-nn to gnode.
            knn_and_score, num_searched_approx = \
                self.nn_structure.knn_and_insert(
                    gnode, offlimits, k=self.nn_k, r=self.nsw_r)
            self.num_computations += num_searched_approx

            print('#num computations local and total\t%s\t%s' %
                  (num_searched_approx, self.num_computations))

            approx_closest, approx_closest_score = \
            knn_and_score[0][1].v, knn_and_score[0][0]

            if self.config.debug:
                print('#approx_closest\t%s\t%s' %
                      (p[1], [x[1] for x in approx_closest.pts]))

            # 1) Find where to be added / rotate
            insert_node, new_ap, new_score, time_before_rotation, \
            time_finish_placement = self.find_insert(
                approx_closest, gnode, debug=self.config.debug)

            # 2) Add gnode to the nn structure.
            # self.nn_structure.insert(gnode)

            # 3) Add gnode to the tree.
            new_internal_node = insert_node.split_down(gnode, new_ap,
                                                       new_score)

            # assert gnode.root() == insert_node.root(), "p_ment.root() %s == insert_node.root() %s" % (
            #     gnode.root(), insert_node.root())
            # assert gnode.lca(
            #     insert_node) == new_internal_node, "p_ment.lca(insert_node) %s == new_internal_node %s" % (
            #     gnode.lca(insert_node), new_internal_node)

            # 4) Update on the path from new_internal node to the root.
            if new_internal_node.parent:
                # Comment out the following check if e_score is random.
                # assert new_internal_node.my_score == \
                #        new_internal_node.e_model.my_e_score()
                new_internal_node.parent.update_aps(gnode.e_model, None)

            # 5) Add new internal node to the nn structure.
            # !! NO LONGER ADDING INTERNL NODES TO NSW !!

            # self.nn_structure.insert(new_internal_node)
            self.root = self.root.root()

            # 6) Try Grafting.
            if self.perform_graft and (self.max_node_graft_size is None
                                       or new_internal_node.point_counter <
                                       self.max_node_graft_size):
                time_before_graft = time.time()
                curr = new_internal_node
                # 6.1) Find offlimits
                # offlimits = set(
                #     [x.nsw_node for x in (
                #             curr.siblings() + curr.descendants() +
                #             curr._ancestors() + [curr] + [self.root])]
                # )
                offlimits = set([
                    x.nsw_node
                    for x in (curr.siblings() + curr.leaves() + [curr])
                ])

                graft_attempt = 0
                while curr and curr.parent and (
                        self.max_node_graft_size is None
                        or curr.point_counter < self.max_node_graft_size):
                    time_before_this_graft = time.time()
                    prev_curr = curr
                    # Try to graft and update offlimits when successful.
                    if (self.config.fast_graft and graft_attempt
                            == 0) or self.config.fast_grafts_only:
                        curr_new, knn_and_score = self._try_graft_fast(
                            curr, knn_and_score, offlimits, gnode, p_idx,
                            graft_index, start_time)
                    elif not self.config.fast_grafts_only:
                        curr_new = self._try_graft(curr, offlimits, gnode,
                                                   p_idx, graft_index,
                                                   start_time)

                    # sib = curr.siblings()[0]
                    # curr = curr.parent

                    # Only take the time to update offlimits if necessary.
                    # NM: If curr is not none, then the only new things to add to
                    # offlimits are the leaves of the node which was grafted to be
                    # the sibling of "curr"
                    if curr_new and curr_new.parent:
                        if self.config.debug:
                            print('#successfulGraftAttempt\t%s' %
                                  graft_attempt)
                        graft_index += 1
                        graft_attempt += 1
                        offlimits.update(
                            set([
                                x.nsw_node for x in (
                                    curr_new.leaves_excluding([prev_curr]))
                            ]))

                    elif graft_attempt < self.beam:
                        graft_attempt += 1
                        if curr.parent:
                            offlimits.update(
                                set([
                                    x.nsw_node for x in (
                                        curr.parent.leaves_excluding([curr]))
                                ]))
                        curr = curr.parent
                    else:
                        graft_attempt += 1
                        curr = curr_new
                        # either you didn't graft or this is the root of the tree?
                        # assert curr is None or curr.parent is None

                    time_after_this_graft = time.time()
                    print("#TimeAfterThisGraftProposal\t%s\t%s" %
                          (time_after_this_graft - time_before_this_graft,
                           time_after_this_graft - start_time))

                end_time = time.time()
                print("#TimeAfterAllGrafts\t%s\t%s" %
                      (end_time - time_before_graft, end_time - start_time))
        end_time = time.time()
        print('Done Inserting p (%s,%s) into tree in %s seconds  ' %
              (p[1], p[2], end_time - start_time))
        print('#numgrafts\t%s' % graft_index)

        # Clean up and log.
        self.observed_classes.add(p[1])
        sys.stdout.flush()
        if self.config.write_every_tree:
            if len(self.config.canopy_out) > 0:
                Graphviz.write_tree(
                    os.path.join(self.config.canopy_out, 'tree_%s.gv' % p_idx),
                    self.root, [], [gnode.id])
                if self.config.nn_structure == 'nsw':
                    GraphvizNSW.write_nsw(
                        os.path.join(self.config.canopy_out,
                                     'nsw_%s.gv' % p_idx), self.nn_structure)
Beispiel #6
0
    def _try_graft(self, curr, offlimits, gnode, p_idx, graft_index,
                   start_time):
        """Try to find a graft for curr.

        Look through the NSW leaves FIRST for the closest non-offlimits node
        for curr. Compute the score and check for a merge. If the score is
        better than curr.parent.my_score and the other.parent.my_score,
        perform the merge and update. If the merge score is better than
        curr.parent.my_score but not others parent score, then try to merge with
        other's parent. If the merge score is worse than curr's parent score,
        return nothing. This function also does a bunch of logging.

        Args:
            curr - the node to initiate grafting from.
            offlimits - the nodes in the NSW that cannot be grafted.
            gnode - newly created node with new point.
            p_idx - the point index (int)
            graft_index - number of times grafted so far
            start_time - time we started insert

        Returns:
            Nothing
        """
        if self.config.debug:
            print('#tryGraft trying to graft \t%s' % (curr.id))
        # First do a search for the closest leaf in the NSW.
        knn_and_score, num_searched_approx = \
            self.nn_structure.knn(
                curr, offlimits, k=self.nn_k, r=self.nsw_r)
        # knn_and_score, num_searched_approx = \
        #     self.nn_structure.knn_and_score_offlimits(
        #         curr, offlimits, k=self.nn_k, r=self.nsw_r)

        self.num_computations += num_searched_approx

        # If there aren't enough nodes to explore just do nothing.
        if knn_and_score:
            other, other_score = knn_and_score[0][1].v, knn_and_score[0][0]
            if self.config.debug:
                print('#tryGraft found nn\t%s\t%s' % (curr.id, other.id))
        else:
            return

        our_lca = curr.lca(other)

        while curr != our_lca and other != our_lca and curr not in other.siblings(
        ) and (self.max_node_graft_size is None
               or other.point_counter < self.max_node_graft_size):
            if self.config.debug:
                print('#tryGraft trying new parent\t%s\t%s' %
                      (curr.id, other.id))
                sys.stdout.flush()
            # Trying to speed up grafting:
            #  - if you don't like me, then go to your parent
            #  - if you like me, but I don't like you, go to my parent
            #  - if we both like each other, then graft and do another search.
            #  - if either of us gets to our lca, then stop, we shouldn't graft

            # Check if graft score is better than both of the parents scores.
            other_score = self.model.quick_e_score(curr.e_model, other.e_model)
            i_like_you = other_score > curr.parent.lazy_my_score()
            you_like_me = other_score > other.parent.lazy_my_score()

            if self.config.debug:
                print('#i_like_you and you_like me\t%s\t%s\t%s\t%s\t%s' %
                      (i_like_you, you_like_me, other_score,
                       curr.parent.lazy_my_score(),
                       other.parent.lazy_my_score()))

            if not you_like_me:
                other = other.parent
            elif you_like_me and not i_like_you:
                curr = curr.parent
            else:
                assert you_like_me and i_like_you
                print('#doingGraft')
                # We're going to graft.
                # [LOGGING] Write the tree before the graft
                if self.config.write_every_tree:
                    Graphviz.write_tree(
                        os.path.join(
                            self.config.canopy_out,
                            'tree_%s_before_graft_%s.gv' %
                            (p_idx, graft_index)), self.root,
                        [other.id, curr.id], [gnode.id])

                # Do the graft.
                assert other.parent
                prev_gp = other.parent.parent
                # new_ap_graft = self.hallucinate_merge(curr, other, None)
                new_graft_internal = curr.graft_to_me(
                    other, new_aproj=None,
                    new_my_score=None)  # We don't want a pw guy here.

                # Update from new_graft_internal to the root.
                before_update_time = time.time()
                curr_update = new_graft_internal
                while curr_update:
                    curr_update.update_from_children()
                    curr_update = curr_update.parent
                after_update_time = time.time()

                print('#TimeForUpdateInGraft\t%s\t%s' %
                      (after_update_time - before_update_time,
                       after_update_time - start_time))

                # Update from previous parent to root.
                if prev_gp:
                    before_update_time = time.time()
                    curr_update = prev_gp
                    while curr_update:
                        curr_update.update_from_children()
                        curr_update = curr_update.parent
                    after_update_time = time.time()
                    print('#TimeForUpdateInPrevGPGraft\t%s\t%s' %
                          (after_update_time - before_update_time,
                           after_update_time - start_time))

                # Add new graft internal to the nn-struct.
                # self.nn_structure.insert(new_graft_internal)
                # TODO AK: doe we need this?
                self.root = new_graft_internal.root()

                # Write some trees.
                if self.config.write_every_tree:
                    Graphviz.write_tree(
                        os.path.join(
                            self.config.canopy_out,
                            'tree_%s_post_graft_%s.gv' % (p_idx, graft_index)),
                        self.root, [other.id, curr.id], [gnode.id])

                # Update offlimits.
                # offlimits.update({other})
                # offlimits.update(other.descendants())
                # return offlimits
                return new_graft_internal
        return None  # No graft found.