Ejemplo n.º 1
0
    def hallucinate_merge(self, other):
        """Return the merger of me and other."""
        start_time = time.time()
        ap = AttributeProjection()
        pw_score, _, _ = self.best_pairwise(other)
        ap.update(self.ment.attributes, self.torch_model.sub_ent_model)
        ap.update(other.ment.attributes, self.torch_model.sub_ent_model)

        num_ms = self.num_pts + other.num_pts
        if 'tes' in ap.aproj_sum:
            ap.aproj_sum['tea'] = ap['tes'] / num_ms

        ap.aproj_local['my_pw'] = pw_score
        ap.aproj_local['new_edges'] = self.num_pts * other.num_pts

        self_entity_score = 1.0
        other_entity_score = 1.0

        if self.num_pts > 1 and 'es' in self.ment.attributes.aproj_local:
            self_entity_score = self.ment.attributes.aproj_local['es']
            if self.config.expit_e_score:
                self_entity_score = expit(self_entity_score)
        else:
            assert self.num_pts == 1
        if other.num_pts > 1 and 'es' in other.ment.attributes.aproj_local:
            other_entity_score = other.ment.attributes.aproj_local['es']
            if self.config.expit_e_score:
                other_entity_score = expit(other_entity_score)
        else:
            assert other.num_pts == 1

        if self_entity_score >= other_entity_score:
            ap.aproj_local['child_e_max'] = self_entity_score
            ap.aproj_local['child_e_min'] = other_entity_score
        else:
            ap.aproj_local['child_e_max'] = other_entity_score
            ap.aproj_local['child_e_min'] = self_entity_score
        if self.config.expit_e_score:
            assert ap.aproj_local['child_e_max'] <= 1.0
            assert ap.aproj_local['child_e_min'] <= 1.0
            assert ap.aproj_local['child_e_max'] >= -0.0
            assert ap.aproj_local['child_e_min'] >= -0.0
        end_time = time.time()
        new_score = self.torch_model.e_score(ap).data.numpy()[0]
        new_node = ACorefModel(self.config, self.torch_model, Ment(ap,
                                                                   None), None,
                               self.num_pts + other.num_pts, self.pair_to_pw)
        new_node.ment.attributes.aproj_local['es'] = new_score
        return new_node
Ejemplo n.º 2
0
    def __init__(self, pts, aproj=None, point_counter=None, mid=None):
        """Init.

        Args:
            pts - a list of data items (mention, label, id).
            aproj - an attribute projection.
            point_counter -
        """
        super().__init__()
        self.pts = pts
        self.as_ment = Ment(aproj, aproj, mid=mid)
        if point_counter is not None:
            self.point_counter = point_counter
        else:
            self.point_counter = len(self.pts)
        self.nsw_node = None
Ejemplo n.º 3
0
    copy_source_to_dir(output_dir,config)

    # assert config.best_model is not None
    # assert config.partition_threshold is not None

    model = coref.models.load_model(config)

    g = Graphviz()

    for i, f in enumerate(config.test_files):

        print('[BEGIN TEST] %s\n' % f)
        pts = []
        counter = 0
        for pt in Ment.load_ments(f, model=model):
            pts.append(pt)

        # Shuffle the data points
        rand.shuffle(pts)

        print('[CLUSTERING...]')
        clustering_time_start = time.time()

        print('[BEGIN POSTPROCESSING...]')
        if config.out_by_canopy:
            canopy_out = os.path.join(output_dir, config.out_by_canopy[i])
        else:
            canopy_out = os.path.join(output_dir, 'canopy_%s' % str(i))
        os.makedirs(canopy_out)
        config.canopy_out = canopy_out
Ejemplo n.º 4
0
    torch_model = coref.models.load_model(config)

    # Node Model is what is stored at each node in the tree
    # and what is used to compute the scores

    node_model = ACorefModel(config,torch_model,None,None,num_pts=0)

    g = Graphviz()

    for i, f in enumerate(config.test_files):

        print('[BEGIN TEST] %s\n' % f)
        pts = []
        counter = 0
        for pt in Ment.load_ments(f, model=torch_model):
            pts.append(pt)

        # Shuffle the data points
        rand.shuffle(pts)

        print('[CLUSTERING...]')
        clustering_time_start = time.time()

        print('[BEGIN POSTPROCESSING...]')
        if config.out_by_canopy:
            canopy_out = os.path.join(output_dir, config.out_by_canopy[i])
        else:
            canopy_out = os.path.join(output_dir, 'canopy_%s' % str(i))
        os.makedirs(canopy_out)
        config.canopy_out = canopy_out
Ejemplo n.º 5
0
    g = Graphviz()

    for i, f in enumerate(config.test_files):
        print('[BEGIN TEST] %s\n' % f)
        pts = []
        counter = 0
        if config.out_by_canopy:
            canopy_out = os.path.join(output_dir, config.out_by_canopy[i])
        else:
            canopy_out = os.path.join(output_dir, 'canopy_%s' % str(i))
        os.makedirs(canopy_out)
        with open('%s/predicted.tsv' % canopy_out, 'w') as predf:
            with open('%s/gold.tsv' % canopy_out, 'w') as goldf:
                print('[CLUSTERING...]')
                clustering_time_start = time.time()
                for m, m.gt, m.id in Ment.load_ments(f):
                    if not m.attributes['canopy']:
                        m.attributes.aproj['canopy'] = config.out_by_canopy[i] if config.out_by_canopy else 'canopy_%s' % str(i)
                    predf.write('%s\t%s\n' % (m.mid, model.mention_to_cluster(m)))
                    if m.gt != "None":
                        goldf.write('%s\t%s\n' % (m.mid, m.gt))

                end_time = time.time()
                print('[TIME] %ss' % (end_time - clustering_time_start))

        print('[BEGIN POSTPROCESSING...]')
        pre, rec, f1 = eval_f1(config,'%s/predicted.tsv' % canopy_out,'%s/gold.tsv' % canopy_out,restrict_to_gold=True)
        print('[PREDICTED PW P/R/F1]\t%s\t%s\t%s' % (pre, rec, f1))
        print()
        with open('%s/predicted.f1.tsv' % canopy_out, 'w') as f1f:
            f1f.write('%s\t%s\t%s\n' % (pre, rec, f1))
Ejemplo n.º 6
0
    def dev_eval_multiple(self, iter, diagnostics, output_path):
        sofl("")
        sofl('[BEGIN DEV EVAL]')

        # assume that dev files are in data/datasetname/canopy/dev/canopy/ments.json
        dev_data_dir = os.path.join('data', self.config.dataset_name,
                                    'eval', 'dev', 'canopy')
        dev_canopies = []
        dev_start = datetime.datetime.now()

        # Get dev files.
        with open(self.config.dev_files, 'r') as fin:
            for line in fin:
                dev_canopies.append(line.strip())

        micro_TP = 0
        micro_FP = 0
        micro_GT = 0

        micro_up_TP = 0
        micro_up_FP = 0
        micro_up_GT = 0
        dev_tree_roots = []
        all_scores = set()

        # Make sure you are not overwriting, if you are there is something wrong with the iteration count
        assert not os.path.exists(os.path.join(output_path,
                                               "pw_iter_{}.torch".format(iter))), \
            "Model exists and would be overwritten %s" %os.path.join(output_path,
                                                                                     "pw_iter_{}.torch".format(iter))

        sofl('[SAVING MODEL...')
        torch.save(self.model,
                   os.path.join(output_path,
                                "pw_iter_{}.torch".format(iter)))

        for idx, dev_canopy in enumerate(dev_canopies):
            dev_file = os.path.join(dev_data_dir, dev_canopy, 'ments.json')
            ms = []
            for m in Ment.load_ments(dev_file, self.model):
                if not self.config.only_use_labeled_dev or (
                        m[1] is not None and m[1] != "None"):
                    ms.append(m)
            self.config.random.shuffle(ms)
            # Restrict
            canopy_size = min(self.config.dev_max_canopy_size,len(ms))
            sofl('Loaded canopy %s with %s mentions and restricted the size of the canopy to %s' % (dev_canopy,len(ms),canopy_size))
            ms = ms[:canopy_size]
            inf_start = datetime.datetime.now()
            sofl('[DEV TREE USING] %s' %self.config.clustering_scheme)
            dev_tree = new_clustering_scheme(self.config, ms, self.model)
            dev_tree_roots.append(dev_tree.build_dendrogram())
            inf_end = datetime.datetime.now()
            inf_time_seconds = (inf_end - inf_start).total_seconds()
            sofl('[INFERENCE IN %ss]' % inf_time_seconds)
            diagnostics['dev_hacsl_time_%s' % dev_canopy] = inf_time_seconds
            save_dict_to_json(diagnostics, os.path.join(output_path,
                                                        'diagnostics.json'))

            sofl('[SCORING TREE...]')
            score_start = datetime.datetime.now()
            pre_ub, rec_ub, f1_ub = dev_tree_roots[-1].f1_best()
            pre, rec, f1 = dev_tree_roots[-1].f1_cluster_marker()
            tp_ub, fp_ub, gt_ub = dev_tree_roots[-1].tp_fp_gt_best()
            tp, fp, gt = dev_tree_roots[-1].tp_fp_gt_cluster_marker()

            micro_TP += tp
            micro_FP += fp
            micro_GT += gt

            micro_up_TP += tp_ub
            micro_up_FP += fp_ub
            micro_up_GT += gt_ub

            sofl("")
            sofl('[(Python) UPPER BOUND P/R/F1 %s]:\t%s\t%s\t%s' % (
                dev_canopy, pre_ub, rec_ub, f1_ub))
            sofl('[PREDICTED P/R/F1 %s]\t%s\t%s\t%s' % (
                dev_canopy, pre, rec, f1))

            # Visit all nodes and collect scores.
            frontier = [dev_tree_roots[-1]]
            while frontier:
                x = frontier.pop(0)
                if x.children:
                    frontier.append(x.children[0])
                    frontier.append(x.children[1])
                all_scores.add(x.my_score)
            score_end = datetime.datetime.now()
            score_time_seconds = (score_end - score_start).total_seconds()
            sofl('[SCORING IN %ss]' % score_time_seconds)

        dev_end = datetime.datetime.now()
        inf_time_seconds = dev_end - dev_start
        sofl('[DEV TOTAL TIME IN %ss]' % inf_time_seconds)
        pre = micro_TP / (
        micro_TP + micro_FP) if micro_TP + micro_FP > 0.0 else 0.0
        rec = micro_TP / (micro_GT) if micro_GT > 0.0 else 0.0
        f1 = 2.0 * (pre * rec) / (pre + rec) if (pre + rec) > 0 else 0
        pre_ub = micro_up_TP / (micro_up_TP + micro_up_FP) if (
                                                              micro_up_TP + micro_up_FP) > 0.0 else 0.0
        rec_ub = micro_up_TP / (micro_up_GT) if micro_up_GT > 0 else 0.0
        f1_ub = 2.0 * (pre_ub * rec_ub) / (pre_ub + rec_ub) if (
                                                               pre_ub + rec_ub) > 0 else 0.0
        sofl('[(Python) UPPER BOUND P/R/F1 %s]:\t%s\t%s\t%s' % (
            'micro', pre_ub, rec_ub, f1_ub))
        sofl('[PREDICTED P/R/F1 %s]\t%s\t%s\t%s' % ('micro', pre, rec, f1))
        score_obj = {"inf_time": inf_time_seconds, "pre": pre, "rec": rec,
                     "f1": f1, "pre_ub": pre_ub, "rec_ub": rec_ub,
                     "f1_ub": f1_ub, "config": self.config.__dict__}
        save_dict_to_json(score_obj, os.path.join(
            output_path, "dev_scores_iter_{}.json".format(iter)))
        self.config.model_filename = os.path.join(
            output_path, "pw_iter_{}.torch".format(iter))

        self.config.save_config(output_path,
                                filename='pwe_iter_%d.config' % iter)

        threshold_start = datetime.datetime.now()
        print('[FIND BEST OVERALL THRESHOLD]')
        sorted_tree_scores = sorted(list(all_scores))
        num_to_try = len(sorted_tree_scores) * self.config.fraction_of_thresholds_to_try_dev
        interval = int(len(sorted_tree_scores) / num_to_try)
        best_f, best_t = None, None
        # best_partition = None
        for i in range(0,len(sorted_tree_scores),interval):
            t = sorted_tree_scores[i]
            tp = 0
            fp = 0
            total_gt = 0
            for root in dev_tree_roots:
                total_gt += root.compute_gt()
                predicted = root.partition_threshold(t)
                assert sum([e.point_counter for e in predicted]) == root.point_counter
                for e in predicted:
                    tp += e.local_tp
                    fp += e.local_fp
            pre = tp / (tp + fp) if tp + fp > 0 else 0.0
            rec = tp / total_gt if total_gt > 0.0 else 0.0
            f1 = F1Node.f1(tp, fp, total_gt)

            sofl('t, pre, rec, f1')
            sofl('%s, %s, %s, %s' % (t, pre, rec, f1))
            if best_f is None or best_f < f1:
                best_f = f1
                best_t = t
                # best_partition = predicted

        sofl('[BEST THRESHOLD F1] %s %s' % (best_t,best_f))
        sofl('self.model.pw_output_layer.weight')
        sofl('%s' % self.model.pw_output_layer.weight)
        sofl('self.model.e_output_layer.weight')
        sofl('%s' % self.model.e_output_layer.weight)
        sofl('self.model.pw_output_layer.bias')
        sofl('%s' % self.model.pw_output_layer.bias)
        sofl('self.model.e_output_layer.bias')
        sofl('%s' % self.model.e_output_layer.bias)
        # sofl('[Best Partition Stats]')
        # for c in best_partition:
        #     print('c.as_ment.attributes.aproj_local')
        #     print(c.as_ment.attributes.aproj_local)
        sofl('[END DEV EVAL]')
        threshold_end = datetime.datetime.now()
        threshold_time_seconds = threshold_end - threshold_start
        sofl('[THRESHOLD TIME IN %ss]' % threshold_time_seconds)
        sofl("")
        return f1_ub, best_f, best_t
Ejemplo n.º 7
0
    def train(self, batcher, outdir, dev_batcher=None):
        """Train the PW model then train the entity model to refine PW model.

        Train the PW model to minimize binary classification loss. Then train
        the entity model to score merges (see the train_e method).

        Args:
             batcher - batcher for pairwise training
             outdir - where to write the trained models, etc.

        Returns:
            None
        """
        diagnostics = {}

        train_canopies = []
        if self.config.train_files.lower() != 'none' and self.config.train_files.lower() != 'empty':
            with open(self.config.train_files, 'r') as fin:
                for line in fin:
                    train_canopies.append(line.strip())

        print('train_canopies')
        print(train_canopies)

        random.shuffle(train_canopies)
        train_data_dir = os.path.join('data', self.config.dataset_name,
                                      'eval', 'train', 'canopy')
        iter_count = 0
        iters_per_file = max(1, int(self.config.refine_itrs / len(train_canopies))) if len(train_canopies) > 0 else 0

        print("entity iter_per_file %s" % iters_per_file)

        # Turn off learning for the entity model features.
        for param in self.model.sub_ent_model.parameters():
            param.requires_grad = False

        # Train the pairwise model.
        print('[TRAINING PAIRWISE]')
        self.train_pw(batcher, outdir, None)
        sys.stdout.flush()

        # Load the best PW model.
        self.model = torch.load(self.config.best_model)

        # Turn off learning for the pairwise model
        for param in self.model.pw_model.parameters():
            param.requires_grad = False
        # Turn on learning for the entity model
        for param in self.model.sub_ent_model.parameters():
            param.requires_grad = True

        # Set the e_optimizer up (do it here because we've loaded the model).
        self.e_opt = torch.optim.Adam(self.model.sub_ent_model.parameters(),
                                      lr=self.config.e_lr,
                                      weight_decay=self.config.l2penalty)

        # Train the entity model.
        print('[TRAINING ENTITY]')
        best_f1 = 0.0

        for idx, train_canopy in enumerate(train_canopies):
            train_file = os.path.join(train_data_dir, train_canopy, 'ments.json')
            ms = []
            for m in Ment.load_ments(train_file, self.model):
                if not self.config.only_use_labeled_dev or (
                        m[1] is not None and m[1] != "None"):
                    ms.append(m)
            best_f1,iter_count = self.train_e(ms, diagnostics,
                                              iter_start_count=iter_count,
                                              num_iterations=iters_per_file,
                                              prev_best_f1=best_f1)