예제 #1
0
    def train(self,
              examples,
              decode_results,
              evaluator=CachedExactMatchEvaluator(),
              initial_performance=0.):
        """optimize the ranker on a dataset using grid search"""
        best_score = initial_performance
        best_param = np.zeros(self.feature_num)

        param_space = (np.array(p) for p in itertools.combinations(
            np.arange(0, 3.01, 0.01), self.feature_num))

        for param in param_space:
            score = self.compute_rerank_performance(examples,
                                                    decode_results,
                                                    fast_mode=True,
                                                    evaluator=evaluator,
                                                    param=param)
            if score > best_score:
                print('New param=%s, score=%.4f' % (param, score),
                      file=sys.stderr)
                best_param = param
                best_score = score

        self.parameter = best_param
예제 #2
0
    def train_multiprocess(self,
                           examples,
                           decode_results,
                           evaluator=CachedExactMatchEvaluator(),
                           initial_performance=0.,
                           num_workers=8):
        """optimize the ranker on a dataset using grid search"""
        best_score = initial_performance
        best_param = np.zeros(self.feature_num)

        self.initialize_rerank_features(examples, decode_results)

        print('generating parameter list', file=sys.stderr)
        param_space = [
            p for p in itertools.combinations(np.arange(0, 2.03, 0.02),
                                              self.feature_num)
        ]
        print('generating parameter list done', file=sys.stderr)

        global _examples
        _examples = examples
        global _decode_results
        _decode_results = decode_results
        global _evaluator
        _evaluator = evaluator
        global _ranker
        _ranker = self

        def _norm(_param):
            return sum(p**2 for p in _param)

        with multiprocessing.Pool(processes=num_workers) as pool:
            # segment the parameter space
            segment_size = int(len(param_space) / num_workers / 5)
            param_space_segments = []
            ptr = 0
            while ptr < len(param_space):
                param_space_segments.append(param_space[ptr:ptr +
                                                        segment_size])
                ptr += segment_size
            print('generated %d parameter segments' %
                  len(param_space_segments),
                  file=sys.stderr)

            results = pool.imap_unordered(_rank_segment_worker,
                                          param_space_segments)

            for param, score in results:
                if score > best_score or score == best_score and _norm(
                        param) < _norm(best_param):
                    print('[Main] New param=%s, score=%.4f' % (param, score),
                          file=sys.stderr)
                    best_param = param
                    best_score = score

        self.parameter = best_param
예제 #3
0
    def compute_rerank_performance(self,
                                   examples,
                                   decode_results,
                                   evaluator=CachedExactMatchEvaluator(),
                                   param=None,
                                   fast_mode=False,
                                   verbose=False,
                                   args=None):
        self.filter_hyps_and_initialize_features(examples, decode_results)

        if param is None:
            param = self.parameter

        sorted_decode_results = []
        for example, hyps in zip(examples, decode_results):
            if hyps:
                new_hyp_scores = [
                    self.get_rerank_score(hyp, param=param) for hyp in hyps
                ]
                best_hyp_idx = np.argmax(new_hyp_scores)
                best_hyp = hyps[best_hyp_idx]

                if fast_mode:
                    sorted_decode_results.append([best_hyp])
                else:
                    sorted_decode_results.append(
                        [hyps[i] for i in np.argsort(new_hyp_scores)[::-1]])
            else:
                sorted_decode_results.append([])

            if verbose:
                gold_standard_idx = [
                    i for i, hyp in enumerate(hyps) if hyp.is_correct
                ]
                if gold_standard_idx and gold_standard_idx[0] != best_hyp_idx:
                    gold_standard_idx = gold_standard_idx[0]
                    print('Utterance: %s' % ' '.join(example.src_sent),
                          file=sys.stderr)
                    print('Gold hyp id: %d' % gold_standard_idx,
                          file=sys.stderr)
                    for _i, hyp in enumerate(hyps):
                        print('Hyp %d: %s ||| score: %f ||| final score: %f' %
                              (_i, hyp.code, hyp.score,
                               self.get_rerank_score(hyp, param=param)),
                              file=sys.stderr)
                        print('\t%s' % hyp.rerank_feature_values,
                              file=sys.stderr)

        metric = evaluator.evaluate_dataset(examples,
                                            sorted_decode_results,
                                            fast_mode=fast_mode,
                                            args=args)

        return metric
예제 #4
0
    def train(self,
              examples,
              decode_results,
              evaluator=CachedExactMatchEvaluator(),
              initial_performance=0.):
        self.initialize_rerank_features(examples, decode_results)

        train_x, train_y, group_train = self.get_feature_matrix(decode_results,
                                                                train=True)
        self.ranker.fit(train_x, train_y, group_train)

        train_acc = self.compute_rerank_performance(examples,
                                                    decode_results,
                                                    fast_mode=True,
                                                    evaluator=evaluator)
        print('Dev acc: %f' % train_acc, file=sys.stderr)