def train(self, examples, decode_results, evaluator=CachedExactMatchEvaluator(), initial_performance=0.): """optimize the ranker on a dataset using grid search""" best_score = initial_performance best_param = np.zeros(self.feature_num) param_space = (np.array(p) for p in itertools.combinations( np.arange(0, 3.01, 0.01), self.feature_num)) for param in param_space: score = self.compute_rerank_performance(examples, decode_results, fast_mode=True, evaluator=evaluator, param=param) if score > best_score: print('New param=%s, score=%.4f' % (param, score), file=sys.stderr) best_param = param best_score = score self.parameter = best_param
def train_multiprocess(self, examples, decode_results, evaluator=CachedExactMatchEvaluator(), initial_performance=0., num_workers=8): """optimize the ranker on a dataset using grid search""" best_score = initial_performance best_param = np.zeros(self.feature_num) self.initialize_rerank_features(examples, decode_results) print('generating parameter list', file=sys.stderr) param_space = [ p for p in itertools.combinations(np.arange(0, 2.03, 0.02), self.feature_num) ] print('generating parameter list done', file=sys.stderr) global _examples _examples = examples global _decode_results _decode_results = decode_results global _evaluator _evaluator = evaluator global _ranker _ranker = self def _norm(_param): return sum(p**2 for p in _param) with multiprocessing.Pool(processes=num_workers) as pool: # segment the parameter space segment_size = int(len(param_space) / num_workers / 5) param_space_segments = [] ptr = 0 while ptr < len(param_space): param_space_segments.append(param_space[ptr:ptr + segment_size]) ptr += segment_size print('generated %d parameter segments' % len(param_space_segments), file=sys.stderr) results = pool.imap_unordered(_rank_segment_worker, param_space_segments) for param, score in results: if score > best_score or score == best_score and _norm( param) < _norm(best_param): print('[Main] New param=%s, score=%.4f' % (param, score), file=sys.stderr) best_param = param best_score = score self.parameter = best_param
def compute_rerank_performance(self, examples, decode_results, evaluator=CachedExactMatchEvaluator(), param=None, fast_mode=False, verbose=False, args=None): self.filter_hyps_and_initialize_features(examples, decode_results) if param is None: param = self.parameter sorted_decode_results = [] for example, hyps in zip(examples, decode_results): if hyps: new_hyp_scores = [ self.get_rerank_score(hyp, param=param) for hyp in hyps ] best_hyp_idx = np.argmax(new_hyp_scores) best_hyp = hyps[best_hyp_idx] if fast_mode: sorted_decode_results.append([best_hyp]) else: sorted_decode_results.append( [hyps[i] for i in np.argsort(new_hyp_scores)[::-1]]) else: sorted_decode_results.append([]) if verbose: gold_standard_idx = [ i for i, hyp in enumerate(hyps) if hyp.is_correct ] if gold_standard_idx and gold_standard_idx[0] != best_hyp_idx: gold_standard_idx = gold_standard_idx[0] print('Utterance: %s' % ' '.join(example.src_sent), file=sys.stderr) print('Gold hyp id: %d' % gold_standard_idx, file=sys.stderr) for _i, hyp in enumerate(hyps): print('Hyp %d: %s ||| score: %f ||| final score: %f' % (_i, hyp.code, hyp.score, self.get_rerank_score(hyp, param=param)), file=sys.stderr) print('\t%s' % hyp.rerank_feature_values, file=sys.stderr) metric = evaluator.evaluate_dataset(examples, sorted_decode_results, fast_mode=fast_mode, args=args) return metric
def train(self, examples, decode_results, evaluator=CachedExactMatchEvaluator(), initial_performance=0.): self.initialize_rerank_features(examples, decode_results) train_x, train_y, group_train = self.get_feature_matrix(decode_results, train=True) self.ranker.fit(train_x, train_y, group_train) train_acc = self.compute_rerank_performance(examples, decode_results, fast_mode=True, evaluator=evaluator) print('Dev acc: %f' % train_acc, file=sys.stderr)