def evaluate_file(self, detok_hyp: Path, ref: Union[Path, List[str]], lowercase=True) -> float: detok_lines = list(IO.get_lines(detok_hyp)) # takes multiple refs, but here we have only one ref_liness = [IO.get_lines(ref) if isinstance(ref, Path) else ref] bleu: BLEUScore = corpus_bleu(sys_stream=detok_lines, ref_streams=ref_liness, lowercase=lowercase) # this should be part of new sacrebleu release (i sent a PR ;) bleu_str = bleu.format() bleu_file = detok_hyp.with_name(detok_hyp.name + ('.lc' if lowercase else '.oc') + '.sacrebleu') log.info(f'BLEU {detok_hyp} : {bleu_str}') IO.write_lines(bleu_file, bleu_str) return bleu.score
def evaluate_file(self, detok_hyp: Path, ref: Union[Path, List[str]], lowercase=True) -> float: detok_lines = IO.get_lines(detok_hyp) # takes multiple refs, but here we have only one ref_liness = [IO.get_lines(ref) if isinstance(ref, Path) else ref] bleu: BLEU = corpus_bleu(sys_stream=detok_lines, ref_streams=ref_liness, lowercase=lowercase) # this should be part of new sacrebleu release (i sent a PR ;) bleu_str = f'BLEU = {bleu.score:.2f} {"/".join(f"{p:.1f}" for p in bleu.precisions)}' \ f' (BP = {bleu.bp:.3f} ratio = {(bleu.sys_len / bleu.ref_len):.3f}' \ f' hyp_len = {bleu.sys_len:d} ref_len={bleu.ref_len:d})' bleu_file = detok_hyp.with_suffix(('.lc' if lowercase else '.oc') + '.sacrebleu') log.info(f'BLEU {detok_hyp} : {bleu_str}') IO.write_lines(bleu_file, bleu_str) return bleu.score
def moses_detokenize(self, inp: Path, out: Path, col=0, lang='en', post_op=None): log.info(f"detok : {inp} --> {out}") tok_lines = IO.get_lines(inp, col=col, line_mapper=lambda x: x.split()) with MosesDetokenizer(lang=lang) as detok: detok_lines = (detok(tok_line) for tok_line in tok_lines) if post_op: detok_lines = (post_op(line) for line in detok_lines) IO.write_lines(out, detok_lines)
def train(cls, model_type: str, vocab_size: int, model_path: str, files: List[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0, dedup=True, spark=None): """ :param model_type: word, char, bpe :param vocab_size: vocabulary size :param model_path: where to store vocabulary model :param files: text for creating vcabulary :param no_split_toks: :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage :return: """ assert not no_split_toks, 'not supported in nlcodec yet' from nlcodec import learn_vocab, term_freq kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {} if not spark: inp = IO.get_liness(*files) else: # extract and store frequencies to this file stats_file = model_path + '.termfreqs' if not Path(stats_file).exists(): log.info("Extracting term frequencies... ") paths = [f if isinstance(f, Path) else Path(f) for f in files] wfs, chfs, n_lines = term_freq.word_counts(paths=paths, dedup=dedup, spark=spark) log.info( f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}" ) stats = chfs if model_type == 'char' else wfs log.info(f"Writing frequencies to {stats_file}") with IO.writer(stats_file) as out: term_freq.write_stats(stats=stats, out=out, line_count=n_lines) kwargs['term_freqs'] = True inp = IO.get_lines(stats_file, delim='\n') learn_vocab(inp=inp, level=model_type, model=model_path, vocab_size=vocab_size, **kwargs) return cls(model_path)
def decode_eval_file(self, decoder, src: Union[Path, List[str]], out_file: Path, ref: Optional[Union[Path, List[str]]], lowercase: bool = True, **dec_args) -> float: if out_file.exists() and out_file.stat().st_size > 0 and line_count( out_file) == (len(src) if isinstance(src, list) else line_count(src)): log.warning( f"{out_file} exists and has desired number of lines. Skipped..." ) else: if isinstance(src, Path): log.info(f"decoding {src.name}") src = list(IO.get_lines(src)) if isinstance(ref, Path): ref = list(IO.get_lines(ref)) with IO.writer(out_file) as out: decoder.decode_file(src, out, **dec_args) detok_hyp = self.detokenize(out_file) if ref: return self.evaluate_file(detok_hyp, ref, lowercase=lowercase)
def tune_decoder_params(self, exp: Experiment, tune_src: str, tune_ref: str, batch_size: int, trials: int = 10, lowercase=True, beam_size=(1, 4, 8), ensemble=(1, 5, 10), lp_alpha=(0.0, 0.4, 0.6), suggested: List[Tuple[int, int, float]] = None, **fixed_args): _, _, _, tune_args = inspect.getargvalues(inspect.currentframe()) tune_args.update(fixed_args) ex_args = ['exp', 'self', 'fixed_args', 'batch_size', 'max_len'] if trials == 0: ex_args += ['beam_size', 'ensemble', 'lp_alpha'] for x in ex_args: del tune_args[x] # exclude some args _, step = exp.get_last_saved_model() tune_dir = exp.work_dir / f'tune_step{step}' log.info(f"Tune dir = {tune_dir}") tune_dir.mkdir(parents=True, exist_ok=True) tune_src, tune_ref = Path(tune_src), Path(tune_ref) assert tune_src.exists() assert tune_ref.exists() tune_src, tune_ref = list(IO.get_lines(tune_src)), list( IO.get_lines(tune_ref)) assert len(tune_src) == len(tune_ref) tune_log = tune_dir / 'scores.json' # resume the tuning memory: Dict[Tuple, float] = {} if tune_log.exists(): data = json.load(tune_log.open()) # JSON keys cant be tuples, so they were stringified memory = {eval(k): v for k, v in data.items()} beam_sizes, ensembles, lp_alphas = [], [], [] if suggested: if isinstance(suggested[0], str): suggested = [eval(x) for x in suggested] suggested = [(x[0], x[1], round(x[2], 2)) for x in suggested] suggested_new = [x for x in suggested if x not in memory] beam_sizes += [x[0] for x in suggested_new] ensembles += [x[1] for x in suggested_new] lp_alphas += [x[2] for x in suggested_new] new_trials = trials - len(memory) if new_trials > 0: beam_sizes += [random.choice(beam_size) for _ in range(new_trials)] ensembles += [random.choice(ensemble) for _ in range(new_trials)] lp_alphas += [ round(random.choice(lp_alpha), 2) for _ in range(new_trials) ] # ensembling is somewhat costlier, so try minimize the model ensembling, by grouping them together grouped_ens = defaultdict(list) for b, ens, l in zip(beam_sizes, ensembles, lp_alphas): grouped_ens[ens].append((b, l)) try: for ens, args in grouped_ens.items(): decoder = Decoder.new(exp, ensemble=ens) for b_s, lp_a in args: eff_batch_size = batch_size // b_s # effective batch size name = f'tune_step{step}_beam{b_s}_ens{ens}_lp{lp_a:.2f}' log.info(name) out_file = tune_dir / f'{name}.out.tsv' score = self.decode_eval_file(decoder, tune_src, out_file, tune_ref, batch_size=eff_batch_size, beam_size=b_s, lp_alpha=lp_a, lowercase=lowercase, **fixed_args) memory[(b_s, ens, lp_a)] = score best_params = sorted(memory.items(), key=lambda x: x[1], reverse=True)[0][0] return dict(zip(['beam_size', 'ensemble', 'lp_alpha'], best_params)), tune_args finally: # JSON keys cant be tuples, so we stringify them data = {str(k): v for k, v in memory.items()} IO.write_lines(tune_log, json.dumps(data))