def train(cls, model_type: str, vocab_size: int, model_path: Union[Path, str], files: List[str], tok_coverage=0.9999, **kwargs): # Note: char_coverage is abused as subword_coverage hub_api = cls.load_hub_model(model_type) bpe = hub_api.bpe dicto = hub_api.task.dictionary freqs = coll.Counter() lines = IO.get_liness(*files) for line in tqdm(lines, mininterval=2, dynamic_ncols=True, unit='line'): freqs.update(bpe.encode(line).split()) total_toks = sum(freqs.values()) log.info(f"Found {len(freqs)} bpe types and {total_toks} toks") freqs = list(sorted(freqs.items(), reverse=True, key=lambda x: x[1])) vocabulary, oovs = [], [] cumulative = 0 for t, f in freqs: if cumulative / total_toks <= tok_coverage: vocabulary.append((t, f)) cumulative += f else: oovs.append((t, f)) oovs_str = ' '.join(f'{t}:{f}' for t, f in oovs) log.info(f'Excluded {len(oovs)} types as OOVs.\n:{oovs_str}') log.info(f'Included {len(vocabulary)} types as in vocabulary; ' f'Coverage = {cumulative / total_toks:g}') # TODO: mapping should be list[int] with one on one map types, indices = [], {} for typ, new_idx in cls.reserved(): assert len(types) == new_idx types.append(typ) old_idx = dicto.indices.get(typ, -1) indices[typ] = [new_idx, old_idx] for typ, freq in vocabulary: # [new index, old index] indices[typ] = [len(types), dicto.indices.get(typ, -1)] types.append(typ) data = {'model_id': model_type, 'mapping': indices} with IO.writer(model_path) as wrtr: yaml.dump(data, wrtr) return cls(model_path)
def train(cls, model_type: str, vocab_size: int, model_path: str, files: List[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0, dedup=True, spark=None): """ :param model_type: word, char, bpe :param vocab_size: vocabulary size :param model_path: where to store vocabulary model :param files: text for creating vcabulary :param no_split_toks: :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage :return: """ assert not no_split_toks, 'not supported in nlcodec yet' from nlcodec import learn_vocab, term_freq kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {} if not spark: inp = IO.get_liness(*files) else: # extract and store frequencies to this file stats_file = model_path + '.termfreqs' if not Path(stats_file).exists(): log.info("Extracting term frequencies... ") paths = [f if isinstance(f, Path) else Path(f) for f in files] wfs, chfs, n_lines = term_freq.word_counts(paths=paths, dedup=dedup, spark=spark) log.info( f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}" ) stats = chfs if model_type == 'char' else wfs log.info(f"Writing frequencies to {stats_file}") with IO.writer(stats_file) as out: term_freq.write_stats(stats=stats, out=out, line_count=n_lines) kwargs['term_freqs'] = True inp = IO.get_lines(stats_file, delim='\n') learn_vocab(inp=inp, level=model_type, model=model_path, vocab_size=vocab_size, **kwargs) return cls(model_path)