def main(): args = parse_args() task = args.pop('task') if task == 'learn': args.pop('out') # No output args.pop('indices') # No output assert args.get( 'level'), 'argument --level is required for "learn" task' import time from datetime import timedelta from nlcodec.utils import max_RSS st = time.time() st_mem = max_RSS()[1] learn_vocab(**args) delta = timedelta(seconds=time.time() - st) et_mem = max_RSS()[1] log.info(f"Time taken: {delta}; Memory: {st_mem} --> {et_mem}") elif task in ('encode', 'decode'): scheme = load_scheme(args.pop('model')) inp, out, indices = args['inp'], args['out'], args.get( 'indices', False) if task == 'encode': recs = encode(inp, scheme, indices=indices) if indices: recs = ([str(idx) for idx in seq] for seq in recs) recs = (' '.join(seq) for seq in recs) else: recs = decode(inp, scheme, indices=indices) write_lines(recs, out) elif task == 'estimate': from nlcodec.qestim import estimate estimate(codec_path=args['model'], data=args['inp']) else: raise NotImplementedError(task + ' not implemented')
def main(): args = parse_args() model_path: Path = args['model'] words_file = model_path.with_suffix(".wordfreq.gz") chars_file = model_path.with_suffix(".charfreq.gz") stats_file = chars_file if args['level'] == 'char' else words_file with utils.log_resources(name="extract stats"): master = args.pop('spark_master') driver_mem = args.pop('driver_mem') dedup = args.pop('dedup') if stats_file.exists(): log.warning(f"{stats_file} exists, reusing it. please delete it if this is wrong.") else: inp = args.pop('inp') with spark.session(master=master, driver_mem=driver_mem) as session: words, chars, line_count = word_counts(paths=inp, dedup=dedup, spark=session) with utils.IO.writer(words_file) as out: write_stats(words, out, line_count=line_count) with utils.IO.writer(chars_file) as out: write_stats(chars, out, line_count=line_count) assert stats_file.exists() with utils.log_resources(name=f"learning {args['level']} vocab"): with utils.IO.reader(stats_file) as inp: learn_vocab(inp=inp, term_freqs=True, **args)
def main(): args = parse_args() task = args.pop('task') if task == 'learn': args.pop('out') # No output args.pop('indices') # No output assert args.get( 'level'), 'argument --level is required for "learn" task' learn_vocab(**args) elif task in ('encode', 'decode'): scheme = load_scheme(args.pop('model')) inp, out, indices = args['inp'], args['out'], args.get( 'indices', False) if task == 'encode': recs = encode(inp, scheme, indices=indices) if indices: recs = ([str(idx) for idx in seq] for seq in recs) recs = (' '.join(seq) for seq in recs) else: recs = decode(inp, scheme, indices=indices) write_lines(recs, out) elif task == 'estimate': from nlcodec.qestim import estimate estimate(codec_path=args['model'], data=args['inp']) else: raise NotImplementedError(task + ' not implemented')
def train(cls, model_type: str, vocab_size: int, model_path: str, files: List[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0, dedup=True, spark=None): """ :param model_type: word, char, bpe :param vocab_size: vocabulary size :param model_path: where to store vocabulary model :param files: text for creating vcabulary :param no_split_toks: :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage :return: """ assert not no_split_toks, 'not supported in nlcodec yet' from nlcodec import learn_vocab, term_freq kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {} if not spark: inp = IO.get_liness(*files) else: # extract and store frequencies to this file stats_file = model_path + '.termfreqs' if not Path(stats_file).exists(): log.info("Extracting term frequencies... ") paths = [f if isinstance(f, Path) else Path(f) for f in files] wfs, chfs, n_lines = term_freq.word_counts(paths=paths, dedup=dedup, spark=spark) log.info( f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}" ) stats = chfs if model_type == 'char' else wfs log.info(f"Writing frequencies to {stats_file}") with IO.writer(stats_file) as out: term_freq.write_stats(stats=stats, out=out, line_count=n_lines) kwargs['term_freqs'] = True inp = IO.get_lines(stats_file, delim='\n') learn_vocab(inp=inp, level=model_type, model=model_path, vocab_size=vocab_size, **kwargs) return cls(model_path)
def test_bpe(): vocab_size = 6000 args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='bpe', vocab_size=vocab_size, min_freq=1, term_freqs=False, char_coverage=0.99999, min_co_ev=2) with tempfile.TemporaryDirectory() as tmpdir: model_file = Path(tmpdir) / 'model.tsv' args['model'] = model_file table = nlc.learn_vocab(**args) assert len(table) == vocab_size table2, meta = nlc.Type.read_vocab(model_file) assert len(table2) == len(table) table_str = '\n'.join(x.format() for x in table) table2_str = '\n'.join(x.format() for x in table2) assert table_str == table2_str
def test_shrink(): vocab_size = 6000 args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='bpe', vocab_size=vocab_size, min_freq=1, term_freqs=False, char_coverage=0.99999, min_co_ev=2) with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) model_file = tmpdir / 'model.tsv' en_model_file = tmpdir / 'model.en.tsv' args['model'] = model_file table = nlc.learn_vocab(**args) assert len(table) == vocab_size scheme = nlc.load_scheme(model_file) mapping = scheme.shrink_vocab(files=[en_txt], min_freq=1, save_at=en_model_file) assert len(mapping) > 0 model2 = nlc.load_scheme(en_model_file) assert len(model2.table) == len(mapping)