Esempio n. 1
0
def main():
    args = parse_args()
    task = args.pop('task')
    if task == 'learn':
        args.pop('out')  # No output
        args.pop('indices')  # No output
        assert args.get(
            'level'), 'argument --level is required for "learn" task'
        import time
        from datetime import timedelta
        from nlcodec.utils import max_RSS
        st = time.time()
        st_mem = max_RSS()[1]
        learn_vocab(**args)
        delta = timedelta(seconds=time.time() - st)
        et_mem = max_RSS()[1]
        log.info(f"Time taken: {delta}; Memory: {st_mem} --> {et_mem}")
    elif task in ('encode', 'decode'):
        scheme = load_scheme(args.pop('model'))
        inp, out, indices = args['inp'], args['out'], args.get(
            'indices', False)
        if task == 'encode':
            recs = encode(inp, scheme, indices=indices)
            if indices:
                recs = ([str(idx) for idx in seq] for seq in recs)
            recs = (' '.join(seq) for seq in recs)
        else:
            recs = decode(inp, scheme, indices=indices)
        write_lines(recs, out)
    elif task == 'estimate':
        from nlcodec.qestim import estimate
        estimate(codec_path=args['model'], data=args['inp'])
    else:
        raise NotImplementedError(task + ' not implemented')
Esempio n. 2
0
def main():
    args = parse_args()
    model_path: Path = args['model']
    words_file = model_path.with_suffix(".wordfreq.gz")
    chars_file = model_path.with_suffix(".charfreq.gz")
    stats_file = chars_file if args['level'] == 'char' else words_file

    with utils.log_resources(name="extract stats"):
        master = args.pop('spark_master')
        driver_mem = args.pop('driver_mem')
        dedup = args.pop('dedup')
        if stats_file.exists():
            log.warning(f"{stats_file} exists, reusing it. please delete it if this is wrong.")
        else:
            inp = args.pop('inp')
            with spark.session(master=master, driver_mem=driver_mem) as session:
                words, chars, line_count = word_counts(paths=inp, dedup=dedup, spark=session)
            with utils.IO.writer(words_file) as out:
                write_stats(words, out, line_count=line_count)
            with utils.IO.writer(chars_file) as out:
                write_stats(chars, out, line_count=line_count)

    assert stats_file.exists()
    with utils.log_resources(name=f"learning {args['level']} vocab"):
        with utils.IO.reader(stats_file) as inp:
            learn_vocab(inp=inp, term_freqs=True, **args)
Esempio n. 3
0
def main():
    args = parse_args()
    task = args.pop('task')
    if task == 'learn':
        args.pop('out')  # No output
        args.pop('indices')  # No output
        assert args.get(
            'level'), 'argument --level is required for "learn" task'
        learn_vocab(**args)
    elif task in ('encode', 'decode'):
        scheme = load_scheme(args.pop('model'))
        inp, out, indices = args['inp'], args['out'], args.get(
            'indices', False)
        if task == 'encode':
            recs = encode(inp, scheme, indices=indices)
            if indices:
                recs = ([str(idx) for idx in seq] for seq in recs)
            recs = (' '.join(seq) for seq in recs)
        else:
            recs = decode(inp, scheme, indices=indices)
        write_lines(recs, out)
    elif task == 'estimate':
        from nlcodec.qestim import estimate
        estimate(codec_path=args['model'], data=args['inp'])
    else:
        raise NotImplementedError(task + ' not implemented')
Esempio n. 4
0
File: codec.py Progetto: isi-nlp/rtg
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: str,
              files: List[str],
              no_split_toks: Optional[List[str]] = None,
              char_coverage: float = 0,
              dedup=True,
              spark=None):
        """
        :param model_type: word, char, bpe
        :param vocab_size: vocabulary size
        :param model_path: where to store vocabulary model
        :param files: text for creating vcabulary
        :param no_split_toks:
        :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage
        :return:
        """
        assert not no_split_toks, 'not supported in nlcodec yet'
        from nlcodec import learn_vocab, term_freq
        kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {}
        if not spark:
            inp = IO.get_liness(*files)
        else:
            # extract and store frequencies to this file
            stats_file = model_path + '.termfreqs'
            if not Path(stats_file).exists():
                log.info("Extracting term frequencies... ")
                paths = [f if isinstance(f, Path) else Path(f) for f in files]
                wfs, chfs, n_lines = term_freq.word_counts(paths=paths,
                                                           dedup=dedup,
                                                           spark=spark)
                log.info(
                    f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}"
                )
                stats = chfs if model_type == 'char' else wfs
                log.info(f"Writing frequencies to {stats_file}")
                with IO.writer(stats_file) as out:
                    term_freq.write_stats(stats=stats,
                                          out=out,
                                          line_count=n_lines)
                kwargs['term_freqs'] = True
            inp = IO.get_lines(stats_file, delim='\n')

        learn_vocab(inp=inp,
                    level=model_type,
                    model=model_path,
                    vocab_size=vocab_size,
                    **kwargs)
        return cls(model_path)
Esempio n. 5
0
def test_bpe():
    vocab_size = 6000

    args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]),
                level='bpe',
                vocab_size=vocab_size,
                min_freq=1,
                term_freqs=False,
                char_coverage=0.99999,
                min_co_ev=2)
    with tempfile.TemporaryDirectory() as tmpdir:
        model_file = Path(tmpdir) / 'model.tsv'
        args['model'] = model_file
        table = nlc.learn_vocab(**args)
        assert len(table) == vocab_size
        table2, meta = nlc.Type.read_vocab(model_file)
        assert len(table2) == len(table)
        table_str = '\n'.join(x.format() for x in table)
        table2_str = '\n'.join(x.format() for x in table2)
        assert  table_str == table2_str
Esempio n. 6
0
def test_shrink():
    vocab_size = 6000
    args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]),
                level='bpe',
                vocab_size=vocab_size,
                min_freq=1,
                term_freqs=False,
                char_coverage=0.99999,
                min_co_ev=2)
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        model_file = tmpdir / 'model.tsv'
        en_model_file = tmpdir / 'model.en.tsv'
        args['model'] = model_file
        table = nlc.learn_vocab(**args)
        assert len(table) == vocab_size
        scheme = nlc.load_scheme(model_file)
        mapping = scheme.shrink_vocab(files=[en_txt], min_freq=1, save_at=en_model_file)
        assert len(mapping) > 0
        model2 = nlc.load_scheme(en_model_file)
        assert len(model2.table) == len(mapping)