Beispiel #1
0
def main(args=None):
    args = args or parse_args()
    src_codec = load_scheme(args.src_model)
    tgt_codec = src_codec
    if args.tgt_model:
        tgt_codec = load_scheme(args.tgt_model)

    with spark_util.session() as session:
        df = spark_util.read_raw_bitext_tok(
            spark=session, src_path=args.src_text, tgt_path=args.tgt_text, truncate=args.truncate,
            src_len=args.src_len, tgt_len=args.tgt_len,
            src_tokenizer=src_codec.encode, tgt_tokenizer=tgt_codec.encode)
        rdd = df.rdd.map(lambda r: (r.id, (r.x, r.y)))
        db = spark_util.rdd_as_db(rdd=rdd, db_path=args.db_path, field_names=('x', 'y'),
                                  repartition=args.num_parts)
        log.info(f"stored {len(db)} recs at {args.db_path}")
Beispiel #2
0
def main():
    args = parse_args()
    task = args.pop('task')
    if task == 'learn':
        args.pop('out')  # No output
        args.pop('indices')  # No output
        assert args.get(
            'level'), 'argument --level is required for "learn" task'
        import time
        from datetime import timedelta
        from nlcodec.utils import max_RSS
        st = time.time()
        st_mem = max_RSS()[1]
        learn_vocab(**args)
        delta = timedelta(seconds=time.time() - st)
        et_mem = max_RSS()[1]
        log.info(f"Time taken: {delta}; Memory: {st_mem} --> {et_mem}")
    elif task in ('encode', 'decode'):
        scheme = load_scheme(args.pop('model'))
        inp, out, indices = args['inp'], args['out'], args.get(
            'indices', False)
        if task == 'encode':
            recs = encode(inp, scheme, indices=indices)
            if indices:
                recs = ([str(idx) for idx in seq] for seq in recs)
            recs = (' '.join(seq) for seq in recs)
        else:
            recs = decode(inp, scheme, indices=indices)
        write_lines(recs, out)
    elif task == 'estimate':
        from nlcodec.qestim import estimate
        estimate(codec_path=args['model'], data=args['inp'])
    else:
        raise NotImplementedError(task + ' not implemented')
Beispiel #3
0
def main():
    args = parse_args()
    task = args.pop('task')
    if task == 'learn':
        args.pop('out')  # No output
        args.pop('indices')  # No output
        assert args.get(
            'level'), 'argument --level is required for "learn" task'
        learn_vocab(**args)
    elif task in ('encode', 'decode'):
        scheme = load_scheme(args.pop('model'))
        inp, out, indices = args['inp'], args['out'], args.get(
            'indices', False)
        if task == 'encode':
            recs = encode(inp, scheme, indices=indices)
            if indices:
                recs = ([str(idx) for idx in seq] for seq in recs)
            recs = (' '.join(seq) for seq in recs)
        else:
            recs = decode(inp, scheme, indices=indices)
        write_lines(recs, out)
    elif task == 'estimate':
        from nlcodec.qestim import estimate
        estimate(codec_path=args['model'], data=args['inp'])
    else:
        raise NotImplementedError(task + ' not implemented')
Beispiel #4
0
 def __init__(self, path: Union[str, Path]):
     # this is experimental
     from nlcodec import load_scheme, EncoderScheme, Type
     self.codec: EncoderScheme = load_scheme(path)
     self.vocab: List[Type] = self.codec.table
     log.info(f'Loaded {len(self.codec)} types from {path}')
     for tok, idx in self.reserved():  # reserved are reserved
         # Todo swap it with nlcodec.Reserved
         assert self.vocab[idx].name == tok
Beispiel #5
0
def main(inp, out, n, model_path):
    scheme = load_scheme(model_path)
    seqs = scheme.encode_parallel(inp)
    freqs = make_n_grams_all(seqs, n)
    freqs = sorted(freqs.items(), key=lambda x:x[1], reverse=True)
    for gram, freq in freqs:
        gram = list(gram)
        names = [scheme.table[g].name for g in gram]
        line = json.dumps([gram, freq, names], ensure_ascii=False)
        out.write(line + '\n')
Beispiel #6
0
def test_shrink():
    vocab_size = 6000
    args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]),
                level='bpe',
                vocab_size=vocab_size,
                min_freq=1,
                term_freqs=False,
                char_coverage=0.99999,
                min_co_ev=2)
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        model_file = tmpdir / 'model.tsv'
        en_model_file = tmpdir / 'model.en.tsv'
        args['model'] = model_file
        table = nlc.learn_vocab(**args)
        assert len(table) == vocab_size
        scheme = nlc.load_scheme(model_file)
        mapping = scheme.shrink_vocab(files=[en_txt], min_freq=1, save_at=en_model_file)
        assert len(mapping) > 0
        model2 = nlc.load_scheme(en_model_file)
        assert len(model2.table) == len(mapping)
Beispiel #7
0
def main(model_path, cands, refs, n, out, freqs=None):
    codec = load_scheme(model_path)
    cands, refs = list(cands), list(refs)
    assert len(cands) == len(refs), f'cands: {len(cands)} but refs: {len(refs)} lines'

    cands = list(codec.encode_parallel(cands))
    refs = list(codec.encode_parallel(refs))
    gram_recalls, ref_gram_freqs, gram_precisions, cand_gram_freqs = count_gram_recall(cands, refs)
    if freqs:
        log.info(f"Loading precomputed gram freqs from {freqs}")
        freqs = [json.loads(l.strip()) for l in freqs]
        gram_freqs = {tuple(g): f for g, f, name in freqs}

        # subset of grams that are found in reference
        gram_freqs = {g: f for g, f in gram_freqs.items() if g in ref_gram_freqs or g in cand_gram_freqs}

        # these grams were not found in training, but in there in refs => OOVs => freq=-1
        oov_grams = {g: -1 for g in ref_gram_freqs if g not in gram_freqs}
        log.info(f"{len(oov_grams)} grams were oov wrt to freqs => assigned freq = -1 ")
        gram_freqs.update(oov_grams)
    else:
        gram_freqs = ref_gram_freqs
    #print(gram_freqs.keys())
    new_grams = {cand_gram: freq for cand_gram, freq in cand_gram_freqs.items()
                   if cand_gram not in gram_freqs}

    if new_grams:
        msg = f'Found {len(new_grams)} grams that are not found in refs or --freqs'
        log.warning(msg)
        if n == 1:
            for ng, f in new_grams.items():
                ng = ng[0]
                log.error(f'Not found:\t{ng}\t{codec.idx_to_str[ng]}\t{f}')
            #raise Exception(msg)
        else:
            log.warning("TG, Come back and handle bigrams and above :)")

    gram_freqs = sorted(gram_freqs.items(), key=lambda t: t[1], reverse=True)
    out.write(f'Rank\tGram\tName\tRankF\tRefF\tCandF\tRecall\tPrecision\tF1\n')
    for i, (gram, rank_freq) in enumerate(gram_freqs):
        name = ','.join(codec.idx_to_str[g] for g in gram)
        idxs = ','.join(str(g) for g in gram)
        gram_recall = gram_recalls.get(gram, 0)
        gram_precision = gram_precisions.get(gram, 1) # should it be zero or one?
        f1 = f1_measure(gram_precision, gram_recall)
        ref_freq = ref_gram_freqs.get(gram, -1)
        cand_freq = cand_gram_freqs.get(gram, -1)
        out.write(f'{i+1}\t{idxs}\t{name}\t{rank_freq}\t{ref_freq}\t{cand_freq}\t{gram_recall:g}'
                  f'\t{gram_precision:g}\t{f1:g}\n')
Beispiel #8
0
def main(inp, model_path):
    codec = load_scheme(model_path)
    assert isinstance(codec, WordScheme)
    vocab = set(codec.str_to_idx.keys())
    iv_toks, oov_toks = partition_vocab_toks(inp, vocab)
    n_iv_types = len(iv_toks)
    n_iv_toks = sum(iv_toks.values())
    n_oov_types = len(oov_toks)
    n_oov_toks = sum(oov_toks.values())
    total_types = n_iv_types + n_oov_types
    total_toks = n_iv_toks + n_oov_toks

    print("*\tInVocab\tOOV")
    print(f"Types\t{n_iv_types}\t{n_oov_types}")
    print(f"Token Count\t{n_iv_toks}\t{n_oov_toks}")
    print(f"Token %\t{100*n_iv_toks/total_toks:.2f}\t{100*n_oov_toks/total_toks:.2f}")
Beispiel #9
0
def estimate(codec_path, data):
    codec = load_scheme(codec_path)
    estimator = QualityEstimator(codec)
    estimation = estimator.estimate(data)
    print(estimation)