def main(args=None): args = args or parse_args() src_codec = load_scheme(args.src_model) tgt_codec = src_codec if args.tgt_model: tgt_codec = load_scheme(args.tgt_model) with spark_util.session() as session: df = spark_util.read_raw_bitext_tok( spark=session, src_path=args.src_text, tgt_path=args.tgt_text, truncate=args.truncate, src_len=args.src_len, tgt_len=args.tgt_len, src_tokenizer=src_codec.encode, tgt_tokenizer=tgt_codec.encode) rdd = df.rdd.map(lambda r: (r.id, (r.x, r.y))) db = spark_util.rdd_as_db(rdd=rdd, db_path=args.db_path, field_names=('x', 'y'), repartition=args.num_parts) log.info(f"stored {len(db)} recs at {args.db_path}")
def main(): args = parse_args() task = args.pop('task') if task == 'learn': args.pop('out') # No output args.pop('indices') # No output assert args.get( 'level'), 'argument --level is required for "learn" task' import time from datetime import timedelta from nlcodec.utils import max_RSS st = time.time() st_mem = max_RSS()[1] learn_vocab(**args) delta = timedelta(seconds=time.time() - st) et_mem = max_RSS()[1] log.info(f"Time taken: {delta}; Memory: {st_mem} --> {et_mem}") elif task in ('encode', 'decode'): scheme = load_scheme(args.pop('model')) inp, out, indices = args['inp'], args['out'], args.get( 'indices', False) if task == 'encode': recs = encode(inp, scheme, indices=indices) if indices: recs = ([str(idx) for idx in seq] for seq in recs) recs = (' '.join(seq) for seq in recs) else: recs = decode(inp, scheme, indices=indices) write_lines(recs, out) elif task == 'estimate': from nlcodec.qestim import estimate estimate(codec_path=args['model'], data=args['inp']) else: raise NotImplementedError(task + ' not implemented')
def main(): args = parse_args() task = args.pop('task') if task == 'learn': args.pop('out') # No output args.pop('indices') # No output assert args.get( 'level'), 'argument --level is required for "learn" task' learn_vocab(**args) elif task in ('encode', 'decode'): scheme = load_scheme(args.pop('model')) inp, out, indices = args['inp'], args['out'], args.get( 'indices', False) if task == 'encode': recs = encode(inp, scheme, indices=indices) if indices: recs = ([str(idx) for idx in seq] for seq in recs) recs = (' '.join(seq) for seq in recs) else: recs = decode(inp, scheme, indices=indices) write_lines(recs, out) elif task == 'estimate': from nlcodec.qestim import estimate estimate(codec_path=args['model'], data=args['inp']) else: raise NotImplementedError(task + ' not implemented')
def __init__(self, path: Union[str, Path]): # this is experimental from nlcodec import load_scheme, EncoderScheme, Type self.codec: EncoderScheme = load_scheme(path) self.vocab: List[Type] = self.codec.table log.info(f'Loaded {len(self.codec)} types from {path}') for tok, idx in self.reserved(): # reserved are reserved # Todo swap it with nlcodec.Reserved assert self.vocab[idx].name == tok
def main(inp, out, n, model_path): scheme = load_scheme(model_path) seqs = scheme.encode_parallel(inp) freqs = make_n_grams_all(seqs, n) freqs = sorted(freqs.items(), key=lambda x:x[1], reverse=True) for gram, freq in freqs: gram = list(gram) names = [scheme.table[g].name for g in gram] line = json.dumps([gram, freq, names], ensure_ascii=False) out.write(line + '\n')
def test_shrink(): vocab_size = 6000 args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='bpe', vocab_size=vocab_size, min_freq=1, term_freqs=False, char_coverage=0.99999, min_co_ev=2) with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) model_file = tmpdir / 'model.tsv' en_model_file = tmpdir / 'model.en.tsv' args['model'] = model_file table = nlc.learn_vocab(**args) assert len(table) == vocab_size scheme = nlc.load_scheme(model_file) mapping = scheme.shrink_vocab(files=[en_txt], min_freq=1, save_at=en_model_file) assert len(mapping) > 0 model2 = nlc.load_scheme(en_model_file) assert len(model2.table) == len(mapping)
def main(model_path, cands, refs, n, out, freqs=None): codec = load_scheme(model_path) cands, refs = list(cands), list(refs) assert len(cands) == len(refs), f'cands: {len(cands)} but refs: {len(refs)} lines' cands = list(codec.encode_parallel(cands)) refs = list(codec.encode_parallel(refs)) gram_recalls, ref_gram_freqs, gram_precisions, cand_gram_freqs = count_gram_recall(cands, refs) if freqs: log.info(f"Loading precomputed gram freqs from {freqs}") freqs = [json.loads(l.strip()) for l in freqs] gram_freqs = {tuple(g): f for g, f, name in freqs} # subset of grams that are found in reference gram_freqs = {g: f for g, f in gram_freqs.items() if g in ref_gram_freqs or g in cand_gram_freqs} # these grams were not found in training, but in there in refs => OOVs => freq=-1 oov_grams = {g: -1 for g in ref_gram_freqs if g not in gram_freqs} log.info(f"{len(oov_grams)} grams were oov wrt to freqs => assigned freq = -1 ") gram_freqs.update(oov_grams) else: gram_freqs = ref_gram_freqs #print(gram_freqs.keys()) new_grams = {cand_gram: freq for cand_gram, freq in cand_gram_freqs.items() if cand_gram not in gram_freqs} if new_grams: msg = f'Found {len(new_grams)} grams that are not found in refs or --freqs' log.warning(msg) if n == 1: for ng, f in new_grams.items(): ng = ng[0] log.error(f'Not found:\t{ng}\t{codec.idx_to_str[ng]}\t{f}') #raise Exception(msg) else: log.warning("TG, Come back and handle bigrams and above :)") gram_freqs = sorted(gram_freqs.items(), key=lambda t: t[1], reverse=True) out.write(f'Rank\tGram\tName\tRankF\tRefF\tCandF\tRecall\tPrecision\tF1\n') for i, (gram, rank_freq) in enumerate(gram_freqs): name = ','.join(codec.idx_to_str[g] for g in gram) idxs = ','.join(str(g) for g in gram) gram_recall = gram_recalls.get(gram, 0) gram_precision = gram_precisions.get(gram, 1) # should it be zero or one? f1 = f1_measure(gram_precision, gram_recall) ref_freq = ref_gram_freqs.get(gram, -1) cand_freq = cand_gram_freqs.get(gram, -1) out.write(f'{i+1}\t{idxs}\t{name}\t{rank_freq}\t{ref_freq}\t{cand_freq}\t{gram_recall:g}' f'\t{gram_precision:g}\t{f1:g}\n')
def main(inp, model_path): codec = load_scheme(model_path) assert isinstance(codec, WordScheme) vocab = set(codec.str_to_idx.keys()) iv_toks, oov_toks = partition_vocab_toks(inp, vocab) n_iv_types = len(iv_toks) n_iv_toks = sum(iv_toks.values()) n_oov_types = len(oov_toks) n_oov_toks = sum(oov_toks.values()) total_types = n_iv_types + n_oov_types total_toks = n_iv_toks + n_oov_toks print("*\tInVocab\tOOV") print(f"Types\t{n_iv_types}\t{n_oov_types}") print(f"Token Count\t{n_iv_toks}\t{n_oov_toks}") print(f"Token %\t{100*n_iv_toks/total_toks:.2f}\t{100*n_oov_toks/total_toks:.2f}")
def estimate(codec_path, data): codec = load_scheme(codec_path) estimator = QualityEstimator(codec) estimation = estimator.estimate(data) print(estimation)