Python Tokenizer.detokenize Examples

Programming Language: Python

Namespace/Package Name: src.tokenizer

Class/Type: Tokenizer

Method/Function: detokenize

Examples at hotexamples.com: 3

Python Tokenizer.detokenize - 3 examples found. These are the top rated real world Python examples of src.tokenizer.Tokenizer.detokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(30)

next(14)

_generator(7)

tokenize(6)

finished(3)

detokenize(3)

format_data(2)

create_tokens(2)

load_state_dict(1)

show_tokens(1)

set_title(1)

set_doc_id(1)

scan_source(1)

save_state_dict(1)

peek(1)

normalize(1)

_status(1)

convert_text_to_number(1)

load_config(1)

iter_terms(1)

counter_tokenize(1)

get_doc_id(1)

convert_number_to_text(1)

fit_on_text(1)

filter_new_lines(1)

filter(1)

export_xml(1)

doTokenization(1)

get_title(1)

Example #1

Show file

File: bench_decode.py Project: StoneLin0708/fakenews

def main(args):
    set_seed(args.seed)

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(
        d_model=768,
        d_ff=1024,
        dropout=.1,
        layers=6,
        heads=8,
        d_emb=-1,
        pad_token_id=tk.pad_id,
        vocab_size=tk.vocab_size
    )

    ds = dataset(0)

    device = torch.device(args.device)

    model.load_state_dict(torch.load(args.ckpt, map_location=device)['model'])

    ds.set_mono_ratio(args.m_ratio)
    if not os.path.exists(args.results):
        start = timeit.default_timer()
        ds.generate(lambda x: [beam_search(
            model=model.to(device),
            input_sequence=torch.LongTensor(tk.tokenize(x)).to(device),
            bos_id=tk.bos_id,
            eos_id=tk.eos_id,
            beam_width=args.beam,
            device=device,
            max_seq_len=64)],
            max_input_len=64)
        end = timeit.default_timer()
        print(f'{end-start:.2f} sec')
        open(args.results,'w').writelines('\n'.join(tk.detokenize(ds.synthetic[1:])))
    else:
        start = timeit.default_timer()
        ds.generate(lambda x: beam_search_v2(
            model=model.to(device),
            input_sequence=tk.tokenize(x),
            tokenizer=tk,
            is_full=lambda b, nx, ny: (nx + ny * 1.5) * b > 256 * 64,
            beam_width=args.beam,
            device=device,
            max_seq_len=64),
            max_input_len=64,
            batch_size=64)
        end = timeit.default_timer()

        s = tk.detokenize(ds.synthetic[1:])
        open(args.results+'_2', 'w').writelines('\n'.join(s))
        r = open(args.results).readlines()
        if len(s) != len(r):
            raise Exception(f'result should be length of {len(r)} but got {len(s)}')
        for i, j in zip(s, r):
            if i != j.strip():
                print(f'---------------\n"{i}"\n!=\n"{j.strip()}"')

        print(f'{end-start:.2f} sec')

Example #2

Show file

File: tkt.py Project: StoneLin0708/fakenews

f = np.array(f)
print(np.exp(f[39:]).sum())

l = list(map(len, w[39:]))
freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0])
print('\n'.join(f'|{i}|{j}|' for i, j in freq))
# In[]
ds = NewsDataset('data/news_dataset_tag10_v2.1.db')
# ds = NewsDataset('data/wiki.db')
tk = Tokenizer('data/t2.1_c1')

# In[]
from src.utils import peek
d = peek(ds.data, 1)
print(d[0][2])
print(tk.detokenize(tk.tokenize(d[0][2])))
# In[]
ll = list(map(lambda x: len(x[2]), ds.data))
sl = sorted(ll)
print(sl[0])
print(sl[int(len(sl) * 0.25)])
print(sl[int(len(sl) * 0.5)])
print(sl[int(len(sl) * 0.75)])
print(sl[-1])

# In[]
tl = []
al = []
for idx, t, a in tqdm(ds.data):
    tl.append(len(tk.tokenize(t, bos=False, eos=False)))
    al.append(len(tk.tokenize(a, bos=False, eos=False)))

Example #3

Show file

File: run_decode.py Project: StoneLin0708/fakenews

def main(args):
    set_seed(args.seed) if args.seed is not None else None

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(d_model=768,
                             d_ff=1024,
                             dropout=0,
                             layers=args.layer,
                             heads=args.heads,
                             d_emb=-1,
                             pad_token_id=tk.pad_id,
                             vocab_size=tk.vocab_size)

    if args.inseq is not None:
        r = beam_search_v2(model, tk.tokenize(args.inseq), tk,
                           lambda b, nx, ny: (nx + ny) * b > 128 * 64, 4,
                           args.device, args.maxlen)
    else:
        if args.peek == 0:
            return
        ds = NewsDataset(args.data,
                         args.a,
                         args.b,
                         inplace=args.inplace,
                         sample=args.sample,
                         seed=args.seed)

        device = torch.device(args.device)

        if args.ckpt == 'latest':
            args.ckpt = find_latest_ckpt(args.model_dir,
                                         args.ckpt_pattern).path
        model.load_state_dict(
            torch.load(args.ckpt, map_location=device)['model'])
        model.to(device)
        if len(args.aids) == 0:
            ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(peek(
                ds, args.peek, args.seed))
        else:
            data = list(filter(lambda x: x[0] in args.aids, ds.data))
            sdata = []
            if len(data) != len(args.aids):
                raise Exception(f'only got {list(zip(*data))[0]}')
            for i in args.aids:
                for x in data:
                    if x[0] == i:
                        sdata.append(x)
            ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(sdata)

        start = timeit.default_timer()
        preds = []
        for beam_n in args.beam:
            p = beam_search_v2(model, inseq, tk, lambda b, nx, ny:
                               (nx + ny) * b > 128 * 64, beam_n, args.device,
                               args.maxlen)
            preds.append((f'beam{beam_n}', p))

        for topk_k in args.topk:
            p = topk(model, inseq, tk, topk_k, args.device, args.maxlen)
            preds.append((f'topk{topk_k}', p))

        results = []
        for idx in range(len(inseq)):
            results.append((tk.detokenize(inseq[idx]),
                            *[tk.detokenize(p[idx]) for _, p in preds],
                            tk.detokenize(outseq[idx][:args.maxlen + 1])))
        df = pd.DataFrame(results,
                          columns=['input', *[n for n, _ in preds], 'target'])
        if args.output is None:
            print(df)
        else:
            df.to_csv(args.output)
        print((timeit.default_timer() - start))