Python Tokenizer.tokenize Exemples, src.tokenizer.Tokenizer.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

 def parse(self):
     """
     Replace sentiment with 2: Positive, 1: Neutral, 0: Negative
     """
     tk = Tokenizer(preserve_case=False)
     with open(self.trainingPath) as training:
         tsvRead = csv.reader(training, delimiter="\t")
         enum = {'positive': 2, 'neutral': 1, 'negative': 0, 'unknown':3}
         tweet_dict = {}
         for line in tsvRead:
             if tk.tokenize(line[1]):
                 phrase = tk.tokenize(line[1])
                 for i,word in enumerate(phrase):
                     if i>50 and word in ["neutral","positive","negative","unknown"]:
                         phrase = phrase[:i]
                         break
                 self.data.append({'Sentiment' : enum[line[0]], 'Tweet' : phrase})

Exemple #2

0

Afficher le fichier

def run(src: str) -> None:
    global env

    #tokenization
    tkz = Tokenizer()
    tokens, err = tkz.tokenize(src)

    if tok_debug:
        for i in tokens:
            print(i)

    if display_errors(err, "LOX: SYNTAX ERROR"):
        return

    #don't send single EOF token to parser
    #this allows parser to make stricter assertions while generating the AST
    if tokens[0].type == TokenType.EOF:
        return

    #parsing
    prs = Parser()
    program, err = prs.parse(tokens)

    if parse_debug:
        for tree in program:
            print(tree)

    if display_errors(err, "LOX: GRAMMAR ERROR"):
        return

    #interpretation
    itr = Interpreter(env)
    exit_status, err, env = itr.interpret(program)
    display_errors(err, "LOX: RUNTIME ERROR")

    if env_debug:
        print(env.map)

Exemple #3

0

Afficher le fichier

Fichier : tkt.py Projet : StoneLin0708/fakenews

f = np.array(f)
print(np.exp(f[39:]).sum())

l = list(map(len, w[39:]))
freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0])
print('\n'.join(f'|{i}|{j}|' for i, j in freq))
# In[]
ds = NewsDataset('data/news_dataset_tag10_v2.1.db')
# ds = NewsDataset('data/wiki.db')
tk = Tokenizer('data/t2.1_c1')

# In[]
from src.utils import peek
d = peek(ds.data, 1)
print(d[0][2])
print(tk.detokenize(tk.tokenize(d[0][2])))
# In[]
ll = list(map(lambda x: len(x[2]), ds.data))
sl = sorted(ll)
print(sl[0])
print(sl[int(len(sl) * 0.25)])
print(sl[int(len(sl) * 0.5)])
print(sl[int(len(sl) * 0.75)])
print(sl[-1])

# In[]
tl = []
al = []
for idx, t, a in tqdm(ds.data):
    tl.append(len(tk.tokenize(t, bos=False, eos=False)))
    al.append(len(tk.tokenize(a, bos=False, eos=False)))

Exemple #4

0

Afficher le fichier

Fichier : bench_decode.py Projet : StoneLin0708/fakenews

def main(args):
    set_seed(args.seed)

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(
        d_model=768,
        d_ff=1024,
        dropout=.1,
        layers=6,
        heads=8,
        d_emb=-1,
        pad_token_id=tk.pad_id,
        vocab_size=tk.vocab_size
    )

    ds = dataset(0)

    device = torch.device(args.device)

    model.load_state_dict(torch.load(args.ckpt, map_location=device)['model'])

    ds.set_mono_ratio(args.m_ratio)
    if not os.path.exists(args.results):
        start = timeit.default_timer()
        ds.generate(lambda x: [beam_search(
            model=model.to(device),
            input_sequence=torch.LongTensor(tk.tokenize(x)).to(device),
            bos_id=tk.bos_id,
            eos_id=tk.eos_id,
            beam_width=args.beam,
            device=device,
            max_seq_len=64)],
            max_input_len=64)
        end = timeit.default_timer()
        print(f'{end-start:.2f} sec')
        open(args.results,'w').writelines('\n'.join(tk.detokenize(ds.synthetic[1:])))
    else:
        start = timeit.default_timer()
        ds.generate(lambda x: beam_search_v2(
            model=model.to(device),
            input_sequence=tk.tokenize(x),
            tokenizer=tk,
            is_full=lambda b, nx, ny: (nx + ny * 1.5) * b > 256 * 64,
            beam_width=args.beam,
            device=device,
            max_seq_len=64),
            max_input_len=64,
            batch_size=64)
        end = timeit.default_timer()

        s = tk.detokenize(ds.synthetic[1:])
        open(args.results+'_2', 'w').writelines('\n'.join(s))
        r = open(args.results).readlines()
        if len(s) != len(r):
            raise Exception(f'result should be length of {len(r)} but got {len(s)}')
        for i, j in zip(s, r):
            if i != j.strip():
                print(f'---------------\n"{i}"\n!=\n"{j.strip()}"')

        print(f'{end-start:.2f} sec')

Exemple #5

0

Afficher le fichier

class WikipediaHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.tokenizer = None
        self.tag = None
        # title and text and id(documentID) are available as field
        self.title = ""
        self.text = []
        self.id = ""

        # need extra processing for following fields:-
        # infobox , categories , external , references

        # Some booleans to determine nesting
        # there is another tag named id which is nested inside revision tag
        # make sure we DONOT use that as the id
        self.insideRevision = False

    def startElement(self, tag, attributes):
        """
        Signals the start of an element in non-namespace mode.
        """
        self.tag = tag  # for identification in characters() method
        if tag == "title":
            log.debug("%s start", tag)
            self.title = ""  # reset for new title
        elif tag == "text":
            log.debug("%s start", tag)
            self.text = ""  # reset for new text
        elif tag == "revision":
            log.debug("%s start", tag)
            self.insideRevision = True
        elif tag == "id":
            log.debug("%s start", tag)

    def endElement(self, tag):
        """
        Signals the end of an element in non-namespace mode.
        """
        if tag == "title":
            # initialize a new document with title ==self.title
            self.tokenizer = Tokenizer(self.title)
            self.tokenizer.set_title(self.title)

        elif tag == "text":
            # By now the document title and id fields must have been extracted
            Helpers.docid_docname_map[
                self.tokenizer.get_doc_id()] = self.tokenizer.get_title()
            # add text body to that document    # TODO: use append
            termid_freq_map = self.tokenizer.tokenize(self.text)

            # print("term_termid_map", Helpers.term_termid_map)
            for term in termid_freq_map:
                # accumulate (termid: docid) pairs
                Indexer.termid_docid_list.append(
                    (term, self.tokenizer.get_doc_id()))

        elif tag == "id" and not self.insideRevision:
            # DoNOT set id if inside <revision> <id>XXX</id>
            self.tokenizer.set_doc_id(self.id)

        elif tag == "revision":
            self.insideRevision = False  # </revision> encountered

        self.tag = None

    def characters(self, content):
        """
        Receive notification of character data.

        The Parser will call this method to report each chunk of character data.
        SAX parsers may return all contiguous character data in a single chunk,
        or they may split it into several chunks;
        """
        if self.tag == "title":
            self.title = content
        elif self.tag == "text":
            # Using append instead of assignment to handle case where text is received in multiple chunks
            self.text += content
        elif self.tag == "id" and not self.insideRevision:
            self.id = content

Exemple #6

0

Afficher le fichier

Fichier : run_decode.py Projet : StoneLin0708/fakenews

def main(args):
    set_seed(args.seed) if args.seed is not None else None

    tk = Tokenizer(args.tokenizer)

    model = TransformerModel(d_model=768,
                             d_ff=1024,
                             dropout=0,
                             layers=args.layer,
                             heads=args.heads,
                             d_emb=-1,
                             pad_token_id=tk.pad_id,
                             vocab_size=tk.vocab_size)

    if args.inseq is not None:
        r = beam_search_v2(model, tk.tokenize(args.inseq), tk,
                           lambda b, nx, ny: (nx + ny) * b > 128 * 64, 4,
                           args.device, args.maxlen)
    else:
        if args.peek == 0:
            return
        ds = NewsDataset(args.data,
                         args.a,
                         args.b,
                         inplace=args.inplace,
                         sample=args.sample,
                         seed=args.seed)

        device = torch.device(args.device)

        if args.ckpt == 'latest':
            args.ckpt = find_latest_ckpt(args.model_dir,
                                         args.ckpt_pattern).path
        model.load_state_dict(
            torch.load(args.ckpt, map_location=device)['model'])
        model.to(device)
        if len(args.aids) == 0:
            ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(peek(
                ds, args.peek, args.seed))
        else:
            data = list(filter(lambda x: x[0] in args.aids, ds.data))
            sdata = []
            if len(data) != len(args.aids):
                raise Exception(f'only got {list(zip(*data))[0]}')
            for i in args.aids:
                for x in data:
                    if x[0] == i:
                        sdata.append(x)
            ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(sdata)

        start = timeit.default_timer()
        preds = []
        for beam_n in args.beam:
            p = beam_search_v2(model, inseq, tk, lambda b, nx, ny:
                               (nx + ny) * b > 128 * 64, beam_n, args.device,
                               args.maxlen)
            preds.append((f'beam{beam_n}', p))

        for topk_k in args.topk:
            p = topk(model, inseq, tk, topk_k, args.device, args.maxlen)
            preds.append((f'topk{topk_k}', p))

        results = []
        for idx in range(len(inseq)):
            results.append((tk.detokenize(inseq[idx]),
                            *[tk.detokenize(p[idx]) for _, p in preds],
                            tk.detokenize(outseq[idx][:args.maxlen + 1])))
        df = pd.DataFrame(results,
                          columns=['input', *[n for n, _ in preds], 'target'])
        if args.output is None:
            print(df)
        else:
            df.to_csv(args.output)
        print((timeit.default_timer() - start))