def parse(self): """ Replace sentiment with 2: Positive, 1: Neutral, 0: Negative """ tk = Tokenizer(preserve_case=False) with open(self.trainingPath) as training: tsvRead = csv.reader(training, delimiter="\t") enum = {'positive': 2, 'neutral': 1, 'negative': 0, 'unknown':3} tweet_dict = {} for line in tsvRead: if tk.tokenize(line[1]): phrase = tk.tokenize(line[1]) for i,word in enumerate(phrase): if i>50 and word in ["neutral","positive","negative","unknown"]: phrase = phrase[:i] break self.data.append({'Sentiment' : enum[line[0]], 'Tweet' : phrase})
def run(src: str) -> None: global env #tokenization tkz = Tokenizer() tokens, err = tkz.tokenize(src) if tok_debug: for i in tokens: print(i) if display_errors(err, "LOX: SYNTAX ERROR"): return #don't send single EOF token to parser #this allows parser to make stricter assertions while generating the AST if tokens[0].type == TokenType.EOF: return #parsing prs = Parser() program, err = prs.parse(tokens) if parse_debug: for tree in program: print(tree) if display_errors(err, "LOX: GRAMMAR ERROR"): return #interpretation itr = Interpreter(env) exit_status, err, env = itr.interpret(program) display_errors(err, "LOX: RUNTIME ERROR") if env_debug: print(env.map)
f = np.array(f) print(np.exp(f[39:]).sum()) l = list(map(len, w[39:])) freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0]) print('\n'.join(f'|{i}|{j}|' for i, j in freq)) # In[] ds = NewsDataset('data/news_dataset_tag10_v2.1.db') # ds = NewsDataset('data/wiki.db') tk = Tokenizer('data/t2.1_c1') # In[] from src.utils import peek d = peek(ds.data, 1) print(d[0][2]) print(tk.detokenize(tk.tokenize(d[0][2]))) # In[] ll = list(map(lambda x: len(x[2]), ds.data)) sl = sorted(ll) print(sl[0]) print(sl[int(len(sl) * 0.25)]) print(sl[int(len(sl) * 0.5)]) print(sl[int(len(sl) * 0.75)]) print(sl[-1]) # In[] tl = [] al = [] for idx, t, a in tqdm(ds.data): tl.append(len(tk.tokenize(t, bos=False, eos=False))) al.append(len(tk.tokenize(a, bos=False, eos=False)))
def main(args): set_seed(args.seed) tk = Tokenizer(args.tokenizer) model = TransformerModel( d_model=768, d_ff=1024, dropout=.1, layers=6, heads=8, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size ) ds = dataset(0) device = torch.device(args.device) model.load_state_dict(torch.load(args.ckpt, map_location=device)['model']) ds.set_mono_ratio(args.m_ratio) if not os.path.exists(args.results): start = timeit.default_timer() ds.generate(lambda x: [beam_search( model=model.to(device), input_sequence=torch.LongTensor(tk.tokenize(x)).to(device), bos_id=tk.bos_id, eos_id=tk.eos_id, beam_width=args.beam, device=device, max_seq_len=64)], max_input_len=64) end = timeit.default_timer() print(f'{end-start:.2f} sec') open(args.results,'w').writelines('\n'.join(tk.detokenize(ds.synthetic[1:]))) else: start = timeit.default_timer() ds.generate(lambda x: beam_search_v2( model=model.to(device), input_sequence=tk.tokenize(x), tokenizer=tk, is_full=lambda b, nx, ny: (nx + ny * 1.5) * b > 256 * 64, beam_width=args.beam, device=device, max_seq_len=64), max_input_len=64, batch_size=64) end = timeit.default_timer() s = tk.detokenize(ds.synthetic[1:]) open(args.results+'_2', 'w').writelines('\n'.join(s)) r = open(args.results).readlines() if len(s) != len(r): raise Exception(f'result should be length of {len(r)} but got {len(s)}') for i, j in zip(s, r): if i != j.strip(): print(f'---------------\n"{i}"\n!=\n"{j.strip()}"') print(f'{end-start:.2f} sec')
class WikipediaHandler(xml.sax.ContentHandler): def __init__(self): self.tokenizer = None self.tag = None # title and text and id(documentID) are available as field self.title = "" self.text = [] self.id = "" # need extra processing for following fields:- # infobox , categories , external , references # Some booleans to determine nesting # there is another tag named id which is nested inside revision tag # make sure we DONOT use that as the id self.insideRevision = False def startElement(self, tag, attributes): """ Signals the start of an element in non-namespace mode. """ self.tag = tag # for identification in characters() method if tag == "title": log.debug("%s start", tag) self.title = "" # reset for new title elif tag == "text": log.debug("%s start", tag) self.text = "" # reset for new text elif tag == "revision": log.debug("%s start", tag) self.insideRevision = True elif tag == "id": log.debug("%s start", tag) def endElement(self, tag): """ Signals the end of an element in non-namespace mode. """ if tag == "title": # initialize a new document with title ==self.title self.tokenizer = Tokenizer(self.title) self.tokenizer.set_title(self.title) elif tag == "text": # By now the document title and id fields must have been extracted Helpers.docid_docname_map[ self.tokenizer.get_doc_id()] = self.tokenizer.get_title() # add text body to that document # TODO: use append termid_freq_map = self.tokenizer.tokenize(self.text) # print("term_termid_map", Helpers.term_termid_map) for term in termid_freq_map: # accumulate (termid: docid) pairs Indexer.termid_docid_list.append( (term, self.tokenizer.get_doc_id())) elif tag == "id" and not self.insideRevision: # DoNOT set id if inside <revision> <id>XXX</id> self.tokenizer.set_doc_id(self.id) elif tag == "revision": self.insideRevision = False # </revision> encountered self.tag = None def characters(self, content): """ Receive notification of character data. The Parser will call this method to report each chunk of character data. SAX parsers may return all contiguous character data in a single chunk, or they may split it into several chunks; """ if self.tag == "title": self.title = content elif self.tag == "text": # Using append instead of assignment to handle case where text is received in multiple chunks self.text += content elif self.tag == "id" and not self.insideRevision: self.id = content
def main(args): set_seed(args.seed) if args.seed is not None else None tk = Tokenizer(args.tokenizer) model = TransformerModel(d_model=768, d_ff=1024, dropout=0, layers=args.layer, heads=args.heads, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size) if args.inseq is not None: r = beam_search_v2(model, tk.tokenize(args.inseq), tk, lambda b, nx, ny: (nx + ny) * b > 128 * 64, 4, args.device, args.maxlen) else: if args.peek == 0: return ds = NewsDataset(args.data, args.a, args.b, inplace=args.inplace, sample=args.sample, seed=args.seed) device = torch.device(args.device) if args.ckpt == 'latest': args.ckpt = find_latest_ckpt(args.model_dir, args.ckpt_pattern).path model.load_state_dict( torch.load(args.ckpt, map_location=device)['model']) model.to(device) if len(args.aids) == 0: ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(peek( ds, args.peek, args.seed)) else: data = list(filter(lambda x: x[0] in args.aids, ds.data)) sdata = [] if len(data) != len(args.aids): raise Exception(f'only got {list(zip(*data))[0]}') for i in args.aids: for x in data: if x[0] == i: sdata.append(x) ids, inseq, outseq = ds.get_collate_fn(tk, getid=True)(sdata) start = timeit.default_timer() preds = [] for beam_n in args.beam: p = beam_search_v2(model, inseq, tk, lambda b, nx, ny: (nx + ny) * b > 128 * 64, beam_n, args.device, args.maxlen) preds.append((f'beam{beam_n}', p)) for topk_k in args.topk: p = topk(model, inseq, tk, topk_k, args.device, args.maxlen) preds.append((f'topk{topk_k}', p)) results = [] for idx in range(len(inseq)): results.append((tk.detokenize(inseq[idx]), *[tk.detokenize(p[idx]) for _, p in preds], tk.detokenize(outseq[idx][:args.maxlen + 1]))) df = pd.DataFrame(results, columns=['input', *[n for n, _ in preds], 'target']) if args.output is None: print(df) else: df.to_csv(args.output) print((timeit.default_timer() - start))