nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i, unk_words=True, sos=False, eos=False) tokenizer.tw2i = lb2id_dict tokenizer.i2tw = id2lb_dict tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i, unk_words=False, sos=False, eos=False) pad_id = tokenizer.sw2i.get(PAD, 0) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" tokenizer = BPE.load(vocab_file) tokenizer.add_tokens(sys_tokens) nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id( BPAD) else 0 collate_fn = BPE.collate_fn(pad_id, True)
def build_data(args): if not args.tl: if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) if args.timestamped_subdir: sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(args.model_dir, sub_folder)): os.mkdir(os.path.join(args.model_dir, sub_folder)) args.model_dir = os.path.join(args.model_dir, sub_folder) args.log_file = os.path.join(args.model_dir, args.log_file) if args.tokenize_type != "bpe": s_paras = [args.wl_th, args.wcutoff] t_paras = [args.wl_th, args.wcutoff] print("INFO: - Build vocabulary...") tokenizer = Tokenizer(s_paras, t_paras) files = [args.train_file] if args.train_file != args.dev_file: files.append(args.dev_file) # Load datasets to build vocabulary data = Tokenizer.load_file(files, task=2) tokenizer.build(datasets=data) sw2i = tokenizer.sw2i tw2i = tokenizer.tw2i print("INFO: - Save vocabulary...") Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab")) else: print("INFO: - Load vocabulary...") tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) sw2i = tokenizer.get_vocab() tw2i = tokenizer.get_vocab() # args.tokenizer = tokenizer # Source language args.swd_pretrained = None args.twd_pretrained = None if len(args.swd_embfile) != 0: scale = np.sqrt(3.0 / args.swd_dim) emb_reader = Embeddings(args.swd_embfile) args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale) if args.twd_embfile == args.swd_embfile: scale = np.sqrt(3.0 / args.twd_dim) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # Target language if len(args.twd_embfile) != 0: scale = np.sqrt(3.0 / args.twd_dim) if args.twd_pretrained is None: emb_reader = Embeddings(args.swd_embfile) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # directly integrate transfer learning if no updating new words SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args else: print("INFO: - Use transfer learning technique") assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file") # load pre-trained argument file from a previous training folder margs = SaveloadHP.load(args.tlargs) # margs.tl = args.tl # margs.log_file = args.log_file # TODO update new vocab and all other new arguments used for new training # 0. Read vocab # 1. Update schema # 2. Update vocab # args.tokenizer = margs.tokenizer # 3. Use all model file directory of previous train args.model_dir = margs.model_dir args.seq2seq_file = margs.seq2seq_file # 4. Keep the remaining current arguments # add a constraint at the loading time that if fail to load any model, just skip it args.swd_pretrained = margs.swd_pretrained args.twd_pretrained = margs.twd_pretrained return args
# -*- coding: utf-8 -*- """ Created on 25/03/2020 @author duytinvo """ import torch from mlmodels.utils.BPEtonkenizer import BPE vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" tokenizer = BPE.load(vocab_file) sent = "This is an example, where we test BPE standed for Byte-Pair Encoding. Happy coding!!!" encoded = tokenizer.encode(sent) # from torchtext.experimental.datasets import IMDB # train1, = IMDB(data_select='train')
def __init__(self, args=None): print("INFO: - Load the pre-built tokenizer...") if args.tokenize_type != "bpe": tokenizer = Tokenizer.load(os.path.join(args.model_dir, "tokenizer.vocab")) else: tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) tokenizer.tw2i = tokenizer.get_vocab() tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) self.args = args self.tokenizer = tokenizer self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu") # Include SOt, EOt if set set_words, else Ignore SOt, EOt # self.num_labels = len(self.tokenizer.tw2i) self.num_labels = self.tokenizer.get_vocab_size() if self.num_labels > 2: self.lossF = nn.CrossEntropyLoss().to(self.device) else: self.lossF = nn.BCEWithLogitsLoss().to(self.device) # Hyper-parameters at source language if self.args.tokenize_type != "bpe": self.source2idx = Tokenizer.lst2idx(tokenizer=self.tokenizer.process_nl, vocab_words=self.tokenizer.sw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) # Hyper-parameters at target language self.target2idx = Tokenizer.lst2idx(tokenizer=self.tokenizer.process_target, vocab_words=self.tokenizer.tw2i, unk_words=True, sos=self.args.tsos, eos=self.args.teos) self.pad_id = self.tokenizer.sw2i.get(PAD, 0) self.unk_id = self.tokenizer.sw2i.get(UNK, UNK_id) sw_size = len(self.tokenizer.sw2i) # tw_size = len(self.tokenizer.tw2i) self.collate_fn = Tokenizer.collate_fn(self.pad_id, True) else: self.source2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.ssos, eos=self.args.seos) self.target2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.tsos, eos=self.args.teos) self.pad_id = self.tokenizer.token_to_id(BPAD) if self.tokenizer.token_to_id(BPAD) is not None \ else self.tokenizer.token_to_id(PAD) self.unk_id = self.tokenizer.token_to_id(BUNK) if self.tokenizer.token_to_id(BUNK) is not None \ else self.tokenizer.token_to_id(UNK) sw_size = self.tokenizer.get_vocab_size() # tw_size = self.tokenizer.get_vocab_size() self.collate_fn = BPE.collate_fn(self.pad_id, True) # Hyper-parameters at word-level source language # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs nlemb_HPs = [sw_size, self.args.swd_dim, self.args.swd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.snl_reqgrad] # NL inputs # Encoder # [nn_mode, nn_inp_dim, nn_out_dim, nn_layers, nn_bidirect, nn_dropout] = HPs if self.args.enc_cnn: enc_HPs = ["cnn", self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.kernel_size] else: enc_HPs = [self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout] # Decoder # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs temb_HPs = [self.num_labels, self.args.twd_dim, self.args.twd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.twd_reqgrad] # Hyper-parameters at word-level target language dec_HPs = [self.args.ed_mode, self.args.twd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout] dec_HPs = [temb_HPs, dec_HPs] print("INFO: - Build model...") # self.seq2seq = Seq2seq(semb_HPs, sch_HPs, enc_HPs, dec_HPs, drop_rate=self.args.final_dropout, # num_labels=self.num_labels, enc_att=self.args.enc_att).to(self.device) self.seq2seq = Seq2seq(nlemb_HPs, enc_HPs, dec_HPs, drop_rate=self.args.final_dropout, num_labels=self.num_labels, enc_att=self.args.enc_att) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs self.seq2seq = nn.DataParallel(self.seq2seq) self.seq2seq.to(self.device) self.seq2seq_optimizer = None if self.args.optimizer.lower() == "adamax": self.init_optimizers(optim.Adamax) elif self.args.optimizer.lower() == "adam": self.init_optimizers(optim.Adam) elif self.args.optimizer.lower() == "radam": self.init_optimizers(RAdam) elif self.args.optimizer.lower() == "adadelta": self.init_optimizers(optim.Adadelta) elif self.args.optimizer.lower() == "adagrad": self.init_optimizers(optim.Adagrad) else: self.init_optimizers(optim.SGD)
def __init__(self, args=None): print("INFO: - Load the pre-built tokenizer...") if args.tokenize_type != "bpe": tokenizer = Tokenizer.load( os.path.join(args.model_dir, "tokenizer.vocab")) else: tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) labels_list = TXT.read(args.label_file, firstline=False) tokenizer.tw2i = Tokenizer.list2dict(sys_tokens + labels_list) tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) self.args = args self.tokenizer = tokenizer self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu") self.num_labels = len(self.tokenizer.tw2i) # Hyper-parameters at target language self.target2idx = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=self.tokenizer.tw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) if self.args.tokenize_type != "bpe": # Hyper-parameters at source language self.source2idx = Tokenizer.lst2idx( tokenizer=Tokenizer.process_nl, vocab_words=self.tokenizer.sw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.sw2i.get(PAD, PAD_id) self.unk_id = self.tokenizer.sw2i.get(UNK, UNK_id) sw_size = len(self.tokenizer.sw2i) # tw_size = len(self.tokenizer.tw2i) self.collate_fn = Tokenizer.collate_fn(self.pad_id, True) else: self.source2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.token_to_id(BPAD) if self.tokenizer.token_to_id(BPAD) is not None \ else self.tokenizer.token_to_id(PAD) self.unk_id = self.tokenizer.token_to_id(BUNK) if self.tokenizer.token_to_id(BUNK) is not None \ else self.tokenizer.token_to_id(UNK) sw_size = self.tokenizer.get_vocab_size() # tw_size = self.tokenizer.get_vocab_size() self.collate_fn = BPE.collate_fn(self.pad_id, True) # Hyper-parameters at word-level source language # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs nlemb_HPs = [ sw_size, self.args.swd_dim, self.args.swd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.snl_reqgrad ] # Encoder # [nn_mode, nn_inp_dim, nn_out_dim, nn_layers, nn_bidirect, nn_dropout] = HPs if self.args.enc_cnn: enc_HPs = [ "cnn", self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.kernel_size ] else: if self.args.ed_mode == "self_attention": # use the maximum length 5 times larger than input length nlemb_HPs += [self.tokenizer.swl * 5] # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_heads, self.args.ed_dropout, self.args.ed_activation, None, self.args.ed_hismask ] else: enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout ] crf_HPs = [ self.args.use_crf, self.num_labels, self.args.se_transitions ] print("INFO: - Build model...") self.labeler = Labeler(nlemb_HPs, enc_HPs, crf_HPs, drop_rate=self.args.final_dropout, num_labels=self.num_labels) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.labeler = nn.DataParallel(self.labeler) self.labeler.to(self.device) self.labeler_optimizer = None if self.args.optimizer.lower() == "adamax": self.init_optimizers(optim.Adamax) elif self.args.optimizer.lower() == "adam": self.init_optimizers(optim.Adam) elif self.args.optimizer.lower() == "radam": self.init_optimizers(RAdam) elif self.args.optimizer.lower() == "adadelta": self.init_optimizers(optim.Adadelta) elif self.args.optimizer.lower() == "adagrad": self.init_optimizers(optim.Adagrad) else: self.init_optimizers(optim.SGD)
help="The input training data file (a text file).") parser.add_argument( "--vocab_file", type=str, default="/media/data/review_response/tokens/bert_level-bpe-vocab.txt", help="Saved vocab file") args = parser.parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) tokenizer = BPE.load(args.vocab_file) source2idx = tokens2ids(tokenizer) # data = CSV.read(args.train_data_file, firstline=True, slices=[0, 1]) # train_dataset = MapDataset(data, source2idx=source2idx, target2idx=source2idx) # # # train_sampler = RandomSampler(train_dataset) # train_sampler = SequentialSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, pin_memory=True, # batch_size=16, collate_fn=collate_fn) # # for i, batch in enumerate(train_dataloader): # inputs, outputs = batch[0], batch[1] # break iterdata = CSV.get_iterator(args.train_data_file, firstline=True)
# Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=2) s_paras = [-1, 1] t_paras = [-1, 1] vocab = Tokenizer(s_paras, t_paras) vocab.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) pad_id = vocab.sw2i.get(PAD, 0) sw_size = len(vocab.sw2i) tw_size = len(vocab.tw2i) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" vocab = BPE.load(vocab_file) vocab.add_tokens([SOT, EOT, NULL]) nl2ids = BPE.tokens2ids(vocab) tg2ids = BPE.tokens2ids(vocab) pad_id = vocab.token_to_id(BPAD) if vocab.token_to_id(BPAD) else 0 sw_size = vocab.get_vocab_size() tw_size = vocab.get_vocab_size() collate_fn = BPE.collate_fn(pad_id, True) # load datasets to map into indexes train_data = JSON.get_iterator(filename) num_lines = JSON._len(filename) # train_data = CSV.get_iterator(filename, firstline=True) # num_lines = CSV._len(filename) train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=tg2ids, num_lines=num_lines)
argparser.add_argument('--out_file', help='output label file', default="/media/data/disambiguator/corpus/bluelink_labels.txt", type=str) argparser.add_argument('--vocab_file', help='pre-trained bpe vocab file', default="/media/data/review_response/tokens/bert_level-bpe-vocab.txt", type=str) argparser.add_argument('--tokenizer', help='tokenizer method', choices=["split", "antlr", "bpe"], default="split", type=str) args = argparser.parse_args() if args.tokenizer == "split": tokenizer = Tokenizer.process_target elif args.tokenizer == "antlr": tokenizer = Utilities.fast_tokenize elif args.tokenizer =="bpe": pretrained_tokenizer = BPE.load(args.vocab_file) tokenizer = lambda x: pretrained_tokenizer.encode(x).tokens else: raise Exception("Not implement yet") extract_label(inp_file=args.inp_file, out_file=args.out_file, tokenizer=tokenizer) # filename = "/media/data/np6/dataset/generated_corpus_stored_train_human.csv" # label_file = "/media/data/np6/dataset/labels2.txt" # vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" # data = read_data(filename) # twcnt, twl = Counter(), 0 # for line in data: # nl, target = line # # Tokenize target into tokens # target = Tokenizer.process_target(target)