use_cuda = False filename = "/media/data/classification/datasets/yelp_review_full_csv/train.csv" label_file = "/media/data/classification/datasets/yelp_review_full_csv/labels.txt" labels_list = TXT.read(label_file, firstline=False) lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list) id2lb_dict = Tokenizer.reversed_dict(lb2id_dict) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict, unk_words=False, sos=False, eos=False) tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=1) s_paras = [-1, 1] t_paras = [-1, 1] tokenizer = Tokenizer(s_paras, t_paras) tokenizer.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i, unk_words=True, sos=False, eos=False) tokenizer.tw2i = lb2id_dict tokenizer.i2tw = id2lb_dict tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i, unk_words=False, sos=False, eos=False) pad_id = tokenizer.sw2i.get(PAD, 0) sw_size = len(tokenizer.sw2i) tw_size = len(tokenizer.tw2i) collate_fn = Tokenizer.collate_fn(pad_id, True) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" tokenizer = BPE.load(vocab_file) tokenizer.add_tokens(sys_tokens) nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)
def build_data(args): if not args.tl: if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) if args.timestamped_subdir: sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(args.model_dir, sub_folder)): os.mkdir(os.path.join(args.model_dir, sub_folder)) args.model_dir = os.path.join(args.model_dir, sub_folder) args.log_file = os.path.join(args.model_dir, args.log_file) if args.tokenize_type != "bpe": s_paras = [args.wl_th, args.wcutoff] t_paras = [args.wl_th, args.wcutoff] print("INFO: - Build vocabulary...") tokenizer = Tokenizer(s_paras, t_paras) files = [args.train_file] if args.train_file != args.dev_file: files.append(args.dev_file) # Load datasets to build vocabulary data = Tokenizer.load_file(files, task=2) tokenizer.build(datasets=data) sw2i = tokenizer.sw2i tw2i = tokenizer.tw2i print("INFO: - Save vocabulary...") Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab")) else: print("INFO: - Load vocabulary...") tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) sw2i = tokenizer.get_vocab() tw2i = tokenizer.get_vocab() # args.tokenizer = tokenizer # Source language args.swd_pretrained = None args.twd_pretrained = None if len(args.swd_embfile) != 0: scale = np.sqrt(3.0 / args.swd_dim) emb_reader = Embeddings(args.swd_embfile) args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale) if args.twd_embfile == args.swd_embfile: scale = np.sqrt(3.0 / args.twd_dim) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # Target language if len(args.twd_embfile) != 0: scale = np.sqrt(3.0 / args.twd_dim) if args.twd_pretrained is None: emb_reader = Embeddings(args.swd_embfile) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # directly integrate transfer learning if no updating new words SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args else: print("INFO: - Use transfer learning technique") assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file") # load pre-trained argument file from a previous training folder margs = SaveloadHP.load(args.tlargs) # margs.tl = args.tl # margs.log_file = args.log_file # TODO update new vocab and all other new arguments used for new training # 0. Read vocab # 1. Update schema # 2. Update vocab # args.tokenizer = margs.tokenizer # 3. Use all model file directory of previous train args.model_dir = margs.model_dir args.seq2seq_file = margs.seq2seq_file # 4. Keep the remaining current arguments # add a constraint at the loading time that if fail to load any model, just skip it args.swd_pretrained = margs.swd_pretrained args.twd_pretrained = margs.twd_pretrained return args
from mlmodels.utils.trad_tokenizer import Tokenizer from mlmodels.utils.jsonIO import JSON Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "/media/data/review_response/Dev.json" tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=2) s_paras = [-1, 1] t_paras = [-1, 1] vocab = Tokenizer(s_paras, t_paras) vocab.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=vocab.process_nl, vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = Tokenizer.lst2idx(tokenizer=vocab.process_target, vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) pad_id = vocab.sw2i.get(PAD, 0) sw_size = len(vocab.sw2i) tw_size = len(vocab.tw2i) else: vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt" vocab = BPE.load(vocab_file) vocab.add_tokens([SOT, EOT, NULL]) nl2ids = BPE.tokens2ids(vocab) tg2ids = BPE.tokens2ids(vocab) pad_id = vocab.token_to_id(BPAD) if vocab.token_to_id(BPAD) else 0