def save_label(train_files, label_file, task=2, firstline=True): datasets = Tokenizer.load_file(train_files, firstline=firstline, task=task) # data = [] label_set = set() for dataset in datasets: for nl, label in dataset: # data.append(d) label_set.update(set(label.split())) # label_set.update([NULL]) TXT.write(label_set, label_file)
def extract_label(inp_file, out_file, tokenizer=Tokenizer.process_target): data = read_data(inp_file) twcnt, twl = Counter(), 0 for line in data: nl, target = line nl = nl.lower() target = target.lower() # Tokenize target into tokens target = tokenizer(target) # target = Utilities.fast_tokenize(target) twcnt, twl = Tokenizer.update_sent(target, twcnt, twl) labels = list(twcnt.keys()) TXT.write(labels, out_file)
def read_data(filename, firstline=True): # load datasets to map into indexes if filename.split(".")[-1] == "csv": data = CSV.read(filename, firstline=firstline, slices=[0, 1]) elif filename.split(".")[-1] == "txt": data = TXT.read(filename, firstline=firstline) elif filename.split(".")[-1] == "json": data = JSON.load(filename) else: raise Exception("Not implement yet") return data
if __name__ == '__main__': import torch from mlmodels.utils.idx2tensor import Data2tensor, seqPAD from mlmodels.utils.dataset import IterDataset, collate_fn, tokens2ids from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset from mlmodels.utils.BPEtonkenizer import BPE from mlmodels.utils.special_tokens import BPAD, PAD, NULL from mlmodels.utils.txtIO import TXT Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "../../data/reviews/processed_csv/train_res4.csv" label_file = "../../data/reviews/processed_csv/labels.txt" labels_list = TXT.read(label_file, firstline=False) lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list) id2lb_dict = Tokenizer.reversed_dict(lb2id_dict) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict, unk_words=False, sos=False, eos=False) tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=2) s_paras = [-1, 1] t_paras = [-1, 1] tokenizer = Tokenizer(s_paras, t_paras) tokenizer.build(data)
# -*- coding: utf-8 -*- """ Created on 2020-07-13 @author duytinvo """ from mlmodels.utils.txtIO import TXT from mlmodels.utils.csvIO import CSV filename = "/media/data/paraphrase/paralex-evaluation/data/train/paraphrases.txt" data = TXT.read(filename, False) newdata = [] for d in data: newdata.append(d.split("\t")[:-1]) CSV.write(newdata, "/media/data/paraphrase/paralex.csv")
# -*- coding: utf-8 -*- """ Created on 25/03/2020 @author duytinvo """ from mlmodels.utils.csvIO import CSV from mlmodels.utils.txtIO import TXT csvfiles = [ "/media/data/review_response/Train.csv", "/media/data/review_response/Dev.csv" ] txtfile = "/media/data/review_response/raw_vocab.txt" data = [] for csvfile in csvfiles: rev = CSV.read(csvfile, True, [0]) data += rev res = CSV.read(csvfile, True, [1]) data += res TXT.write(data, txtfile)
def __init__(self, args=None): print("INFO: - Load the pre-built tokenizer...") if args.tokenize_type != "bpe": tokenizer = Tokenizer.load( os.path.join(args.model_dir, "tokenizer.vocab")) else: tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) labels_list = TXT.read(args.label_file, firstline=False) tokenizer.tw2i = Tokenizer.list2dict(sys_tokens + labels_list) tokenizer.i2tw = Tokenizer.reversed_dict(tokenizer.tw2i) self.args = args self.tokenizer = tokenizer self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu") self.num_labels = len(self.tokenizer.tw2i) # Hyper-parameters at target language self.target2idx = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=self.tokenizer.tw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) if self.args.tokenize_type != "bpe": # Hyper-parameters at source language self.source2idx = Tokenizer.lst2idx( tokenizer=Tokenizer.process_nl, vocab_words=self.tokenizer.sw2i, unk_words=True, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.sw2i.get(PAD, PAD_id) self.unk_id = self.tokenizer.sw2i.get(UNK, UNK_id) sw_size = len(self.tokenizer.sw2i) # tw_size = len(self.tokenizer.tw2i) self.collate_fn = Tokenizer.collate_fn(self.pad_id, True) else: self.source2idx = BPE.tokens2ids(self.tokenizer, sos=self.args.ssos, eos=self.args.seos) self.pad_id = self.tokenizer.token_to_id(BPAD) if self.tokenizer.token_to_id(BPAD) is not None \ else self.tokenizer.token_to_id(PAD) self.unk_id = self.tokenizer.token_to_id(BUNK) if self.tokenizer.token_to_id(BUNK) is not None \ else self.tokenizer.token_to_id(UNK) sw_size = self.tokenizer.get_vocab_size() # tw_size = self.tokenizer.get_vocab_size() self.collate_fn = BPE.collate_fn(self.pad_id, True) # Hyper-parameters at word-level source language # [size, dim, pre_embs, drop_rate, zero_padding, requires_grad] = HPs nlemb_HPs = [ sw_size, self.args.swd_dim, self.args.swd_pretrained, self.args.wd_dropout, self.args.wd_padding, self.args.snl_reqgrad ] # Encoder # [nn_mode, nn_inp_dim, nn_out_dim, nn_layers, nn_bidirect, nn_dropout] = HPs if self.args.enc_cnn: enc_HPs = [ "cnn", self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.kernel_size ] else: if self.args.ed_mode == "self_attention": # use the maximum length 5 times larger than input length nlemb_HPs += [self.tokenizer.swl * 5] # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_heads, self.args.ed_dropout, self.args.ed_activation, None, self.args.ed_hismask ] else: enc_HPs = [ self.args.ed_mode, self.args.swd_dim, self.args.ed_outdim, self.args.ed_layers, self.args.ed_bidirect, self.args.ed_dropout ] crf_HPs = [ self.args.use_crf, self.num_labels, self.args.se_transitions ] print("INFO: - Build model...") self.labeler = Labeler(nlemb_HPs, enc_HPs, crf_HPs, drop_rate=self.args.final_dropout, num_labels=self.num_labels) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") self.labeler = nn.DataParallel(self.labeler) self.labeler.to(self.device) self.labeler_optimizer = None if self.args.optimizer.lower() == "adamax": self.init_optimizers(optim.Adamax) elif self.args.optimizer.lower() == "adam": self.init_optimizers(optim.Adam) elif self.args.optimizer.lower() == "radam": self.init_optimizers(RAdam) elif self.args.optimizer.lower() == "adadelta": self.init_optimizers(optim.Adadelta) elif self.args.optimizer.lower() == "adagrad": self.init_optimizers(optim.Adagrad) else: self.init_optimizers(optim.SGD)