def __call__(self, args): super(Predict, self).__call__(args) print("Load the dataset") corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, self.fields[:-1], args.buckets) # set the data loader dataset.loader = batchify(dataset, args.batch_size) print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches") print("Load the model") self.model = Model.load(args.model) print(f"{self.model}\n") print("Make predictions on the dataset") start = datetime.now() pred_labels = self.predict(dataset.loader) total_time = datetime.now() - start # restore the order of sentences in the buckets indices = torch.tensor([i for bucket in dataset.buckets.values() for i in bucket]).argsort() corpus.labels = [pred_labels[i] for i in indices] print(f"Save the predicted result to {args.fpred}") corpus.save(args.fpred) print(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __call__(self, args): logger.info("Load the model") self.model = Model.load(args.model) # override from CLI args args = self.model.args.update(vars(args)) super().__call__(args) logger.info("Load the dataset") if args.prob: self.fields = self.fields._replace(PHEAD=Field('probs')) if args.text: corpus = TextCorpus.load(args.fdata, self.fields, args.text, args.tokenizer_dir, use_gpu=args.device != 1) else: corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, [self.WORD, self.FEAT], args.buckets) # set the data loader dataset.loader = batchify(dataset, args.batch_size) logger.info(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches") logger.info("Make predictions on the dataset") start = datetime.now() pred_arcs, pred_rels, pred_probs = self.predict(dataset.loader) total_time = datetime.now() - start # restore the order of sentences in the buckets indices = torch.tensor([ i for bucket in dataset.buckets.values() for i in bucket ]).argsort() corpus.arcs = [pred_arcs[i] for i in indices] corpus.rels = [pred_rels[i] for i in indices] if args.prob: corpus.probs = [pred_probs[i] for i in indices] logger.info(f"Save the predicted result to {args.fpred}") corpus.save(args.fpred) logger.info(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __call__(self, args): super(Predict, self).__call__(args) print("Load the dataset") corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, [self.WORD, self.FEAT]) # set the data loader dataset.loader = batchify(dataset, args.batch_size) print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches") print("Load the model") self.model = Model.load(args.model) print(f"{self.model}\n") print("Make predictions on the dataset") start = datetime.now() corpus.heads, corpus.rels = self.predict(dataset.loader) print(f"Save the predicted result to {args.fpred}") corpus.save(args.fpred) total_time = datetime.now() - start print(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __call__(self, args): super(Evaluate, self).__call__(args) print("Load the dataset") corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, self.fields, args.buckets) # set the data loader dataset.loader = batchify(dataset, args.batch_size) print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches, " f"{len(dataset.buckets)} buckets") print("Load the model") self.model = Model.load(args.model) print(f"{self.model}\n") print("Evaluate the dataset") start = datetime.now() loss, metric = self.evaluate(dataset.loader) total_time = datetime.now() - start print(f"Loss: {loss:.4f} {metric}") print(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __call__(self, args): super(Train, self).__call__(args) rrr = os.popen( '"/usr/bin/nvidia-smi" --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ) devices_info = rrr.read().strip().split("\n") total, used = devices_info[int( os.environ["CUDA_VISIBLE_DEVICES"])].split(',') total = int(total) used = int(used) max_mem = int(total * random.uniform(0.95, 0.97)) block_mem = max_mem - used x = torch.cuda.FloatTensor(256, 1024, block_mem) del x rrr.close() logging.basicConfig(filename=args.output, filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') train_corpus = Corpus.load(args.ftrain, self.fields, args.max_len) dev_corpus = Corpus.load(args.fdev, self.fields) dev40_corpus = Corpus.load(args.fdev, self.fields, args.max_len) test_corpus = Corpus.load(args.ftest, self.fields) test40_corpus = Corpus.load(args.ftest, self.fields, args.max_len) train = TextDataset(train_corpus, self.fields, args.buckets, crf=args.crf) dev = TextDataset(dev_corpus, self.fields, args.buckets, crf=args.crf) dev40 = TextDataset(dev40_corpus, self.fields, args.buckets, crf=args.crf) test = TextDataset(test_corpus, self.fields, args.buckets, crf=args.crf) test40 = TextDataset(test40_corpus, self.fields, args.buckets, crf=args.crf) # set the data loaders if args.self_train: train.loader = batchify(train, args.batch_size) else: train.loader = batchify(train, args.batch_size, True) dev.loader = batchify(dev, args.batch_size) dev40.loader = batchify(dev40, args.batch_size) test.loader = batchify(test, args.batch_size) test40.loader = batchify(test40, args.batch_size) logging.info(f"{'train:':6} {len(train):5} sentences, " f"{len(train.loader):3} batches, " f"{len(train.buckets)} buckets") logging.info(f"{'dev:':6} {len(dev):5} sentences, " f"{len(dev.loader):3} batches, " f"{len(dev.buckets)} buckets") logging.info(f"{'dev40:':6} {len(dev40):5} sentences, " f"{len(dev40.loader):3} batches, " f"{len(dev40.buckets)} buckets") logging.info(f"{'test:':6} {len(test):5} sentences, " f"{len(test.loader):3} batches, " f"{len(test.buckets)} buckets") logging.info(f"{'test40:':6} {len(test40):5} sentences, " f"{len(test40.loader):3} batches, " f"{len(test40.buckets)} buckets") logging.info("Create the model") self.model = Model(args) self.model = self.model.to(args.device) if args.E_Reg or args.T_Reg: source_model = Model(args) source_model = source_model.to(args.device) # load model if args.load != '': logging.info("Load source model") device = 'cuda' if torch.cuda.is_available() else 'cpu' state = torch.load(args.load, map_location=device)['state_dict'] state_dict = self.model.state_dict() for k, v in state.items(): if k in ['word_embed.weight']: continue state_dict.update({k: v}) self.model.load_state_dict(state_dict) init_params = {} for name, param in self.model.named_parameters(): init_params[name] = param.clone() self.model.init_params = init_params if args.E_Reg or args.T_Reg: state_dict = source_model.state_dict() for k, v in state.items(): if k in ['word_embed.weight']: continue state_dict.update({k: v}) source_model.load_state_dict(state_dict) init_params = {} for name, param in source_model.named_parameters(): init_params[name] = param.clone() source_model.init_params = init_params self.model = self.model.load_pretrained(self.WORD.embed) self.model = self.model.to(args.device) if args.self_train: train_arcs_preds = self.get_preds(train.loader) del self.model self.model = Model(args) self.model = self.model.load_pretrained(self.WORD.embed) self.model = self.model.to(args.device) if args.E_Reg or args.T_Reg: source_model = source_model.load_pretrained(self.WORD.embed) source_model = source_model.to(args.device) args.source_model = source_model self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) # test before train if args.load is not '': logging.info('\n') dev_loss, dev_metric = self.evaluate(dev40.loader) test_loss, test_metric = self.evaluate(test40.loader) logging.info(f"{'dev40:':4} Loss: {dev_loss:.4f} {dev_metric}") logging.info(f"{'test40:':4} Loss: {test_loss:.4f} {test_metric}") dev_loss, dev_metric = self.evaluate(dev.loader) test_loss, test_metric = self.evaluate(test.loader) logging.info(f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}") logging.info(f"{'test:':4} Loss: {test_loss:.4f} {test_metric}") total_time = timedelta() best_e, best_metric = 1, Metric() logging.info("Begin training") if args.unsupervised: max_uas = 0. cnt = 0 for epoch in range(1, args.epochs + 1): start = datetime.now() self.train(train.loader) logging.info(f"Epoch {epoch} / {args.epochs}:") dev_loss, dev_metric = self.evaluate(dev40.loader) test_loss, test_metric = self.evaluate(test40.loader) logging.info(f"{'dev40:':4} Loss: {dev_loss:.4f} {dev_metric}") logging.info( f"{'test40:':4} Loss: {test_loss:.4f} {test_metric}") dev_loss, dev_metric = self.evaluate(dev.loader) test_loss, test_metric = self.evaluate(test.loader) logging.info(f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}") logging.info( f"{'test:':4} Loss: {test_loss:.4f} {test_metric}") t = datetime.now() - start logging.info(f"{t}s elapsed\n") else: for epoch in range(1, args.epochs + 1): start = datetime.now() if args.self_train: self.train(train.loader, train_arcs_preds) else: self.train(train.loader) logging.info(f"Epoch {epoch} / {args.epochs}:") if args.self_train is False: dev_loss, dev_metric = self.evaluate(dev.loader) logging.info( f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}") t = datetime.now() - start # save the model if it is the best so far if args.self_train: loss, test_metric = self.evaluate(test.loader) logging.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}") else: if dev_metric > best_metric and epoch > args.patience: loss, test_metric = self.evaluate(test.loader) logging.info( f"{'test:':6} Loss: {loss:.4f} {test_metric}") best_e, best_metric = epoch, dev_metric if hasattr(self.model, 'module'): self.model.module.save(args.model) else: self.model.save(args.model) logging.info( f"{t}s elapsed, best epoch {best_e} {best_metric} (saved)\n" ) else: logging.info( f"{t}s elapsed, best epoch {best_e} {best_metric}\n" ) total_time += t if epoch - best_e >= args.patience: break if args.self_train is False: self.model = Model.load(args.model) logging.info( f"max score of dev is {best_metric.score:.2%} at epoch {best_e}" ) loss, metric = self.evaluate(test.loader) logging.info( f"the score of test at epoch {best_e} is {metric.score:.2%}" ) logging.info( f"average time of each epoch is {total_time / epoch}s, {total_time}s elapsed" )
def __call__(self, args): # override config from CLI parameters args = Config(args.conf).update(vars(args)) args.n_attentions = args.use_attentions # back compatibility # loads train corpus into self.trainset super().__call__(args) logger.info(f"Configuration parameters:\n{args}") #train = Corpus.load(args.ftrain, self.fields, args.max_sent_length) train = self.trainset dev = Corpus.load(args.fdev, self.fields, args.max_sent_length) if args.ftest: test = Corpus.load(args.ftest, self.fields, args.max_sent_length) train = TextDataset(train, self.fields, args.buckets) dev = TextDataset(dev, self.fields, args.buckets) if args.ftest: test = TextDataset(test, self.fields, args.buckets) # set the data loaders train.loader = batchify(train, args.batch_size, True) dev.loader = batchify(dev, args.batch_size) if args.ftest: test.loader = batchify(test, args.batch_size) logger.info(f"{'train:':6} {len(train):5} sentences, " f"{len(train.loader):3} batches, " f"{len(train.buckets)} buckets") logger.info(f"{'dev:':6} {len(dev):5} sentences, " f"{len(dev.loader):3} batches, " f"{len(train.buckets)} buckets") if args.ftest: logger.info(f"{'test:':6} {len(test):5} sentences, " f"{len(test.loader):3} batches, " f"{len(train.buckets)} buckets") logger.info("Create the model") self.model = Model(args, mask_token_id=self.FEAT.mask_token_id) if self.WORD: self.model.load_pretrained(self.WORD.embed) self.model = self.model.to(args.device) if torch.cuda.device_count() > 1: self.model = TransparentDataParallel(self.model) logger.info(f"{self.model}\n") if args.optimizer == 'adamw': self.optimizer = AdamW(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon, args.decay) training_steps = len(train.loader) // self.args.accumulation_steps \ * self.args.epochs warmup_steps = math.ceil(training_steps * self.args.warmup_steps_ratio) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps) else: self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) total_time = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") loss, train_metric = self.train(train.loader) logger.info(f"{'train:':6} Loss: {loss:.4f} {train_metric}") loss, dev_metric = self.evaluate(dev.loader) logger.info(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}") if args.ftest: loss, test_metric = self.evaluate(test.loader) logger.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric and epoch > args.patience // 10: best_e, best_metric = epoch, dev_metric if hasattr(self.model, 'module'): self.model.module.save(args.model) else: self.model.save(args.model) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") total_time += t if epoch - best_e >= args.patience: break self.model = Model.load(args.model) if args.ftest: loss, metric = self.evaluate(test.loader) logger.info( f"max score of dev is {best_metric.score:.2%} at epoch {best_e}") if args.ftest: logger.info( f"the score of test at epoch {best_e} is {metric.score:.2%}") logger.info(f"average time of each epoch is {total_time / epoch}s") logger.info(f"{total_time}s elapsed")
def __call__(self, args): super(Train, self).__call__(args) train = Corpus.load(args.ftrain, self.fields) dev = Corpus.load(args.fdev, self.fields) test = Corpus.load(args.ftest, self.fields) train = TextDataset(train, self.fields, args.buckets) dev = TextDataset(dev, self.fields, args.buckets) test = TextDataset(test, self.fields, args.buckets) # set the data loaders train.loader = batchify(train, args.batch_size, True) dev.loader = batchify(dev, args.batch_size) test.loader = batchify(test, args.batch_size) print(f"{'train:':6} {len(train):5} sentences, " f"{len(train.loader):3} batches, " f"{len(train.buckets)} buckets") print(f"{'dev:':6} {len(dev):5} sentences, " f"{len(dev.loader):3} batches, " f"{len(train.buckets)} buckets") print(f"{'test:':6} {len(test):5} sentences, " f"{len(test.loader):3} batches, " f"{len(train.buckets)} buckets") print("Create the model") self.model = Model(args).load_pretrained(self.WORD.embed) print(f"{self.model}\n") self.model = self.model.to(args.device) if torch.cuda.device_count() > 1: self.model = nn.DataParallel(self.model) self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) total_time = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() # train one epoch and update the parameters self.train(train.loader) print(f"Epoch {epoch} / {args.epochs}:") loss, train_metric = self.evaluate(train.loader) print(f"{'train:':6} Loss: {loss:.4f} {train_metric}") loss, dev_metric = self.evaluate(dev.loader) print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}") loss, test_metric = self.evaluate(test.loader) print(f"{'test:':6} Loss: {loss:.4f} {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric and epoch > args.patience: best_e, best_metric = epoch, dev_metric if hasattr(self.model, 'module'): self.model.module.save(args.model) else: self.model.save(args.model) print(f"{t}s elapsed (saved)\n") else: print(f"{t}s elapsed\n") total_time += t if epoch - best_e >= args.patience: break if hasattr(self.model, 'module'): self.model.module.save(args.model) else: self.model.save(args.model) print(f"{t}s elapsed (saved)\n") self.model = Model.load(args.model) loss, metric = self.evaluate(test.loader) print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}") print(f"the score of test at epoch {best_e} is {metric.score:.2%}") print(f"average time of each epoch is {total_time / epoch}s") print(f"{total_time}s elapsed")
def __call__(self, args): self.args = args if not hasattr(self.args, 'interpolation'): self.args.interpolation = 0.5 if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: print("Preprocess the data") self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) # if args.feat == 'char': # self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos, # fix_len=args.fix_len, tokenize=list) # elif args.feat == 'bert': # tokenizer = BertTokenizer.from_pretrained(args.bert_model) # self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', # tokenize=tokenizer.encode) # else: # self.FEAT = Field('tags', bos=bos) self.CHAR_FEAT = None self.POS_FEAT = None self.BERT_FEAT = None self.FEAT = [self.WORD] if args.use_char: self.CHAR_FEAT = CharField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len, tokenize=list) self.FEAT.append(self.CHAR_FEAT) if args.use_pos: self.POS_FEAT = Field('tags', bos=bos) if args.use_bert: tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.BERT_FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', tokenize=tokenizer.encode) self.FEAT.append(self.BERT_FEAT) self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int) self.REL = Field('rels', bos=bos) self.fields = CoNLL(FORM=self.FEAT, CPOS=self.POS_FEAT, HEAD=self.HEAD, DEPREL=self.REL) # if args.feat in ('char', 'bert'): # self.fields = CoNLL(FORM=(self.WORD, self.FEAT), # HEAD=self.HEAD, DEPREL=self.REL) # else: # self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, # HEAD=self.HEAD, DEPREL=self.REL) train = Corpus.load(args.ftrain, self.fields) if args.fembed: embed = Embedding.load(args.fembed, args.unk) else: embed = None self.WORD.build(train, args.min_freq, embed) if args.use_char: self.CHAR_FEAT.build(train) if args.use_pos: self.POS_FEAT.build(train) if args.use_bert: self.BERT_FEAT.build(train) # self.FEAT.build(train) self.REL.build(train) torch.save(self.fields, args.fields) else: self.fields = torch.load(args.fields) if args.feat in ('char', 'bert'): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = torch.tensor([ i for s, i in self.WORD.vocab.stoi.items() if ispunct(s) ]).to(args.device) self.rel_criterion = nn.CrossEntropyLoss() self.arc_criterion = nn.CrossEntropyLoss() if args.binary: self.arc_criterion = nn.BCEWithLogitsLoss(reduction='none') # print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}") print(f"{self.WORD}\n{self.HEAD}\n{self.REL}") update_info = {} # pdb.set_trace() if args.use_char: update_info['n_char_feats'] = len(self.CHAR_FEAT.vocab) if args.use_pos: update_info['n_pos_feats'] = len(self.POS_FEAT.vocab) args.update({ 'n_words': self.WORD.vocab.n_init, # 'n_feats': len(self.FEAT.vocab), 'n_rels': len(self.REL.vocab), 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index }) args.update(update_info)
def __call__(self, args): self.args = args if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: logger.info("Preprocess the data") self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=args.lower) if args.feat == 'char': self.FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len, tokenize=list) elif args.feat == 'bert': tokenizer = SubwordField.tokenizer(args.bert_model) self.FEAT = SubwordField('bert', tokenizer=tokenizer, fix_len=args.fix_len) self.bos = self.FEAT.bos or bos if hasattr(tokenizer, 'vocab'): self.FEAT.vocab = tokenizer.vocab else: self.FEAT.vocab = FieldVocab( tokenizer.unk_token_id, { tokenizer._convert_id_to_token(i): i for i in range(len(tokenizer)) }) else: self.FEAT = Field('tags', bos=self.bos) self.ARC = Field('arcs', bos=self.bos, use_vocab=False, fn=numericalize) self.REL = Field('rels', bos=self.bos) if args.feat == 'bert': if args.n_embed: self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.ARC, DEPREL=self.REL) self.WORD.bos = self.bos # ensure representations of the same length else: self.fields = CoNLL(FORM=self.FEAT, HEAD=self.ARC, DEPREL=self.REL) self.WORD = None elif args.feat == 'char': self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.ARC, DEPREL=self.REL) else: self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, HEAD=self.ARC, DEPREL=self.REL) train = Corpus.load(args.ftrain, self.fields, args.max_sent_length) if args.fembed: embed = Embedding.load(args.fembed, args.unk) else: embed = None if self.WORD: self.WORD.build(train, args.min_freq, embed) self.FEAT.build(train) self.REL.build(train) if args.feat == 'bert': # do not save the tokenize funztion, or else it might be incompatible with new releases tokenize = self.FEAT.tokenize # save it self.FEAT.tokenize = None torch.save(self.fields, args.fields) if args.feat == 'bert': self.FEAT.tokenize = tokenize # restore self.trainset = train # pass it on to subclasses else: self.trainset = None self.fields = torch.load(args.fields) if args.feat == 'bert': tokenizer = SubwordField.tokenizer(args.bert_model) if args.n_embed: self.fields.FORM[1].tokenize = tokenizer.tokenize else: self.fields.FORM.tokenize = tokenizer.tokenize if args.feat in ('char', 'bert'): if isinstance(self.fields.FORM, tuple): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = None, self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.ARC, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = torch.tensor( [i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)]).to(args.device) if self.WORD else [] # override parameters from embeddings: if self.WORD: args.update({ 'n_words': self.WORD.vocab.n_init, 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index, }) args.update({ 'n_feats': len(self.FEAT.vocab), 'n_rels': len(self.REL.vocab), 'feat_pad_index': self.FEAT.pad_index, }) logger.info("Features:") if self.WORD: logger.info(f" {self.WORD}") logger.info(f" {self.FEAT}\n {self.ARC}\n {self.REL}")
def __call__(self, args): self.args = args if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: print("Preprocess the data") self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len, tokenize=list) elif args.feat == 'bert': tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', tokenize=tokenizer.encode) else: self.FEAT = Field('tags', bos=bos) self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int) self.REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.HEAD, DEPREL=self.REL) else: self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, HEAD=self.HEAD, DEPREL=self.REL) train = Corpus.load(args.ftrain, self.fields) if args.fembed: embed = Embedding.load(args.fembed, args.unk) else: embed = None self.WORD.build(train, args.min_freq, embed) self.FEAT.build(train) self.REL.build(train) torch.save(self.fields, args.fields) else: self.fields = torch.load(args.fields) if args.feat in ('char', 'bert'): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = torch.tensor([ i for s, i in self.WORD.vocab.stoi.items() if ispunct(s) ]).to(args.device) self.criterion = nn.CrossEntropyLoss() print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}") args.update({ 'n_words': self.WORD.vocab.n_init, 'n_feats': len(self.FEAT.vocab), 'n_rels': len(self.REL.vocab), 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index })
def __call__(self, args): self.args = args logging.basicConfig(filename=args.output, filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') args.ud_dataset = { 'en': ( 'data/ud/UD_English-EWT/en_ewt-ud-train.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-dev.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-test.conllx', "data/fastText_data/wiki.en.ewt.vec.new", ), 'en20': ( 'data/ud/UD_English-EWT/en_ewt-ud-train20.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-dev.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-test.conllx', "data/fastText_data/wiki.en.ewt.vec.new", ), 'en40': ( 'data/ud/UD_English-EWT/en_ewt-ud-train40.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-dev.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-test.conllx', "data/fastText_data/wiki.en.ewt.vec.new", ), 'en60': ( 'data/ud/UD_English-EWT/en_ewt-ud-train60.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-dev.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-test.conllx', "data/fastText_data/wiki.en.ewt.vec.new", ), 'en80': ( 'data/ud/UD_English-EWT/en_ewt-ud-train80.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-dev.conllx', 'data/ud/UD_English-EWT/en_ewt-ud-test.conllx', "data/fastText_data/wiki.en.ewt.vec.new", ), 'ar': ( "data/ud/UD_Arabic-PADT/ar_padt-ud-train.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-dev.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-test.conllx", "data/fastText_data/wiki.ar.padt.vec.new", ), 'ar20': ( "data/ud/UD_Arabic-PADT/ar_padt-ud-train20.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-dev.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-test.conllx", "data/fastText_data/wiki.ar.padt.vec.new", ), 'ar40': ( "data/ud/UD_Arabic-PADT/ar_padt-ud-train40.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-dev.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-test.conllx", "data/fastText_data/wiki.ar.padt.vec.new", ), 'ar60': ( "data/ud/UD_Arabic-PADT/ar_padt-ud-train60.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-dev.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-test.conllx", "data/fastText_data/wiki.ar.padt.vec.new", ), 'ar80': ( "data/ud/UD_Arabic-PADT/ar_padt-ud-train80.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-dev.conllx", "data/ud/UD_Arabic-PADT/ar_padt-ud-test.conllx", "data/fastText_data/wiki.ar.padt.vec.new", ), 'bg': ( "data/ud/UD_Bulgarian-BTB/bg_btb-ud-train.conllx", "data/ud/UD_Bulgarian-BTB/bg_btb-ud-dev.conllx", "data/ud/UD_Bulgarian-BTB/bg_btb-ud-test.conllx", "data/fastText_data/wiki.bg.btb.vec.new", ), 'da': ( "data/ud/UD_Danish-DDT/da_ddt-ud-train.conllx", "data/ud/UD_Danish-DDT/da_ddt-ud-dev.conllx", "data/ud/UD_Danish-DDT/da_ddt-ud-test.conllx", "data/fastText_data/wiki.da.ddt.vec.new", ), 'de': ( "data/ud/UD_German-GSD/de_gsd-ud-train.conllx", "data/ud/UD_German-GSD/de_gsd-ud-dev.conllx", "data/ud/UD_German-GSD/de_gsd-ud-test.conllx", "data/fastText_data/wiki.de.gsd.vec.new", ), 'es': ( "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-train.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-dev.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-test.conllx", "data/fastText_data/wiki.es.gsdancora.vec.new", ), 'es20': ( "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-train20.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-dev.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-test.conllx", "data/fastText_data/wiki.es.gsdancora.vec.new", ), 'es40': ( "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-train40.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-dev.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-test.conllx", "data/fastText_data/wiki.es.gsdancora.vec.new", ), 'es60': ( "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-train60.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-dev.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-test.conllx", "data/fastText_data/wiki.es.gsdancora.vec.new", ), 'es80': ( "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-train80.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-dev.conllx", "data/ud/UD_Spanish-GSDAnCora/es_gsdancora-ud-test.conllx", "data/fastText_data/wiki.es.gsdancora.vec.new", ), 'fa': ( "data/ud/UD_Persian-Seraji/fa_seraji-ud-train.conllx", "data/ud/UD_Persian-Seraji/fa_seraji-ud-dev.conllx", "data/ud/UD_Persian-Seraji/fa_seraji-ud-test.conllx", "data/fastText_data/wiki.fa.seraji.vec.new", ), 'fr': ( "data/ud/UD_French-GSD/fr_gsd-ud-train.conllx", "data/ud/UD_French-GSD/fr_gsd-ud-dev.conllx", "data/ud/UD_French-GSD/fr_gsd-ud-test.conllx", "data/fastText_data/wiki.fr.gsd.vec.new", ), 'he': ( "data/ud/UD_Hebrew-HTB/he_htb-ud-train.conllx", "data/ud/UD_Hebrew-HTB/he_htb-ud-dev.conllx", "data/ud/UD_Hebrew-HTB/he_htb-ud-test.conllx", "data/fastText_data/wiki.he.htb.vec.new", ), 'hi': ( "data/ud/UD_Hindi-HDTB/hi_hdtb-ud-train.conllx", "data/ud/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllx", "data/ud/UD_Hindi-HDTB/hi_hdtb-ud-test.conllx", "data/fastText_data/wiki.hi.hdtb.vec.new", ), 'hr': ( "data/ud/UD_Croatian-SET/hr_set-ud-train.conllx", "data/ud/UD_Croatian-SET/hr_set-ud-dev.conllx", "data/ud/UD_Croatian-SET/hr_set-ud-test.conllx", "data/fastText_data/wiki.hr.set.vec.new", ), 'id': ( "data/ud/UD_Indonesian-GSD/id_gsd-ud-train.conllx", "data/ud/UD_Indonesian-GSD/id_gsd-ud-dev.conllx", "data/ud/UD_Indonesian-GSD/id_gsd-ud-test.conllx", "data/fastText_data/wiki.id.gsd.vec.new", ), 'it': ( "data/ud/UD_Italian-ISDT/it_isdt-ud-train.conllx", "data/ud/UD_Italian-ISDT/it_isdt-ud-dev.conllx", "data/ud/UD_Italian-ISDT/it_isdt-ud-test.conllx", "data/fastText_data/wiki.it.isdt.vec.new", ), 'ja': ( "data/ud/UD_Japanese-GSD/ja_gsd-ud-train.conllx", "data/ud/UD_Japanese-GSD/ja_gsd-ud-dev.conllx", "data/ud/UD_Japanese-GSD/ja_gsd-ud-test.conllx", "data/fastText_data/wiki.ja.gsd.vec.new", ), 'ko': ( "data/ud/UD_Korean-GSDKaist/ko_gsdkaist-ud-train.conllx", "data/ud/UD_Korean-GSDKaist/ko_gsdkaist-ud-dev.conllx", "data/ud/UD_Korean-GSDKaist/ko_gsdkaist-ud-test.conllx", "data/fastText_data/wiki.ko.gsdkaist.vec.new", ), 'nl': ( "data/ud/UD_Dutch-AlpinoLassySmall/nl_alpinolassysmall-ud-train.conllx", "data/ud/UD_Dutch-AlpinoLassySmall/nl_alpinolassysmall-ud-dev.conllx", "data/ud/UD_Dutch-AlpinoLassySmall/nl_alpinolassysmall-ud-test.conllx", "data/fastText_data/wiki.nl.alpinolassysmall.vec.new", ), 'no': ( "data/ud/UD_Norwegian-BokmaalNynorsk/no_bokmaalnynorsk-ud-train.conllx", "data/ud/UD_Norwegian-BokmaalNynorsk/no_bokmaalnynorsk-ud-dev.conllx", "data/ud/UD_Norwegian-BokmaalNynorsk/no_bokmaalnynorsk-ud-test.conllx", "data/fastText_data/wiki.no.bokmaalnynorsk.vec.new", ), 'pt': ( "data/ud/UD_Portuguese-BosqueGSD/pt_bosquegsd-ud-train.conllx", "data/ud/UD_Portuguese-BosqueGSD/pt_bosquegsd-ud-dev.conllx", "data/ud/UD_Portuguese-BosqueGSD/pt_bosquegsd-ud-test.conllx", "data/fastText_data/wiki.pt.bosquegsd.vec.new", ), 'sv': ( "data/ud/UD_Swedish-Talbanken/sv_talbanken-ud-train.conllx", "data/ud/UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllx", "data/ud/UD_Swedish-Talbanken/sv_talbanken-ud-test.conllx", "data/fastText_data/wiki.sv.talbanken.vec.new", ), 'tr': ( "data/ud/UD_Turkish-IMST/tr_imst-ud-train.conllx", "data/ud/UD_Turkish-IMST/tr_imst-ud-dev.conllx", "data/ud/UD_Turkish-IMST/tr_imst-ud-test.conllx", "data/fastText_data/wiki.tr.imst.vec.new", ), 'zh': ( "data/ud/UD_Chinese-GSD/zh_gsd-ud-train.conllx", "data/ud/UD_Chinese-GSD/zh_gsd-ud-dev.conllx", "data/ud/UD_Chinese-GSD/zh_gsd-ud-test.conllx", "data/fastText_data/wiki.zh.gsd.vec.new", )} self.args.ftrain = args.ud_dataset[args.lang][0] self.args.fdev = args.ud_dataset[args.lang][1] self.args.ftest = args.ud_dataset[args.lang][2] self.args.fembed = args.ud_dataset[args.lang][3] if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: logging.info("Preprocess the data") self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.BERT = BertField('bert', pad='[PAD]', bos='[CLS]', tokenize=tokenizer.encode) if args.feat == 'char': self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len, tokenize=list) elif args.feat == 'bert': tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', tokenize=tokenizer.encode) else: self.FEAT = Field('tags', bos=bos) self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int) self.REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): self.fields = CoNLL(FORM=(self.WORD, self.BERT, self.FEAT), HEAD=self.HEAD, DEPREL=self.REL) else: self.fields = CoNLL(FORM=(self.WORD, self.BERT), CPOS=self.FEAT, HEAD=self.HEAD, DEPREL=self.REL) train = Corpus.load(args.ftrain, self.fields, args.max_len) if args.fembed: if args.bert is False: # fasttext embed = Embedding.load(args.fembed, args.lang, unk=args.unk) else: embed = None else: embed = None self.WORD.build(train, args.min_freq, embed) self.FEAT.build(train) self.BERT.build(train) self.REL.build(train) torch.save(self.fields, args.fields) else: self.fields = torch.load(args.fields) if args.feat in ('char', 'bert'): self.WORD, self.BERT, self.FEAT = self.fields.FORM else: self.WORD, self.BERT, self.FEAT = self.fields.FORM, self.fields.CPOS self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)]).to(args.device) self.criterion = nn.CrossEntropyLoss() logging.info(f"{self.WORD}\n{self.FEAT}\n{self.BERT}\n{self.HEAD}\n{self.REL}") args.update({ 'n_words': self.WORD.vocab.n_init, 'n_feats': len(self.FEAT.vocab), 'n_bert': len(self.BERT.vocab), 'n_rels': len(self.REL.vocab), 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index }) logging.info(f"n_words {args.n_words} n_feats {args.n_feats} n_bert {args.n_bert} pad_index {args.pad_index} bos_index {args.bos_index}")
def __call__(self, args): self.args = args if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: print("Preprocess the data") self.CHAR = Field('chars', pad=pad, unk=unk, bos=bos, eos=eos, lower=True) # TODO span as label, modify chartfield to spanfield self.SEG = SegmentField('segs') if args.feat == 'bert': tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', eos='[SEP]', tokenize=tokenizer.encode) self.fields = CoNLL(CHAR=(self.CHAR, self.FEAT), SEG=self.SEG) elif args.feat == 'bigram': self.BIGRAM = NGramField( 'bichar', n=2, pad=pad, unk=unk, bos=bos, eos=eos, lower=True) self.fields = CoNLL(CHAR=(self.CHAR, self.BIGRAM), SEG=self.SEG) elif args.feat == 'trigram': self.BIGRAM = NGramField( 'bichar', n=2, pad=pad, unk=unk, bos=bos, eos=eos, lower=True) self.TRIGRAM = NGramField( 'trichar', n=3, pad=pad, unk=unk, bos=bos, eos=eos, lower=True) self.fields = CoNLL(CHAR=(self.CHAR, self.BIGRAM, self.TRIGRAM), SEG=self.SEG) else: self.fields = CoNLL(CHAR=self.CHAR, SEG=self.SEG) train = Corpus.load(args.ftrain, self.fields) embed = Embedding.load( 'data/tencent.char.200.txt', args.unk) if args.embed else None self.CHAR.build(train, args.min_freq, embed) if hasattr(self, 'FEAT'): self.FEAT.build(train) if hasattr(self, 'BIGRAM'): embed = Embedding.load( 'data/tencent.bi.200.txt', args.unk) if args.embed else None self.BIGRAM.build(train, args.min_freq, embed=embed, dict_file=args.dict_file) if hasattr(self, 'TRIGRAM'): embed = Embedding.load( 'data/tencent.tri.200.txt', args.unk) if args.embed else None self.TRIGRAM.build(train, args.min_freq, embed=embed, dict_file=args.dict_file) # TODO self.SEG.build(train) torch.save(self.fields, args.fields) else: self.fields = torch.load(args.fields) if args.feat == 'bert': self.CHAR, self.FEAT = self.fields.CHAR elif args.feat == 'bigram': self.CHAR, self.BIGRAM = self.fields.CHAR elif args.feat == 'trigram': self.CHAR, self.BIGRAM, self.TRIGRAM = self.fields.CHAR else: self.CHAR = self.fields.CHAR # TODO self.SEG = self.fields.SEG # TODO loss funciton # self.criterion = nn.CrossEntropyLoss() # # [B, E, M, S] # self.trans = (torch.tensor([1., 0., 0., 1.]).log().to(args.device), # torch.tensor([0., 1., 0., 1.]).log().to(args.device), # torch.tensor([[0., 1., 1., 0.], # [1., 0., 0., 1.], # [0., 1., 1., 0.], # [1., 0., 0., 1.]]).log().to(args.device)) args.update({ 'n_chars': self.CHAR.vocab.n_init, 'pad_index': self.CHAR.pad_index, 'unk_index': self.CHAR.unk_index }) # TODO vocab = f"{self.CHAR}\n" if hasattr(self, 'FEAT'): args.update({ 'n_feats': self.FEAT.vocab.n_init, }) vocab += f"{self.FEAT}\n" if hasattr(self, 'BIGRAM'): args.update({ 'n_bigrams': self.BIGRAM.vocab.n_init, }) vocab += f"{self.BIGRAM}\n" if hasattr(self, 'TRIGRAM'): args.update({ 'n_trigrams': self.TRIGRAM.vocab.n_init, }) vocab += f"{self.TRIGRAM}\n" print(f"Override the default configs\n{args}") print(vocab[:-1])