def __call__(self, args): self.args = args if not os.path.exists(args.file): os.mkdir(args.file) if not os.path.exists(args.fields) or args.preprocess: print("Preprocess the data") self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len, tokenize=list) elif args.feat == 'bert': tokenizer = BertTokenizer.from_pretrained(args.bert_model) self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]', tokenize=tokenizer.encode) else: self.FEAT = Field('tags', bos=bos) self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int) self.REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.HEAD, DEPREL=self.REL) else: self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, HEAD=self.HEAD, DEPREL=self.REL) train = Corpus.load(args.ftrain, self.fields) if args.fembed: embed = Embedding.load(args.fembed, args.unk) else: embed = None self.WORD.build(train, args.min_freq, embed) self.FEAT.build(train) self.REL.build(train) torch.save(self.fields, args.fields) else: self.fields = torch.load(args.fields) if args.feat in ('char', 'bert'): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)]).to(args.device) self.criterion = nn.CrossEntropyLoss() print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}") args.update({ 'n_words': self.WORD.vocab.n_init, 'n_feats': len(self.FEAT.vocab), 'n_rels': len(self.REL.vocab), 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index })
def __call__(self, args): super(Predict, self).__call__(args) print("Load the dataset") corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, [self.WORD, self.FEAT]) # set the data loader dataset.loader = batchify(dataset, args.batch_size) print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches") print("Load the model") self.model = Model.load(args.model) print(f"{self.model}\n") print("Make predictions on the dataset") start = datetime.now() corpus.heads, corpus.rels = self.predict(dataset.loader) print(f"Save the predicted result to {args.fpred}") corpus.save(args.fpred) total_time = datetime.now() - start print(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
def __call__(self, args): super(Evaluate, self).__call__(args) print("Load the dataset") corpus = Corpus.load(args.fdata, self.fields) dataset = TextDataset(corpus, self.fields, args.buckets) # set the data loader dataset.loader = batchify(dataset, args.batch_size) print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches, " f"{len(dataset.buckets)} buckets") print("Load the model") self.model = Model.load(args.model) print(f"{self.model}\n") print("Evaluate the dataset") start = datetime.now() loss, metric = self.evaluate(dataset.loader) total_time = datetime.now() - start print(f"Loss: {loss:.4f} {metric}") print(f"{total_time}s elapsed, " f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
if pred == 0: fpw.write(str(id)) fpw.write('\t') fpw.write(a['alternatives'].split('|')[0]) elif pred == 1: fpw.write(str(id)) fpw.write('\t') fpw.write(a['alternatives'].split('|')[1]) else: fpw.write(str(id)) fpw.write('\t') fpw.write(a['alternatives'].split('|')[2]) fpw.write('\n') id += 1 line = fp.readline() corpus = Corpus('RC') model = eval(args.model)(corpus, args) model.train() parameters = filter(lambda p: p.requires_grad, model.parameters()) model, optimizer, criterion = torch.load('trainedmodel/RC_save_best.pt') evaluation(model, optimizer, criterion, corpus, args.cuda, args.batch_size, dataset='test')
verbose=args.verbose) Document.set_gold_labels(exclude_docs, Label.exclude, one_label=True) all_docs = include_docs + exclude_docs # filter out docs with no: 1) text or abstract data 2) gold labels (only if training) if args.train: #all_docs = Document.filter_fields(all_docs, [Document.gold_label, Document.abstract]) all_docs = Document.filter_gold_labels(all_docs) all_docs = Document.filter_failed_parses(all_docs) train_docs, test_docs = split_data(all_docs) else: train_docs, test_docs = [], all_docs corpus = Corpus(train_docs=train_docs, test_docs=test_docs) save_loc = config["corpus"].get(SVMClassifier.SAVE_LOC_KEY, "corpus.pkl") corpus.save(save_loc) logging.info("{} Training and {} Test Documents".format( len(corpus.train), len(corpus.test))) if args.train: labelled_corpus = Document.filter_gold_labels(corpus.train) assert labelled_corpus, "cannot train a classifier without any gold labels in the corpus" logging.info("Reading classifier...") classifier = ClassifierStrategy.from_config(config) classifier.train_classifier(corpus.train)
args = parser.parse_args() seed_num = args.seed random.seed(seed_num) torch.manual_seed(seed_num) np.random.seed(seed_num) if args.gpu != '-1': use_cuda = True os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu else: use_cuda = False torch.set_num_threads(args.cpu_thread) train_samples = Corpus(args.train, do_lower=False, number_normal=True) dev_samples = Corpus(args.dev, do_lower=False, number_normal=True) test_samples = Corpus(args.test, do_lower=False, number_normal=True) word_vocab = Vocab(train_samples.samples + dev_samples.samples + test_samples.samples, islabel=False, freq=1) #word_vocab.add_embedding_file(args.pretrain_emb, embedding_dim=args.word_emb_dim) char_vocab = Vocab(train_samples.samples + dev_samples.samples + test_samples.samples, ischar=True, freq=1) label_vocab = Vocab(train_samples.samples + dev_samples.samples + test_samples.samples, islabel=True,
type=str, default='1', help='experiment index') parser.add_argument('--log', type=str, default='nothing', help='take note') args = parser.parse_args() torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) corpus = Corpus(args.task) model = eval(args.model)(corpus, args) model.train() criterion = nn.NLLLoss() parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adamax(parameters, lr=args.lr) if args.cuda: model.cuda() criterion.cuda() start_time = time.time() total_loss = 0 interval = args.interval save_interval = len(corpus.data_all['train']) // args.batch_size
def __call__(self, args): super(Train, self).__call__(args) train = Corpus.load(args.ftrain, self.fields) dev = Corpus.load(args.fdev, self.fields) test = Corpus.load(args.ftest, self.fields) train = TextDataset(train, self.fields, args.buckets) dev = TextDataset(dev, self.fields, args.buckets) test = TextDataset(test, self.fields, args.buckets) # set the data loaders train.loader = batchify(train, args.batch_size, True) dev.loader = batchify(dev, args.batch_size) test.loader = batchify(test, args.batch_size) print(f"{'train:':6} {len(train):5} sentences, " f"{len(train.loader):3} batches, " f"{len(train.buckets)} buckets") print(f"{'dev:':6} {len(dev):5} sentences, " f"{len(dev.loader):3} batches, " f"{len(train.buckets)} buckets") print(f"{'test:':6} {len(test):5} sentences, " f"{len(test.loader):3} batches, " f"{len(train.buckets)} buckets") print("Create the model") self.model = Model(args).load_pretrained(self.WORD.embed) print(f"{self.model}\n") self.model = self.model.to(args.device) if torch.cuda.device_count() > 1: self.model = nn.DataParallel(self.model) self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) total_time = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() # train one epoch and update the parameters self.train(train.loader) print(f"Epoch {epoch} / {args.epochs}:") loss, train_metric = self.evaluate(train.loader) print(f"{'train:':6} Loss: {loss:.4f} {train_metric}") loss, dev_metric = self.evaluate(dev.loader) print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}") loss, test_metric = self.evaluate(test.loader) print(f"{'test:':6} Loss: {loss:.4f} {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric and epoch > args.patience: best_e, best_metric = epoch, dev_metric if hasattr(self.model, 'module'): self.model.module.save(args.model) else: self.model.save(args.model) print(f"{t}s elapsed (saved)\n") else: print(f"{t}s elapsed\n") total_time += t if epoch - best_e >= args.patience: break self.model = Model.load(args.model) loss, metric = self.evaluate(test.loader) print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}") print(f"the score of test at epoch {best_e} is {metric.score:.2%}") print(f"average time of each epoch is {total_time / epoch}s") print(f"{total_time}s elapsed")