def init_vocab(self, data): assert self.eval == False # for eval vocab must exist charvocab = CharVocab(data, idx=0) wordvocab = WordVocab(data, idx=1, cutoff=self.word_cutoff, lower=True) posvocab = WordVocab(data, idx=2) featsvocab = FeatureVocab(data, idx=3) vocab = MultiVocab({ 'char': charvocab, 'word': wordvocab, 'pos': posvocab, 'feats': featsvocab }) return vocab
def train(corpus_file, mode, dim_size, window, min_count, negative, epoch, workers): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=workers, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) word_vocab = WordVocab(Trie(words), lowercase=True) entity_vocab = EntityVocab(Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: ind = word_vocab.get_index(word) if ind is not None: word_embedding[ind] = model[word] for entity in entities: entity_embedding[entity_vocab.get_index(entity)] = model[ MARKER + entity.replace(u' ', u'_')] return EmbeddingReader(word_embedding, entity_embedding, word_vocab, entity_vocab)
def __init__(self, filename, vocabs=None): self._filename = filename self._vocabs = vocabs if vocabs else [ WordVocab(), LemmaVocab(), TagVocab(), PredictVocab(), SemTagVocab() ] self._input_vocabs = self.vocabs[:-1] self._target_vocabs = [self.vocabs[-1]] self._establish_vocab() self._data = [] self._read_data(self._filename, self._data)
def decode(vocab_file, model_file, input_file, no_prog): d = pickle.load(open(vocab_file, "rb")) wv = WordVocab.from_dump(d["vocab"]["word"]) tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()} writer = d["author"] if "writer" in model_file else None model = Reporter.parse_config(tv=tv, wv=wv, writer=writer, model_file=model_file) inputs = json.load(open(input_file)) for ins in tqdm(inputs, total=len(inputs), ncols=80, disable=no_prog): print( model.decode( make_table(ins), writer=writer.get(ins.get("author"), 0) if writer else None))
def prep(dir_path, annotation_path, out_path): desc = str(datetime.datetime.now( )) + " Overwrite the preprocessed data: {}? Y/n (default: n)".format( out_path) if os.path.exists(out_path) and input(desc) != "Y": print(str(datetime.datetime.now()) + " Exit.") exit() print(str(datetime.datetime.now()) + " Building dataset from " + dir_path) train = json.load(open(os.path.join(dir_path, "train.json"))) tables, texts = [make_table(ins) for ins in train], list(make_text(train, annotation_path)) authors = {ins.get("author", "UNK") for ins in train} assert len(tables) == len(texts) tv = {} for k in ("team", "player"): tv[k] = TableVocab([t[k] for t in tables], key=k) wv = WordVocab({w for doc in texts for sent, _ in doc for w in sent}) print(str(datetime.datetime.now()) + " Saving dataset from " + out_path) pickle.dump( { "data": { "text": texts, "table": tables }, "vocab": { "word": wv.__dict__, "table": {k: v.__dict__ for k, v in tv.items()} }, "author": {k: i for i, k in enumerate(authors)} }, open(out_path, "wb"))
epochs = 10 num_workers = 1 with_cuda = False log_freq = 20 corpus_lines = None cuda_devices = None on_memory = True lr = 1e-4 adam_weight_decay = 0.01 adam_beta1 = 0.9 adam_beta2 = 0.999 print("Loading Vocab", vocab_path) vocab = WordVocab.load_vocab(vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", train_dataset) train_dataset = BERTDataset(train_dataset, vocab, seq_len=seq_len, corpus_lines=corpus_lines, on_memory=on_memory) print("Loading Test Dataset", test_dataset) test_dataset = BERTDataset(test_dataset, vocab, seq_len=seq_len, on_memory=on_memory) \ if test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset,
import numpy as np import pandas as pd import pickle from tqdm import tqdm VOCAB_SAVE_PATH = "./data/saves/vocab/" from vocab import CharVocab, WordVocab CHAR_VOCAB = CharVocab() WORD_VOCAB = WordVocab() EMBEDD_MATRIX = None VECTOR_DIM = 20 def save_(): pickle_out = open(VOCAB_SAVE_PATH + 'char_vocab.pickle', 'wb') pickle.dump(CHAR_VOCAB, pickle_out) pickle_out.close() pickle_out = open('./data/saves/char_embedding.pickle', 'wb') pickle.dump(EMBEDD_MATRIX, pickle_out) pickle_out.close() def load_word_vocab(): global WORD_VOCAB WORD_VOCAB = pickle.load(open(VOCAB_SAVE_PATH + 'word_vocab.pickle', 'rb'))
type=int, default=1, help="number of batch_size") parser.add_argument( "--multi_segment", type=bool, default=False, help="whether to use multiple segment_labels for entity types") parser.add_argument("--sep_label", type=bool, default=False, help="whether to insert <sep>") args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory, multi_segment=args.multi_segment, sep=args.sep_label) print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
def main(experiment_path, dataset_path, config_path, restore_path, workers): logging.basicConfig(level=logging.INFO) config = Config.from_json(config_path) fix_seed(config.seed) train_data = pd.concat([ load_data(os.path.join(dataset_path, 'train-clean-100'), workers=workers), load_data(os.path.join(dataset_path, 'train-clean-360'), workers=workers), ]) eval_data = pd.concat([ load_data(os.path.join(dataset_path, 'dev-clean'), workers=workers), ]) if config.vocab == 'char': vocab = CharVocab(CHAR_VOCAB) elif config.vocab == 'word': vocab = WordVocab(train_data['syms'], 30000) elif config.vocab == 'subword': vocab = SubWordVocab(10000) else: raise AssertionError('invalid config.vocab: {}'.format(config.vocab)) train_transform = T.Compose([ ApplyTo(['sig'], T.Compose([ LoadSignal(SAMPLE_RATE), ToTensor(), ])), ApplyTo(['syms'], T.Compose([ VocabEncode(vocab), ToTensor(), ])), Extract(['sig', 'syms']), ]) eval_transform = train_transform train_dataset = TrainEvalDataset(train_data, transform=train_transform) eval_dataset = TrainEvalDataset(eval_data, transform=eval_transform) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=BatchSampler(train_data, batch_size=config.batch_size, shuffle=True, drop_last=True), num_workers=workers, collate_fn=collate_fn) eval_data_loader = torch.utils.data.DataLoader( eval_dataset, batch_sampler=BatchSampler(eval_data, batch_size=config.batch_size), num_workers=workers, collate_fn=collate_fn) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = Model(SAMPLE_RATE, len(vocab)) model_to_save = model if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) if restore_path is not None: load_weights(model_to_save, restore_path) if config.opt.type == 'adam': optimizer = torch.optim.Adam(model.parameters(), config.opt.lr, weight_decay=1e-4) elif config.opt.type == 'sgd': optimizer = torch.optim.SGD(model.parameters(), config.opt.lr, momentum=0.9, weight_decay=1e-4) else: raise AssertionError('invalid config.opt.type {}'.format( config.opt.type)) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(train_data_loader) * config.epochs) # ================================================================================================================== # main loop train_writer = SummaryWriter(os.path.join(experiment_path, 'train')) eval_writer = SummaryWriter(os.path.join(experiment_path, 'eval')) best_wer = float('inf') for epoch in range(config.epochs): if epoch % 10 == 0: logging.info(experiment_path) # ============================================================================================================== # training metrics = { 'loss': Mean(), 'fps': Mean(), } model.train() t1 = time.time() for (sigs, labels), (sigs_mask, labels_mask) in tqdm( train_data_loader, desc='epoch {} training'.format(epoch), smoothing=0.01): sigs, labels = sigs.to(device), labels.to(device) sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to( device) logits, etc = model(sigs, labels[:, :-1], sigs_mask, labels_mask[:, :-1]) loss = compute_loss(input=logits, target=labels[:, 1:], mask=labels_mask[:, 1:], smoothing=config.label_smoothing) metrics['loss'].update(loss.data.cpu().numpy()) lr = np.squeeze(scheduler.get_lr()) optimizer.zero_grad() loss.mean().backward() optimizer.step() scheduler.step() t2 = time.time() metrics['fps'].update(1 / ((t2 - t1) / sigs.size(0))) t1 = t2 with torch.no_grad(): metrics = {k: metrics[k].compute_and_reset() for k in metrics} print('[EPOCH {}][TRAIN] {}'.format( epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k]) for k in metrics))) for k in metrics: train_writer.add_scalar(k, metrics[k], global_step=epoch) train_writer.add_scalar('learning_rate', lr, global_step=epoch) train_writer.add_image('spectras', torchvision.utils.make_grid( etc['spectras'], nrow=compute_nrow(etc['spectras']), normalize=True), global_step=epoch) for k in etc['weights']: w = etc['weights'][k] train_writer.add_image('weights/{}'.format(k), torchvision.utils.make_grid( w, nrow=compute_nrow(w), normalize=True), global_step=epoch) for i, (true, pred) in enumerate( zip(labels[:, 1:][:4].detach().data.cpu().numpy(), np.argmax(logits[:4].detach().data.cpu().numpy(), -1))): print('{}:'.format(i)) text = vocab.decode( take_until_token(true.tolist(), vocab.eos_id)) print(colored(text, 'green')) text = vocab.decode( take_until_token(pred.tolist(), vocab.eos_id)) print(colored(text, 'yellow')) # ============================================================================================================== # evaluation metrics = { # 'loss': Mean(), 'wer': Mean(), } model.eval() with torch.no_grad(), Pool(workers) as pool: for (sigs, labels), (sigs_mask, labels_mask) in tqdm( eval_data_loader, desc='epoch {} evaluating'.format(epoch), smoothing=0.1): sigs, labels = sigs.to(device), labels.to(device) sigs_mask, labels_mask = sigs_mask.to(device), labels_mask.to( device) logits, etc = model.infer(sigs, sigs_mask, sos_id=vocab.sos_id, eos_id=vocab.eos_id, max_steps=labels.size(1) + 10) # loss = compute_loss( # input=logits, target=labels[:, 1:], mask=labels_mask[:, 1:], smoothing=config.label_smoothing) # metrics['loss'].update(loss.data.cpu().numpy()) wer = compute_wer(input=logits, target=labels[:, 1:], vocab=vocab, pool=pool) metrics['wer'].update(wer) with torch.no_grad(): metrics = {k: metrics[k].compute_and_reset() for k in metrics} print('[EPOCH {}][EVAL] {}'.format( epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k]) for k in metrics))) for k in metrics: eval_writer.add_scalar(k, metrics[k], global_step=epoch) eval_writer.add_image('spectras', torchvision.utils.make_grid( etc['spectras'], nrow=compute_nrow(etc['spectras']), normalize=True), global_step=epoch) for k in etc['weights']: w = etc['weights'][k] eval_writer.add_image('weights/{}'.format(k), torchvision.utils.make_grid( w, nrow=compute_nrow(w), normalize=True), global_step=epoch) save_model(model_to_save, experiment_path) if metrics['wer'] < best_wer: best_wer = metrics['wer'] save_model(model_to_save, mkdir(os.path.join(experiment_path, 'best')))
def train(vocab_file, valid_file, nh_vocab, nh_rnn, writer, learning_rate, lr_decay, batch_size, n_epoch, log_dir): log_dir = os.path.join(log_dir, str(int(time.time()))) # Initialize... print(str(datetime.datetime.now()) + " Log dir at {}".format(log_dir)) os.mkdir(log_dir) print(str(datetime.datetime.now()) + " Loading dataset...") d = pickle.load(open(vocab_file, "rb")) texts, tables = d["data"]["text"], d["data"]["table"] wv = WordVocab.from_dump(d["vocab"]["word"]) tv = {k: TableVocab.from_dump(v) for k, v in d["vocab"]["table"].items()} writer = d["author"] if writer else None print(str(datetime.datetime.now()) + " Vectorizing...") data = list(vectorize(texts, tables, wv, tv, writer)) valid = json.load(open(valid_file)) if valid_file else None # Model model = Reporter(tv=tv, wv=wv, nh_vocab=nh_vocab, nh_rnn=nh_rnn, writer=writer) print(str(datetime.datetime.now()) + " Model configurations...") print(str(datetime.datetime.now()) + " " + str(model)) # Trainer trainer = Trainer(model, lr=learning_rate, decay=lr_decay, batch_size=batch_size) print(str(datetime.datetime.now()) + " Trainer configurations...") print(str(datetime.datetime.now()) + " " + str(trainer)) try: best = 0. print(str(datetime.datetime.now()) + " Start training...") for _ in range(n_epoch): trainer.fit_partial(data) pc_name = str(model) + "_{}.dy".format(trainer.iter) model.pc.save(os.path.join(log_dir, pc_name)) if valid and trainer.iter >= 5: pred = [] prog = tqdm( desc="Evaluation: ", total=len(valid) + 1, ncols=80, ) for ins in valid: p = model.decode(make_table(ins), writer=writer.get(ins.get("author")) if writer else None) pred.append(p.split()) prog.update() bleu = nltk.translate.bleu_score.corpus_bleu( [[nltk.word_tokenize(' '.join(v["summary"]))] for v in valid], pred) prog.set_postfix(BLEU=bleu) prog.update() prog.close() if bleu > best: best = bleu print(str(datetime.datetime.now()) + " Save best model...") model.pc.save( os.path.join(log_dir, str(model) + "_best.dy")) except KeyboardInterrupt: print("KeyboardInterrupted...")
def get_vocab(data): char_vocab = CharVocab(data, idx=0) word_vocab = WordVocab(data, idx=0, cutoff=2) vocab = MultiVocab({'char': char_vocab, 'word': word_vocab}) return vocab
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-bt", "--bert_path", type=str, help='path of pretrained bert') parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=100, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=4, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") d = vars(parser.parse_args()) args = Option(d) print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = Dataset(args.train_dataset, vocab, seq_len=args.seq_len) print("Loading Test Dataset", args.test_dataset) test_dataset = Dataset(args.test_dataset, vocab, seq_len=args.seq_len) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None print("Building model") bert = torch.load(args.bert_path) model = Readout(bert.hidden, args.hidden) #model = bert_fc(args.bert_path, args.hidden) print("Creating Trainer") trainer = Trainer(args, bert, model, train_dataloader=train_data_loader, test_dataloader=test_data_loader) print("Training Start") for epoch in range(args.epochs): trainer.train(epoch) trainer.save(epoch, args.output_path) if test_data_loader is not None: trainer.test(epoch)