def main(args): if not Path(args.out_file).parent.exists(): raise ValueError("Invalid out_file %s. Does the directory exist?" % (args.out_file, )) corpus = Corpus(args.corpus_path) # Remove actual dataset -- just keep the vocabulary corpus.train = None corpus.valid = None corpus.test = None torch.save(corpus, args.out_file)
def train_model(self): args = self.args # Load data corpus = Corpus(args.file) train_data = train.batchify(corpus.train, args.batch_size, self.device) # Build the model ntokens = len(corpus.dictionary) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(self.device) # criterion = nn.NLLLoss() # criterion = nn.MSELoss() criterion = self.args.criterion optimizer = optim.Adam(model.parameters(), lr=args.lr) # Training code # Loop over epochs. lr = args.lr # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train.train(train_data, args, model, optimizer, criterion, corpus, epoch, lr, self.device) print('-' * 89) with open(args.save, 'wb') as f: torch.save(model, f) lr /= 4.0 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') return model
def main(): """ main function """ ctx = mx.gpu(2) batch_size = 40 bptt = 35 corpus = Corpus('./data/ptb.') ntokens = len(corpus.dictionary) train_data = CorpusIter(corpus.train, batch_size, bptt) valid_data = CorpusIter(corpus.valid, batch_size, bptt) test_data = CorpusIter(corpus.test, batch_size, bptt) data_names = [x[0] for x in test_data.provide_data] label_names = [x[0] for x in test_data.provide_label] prefix = './output/model' epoch = 39 model = mx.module.Module.load(prefix, epoch, label_names=label_names, data_names=data_names, context=ctx) model.bind(for_training=False, data_shapes=test_data.provide_data, label_shapes=test_data.provide_label) for batch in test_data: model.forward(batch, is_train=False) print(model.get_outputs())
def get_popular_first_words(args): corpus = Corpus(args.data_path) ntokens = len(corpus.dictionary) idx2word = corpus.dictionary.idx2word most_common_first_words_ids = [i[0] for i in Counter(corpus.train.tolist()).most_common() if idx2word[i[0]][0].isupper()][:args.utterances_to_generate] return[corpus.dictionary.idx2word[i] for i in most_common_first_words_ids]
def plot(args): num_words = 1000 # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) # Load model. with open(args.checkpoint, 'rb') as f: try: model = torch.load(f) except: # Convert the model to CPU if the model is serialized on GPU. model = torch.load(f, map_location='cpu') model.eval() embeddings = model.embedding.weight.data # Load data. data_dir = os.path.expanduser(args.data_dir) corpus = Corpus(data_dir, headers=args.no_headers, lower=args.lower, chars=args.use_chars) ntokens = len(corpus.dictionary.w2i) # Some checks to see if data and model are consistent. model_data_checks(model, corpus, args) # Prepare embeddings from num_words most common words. most_common_idxs = Counter(corpus.train).most_common(num_words) most_common_idxs, _ = zip(*most_common_idxs) # Discard counts most_common_words = [corpus.dictionary.i2w[i] for i in most_common_idxs] idxs = torch.LongTensor(most_common_idxs) embeddings = embeddings[idxs, :].numpy() # Make bokeh plot. emb_scatter(embeddings, most_common_words, model_name=args.name)
def save_lmbmet_ds(): """Instantiate a Corpus object, add vocabulary and encoded data""" logger.info('Saving LMBMET training data...') corpus = Corpus() # Identify all the tokens except MeSH descriptors, which are included in # vocab via mesh_def words_wo_mesh_d = [] for w, _ in words.most_common(): if w in mesh_def and mesh_def[w]['descriptor']: continue words_wo_mesh_d.append(w) corpus.vocab.load_vocab(words_wo_mesh_d, mesh_def=mesh_def, specials=['<eos>', '<unk>']) corpus.mesh_def = mesh_def corpus.load_data(docs) pickle.dump(corpus, lm_out_file.open("wb"), protocol=4)
def main(args): data_dir = os.path.expanduser(args.data_dir) gold_path = os.path.expanduser(args.gold_path) corpus = Corpus(args.vocab_path, data_dir) model = torch.load(args.model_path) parser = Decoder(corpus, model) conll = parser.batch_eval() # Write the conll as text. conll.write(args.predict_path) # Evaluate the predicted conll. os.system('perl eval.pl -g {0} -s {1} > {2}'.format( gold_path, args.predict_path, args.result_path))
def test(data_path, model_path, options_path, dict_path): with open(model_path, 'rb') as f: model = torch.load(f) with open(options_path, 'rb') as f: model_params = pkl.load(f) print "Load data..." corpus = Corpus(data_path) test_data = batchify(corpus.test, model_params['batch_size']) print "Done" PPL = evaluatePTB(test_data, model, model_params, corpus.dictionary) print 'test perplexity: ', PPL
def main(): set_random_seed(2020) show_device_info() data_path = Path('/media/bnu/data/nlp-practice/language-model') corpus = Corpus(data_path, sort_by_len=False) learner = LMLearner(corpus, n_embed=400, n_hidden=400, dropout=0.5, rnn_type='LSTM', batch_size=128, learning_rate=1e-3) # 训练模型, 已经训练好进行错误分析的时候可以注释掉 run(learner) test_loss, test_acc, test_words, test_result = learner.predict() print('Result in Test --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(test_loss, test_acc, test_words)) show_test_sample(4, test_result, corpus) show_most_mistake(test_result, corpus)
def main(args): print(f'Loading corpus from `{args.data}`...') corpus = Corpus(args.data, order=args.order, lower=args.lower, max_lines=args.max_lines) model = Ngram(order=args.order) name = f'{args.name}.{args.order}gram' print('Example data:') print('Train:', corpus.train[:20]) print('Valid:', corpus.valid[:20]) print('Training model...') model.train(corpus.train, add_k=args.add_k, interpolate=args.interpolate, backoff=args.backoff) print(f'Vocab size: {len(model.vocab):,}') if args.save_arpa: print(f'Saving model to `{name}`...') model.save_arpa(name) assert model.sum_to_one(n=10) print('Generating text...') text = model.generate(100) text = ' '.join(text) path = os.path.join(args.out, f'generated.{name}.txt') print(text) with open(path, 'w') as f: print(text, file=f) if model.is_smoothed: print('\nPredicting test set NLL...') logprob = model(corpus.test) nll = -logprob / len(corpus.test) print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}') path = os.path.join(args.out, f'result.{name}.txt') with open(path, 'w') as f: print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f) else: exit( 'No evaluation with unsmoothed model: probability is probably 0 anyways.' )
def main(data_dir): print(f'Reading and processing data from `{data_dir}`...') corpus = Corpus(data_dir) print(f'Collecting ngram counts...') print('Unigram...') unigrams = get_unigrams(corpus.train) print('Bigram...') bigrams = get_ngrams(corpus.train, history=1) print('Trigram...') trigrams = get_ngrams(corpus.train, history=2) print('Fourgram...') fourgrams = get_ngrams(corpus.train, history=3) for i, gram in enumerate((unigrams, bigrams, trigrams, fourgrams), 1): with open(f'data/wikitext.{i}gram.json', 'w') as f: json.dump(gram, f, indent=4) print('Done.')
def main(unused_argv): if len(unused_argv) != 1: raise Exception("There is a problem with how you entered flags: %s" % unused_argv) options, vocab, multisense_vocab, tf_config = init.init() model = polylm.PolyLM( vocab, options, multisense_vocab=multisense_vocab, training=True) test_words = options.test_words.split() if not os.path.exists(options.model_dir): os.makedirs(options.model_dir) src_dir = os.path.join(options.model_dir, 'src_%d' % int(time.time())) os.makedirs(src_dir) publish_source(src_dir) flags_str = options.flags_into_string() with open(os.path.join(options.model_dir, 'flags'), 'w') as f: f.write(flags_str) corpus = Corpus(options.corpus_path, vocab) with tf.Session(config=tf_config) as sess: model.attempt_restore(sess, options.model_dir, False) model.train(corpus, sess, test_words=test_words)
return loss if __name__ == '__main__': # args head = '%(asctime)-15s %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) args = parser.parse_args() logging.info(args) ctx = mx.gpu() batch_size = args.batch_size bptt = args.bptt mx.random.seed(args.seed) # data corpus = Corpus(args.data) ntokens = len(corpus.dictionary) train_data = CorpusIter(corpus.train, batch_size, bptt) valid_data = CorpusIter(corpus.valid, batch_size, bptt) test_data = CorpusIter(corpus.test, batch_size, bptt) # model pred, states, state_names = rnn(bptt, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, batch_size, args.tied) loss = softmax_ce_loss(pred) # module module = CustomStatefulModule(loss, states, state_names=state_names,
def main(): args = get_args() log.info(f'Parsed arguments: \n{pformat(args.__dict__)}') assert args.cond_type.lower() in ['none', 'platanios', 'oestling'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') log.info('Using device {}.'.format(device)) use_apex = False if torch.cuda.is_available() and args.fp16: log.info('Loading Nvidia Apex and using AMP') from apex import amp, optimizers use_apex = True else: log.info('Using FP32') amp = None log.info(f'Using time stamp {timestamp} to save models and logs.') if not args.no_seed: log.info(f'Setting random seed to {args.seed} for reproducibility.') torch.manual_seed(args.seed) random.seed(args.seed) data = Corpus(args.datadir) data_splits = [ { 'split': 'train', 'languages': args.dev_langs + args.target_langs, 'invert_include': True, }, { 'split': 'valid', 'languages': args.dev_langs, }, { 'split': 'test', 'languages': args.target_langs, }, ] if args.refine: data_splits.append({ 'split': 'train_100', 'languages': args.target_langs, 'ignore_missing': True, }) data_splits = data.make_datasets(data_splits, force_rebuild=args.rebuild) train_set, val_set, test_set = data_splits['train'], data_splits[ 'valid'], data_splits['test'] dictionary = data_splits['dictionary'] train_language_distr = get_sampling_probabilities(train_set, 1.0) train_set = Dataset(train_set, batchsize=args.batchsize, bptt=args.bptt, reset_on_iter=True, language_probabilities=train_language_distr) val_set = Dataset(val_set, make_config=True, batchsize=args.valid_batchsize, bptt=args.bptt, eval=True) test_set = Dataset(test_set, make_config=True, batchsize=args.test_batchsize, bptt=args.bptt, eval=True) train_loader = DataLoader(train_set, num_workers=args.workers) val_loader = DataLoader(val_set, num_workers=args.workers) test_loader = DataLoader(test_set, num_workers=args.workers) if args.refine: refine_set = dict() for lang, lang_d in data_splits['train_100'].items(): refine_set[lang] = Dataset({lang: lang_d}, batchsize=args.valid_batchsize, bptt=args.bptt, make_config=True) n_token = len(dictionary.idx2tkn) # Load and preprocess matrix of typological features # TODO: implement this, the OEST # prior_matrix = load_prior(args.prior, corpus.dictionary.lang2idx) # n_components = min(50, *prior_matrix.shape) # pca = PCA(n_components=n_components, whiten=True) # prior_matrix = pca.fit_transform(prior_matrix) prior = None model = RNN(args.cond_type, prior, n_token, n_input=args.emsize, n_hidden=args.nhidden, n_layers=args.nlayers, dropout=args.dropouto, dropoute=args.dropoute, dropouth=args.dropouth, dropouti=args.dropouti, wdrop=args.wdrop, wdrop_layers=[0, 1, 2], tie_weights=True).to(device) if args.opt_level != 'O2': loss_function = SplitCrossEntropyLoss(args.emsize, splits=[]).to(device) else: loss_function = CrossEntropyLoss().to( device) # Should be ok to use with a vocabulary of this small size if use_apex: optimizer = optimizers.FusedAdam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) else: params = list(filter(lambda p: p.requires_grad, model.parameters())) + list( loss_function.parameters()) optimizer = Adam(params, lr=args.lr, weight_decay=args.wdecay) if use_apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) parameters = { 'model': model, 'optimizer': optimizer, 'loss_function': loss_function, 'use_apex': use_apex, 'amp': amp if use_apex else None, 'clip': args.clip, 'alpha': args.alpha, 'beta': args.beta, 'bptt': args.bptt, 'device': device, 'prior': args.prior, } # Add backward hook for gradient clipping if args.clip: if use_apex: for p in amp.master_params(optimizer): p.register_hook( lambda grad: torch.clamp(grad, -args.clip, args.clip)) else: for p in model.parameters(): p.register_hook( lambda grad: torch.clamp(grad, -args.clip, args.clip)) if args.prior == 'vi': prior = VIPrior(model, device=device) parameters['prior'] = prior def sample_weights(module: torch.nn.Module, input: torch.Tensor): prior.sample_weights(module) sample_weights_hook = model.register_forward_pre_hook(sample_weights) # Load model checkpoint if available start_epoch = 1 if args.resume: if args.checkpoint is None: log.error( 'No checkpoint passed. Specify it using the --checkpoint flag') checkpoint = None else: log.info('Loading the checkpoint at {}'.format(args.checkpoint)) checkpoint = load_model(args.checkpoint, **parameters) start_epoch = checkpoint['epoch'] if args.wdrop: for rnn in model.rnns: if isinstance(rnn, WeightDrop): rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop saved_models = list() result_str = '| Language {} | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' def test(): log.info('=' * 89) log.info('Running test set (zero-shot results)...') test_loss, avg_loss = evaluate(test_loader, **parameters) log.info('Test set finished | test loss {} | test bpc {}'.format( test_loss, test_loss / math.log(2))) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) log.info('=' * 89) if args.train: f = 1. stored_loss = 1e32 epochs_no_improve = 0 val_losses = list() # calculate specific language lr data_spec_count = sum([len(ds) for l, ds in train_set.data.items()]) data_spec_avg = data_spec_count / len(train_set.data.items()) data_spec_lrweights = dict([(l, data_spec_avg / len(ds)) for l, ds in train_set.data.items()]) # estimate total number of steps total_steps = sum( [len(ds) // args.bptt for l, ds in train_set.data.items()]) * args.no_epochs steps = 0 try: pbar = tqdm.trange(start_epoch, args.no_epochs + 1, position=1, dynamic_ncols=True) for epoch in pbar: steps = train(train_loader, lr_weights=data_spec_lrweights, **parameters, total_steps=total_steps, steps=steps, scaling=args.scaling, n_samples=args.n_samples, tb_writer=tb_writer) val_loss, _ = evaluate(val_loader, **parameters) pbar.set_description('Epoch {} | Val loss {}'.format( epoch, val_loss)) # Save model if args.prior == 'vi': sample_weights_hook.remove() filename = path.join( args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format( timestamp, epoch, '_with_apex' if use_apex else '', args.prior)) torch.save(make_checkpoint(epoch + 1, **parameters), filename) saved_models.append(filename) if args.prior == 'vi': sample_weights_hook = model.register_forward_pre_hook( sample_weights) # Early stopping if val_loss < stored_loss: epochs_no_improve = 0 stored_loss = val_loss else: epochs_no_improve += 1 if epochs_no_improve == args.patience: log.info('Early stopping at epoch {}'.format(epoch)) break val_losses.append(val_loss) # Reduce lr every 1/3 total epochs if epoch - 1 > f / 3 * args.no_epochs: log.info('Epoch {}/{}. Dividing LR by 10'.format( epoch, args.no_epochs)) for g in optimizer.param_groups: g['lr'] = g['lr'] / 10 f += 1. test() except KeyboardInterrupt: log.info('Registered KeyboardInterrupt. Stopping training.') log.info('Saving last model to disk') if args.prior == 'vi': sample_weights_hook.remove() torch.save( make_checkpoint(epoch, **parameters), path.join( args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format( timestamp, epoch, '_with_apex' if use_apex else '', args.prior))) return elif args.test: test() # Only test on existing languages if there are no held out languages if not args.target_langs: exit(0) importance = 1e-5 # If use UNIV, calculate informed prior, else use boring prior if args.prior == 'laplace': if not isinstance( prior, LaplacePrior): # only calculate matrix if it is not supplied. log.info('Creating laplace approximation dataset') laplace_set = Dataset(data_splits['train'], batchsize=args.batchsize, bptt=100, reset_on_iter=True) laplace_loader = DataLoader(laplace_set, num_workers=args.workers) log.info('Creating Laplacian prior') prior = LaplacePrior(model, loss_function, laplace_loader, use_apex=use_apex, amp=amp, device=device) parameters['prior'] = prior torch.save( make_checkpoint('fisher_matrix', **parameters), path.join( args.checkpoint_dir, '{}_fishers_matrix{}_{}.pth'.format( timestamp, '_with_apex' if use_apex else '', args.prior))) importance = 1e5 elif args.prior == 'ninf': log.info('Creating non-informative Gaussian prior') parameters['prior'] = GaussianPrior() elif args.prior == 'vi': importance = 1e-5 elif args.prior == 'hmc': raise NotImplementedError else: raise ValueError( f'Passed prior {args.prior} is not an implemented inference technique.' ) best_model = saved_models[-1] if not len( saved_models) == 0 else args.checkpoint # Remove sampling hook from model if args.prior == 'vi': sample_weights_hook.remove() # Refine on 100 samples on each target if args.refine: # reset learning rate optimizer.param_groups[0]['lr'] = args.lr loss = 0 results = dict() # Create individual tests sets test_sets = dict() for lang, lang_d in data_splits['test'].items(): test_sets[lang] = DataLoader(Dataset({lang: lang_d}, make_config=True, batchsize=args.test_batchsize, bptt=args.bptt, eval=True), num_workers=args.workers) for lang, lang_data in tqdm.tqdm(refine_set.items()): final_loss = False refine_dataloader = DataLoader(lang_data, num_workers=args.workers) load_model(best_model, **parameters) log.info(f'Refining for language {dictionary.idx2lang[lang]}') for epoch in range(1, args.refine_epochs + 1): refine(refine_dataloader, **parameters, importance=importance) if epoch % 5 == 0: final_loss = True loss, avg_loss = evaluate(test_sets[lang], model, loss_function, only_l=lang, report_all=True, device=device) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.debug( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) if not final_loss: loss, avg_loss = evaluate(test_sets[lang], model, loss_function, only_l=lang, report_all=True, device=device) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) results[lang] = avg_l_loss log.info('=' * 89) log.info('FINAL FEW SHOT RESULTS: ') log.info('=' * 89) for lang, avg_l_loss in results.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) log.info('=' * 89)
import os from data import Corpus import argparse from model import RNNModel import numpy as np def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--load_path', default=None) args = parser.parse_args() return args cl_args = parse_args() dataset = Corpus() dataset.process_data() sos = dataset.target_dict.word2idx['<sos>'] eos = dataset.target_dict.word2idx['<eos>'] args = np.load(os.path.join(cl_args.load_path, 'args.npy')).tolist() model = RNNModel(args).cuda() model.eval() if cl_args.load_path: file = os.path.join(cl_args.load_path, 'model.pt') model.load_state_dict(torch.load(file)) itr = dataset.create_epoch_iterator('test', 1) for i in xrange(50): source, target = itr.next() output = model.sample(source, sos, eos)
BPTT = args.bptt BSZ = args.bsz EVAL_BSZ = 10 LR = args.lr CLIP = args.clip ######################################################################################################################## PRINT_EVERY = args.log CUDA = args.cuda ######################################################################################################################## # save decoders DECODER = open("decoder.json", "w") ENCODER = open("encoder.json", "w") # read data corpus = Corpus(args.data, CUDA) vocab_size = len(corpus.dictionary) print("|V|", vocab_size) # turn into batches training_data = batchify(corpus.train, BSZ, CUDA) validation_data = batchify(corpus.valid, EVAL_BSZ, CUDA) # set loss function loss_function = nn.CrossEntropyLoss() # Load the best saved model or initialize new one if args.load: print('loading') with open(args.save, 'rb') as f: model = torch.load(f)
def generate(args): cuda = torch.cuda.is_available() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: try: model = torch.load(f) except: # Convert the model to CPU if the model is serialized on GPU. model = torch.load(f, map_location='cpu') model.eval() sos = SOS_CHAR if args.use_chars else SOS eos = EOS_CHAR if args.use_chars else EOS unk = UNK_CHAR if args.use_chars else UNK data_dir = os.path.expanduser(args.data_dir) corpus = Corpus(data_dir, headers=args.no_headers, lower=args.lower, chars=args.use_chars) ntokens = len(corpus.dictionary) model_data_checks(model, corpus, args) if args.start: start = list(args.start) if args.use_chars else args.start.split() input = start = [word.lower() for word in start] if args.lower else start if len(input) < model.order: input = (model.order - len(input)) * [sos] + input elif len(input) > model.order: input = input[-model.order:] else: start = input = [sos] * model.order input = [word if word in corpus.dictionary.w2i else unk for word in input] ids = [corpus.dictionary.w2i[word] for word in input] input = Variable(torch.LongTensor(ids).unsqueeze(0)) input = input.cuda() if cuda else input glue = '' if args.use_chars else ' ' with open(args.outf, 'w') as outf: if args.start: outf.write(glue.join(start) + glue) for i in range(args.num_samples): output = model(input) word_weights = output.squeeze().div(args.temperature).exp().cpu() if args.no_unk: word_weights[corpus.dictionary.w2i[unk]] = 0 word_idx = torch.multinomial(word_weights, 1)[0] word_idx = word_idx.data[0] word = corpus.dictionary.i2w[word_idx] ids.append(word_idx) input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0)) input = input.cuda() if cuda else input if word is sos and args.no_sos: continue elif word is eos: outf.write('\n') else: outf.write(word + glue) if i % 100 == 0: print('| Generated {}/{} words'.format(i, args.num_samples)) print(f'Results saved in `{args.outf}`.')
import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" from keras.models import load_model from util import data_generator, generator_y_true from data import Vocab, EmojiVocab, Corpus from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score import numpy as np model = load_model(os.path.join('weight', args.model)) # load corpus and vocab vocab = Vocab(20000) # 20k emoji_vocab = EmojiVocab(40) corpus = Corpus(vocab, emoji_vocab, debug=False, eval=True) encoded_test = corpus.encoded_test # evaluation y_pred = model.predict_generator( data_generator(encoded_test, args.batch_size, args.step_size, len(emoji_vocab)), len(encoded_test[0]) // (args.batch_size * args.step_size), verbose=1) target_names = [emoji_vocab.decode(x) for x in range(len(emoji_vocab))] y_true = list( np.array( generator_y_true(encoded_test, args.batch_size, args.step_size, len(emoji_vocab))).reshape(-1))
def load_corpus(self): self.corpus = Corpus(self.vocab, self.config.debug) self.encoded_train = np.array(self.corpus.encoded_train) self.encoded_dev = np.array(self.corpus.encoded_dev) self.encoded_test = np.array(self.corpus.encoded_test)
# 参数设定 nepoch = 6 batch_size = 20 eval_batch_size = 10 bptt_len = 20 emsize = 200 nhid = 256 nlayers = 2 rnn_type = 'LSTM' lr = 20 clip_coefficient = 0.5 cuda = True print_every = 200 # 准备数据 corpus = Corpus('./data') ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) # 建立模型 model = RNNModel(rnn_type, ntokens, emsize, nhid, nlayers) if cuda: model.cuda() loss_fn = nn.functional.cross_entropy prev_val_loss = None for epoch in range(1, nepoch + 1): epoch_start_time = time.time() train()
from data import Corpus import os path = "wikitext-2" corpus = Corpus(path) f = open(os.path.join(path, 'train_.txt'), 'w') f.writelines(("%s\n" % t for t in corpus.train)) f = open(os.path.join(path, 'test_.txt'), 'w') f.writelines(("%s\n" % t for t in corpus.test)) f = open(os.path.join(path, 'valid_.txt'), 'w') f.writelines(("%s\n" % t for t in corpus.valid))
def main(): parser = argparse.ArgumentParser(description='Baseline RNN Language Model') parser.add_argument('--data', type=str, default='./data/', help='location of the data corpus') parser.add_argument( '--test_path', type=str, default=None, help= 'location of the test corpus to calculate word or character-level perplexity' ) parser.add_argument( '--input', type=str, default='word', help='input level (word, grapheme, bpe, syllable, morfessor, char)') parser.add_argument( '--model', type=str, default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') parser.add_argument('--emsize', type=int, default=650, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=650, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=2, help='number of layers') parser.add_argument('--lr', type=float, default=1, help='initial learning rate') parser.add_argument('--clip', type=float, default=10, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=35, help='sequence length') parser.add_argument('--dropout', type=float, default=0.5, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--seed', type=int, default=234, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='model.pt', help='path to save the final model') parser.add_argument('--onnx_export', type=str, default='', help='path to export the final model in onnx format') # ===== parser.add_argument("--corr_type", type=str, default="word") # word, char, bpe # ===== args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load data ############################################################################### # corpus = reader.Corpus(args.data, args.input) corpus = Corpus(args.data) eval_batch_size = 10 # train_data = batchify(corpus.train, args.batch_size, device) # val_data = batchify(corpus.valid, eval_batch_size, device) test_data = batchify(corpus.test, eval_batch_size, device) ############################################################################### # Evaluation code ############################################################################### # Load the best saved model. with open(args.save, 'rb') as f: if args.cuda: model = torch.load(f) else: model = torch.load(f, map_location='cpu') # after load the rnn params are not a continuous chunk of memory # this makes them a continuous chunk, and will speed up forward pass model.rnn.flatten_parameters() # Run on test import numpy as np correctness = get_argmax_correctness(args, model, test_data, eval_batch_size) idx2word = corpus.dictionary.idx2word orig_words = [ idx2word[z] for z in test_data.t().contiguous().view(-1).numpy() ] assert len(orig_words) == len(correctness) my_eval(correctness, orig_words, args.corr_type)
args = p.parse_args() logging.basicConfig(level=logging.INFO) # Check whether GPU is present if args.enable_cuda and torch.cuda.is_available(): enable_cuda = True torch.cuda.set_device(1) logging.info("CUDA is enabled") else: enable_cuda = False logging.info("CUDA is disabled") # Prepare corpus, encoder and decoder corpus = Corpus(args.english_train, args.french_train, args.batch_size, args.num_symbols, args.min_count, args.lower, args.enable_cuda) if args.enc_type.lower() == "transformer": encoder = TransformerEncoder(args.dim, corpus.vocab_size_e, corpus.max_pos, enable_cuda) else: encoder = Encoder(args.dim, corpus.vocab_size_e, corpus.max_pos, args.enc_type, enable_cuda) valid = corpus.load_data(args.english_valid, args.french_valid) eos = corpus.dict_f.word2index["</s>"] decoder = Decoder(args.dim, corpus.vocab_size_f, eos, corpus.longest_english, args.dec_type, args.attention) if enable_cuda: encoder.cuda() decoder.cuda()
hidden_size = cfg['model']['hidden_size'] nlayers = cfg['model']['nlayers'] batch_size = cfg['model']['batch_size'] input_size = cfg['model']['input_size'] epochs = cfg['model']['epochs'] lr = float(cfg['model']['lr']) seq_len = cfg['model']['seq_len'] #(self, C, nlayers, vocab_size, input_size, hidden_size, lr) corpus_loc = cfg['corpus_loc'] print('test') print(corpus_loc) corpus = Corpus(corpus_loc) vocab_size = len(corpus.dict) train_batch_size = cfg['model']['train_batch_size'] eval_batch_size = cfg['model']['eval_batch_size'] test_batch_size = cfg['model']['test_batch_size'] corpus_train = batchify(corpus.train, train_batch_size) corpus_valid = batchify(corpus.val, eval_batch_size) corpus_test = batchify(corpus.test, test_batch_size) batch_no = 0%corpus_train.shape[0] batch = get_batch(corpus_train, batch_no, train_batch_size)
heads = mst(S) # Predict labels select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1) select = Variable(select) selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1) _, labels = selected.max(dim=0) labels = labels.data.numpy() return heads, labels if __name__ == '__main__': data_path = '../../stanford-ptb' vocab_path = 'vocab/train' model_path = 'models/model.pt' dictionary = Dictionary(vocab_path) corpus = Corpus(data_path=data_path, vocab_path=vocab_path) model = torch.load(model_path) batches = corpus.train.batches(1) words, tags, heads, labels = next(batches) S_arc, S_lab = model(words, tags) plot(S_arc, heads) words = tags = [1, 2, 3, 4] heads_pred, labels_pred = predict(model, words, tags) print(heads_pred, '\n', heads[0].data.numpy()) print(labels_pred, '\n', labels[0].data.numpy())
import numpy as np from config import load_config def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-p', '--path', required=True) parser.add_argument('-s', '--save_path', default='./') parser.add_argument('-l', '--load_path', default=None) args = parser.parse_args() return args args = parse_args() cf = load_config(args.path) dataset = Corpus() dataset.process_data() cf.ntokens_source = len(dataset.source_dict) cf.ntokens_target = len(dataset.target_dict) if not os.path.exists(args.save_path): os.makedirs(args.save_path) criterion = nn.CrossEntropyLoss( ignore_index=dataset.target_dict.word2idx['<pad>']) model = RNNModel(cf).cuda() optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4) if args.load_path:
from data import Corpus corpus = Corpus.load_from_folder("data/docs") corpus.auto_tags() corpus.save_conllu("todo/vi_corpus_v1_todo.conllu", write_status=True) tagged_corpus = Corpus.load_from_conllu_file("vi_corpus_v1.conllu") print(0) # content = "\n\n".join(tokenizes) # open("tmp/tokenize_data.txt", "w").write(content)
# restoring model savepath = params['filepath'].get('ckpt') ckpt = torch.load(savepath) vocab = ckpt['vocab'] model = SeNet(num_classes=params['num_classes'], vocab=vocab) model.load_state_dict(ckpt['model_state_dict']) model.eval() # create dataset, dataloader tagger = Okt() padder = PadSequence(length=30) tst_data = read_data(params['filepath'].get('tst')) tst_data = remove_na(tst_data) tst_dataset = Corpus(tst_data, vocab, tagger, padder) tst_dataloader = DataLoader(tst_dataset, batch_size=128) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') model.to(device) # evaluation correct_count = 0 for x_mb, y_mb in tqdm(tst_dataloader): x_mb = x_mb.to(device) y_mb = y_mb.to(device) with torch.no_grad(): y_mb_hat = model(x_mb) y_mb_hat = torch.max(y_mb_hat, 1)[1] correct_count += (y_mb_hat == y_mb).sum().item()
weightDecay = args.wd K = args.K torch.set_num_threads(1) torch.manual_seed(seed) random.seed(seed) torch.cuda.set_device(gpuId) torch.cuda.manual_seed(seed) corpus = Corpus(sourceTrainFile=sourceTrainFile, sourceOrigTrainFile=sourceOrigTrainFile, targetTrainFile=targetTrainFile, sourceDevFile=sourceDevFile, sourceOrigDevFile=sourceOrigDevFile, targetDevFile=targetDevFile, minFreqSource=minFreqSource, minFreqTarget=minFreqTarget, maxTokenLen=maxLen) print('Source vocabulary size: ' + str(corpus.sourceVoc.size())) print('Target vocabulary size: ' + str(corpus.targetVoc.size())) print() print('# of training samples: ' + str(len(corpus.trainData))) print('# of develop samples: ' + str(len(corpus.devData))) print('Random seed: ', str(seed)) useSmallSoftmax = (K > 0 and K <= corpus.targetVoc.size()) if useSmallSoftmax:
torch.cuda.manual_seed(args.seed) # Config to run config = Config() if os.path.isfile(args.save): checkpoint = torch.load(args.save) if 'config' in checkpoint: print("Loading saved config") config = checkpoint['config'] print(config) # Dictionary and corpus dictionary = Dictionary() training_corpus = Corpus(args.data + "/train.txt", dictionary, create_dict=True, use_cuda=args.cuda, n_gram=config.n_gram, context_mode=config.context_mode) validation_corpus = Corpus(args.data + "/valid.txt", dictionary, create_dict=True, use_cuda=args.cuda, n_gram=config.n_gram, context_mode=config.context_mode) # TensorboardX object writer = SummaryWriter("saved_runs/" + args.save) # Word embeddings embedding = nn.Embedding(len(dictionary), config.em_size, padding_idx=0) if config.pre_trained: