def __init__(self, text=None): offset = 1 super().__init__(kVocabStart - offset) save_path = path.join(kPrepDataDir, 'vocab.pt') if path.exists(save_path): with open(save_path, 'rb') as f: vocabulary = pickle.load(f) self.encoder = WhitespaceEncoder(vocabulary) return self.encoder = WhitespaceEncoder(text) with open(save_path, 'wb') as f: pickle.dump(self.vocab()[offset:], f)
class TestPrintRandomSample(unittest.TestCase): def setUp(self): self.output_text_encoder = WhitespaceEncoder(['a b c d e'], append_eos=False) self.input_text_encoder = WhitespaceEncoder(['a b c d e'], append_eos=False) predictions = [ self.output_text_encoder.encode('a b c d d').tolist(), self.output_text_encoder.encode('a a a a a').tolist(), self.output_text_encoder.encode('b b b b b').tolist(), ] targets = [ self.output_text_encoder.encode('a b c d e').tolist(), self.output_text_encoder.encode('a a a a a').tolist(), self.output_text_encoder.encode('b b b b b').tolist(), ] sources, targets, outputs = get_batch( predictions=predictions, targets=targets, vocab_size=self.output_text_encoder.vocab_size) self.sources = sources self.targets = targets self.outputs = outputs def test_ignore_index_none(self): print_random_sample(self.sources, self.targets, self.outputs, self.input_text_encoder, self.output_text_encoder, n_samples=1) def test_ignore_index(self): print_random_sample(self.sources, self.targets, self.outputs, self.input_text_encoder, self.output_text_encoder, n_samples=1, ignore_index=self.output_text_encoder.stoi['e']) def test_n_samples_big(self): print_random_sample(self.sources, self.targets, self.outputs, self.input_text_encoder, self.output_text_encoder, n_samples=40)
def setUp(self): self.output_text_encoder = WhitespaceEncoder(['a b c d e'], append_eos=False) self.input_text_encoder = WhitespaceEncoder(['a b c d e'], append_eos=False) predictions = [ self.output_text_encoder.encode('a b c d d').tolist(), self.output_text_encoder.encode('a a a a a').tolist(), self.output_text_encoder.encode('b b b b b').tolist(), ] targets = [ self.output_text_encoder.encode('a b c d e').tolist(), self.output_text_encoder.encode('a a a a a').tolist(), self.output_text_encoder.encode('b b b b b').tolist(), ] sources, targets, outputs = get_batch( predictions=predictions, targets=targets, vocab_size=self.output_text_encoder.vocab_size) self.sources = sources self.targets = targets self.outputs = outputs
def test_spacy_encoder(): input_ = 'This is a sentence' encoder = WhitespaceEncoder([input_]) tokens = encoder.encode(input_) assert encoder.decode(tokens) == input_
if args.gpu >= 0: torch.cuda.set_device(args.gpu) # load dataset train, dev, test = snli_dataset(train=True, dev=True, test=True) # Preprocess for row in datasets_iterator(train, dev, test): row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)] sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)] label_encoder = IdentityEncoder(label_corpus) # Encode for row in datasets_iterator(train, dev, test): row['premise'] = sentence_encoder.encode(row['premise']) row['hypothesis'] = sentence_encoder.encode(row['hypothesis']) row['label'] = label_encoder.encode(row['label']) config = args config.n_embed = sentence_encoder.vocab_size config.d_out = label_encoder.vocab_size config.n_cells = config.n_layers
from torchnlp.text_encoders import WhitespaceEncoder from torchnlp.utils import pad_tensor import matplotlib.pyplot as plt # doc/words and its label docs = [ 'China', 'Italy', 'Germany', 'USA', 'Canada', 'Beijing', 'Rome', 'Berlin', 'Washington DC', 'Ottawa' ] # define class labels labels = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) # we use integer to encode/represent the documents's word # here we use torchnlp's Tokenizer t = WhitespaceEncoder(docs) # t.vocab # encode the whole document encoded_docs = [t.encode(x) for x in docs] print("encoded_docs is:") print(encoded_docs) # encoded_docs will look this #[tensor([5]), # tensor([6]), # tensor([7]), ## tensor([8]), # tensor([9]), # tensor([10]), # tensor([11]), # tensor([12]),
def load_data(data_type, preprocessing=False, fine_grained=False, verbose=False, text_length=5000, encode=True, load_SLE=False): if data_type == 'imdb': train_data, test_data = imdb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'newsgroups': train_data, test_data = newsgroups_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'reuters': train_data, test_data = reuters_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'webkb': train_data, test_data = webkb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'cade': train_data, test_data = cade_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'dbpedia': train_data, test_data = dbpedia_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'agnews': train_data, test_data = agnews_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yahoo': train_data, test_data = yahoo_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'sogou': train_data, test_data = sogou_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yelp': train_data, test_data = yelp_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'amazon': train_data, test_data = amazon_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'custom': test_data = custom_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) for row in datasets_iterator(test_data): row['text'] = sentence_encoder.encode(' '.join(row['text'])) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data else: raise ValueError('{} data type not supported.'.format(data_type)) if encode: if load_SLE: sentence_encoder = pickle.load( open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) else: sentence_corpus = [ row['text'] for row in datasets_iterator(train_data, ) ] label_corpus = [ row['label'] for row in datasets_iterator(train_data, ) ] sentence_encoder = WhitespaceEncoder( sentence_corpus, reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN]) label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[]) with open('epochs/sentence_encoder', 'wb') as f: pickle.dump(sentence_encoder, f) with open('epochs/label_encoder', 'wb') as f: pickle.dump(label_encoder, f) # Encode for row in datasets_iterator(train_data, test_data): row['text'] = sentence_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data else: return train_data, test_data
def encoder(input_): return WhitespaceEncoder([input_])
def main(): ROOT_DIR = os.path.join(str(Path.home()), '.torchtext') # define parameters and hyperparameters args = { 'data_dir': ROOT_DIR, 'use_cuda': True, 'test_batch_size': 128, 'dev_size': 0.1, 'checkpoint': True, 'early_stopping': False, 'epochs': 5, 'd_embedding': 300, 'word_vectors': 'glove.840B.300d', 'word_vectors_freeze': True, 'vector_cache_dir': os.path.join(ROOT_DIR, 'vector_cache'), 'momentum': .9, 'seed': 42, 'visdom_env': 'main', } args = Args(**args) vis = visdom.Visdom() if not vis.check_connection(): raise RuntimeError( "Visdom server not running. Please run python -m visdom.server") torch.manual_seed(args.seed) device = torch.device('cuda' if args.use_cuda else 'cpu') # Load dataset splits train, test = trec_dataset(train=True, test=True, directory=args.data_dir) # Create encoders (TODO: best way to persist those?) text_corpus = [row['text'] for row in datasets_iterator(train, test)] text_encoder = WhitespaceEncoder(text_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, test)] label_encoder = LabelEncoder(label_corpus) # encode dataset splits for row in datasets_iterator(train, test): row['text'] = text_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) # create sampler for train / dev split used in dataloader train_sampler, dev_sampler = train_test_split_sampler( train, test_size=args.dev_size, random_state=args.seed) def delete_checkpoint(path): checkpoint_files = list(path.glob('checkpoint_model*.pth')) if checkpoint_files: os.remove(checkpoint_files[0]) visdom_logger = VisdomRunSummaryLogger(env=args.visdom_env, clear_batch_summary=True) # TODO: abstract this part run_config = {'run': 0} # train function def train_f(config): run_name = 'run_%d' % run_config['run'] run_config['run'] = run_config['run'] + 1 visdom_logger.new_run(run_name) model_path = Path('/tmp/models/') delete_checkpoint(model_path) train_batch_sampler = FlexibleBucketBatchSampler( train, config.batch_size, sampler=train_sampler, drop_last=True, sort_key=lambda r: len(row['text'])) train_loader = DataLoader(train, batch_sampler=train_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) dev_batch_sampler = FlexibleBucketBatchSampler( train, config.test_batch_size, drop_last=True, sampler=dev_sampler, sort_key=lambda r: len(row['text'])) dev_loader = DataLoader(train, batch_sampler=dev_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) test_sampler = BucketBatchSampler(test, config.test_batch_size, drop_last=True, sort_key=lambda r: len(row['text'])) test_loader = DataLoader(test, batch_sampler=test_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding) if config.word_vectors_freeze: embedding.weight.requires_grad = False if config.word_vectors: # Load word vectors word_vectors = word_to_vector.aliases[config.word_vectors]( cache=config.vector_cache_dir) for i, token in enumerate(text_encoder.vocab): embedding.weight.data[i] = word_vectors[token] print( 'Found vectors for %d tokens in vocabulary' % len([t for t in text_encoder.vocab if t in word_vectors.stoi])) model = LSTMClassifier(d_in=embedding.embedding_dim, d_out=label_encoder.vocab_size, d_hidden=config.d_hidden, dropout=config.dropout, embedding=embedding) model.to(device) optimizer_params = list( filter(lambda p: p.requires_grad, model.parameters())) optimizer = torch.optim.SGD(optimizer_params, lr=config.lr, momentum=config.momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator_train = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) evaluator_dev = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) visdom_logger.attach_trainer(trainer) visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train') visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev') lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda epoch_: 1. / (1 + config.lr_decay * (epoch_ - 1))) # scoring function for early stopping and checkpointing def score_function(engine): dev_loss = engine.state.metrics['nll'] return -dev_loss early_stopping = EarlyStopping(patience=15, score_function=score_function, trainer=trainer) def checkpoint_score_function(engine): dev_accuracy = engine.state.metrics['accuracy'] return dev_accuracy checkpoint = ModelCheckpoint('/tmp/models', 'checkpoint', score_function=checkpoint_score_function, n_saved=1, create_dir=True, score_name="dev_accuracy") # lets train! train_model( model=model, trainer=trainer, epochs=config.epochs, evaluator_train=evaluator_train, evaluator_dev=evaluator_dev, train_loader=train_loader, dev_loader=dev_loader, lr_scheduler=lr_scheduler, early_stopping=early_stopping if config.early_stopping else None, checkpoint=checkpoint if config.checkpoint else None) # load checkpointed (best) model and evaluate on test loader model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0]) test_evaluator = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) test_evaluator.run(test_loader) metrics = test_evaluator.state.metrics print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format( metrics['accuracy'], metrics['nll'])) test_evaluator.run(dev_loader) metrics = test_evaluator.state.metrics return metrics['nll'] # hyperparameter tuning! hp_opt = HPOptimizer(args=args, strategy='gp', space=[ Real(0.1, 0.5, name='dropout'), Categorical([50, 100, 150, 200], name='d_hidden'), Real(1e-4, 1, prior='log-uniform', name='lr'), Real(1e-3, 1, prior='log-uniform', name='lr_decay'), Categorical([4, 8, 16, 32, 64, 128], name='batch_size') ]) hp_opt.add_callback(visdom_logger.run_summary) result = hp_opt.minimize(train_f, n_calls=10) print(result)