コード例 #1
0
    def __init__(self, text=None):
        offset = 1
        super().__init__(kVocabStart - offset)

        save_path = path.join(kPrepDataDir, 'vocab.pt')
        if path.exists(save_path):
            with open(save_path, 'rb') as f:
                vocabulary = pickle.load(f)
            self.encoder = WhitespaceEncoder(vocabulary)
            return

        self.encoder = WhitespaceEncoder(text)
        with open(save_path, 'wb') as f:
            pickle.dump(self.vocab()[offset:], f)
コード例 #2
0
 def setUp(self):
     self.output_text_encoder = WhitespaceEncoder(['a b c d e'],
                                                  append_eos=False)
     self.input_text_encoder = WhitespaceEncoder(['a b c d e'],
                                                 append_eos=False)
     predictions = [
         self.output_text_encoder.encode('a b c d d').tolist(),
         self.output_text_encoder.encode('a a a a a').tolist(),
         self.output_text_encoder.encode('b b b b b').tolist(),
     ]
     targets = [
         self.output_text_encoder.encode('a b c d e').tolist(),
         self.output_text_encoder.encode('a a a a a').tolist(),
         self.output_text_encoder.encode('b b b b b').tolist(),
     ]
     sources, targets, outputs = get_batch(
         predictions=predictions,
         targets=targets,
         vocab_size=self.output_text_encoder.vocab_size)
     self.sources = sources
     self.targets = targets
     self.outputs = outputs
コード例 #3
0
def test_spacy_encoder():
    input_ = 'This is a sentence'
    encoder = WhitespaceEncoder([input_])
    tokens = encoder.encode(input_)
    assert encoder.decode(tokens) == input_
コード例 #4
0
ファイル: train.py プロジェクト: zbyte64/PyTorch-NLP
if args.gpu >= 0:
    torch.cuda.set_device(args.gpu)

# load dataset
train, dev, test = snli_dataset(train=True, dev=True, test=True)

# Preprocess
for row in datasets_iterator(train, dev, test):
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)]
sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)]
label_encoder = IdentityEncoder(label_corpus)

# Encode
for row in datasets_iterator(train, dev, test):
    row['premise'] = sentence_encoder.encode(row['premise'])
    row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
    row['label'] = label_encoder.encode(row['label'])

config = args
config.n_embed = sentence_encoder.vocab_size
config.d_out = label_encoder.vocab_size
config.n_cells = config.n_layers
コード例 #5
0
from torchnlp.text_encoders import WhitespaceEncoder
from torchnlp.utils import pad_tensor
import matplotlib.pyplot as plt

# doc/words and its label
docs = [
    'China', 'Italy', 'Germany', 'USA', 'Canada', 'Beijing', 'Rome', 'Berlin',
    'Washington DC', 'Ottawa'
]

# define class labels
labels = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

# we use integer to encode/represent the documents's word
# here we use torchnlp's Tokenizer
t = WhitespaceEncoder(docs)
# t.vocab

# encode the whole document
encoded_docs = [t.encode(x) for x in docs]
print("encoded_docs is:")
print(encoded_docs)
# encoded_docs will look this
#[tensor([5]),
# tensor([6]),
# tensor([7]),
## tensor([8]),
# tensor([9]),
# tensor([10]),
# tensor([11]),
# tensor([12]),
コード例 #6
0
def load_data(data_type,
              preprocessing=False,
              fine_grained=False,
              verbose=False,
              text_length=5000,
              encode=True,
              load_SLE=False):
    if data_type == 'imdb':
        train_data, test_data = imdb_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'newsgroups':
        train_data, test_data = newsgroups_dataset(preprocessing=preprocessing,
                                                   verbose=verbose,
                                                   text_length=text_length)
    elif data_type == 'reuters':
        train_data, test_data = reuters_dataset(preprocessing=preprocessing,
                                                fine_grained=fine_grained,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'webkb':
        train_data, test_data = webkb_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'cade':
        train_data, test_data = cade_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'dbpedia':
        train_data, test_data = dbpedia_dataset(preprocessing=preprocessing,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'agnews':
        train_data, test_data = agnews_dataset(preprocessing=preprocessing,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'yahoo':
        train_data, test_data = yahoo_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'sogou':
        train_data, test_data = sogou_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'yelp':
        train_data, test_data = yelp_dataset(preprocessing=preprocessing,
                                             fine_grained=fine_grained,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'amazon':
        train_data, test_data = amazon_dataset(preprocessing=preprocessing,
                                               fine_grained=fine_grained,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'custom':
        test_data = custom_dataset(preprocessing=preprocessing,
                                   fine_grained=fine_grained,
                                   verbose=verbose,
                                   text_length=text_length)
        sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb'))
        label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        for row in datasets_iterator(test_data):
            row['text'] = sentence_encoder.encode(' '.join(row['text']))
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data
    else:
        raise ValueError('{} data type not supported.'.format(data_type))

    if encode:
        if load_SLE:
            sentence_encoder = pickle.load(
                open('epochs/sentence_encoder', 'rb'))
            label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        else:
            sentence_corpus = [
                row['text'] for row in datasets_iterator(train_data, )
            ]
            label_corpus = [
                row['label'] for row in datasets_iterator(train_data, )
            ]
            sentence_encoder = WhitespaceEncoder(
                sentence_corpus,
                reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN])
            label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[])
            with open('epochs/sentence_encoder', 'wb') as f:
                pickle.dump(sentence_encoder, f)
            with open('epochs/label_encoder', 'wb') as f:
                pickle.dump(label_encoder, f)

        # Encode
        for row in datasets_iterator(train_data, test_data):
            row['text'] = sentence_encoder.encode(row['text'])
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data
    else:
        return train_data, test_data
コード例 #7
0
def encoder(input_):
    return WhitespaceEncoder([input_])
コード例 #8
0
def main():
    ROOT_DIR = os.path.join(str(Path.home()), '.torchtext')

    # define parameters and hyperparameters
    args = {
        'data_dir': ROOT_DIR,
        'use_cuda': True,
        'test_batch_size': 128,
        'dev_size': 0.1,
        'checkpoint': True,
        'early_stopping': False,
        'epochs': 5,
        'd_embedding': 300,
        'word_vectors': 'glove.840B.300d',
        'word_vectors_freeze': True,
        'vector_cache_dir': os.path.join(ROOT_DIR, 'vector_cache'),
        'momentum': .9,
        'seed': 42,
        'visdom_env': 'main',
    }

    args = Args(**args)

    vis = visdom.Visdom()
    if not vis.check_connection():
        raise RuntimeError(
            "Visdom server not running. Please run python -m visdom.server")

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if args.use_cuda else 'cpu')

    # Load dataset splits
    train, test = trec_dataset(train=True, test=True, directory=args.data_dir)

    # Create encoders (TODO: best way to persist those?)
    text_corpus = [row['text'] for row in datasets_iterator(train, test)]
    text_encoder = WhitespaceEncoder(text_corpus)

    label_corpus = [row['label'] for row in datasets_iterator(train, test)]
    label_encoder = LabelEncoder(label_corpus)

    # encode dataset splits
    for row in datasets_iterator(train, test):
        row['text'] = text_encoder.encode(row['text'])
        row['label'] = label_encoder.encode(row['label'])

    # create sampler for train / dev split used in dataloader
    train_sampler, dev_sampler = train_test_split_sampler(
        train, test_size=args.dev_size, random_state=args.seed)

    def delete_checkpoint(path):
        checkpoint_files = list(path.glob('checkpoint_model*.pth'))
        if checkpoint_files:
            os.remove(checkpoint_files[0])

    visdom_logger = VisdomRunSummaryLogger(env=args.visdom_env,
                                           clear_batch_summary=True)

    # TODO: abstract this part
    run_config = {'run': 0}

    # train function
    def train_f(config):

        run_name = 'run_%d' % run_config['run']
        run_config['run'] = run_config['run'] + 1

        visdom_logger.new_run(run_name)

        model_path = Path('/tmp/models/')

        delete_checkpoint(model_path)

        train_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.batch_size,
            sampler=train_sampler,
            drop_last=True,
            sort_key=lambda r: len(row['text']))

        train_loader = DataLoader(train,
                                  batch_sampler=train_batch_sampler,
                                  collate_fn=collate_fn,
                                  pin_memory=config.use_cuda,
                                  num_workers=0)

        dev_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.test_batch_size,
            drop_last=True,
            sampler=dev_sampler,
            sort_key=lambda r: len(row['text']))

        dev_loader = DataLoader(train,
                                batch_sampler=dev_batch_sampler,
                                collate_fn=collate_fn,
                                pin_memory=config.use_cuda,
                                num_workers=0)

        test_sampler = BucketBatchSampler(test,
                                          config.test_batch_size,
                                          drop_last=True,
                                          sort_key=lambda r: len(row['text']))

        test_loader = DataLoader(test,
                                 batch_sampler=test_sampler,
                                 collate_fn=collate_fn,
                                 pin_memory=config.use_cuda,
                                 num_workers=0)

        embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding)

        if config.word_vectors_freeze:
            embedding.weight.requires_grad = False

        if config.word_vectors:
            # Load word vectors
            word_vectors = word_to_vector.aliases[config.word_vectors](
                cache=config.vector_cache_dir)
            for i, token in enumerate(text_encoder.vocab):
                embedding.weight.data[i] = word_vectors[token]
            print(
                'Found vectors for %d tokens in vocabulary' %
                len([t for t in text_encoder.vocab if t in word_vectors.stoi]))

        model = LSTMClassifier(d_in=embedding.embedding_dim,
                               d_out=label_encoder.vocab_size,
                               d_hidden=config.d_hidden,
                               dropout=config.dropout,
                               embedding=embedding)
        model.to(device)

        optimizer_params = list(
            filter(lambda p: p.requires_grad, model.parameters()))

        optimizer = torch.optim.SGD(optimizer_params,
                                    lr=config.lr,
                                    momentum=config.momentum)

        trainer = create_supervised_trainer(model,
                                            optimizer,
                                            F.nll_loss,
                                            device=device)

        evaluator_train = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        evaluator_dev = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        visdom_logger.attach_trainer(trainer)
        visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train')
        visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev')

        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lambda epoch_: 1. / (1 + config.lr_decay *
                                            (epoch_ - 1)))

        # scoring function for early stopping and checkpointing
        def score_function(engine):
            dev_loss = engine.state.metrics['nll']
            return -dev_loss

        early_stopping = EarlyStopping(patience=15,
                                       score_function=score_function,
                                       trainer=trainer)

        def checkpoint_score_function(engine):
            dev_accuracy = engine.state.metrics['accuracy']
            return dev_accuracy

        checkpoint = ModelCheckpoint('/tmp/models',
                                     'checkpoint',
                                     score_function=checkpoint_score_function,
                                     n_saved=1,
                                     create_dir=True,
                                     score_name="dev_accuracy")

        # lets train!
        train_model(
            model=model,
            trainer=trainer,
            epochs=config.epochs,
            evaluator_train=evaluator_train,
            evaluator_dev=evaluator_dev,
            train_loader=train_loader,
            dev_loader=dev_loader,
            lr_scheduler=lr_scheduler,
            early_stopping=early_stopping if config.early_stopping else None,
            checkpoint=checkpoint if config.checkpoint else None)

        # load checkpointed (best) model and evaluate on test loader
        model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0])

        test_evaluator = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        test_evaluator.run(test_loader)
        metrics = test_evaluator.state.metrics
        print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format(
            metrics['accuracy'], metrics['nll']))

        test_evaluator.run(dev_loader)
        metrics = test_evaluator.state.metrics
        return metrics['nll']

    # hyperparameter tuning!
    hp_opt = HPOptimizer(args=args,
                         strategy='gp',
                         space=[
                             Real(0.1, 0.5, name='dropout'),
                             Categorical([50, 100, 150, 200], name='d_hidden'),
                             Real(1e-4, 1, prior='log-uniform', name='lr'),
                             Real(1e-3,
                                  1,
                                  prior='log-uniform',
                                  name='lr_decay'),
                             Categorical([4, 8, 16, 32, 64, 128],
                                         name='batch_size')
                         ])

    hp_opt.add_callback(visdom_logger.run_summary)

    result = hp_opt.minimize(train_f, n_calls=10)
    print(result)