Ejemplo n.º 1
0
def test_identity_encoder_unknown():
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    encoder = IdentityEncoder(sample)
    input_ = 'symbols/namesake/named_after'
    output = encoder.encode(input_)
    assert len(output) == 1
    assert encoder.decode(output) == UNKNOWN_TOKEN
Ejemplo n.º 2
0
def test_identity_encoder_known():
    input_ = 'symbols/namesake/named_after'
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    sample.append(input_)
    encoder = IdentityEncoder(sample)
    output = encoder.encode(input_)
    assert len(output) == 1
    assert encoder.decode(output) == input_
Ejemplo n.º 3
0
 def __init__(self, is_char=False):
     self.mapped_data = dict()
     if not is_char:
         self.train, self.valid, self.test = ptb(train=True,
                                                 dev=True,
                                                 test=True)
     else:
         self.train, self.valid, self.test = ptb(
             train=True,
             dev=True,
             test=True,
             train_filename="ptb.char.train.txt",
             dev_filename="ptb.char.valid.txt",
             test_filename="ptb.char.test.txt")
     self._map_data(self.train + self.valid + self.test)
     #encodeer data
     encoder = IdentityEncoder(self.train + self.valid + self.test)
     self.train = torch.LongTensor(encoder.encode(self.train))
     self.valid = torch.LongTensor(encoder.encode(self.valid))
     self.test = torch.LongTensor(encoder.encode(self.test))
     self.ntoken = encoder.vocab_size
     print('hello')
Ejemplo n.º 4
0
def test_identity_encoder_sequence():
    input_ = [
        'symbols/namesake/named_after', 'people/deceased_person/place_of_death'
    ]
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    encoder = IdentityEncoder(sample)
    output = encoder.encode(input_)
    assert len(output) == 2
    assert encoder.decode(output) == [
        UNKNOWN_TOKEN, 'people/deceased_person/place_of_death'
    ]
Ejemplo n.º 5
0
    with open(fn, 'rb') as f:
        model, criterion, optimizer = torch.load(f)


from torchnlp import datasets
from torchnlp.text_encoders import IdentityEncoder
from torchnlp.samplers import BPTTBatchSampler

print('Producing dataset...')
train, val, test = getattr(datasets, args.data)(train=True,
                                                dev=True,
                                                test=True)

encoder = IdentityEncoder(train + val + test)

train_data = encoder.encode(train)
val_data = encoder.encode(val)
test_data = encoder.encode(test)

eval_batch_size = 10
test_batch_size = 1

train_source_sampler, val_source_sampler, test_source_sampler = tuple([
    BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'source')
    for d in (train, val, test)
])

train_target_sampler, val_target_sampler, test_target_sampler = tuple([
    BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'target')
    for d in (train, val, test)
])
Ejemplo n.º 6
0
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)]
sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)]
label_encoder = IdentityEncoder(label_corpus)

# Encode
for row in datasets_iterator(train, dev, test):
    row['premise'] = sentence_encoder.encode(row['premise'])
    row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
    row['label'] = label_encoder.encode(row['label'])

config = args
config.n_embed = sentence_encoder.vocab_size
config.d_out = label_encoder.vocab_size
config.n_cells = config.n_layers

# double the number of cells for bidirectional networks
if config.birnn:
    config.n_cells *= 2

if args.resume_snapshot:
    model = torch.load(
        args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu))
else:
    model = SNLIClassifier(config)
Ejemplo n.º 7
0
def load_data(data_type,
              preprocessing=False,
              fine_grained=False,
              verbose=False,
              text_length=5000,
              encode=True,
              load_SLE=False):
    if data_type == 'imdb':
        train_data, test_data = imdb_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'newsgroups':
        train_data, test_data = newsgroups_dataset(preprocessing=preprocessing,
                                                   verbose=verbose,
                                                   text_length=text_length)
    elif data_type == 'reuters':
        train_data, test_data = reuters_dataset(preprocessing=preprocessing,
                                                fine_grained=fine_grained,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'webkb':
        train_data, test_data = webkb_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'cade':
        train_data, test_data = cade_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'dbpedia':
        train_data, test_data = dbpedia_dataset(preprocessing=preprocessing,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'agnews':
        train_data, test_data = agnews_dataset(preprocessing=preprocessing,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'yahoo':
        train_data, test_data = yahoo_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'sogou':
        train_data, test_data = sogou_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'yelp':
        train_data, test_data = yelp_dataset(preprocessing=preprocessing,
                                             fine_grained=fine_grained,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'amazon':
        train_data, test_data = amazon_dataset(preprocessing=preprocessing,
                                               fine_grained=fine_grained,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'custom':
        test_data = custom_dataset(preprocessing=preprocessing,
                                   fine_grained=fine_grained,
                                   verbose=verbose,
                                   text_length=text_length)
        sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb'))
        label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        for row in datasets_iterator(test_data):
            row['text'] = sentence_encoder.encode(' '.join(row['text']))
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data
    else:
        raise ValueError('{} data type not supported.'.format(data_type))

    if encode:
        if load_SLE:
            sentence_encoder = pickle.load(
                open('epochs/sentence_encoder', 'rb'))
            label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        else:
            sentence_corpus = [
                row['text'] for row in datasets_iterator(train_data, )
            ]
            label_corpus = [
                row['label'] for row in datasets_iterator(train_data, )
            ]
            sentence_encoder = WhitespaceEncoder(
                sentence_corpus,
                reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN])
            label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[])
            with open('epochs/sentence_encoder', 'wb') as f:
                pickle.dump(sentence_encoder, f)
            with open('epochs/label_encoder', 'wb') as f:
                pickle.dump(label_encoder, f)

        # Encode
        for row in datasets_iterator(train_data, test_data):
            row['text'] = sentence_encoder.encode(row['text'])
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data
    else:
        return train_data, test_data