def test_identity_encoder_unknown():
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    encoder = IdentityEncoder(sample)
    input_ = 'symbols/namesake/named_after'
    output = encoder.encode(input_)
    assert len(output) == 1
    assert encoder.decode(output) == UNKNOWN_TOKEN
def test_identity_encoder_known():
    input_ = 'symbols/namesake/named_after'
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    sample.append(input_)
    encoder = IdentityEncoder(sample)
    output = encoder.encode(input_)
    assert len(output) == 1
    assert encoder.decode(output) == input_
def test_identity_encoder_sequence():
    input_ = [
        'symbols/namesake/named_after', 'people/deceased_person/place_of_death'
    ]
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    encoder = IdentityEncoder(sample)
    output = encoder.encode(input_)
    assert len(output) == 2
    assert encoder.decode(output) == [
        UNKNOWN_TOKEN, 'people/deceased_person/place_of_death'
    ]
Exemple #4
0
    def __init__(self, data_path=None):
        super().__init__()

        save_path = path.join(kPrepDataDir, 'labels.pt')
        if path.exists(save_path):
            with open(save_path, 'rb') as f:
                labels = pickle.load(f)
            self.encoder = IdentityEncoder(labels)
            return

        labels = []
        with open(data_path, 'r') as f:
            for line in f:
                label = line.split('\t')[-1][:-1]
                labels.append(label)
        self.encoder = IdentityEncoder(labels)
        with open(save_path, 'wb') as f:
            pickle.dump(self.vocab(), f)
Exemple #5
0
 def __init__(self, is_char=False):
     self.mapped_data = dict()
     if not is_char:
         self.train, self.valid, self.test = ptb(train=True,
                                                 dev=True,
                                                 test=True)
     else:
         self.train, self.valid, self.test = ptb(
             train=True,
             dev=True,
             test=True,
             train_filename="ptb.char.train.txt",
             dev_filename="ptb.char.valid.txt",
             test_filename="ptb.char.test.txt")
     self._map_data(self.train + self.valid + self.test)
     #encodeer data
     encoder = IdentityEncoder(self.train + self.valid + self.test)
     self.train = torch.LongTensor(encoder.encode(self.train))
     self.valid = torch.LongTensor(encoder.encode(self.valid))
     self.test = torch.LongTensor(encoder.encode(self.test))
     self.ntoken = encoder.vocab_size
     print('hello')
Exemple #6
0
def model_load(fn):
    global model, criterion, optimizer
    with open(fn, 'rb') as f:
        model, criterion, optimizer = torch.load(f)


from torchnlp import datasets
from torchnlp.text_encoders import IdentityEncoder
from torchnlp.samplers import BPTTBatchSampler

print('Producing dataset...')
train, val, test = getattr(datasets, args.data)(train=True,
                                                dev=True,
                                                test=True)

encoder = IdentityEncoder(train + val + test)

train_data = encoder.encode(train)
val_data = encoder.encode(val)
test_data = encoder.encode(test)

eval_batch_size = 10
test_batch_size = 1

train_source_sampler, val_source_sampler, test_source_sampler = tuple([
    BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'source')
    for d in (train, val, test)
])

train_target_sampler, val_target_sampler, test_target_sampler = tuple([
    BPTTBatchSampler(d, args.bptt, args.batch_size, True, 'target')
Exemple #7
0
# load dataset
train, dev, test = snli_dataset(train=True, dev=True, test=True)

# Preprocess
for row in datasets_iterator(train, dev, test):
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)]
sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)]
label_encoder = IdentityEncoder(label_corpus)

# Encode
for row in datasets_iterator(train, dev, test):
    row['premise'] = sentence_encoder.encode(row['premise'])
    row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
    row['label'] = label_encoder.encode(row['label'])

config = args
config.n_embed = sentence_encoder.vocab_size
config.d_out = label_encoder.vocab_size
config.n_cells = config.n_layers

# double the number of cells for bidirectional networks
if config.birnn:
    config.n_cells *= 2
Exemple #8
0
def load_data(data_type,
              preprocessing=False,
              fine_grained=False,
              verbose=False,
              text_length=5000,
              encode=True,
              load_SLE=False):
    if data_type == 'imdb':
        train_data, test_data = imdb_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'newsgroups':
        train_data, test_data = newsgroups_dataset(preprocessing=preprocessing,
                                                   verbose=verbose,
                                                   text_length=text_length)
    elif data_type == 'reuters':
        train_data, test_data = reuters_dataset(preprocessing=preprocessing,
                                                fine_grained=fine_grained,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'webkb':
        train_data, test_data = webkb_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'cade':
        train_data, test_data = cade_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'dbpedia':
        train_data, test_data = dbpedia_dataset(preprocessing=preprocessing,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'agnews':
        train_data, test_data = agnews_dataset(preprocessing=preprocessing,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'yahoo':
        train_data, test_data = yahoo_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'sogou':
        train_data, test_data = sogou_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'yelp':
        train_data, test_data = yelp_dataset(preprocessing=preprocessing,
                                             fine_grained=fine_grained,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'amazon':
        train_data, test_data = amazon_dataset(preprocessing=preprocessing,
                                               fine_grained=fine_grained,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'custom':
        test_data = custom_dataset(preprocessing=preprocessing,
                                   fine_grained=fine_grained,
                                   verbose=verbose,
                                   text_length=text_length)
        sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb'))
        label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        for row in datasets_iterator(test_data):
            row['text'] = sentence_encoder.encode(' '.join(row['text']))
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data
    else:
        raise ValueError('{} data type not supported.'.format(data_type))

    if encode:
        if load_SLE:
            sentence_encoder = pickle.load(
                open('epochs/sentence_encoder', 'rb'))
            label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        else:
            sentence_corpus = [
                row['text'] for row in datasets_iterator(train_data, )
            ]
            label_corpus = [
                row['label'] for row in datasets_iterator(train_data, )
            ]
            sentence_encoder = WhitespaceEncoder(
                sentence_corpus,
                reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN])
            label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[])
            with open('epochs/sentence_encoder', 'wb') as f:
                pickle.dump(sentence_encoder, f)
            with open('epochs/label_encoder', 'wb') as f:
                pickle.dump(label_encoder, f)

        # Encode
        for row in datasets_iterator(train_data, test_data):
            row['text'] = sentence_encoder.encode(row['text'])
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data
    else:
        return train_data, test_data
def encoder():
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    return IdentityEncoder(sample)