Exemple #1
0
 def merge_data(self, pos, neg, device):
     # FIXME: maybe just Field?
     label_field = RawField(postprocessing=lambda x: torch.cuda.LongTensor(
         x, device=device))
     label_field.is_target = True
     examples = [self._attach_label(ex, POS_LABEL) for ex in pos] +\
         [self._attach_label(ex, NEG_LABEL) for ex in neg]
     return Dataset(examples, [('sent', self.sent_field),
                               ('label', label_field)])
Exemple #2
0
# parse conll dependency data
model_class, tokenizer_class, pretrained_weights = BertModel, BertTokenizer, config['bert']
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def batch_num(nums):
    lengths = torch.tensor([len(n) for n in nums]).long()
    n = lengths.max()
    out = torch.zeros(len(nums), n).long()
    for b, n in enumerate(nums):
        out[b, :len(n)] = torch.tensor(n)
    return out, lengths

HEAD = RawField(preprocessing=lambda x: [int(i) for i in x],
        postprocessing=batch_num)
HEAD.is_target = True
WORD = SubTokenizedField(tokenizer)

def len_filt(x): return 5 < len(x.word[0]) < 40

train = ConllXDataset('wsj.train.conllx', (('word', WORD), ('head', HEAD)),
        filter_pred=len_filt)
train_iter = TokenBucket(train, 750)
val = ConllXDataset('wsj.dev.conllx', (('word', WORD), ('head', HEAD)),
        filter_pred=len_filt)
val_iter = BucketIterator(val, batch_size=20, device='cuda:0')

# make bert model to compute potentials
H = config['H']
class Model(nn.Module):
    def __init__(self, hidden):
Exemple #3
0
def create_dataset(config: Config, device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]:

    fields = dict()
    raw_field = RawField()
    # torchtext 0.3.1
    # AttributeError: 'RawField' object has no attribute 'is_target'
    raw_field.is_target = False
    fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, raw_field)

    time_field = Field(use_vocab=False, batch_first=True, sequential=False)
    fields['jst_hour'] = (SeqType.Time.value, time_field)

    token_field = \
        Field(use_vocab=True,
              init_token=SpecialToken.BOS.value,
              eos_token=SpecialToken.EOS.value,
              pad_token=SpecialToken.Padding.value,
              unk_token=SpecialToken.Unknown.value) \
        if config.use_init_token_tag \
        else Field(use_vocab=True,
                   eos_token=SpecialToken.EOS.value,
                   pad_token=SpecialToken.Padding.value,
                   unk_token=SpecialToken.Unknown.value)

    fields['processed_tokens'] = (SeqType.Token.value, token_field)

    seqtypes = [SeqType.RawShort, SeqType.RawLong,
                SeqType.MovRefShort, SeqType.MovRefLong,
                SeqType.NormMovRefShort, SeqType.NormMovRefLong,
                SeqType.StdShort, SeqType.StdLong]

    for (ric, seqtype) in itertools.product(config.rics, seqtypes):
        n = N_LONG_TERM \
            if seqtype.value.endswith('long') \
            else N_SHORT_TERM
        price_field = Field(use_vocab=False,
                            fix_length=n,
                            batch_first=True,
                            pad_token=0.0,
                            preprocessing=lambda xs: [float(x) for x in xs],
                            dtype=torch.float)
        key = stringify_ric_seqtype(ric, seqtype)
        fields[key] = (key, price_field)

    train, val, test = \
        TabularDataset.splits(path=str(config.dir_output),
                              format='json',
                              train='alignment-train.json',
                              validation='alignment-valid.json',
                              test='alignment-test.json',
                              fields=fields)

    token_field.build_vocab(train, min_freq=config.token_min_freq)

    batch_size = config.batch_size
    train_iter, val_iter, test_iter = \
        Iterator.splits((train, val, test),
                        batch_sizes=(batch_size, batch_size, batch_size),
                        device=-1 if device.type == 'cpu' else device,
                        repeat=False,
                        sort=False)

    return (token_field.vocab, train_iter, val_iter, test_iter)