CHARS = CharField(fix_length=config['l0'],
                      lower=not config['differ_uppercase'])
    LABEL = torchdata.Field(use_vocab=False,
                            sequential=False,
                            preprocessing=lambda x: int(x),
                            is_target=True)

    train_dataset, test_dataset = torchdata.TabularDataset.splits(
        path=config['dataset_path'],
        train=config['dataset_train'],
        test=config['dataset_test'],
        format='tsv',
        fields=[('label', LABEL), ('chars', CHARS)])

    train_iterator = torchdata.BucketIterator(train_dataset,
                                              batch_size=config['batch_size'],
                                              device=device)
    test_iterator = torchdata.BucketIterator(
        test_dataset, batch_size=config['test_batch_size'], device=device)

    num_classes, weights = utils.get_weights(
        [e.label for e in train_dataset.examples], config)

    alphabet = config['alphabet']
    # alphabet.append("'")
    CHARS.build_vocab(alphabet)
    LABEL.build_vocab(train_dataset)

    charCNNModel = CharCNNModel(num_classes, alphabet=alphabet).to(device)
    if config['load_model']:
        charCNNModel.load_state_dict(
    def __init__(self,
                 model,
                 labeled,
                 unlabeled,
                 batch_size=64,
                 cap=None,
                 resume_from=None):
        self.model = model
        self.data_root = envs.DATA_DIR
        self.device = envs.CUDA_DEVICE

        self.model.to(self.device)
        # compute class weights

        train_set = BasicDS(path=os.path.join(self.data_root, 'train.json'),
                            text_field=TEXT,
                            label_field=LABEL,
                            samples=labeled,
                            cap=cap)

        test_set = BasicDS(path=os.path.join(self.data_root, 'test.json'),
                           text_field=TEXT,
                           label_field=LABEL,
                           samples=None,
                           cap=cap)

        infer_set = BasicDS(path=os.path.join(self.data_root, 'train.json'),
                            text_field=TEXT,
                            label_field=LABEL,
                            samples=unlabeled,
                            cap=cap)

        self.train_iterator = data.BucketIterator(
            train_set,
            batch_size=batch_size,
            device=self.device,
            shuffle=True,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True)

        self.test_iterator, self.infer_iterator = data.BucketIterator.splits(
            (test_set, infer_set),
            batch_size=batch_size,
            device=self.device,
            shuffle=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True)

        labels = []
        for i in range(len(train_set)):
            labels.append(train_set[i].label)

        class_weight = compute_class_weight(Counter(labels),
                                            num_classes=10,
                                            min_count=1)

        class_weight = torch.Tensor(class_weight).to(self.device)

        self.criterion = nn.CrossEntropyLoss(class_weight)
        self.optimizer = optim.Adam(self.model.parameters())

        if envs.RESUME_FROM:
            ckpt = torch.load(os.path.join(envs.EXPT_DIR, envs.RESUME_FROM))
            self.model.load_state_dict(ckpt['model'])
            self.optimizer.load_state_dict(ckpt['optimizer'])

            for state in self.optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.to(envs.CUDA_DEVICE)
Esempio n. 3
0
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
args.logger = logger
args.device = torch.device('cuda')

# -- DATA
train_data, dev_data, test_data, SRC, TRG = load_iwslt(args)
tok2i, i2tok, SRC, TRG = load_iwslt_vocab(args, SRC, TRG, args.data_prefix)
SRC = copy.deepcopy(SRC)
for data_ in [train_data, dev_data, test_data]:
    if not data_ is None:
        data_.fields['src'] = SRC

sort_key = lambda x: len(x.src)
trainloader = data.BucketIterator(dataset=train_data, batch_size=args.batch_size, device=args.device, train=True, repeat=False, shuffle=True, sort_key=sort_key, sort_within_batch=True) if not train_data is None else None
validloader = data.BucketIterator(dataset=dev_data, batch_size=args.batch_size, device=args.device, train=False, repeat=False, shuffle=True, sort_key=sort_key, sort_within_batch=True) if not dev_data is None else None
testloader = data.BucketIterator(dataset=test_data, batch_size=args.batch_size, device=args.device, train=False, repeat=False, shuffle=False, sort_key=sort_key, sort_within_batch=True) if not test_data is None else None

args.n_classes = len(TRG.vocab.stoi)


# -- loss
loss_flags = {}
if 'multiset' in args.loss:
    loss_fn = sequential_set_loss
    if not args.transformer_auxiliary_end:
        loss_fn = sequential_set_no_stop_loss
    loss_flags['self_teach_beta'] = float(args.self_teach_beta)

Esempio n. 4
0
#
#             loss = trainer_G._train_batch(
#                 src_seq, src_length.tolist(), tgt_seq, G, teacher_forcing_ratio=0)
#             if step % 100 == 0:
#                 print('[step %d] loss_G %.4f' % (epoch * len(train_iter) + step, loss))
#     Checkpoint(model=G, optimizer=optim_G, epoch=0, step=0,
#                input_vocab=EN.vocab, output_vocab=EN.vocab).save(opt._load_G_from)

# Train SeqGAN
ALPHA = 0

for epoch in range(100):
    logging.info('[Epoch %d]' % epoch)
    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=16,
                                     device=opt.device,
                                     sort_within_batch=True,
                                     sort_key=lambda x: len(x.src),
                                     repeat=False)

    for step, batch in enumerate(train_iter):
        src_seq = batch.src[0]
        src_length = batch.src[1]
        tgt_seq = src_seq.clone()
        # gold = tgt_seq[:, 1:]

        # reconstruction loss
        # loss_G.reset()
        # decoder_outputs, decoder_hidden, other = G(src_seq, src_length.tolist(), target_variable=None)
        # fake = torch.cat(other[DecoderRNN.KEY_SEQUENCE], dim=1)

        # (1) train D
    print("Number of src words (types):", len(src_field.vocab))
    print("Number of trg words (types):", len(trg_field.vocab), "\n")


print_data_info(train_data, valid_data, test_data, SRC, TRG)

# In[19]:

batch_size = 36

# In[20]:

train_iter = data.BucketIterator(train_data,
                                 batch_size=batch_size,
                                 train=True,
                                 sort_within_batch=True,
                                 sort_key=lambda x: (len(x.src), len(x.trg)),
                                 repeat=False,
                                 device=DEVICE)

# In[21]:

valid_iter = data.Iterator(valid_data,
                           batch_size=1,
                           train=False,
                           sort=False,
                           repeat=False,
                           device=DEVICE)

# In[22]:
Esempio n. 6
0
def train(args):
    train_data, val_data, test_data, SRC, TGT = prepare_data(args)

    BATCH_SIZE = args.batch_size
    best_bleu_loss = 0
    pad_idx = TGT.vocab.stoi["<pad>"]

    print("Size of source vocabulary:", len(SRC.vocab))
    print("Size of target vocabulary:", len(TGT.vocab))

    print("FC matrix:", args.hidden_dim, args.ff_dim)
    print(args.compress)
    model = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                   d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks,
                                   compress=args.compress, compress_att=args.compress_attn,
                                   compress_mode=args.compress_mode,
                                   num_compress_enc=args.num_enc_blocks_comp,
                                   num_compress_dec=args.num_dec_blocks_comp
                                   )
    model.to(device)
    if args.load_model:
        print('load model from [%s]' % args.load_model, file=sys.stderr)
        params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
        # TODO args = params['args']
        state_dict = params['model']
        # opts = params['']
        model.load_state_dict(state_dict)

    criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    # criterion = nn.NLLLoss(reduction="sum", ignore_index=0)
    criterion.to(device)
    train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, train=True,
                                 sort_within_batch=True,
                                 sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False,
                                 device=device)
    valid_iter = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False, repeat=False,
                           device=device)

    model_opt = opt.WrapperOpt(model.src_embed[0].d_model, 2, 4000,
                                     torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9))

    # train_time = begin_time = time.time()
    valid_params = (SRC, TGT, valid_iter)

    print("Number of examples in train: ", BATCH_SIZE * len([_ for _ in train_iter]))
    print("Number of examples in validation: ", BATCH_SIZE * len([_ for _ in valid_iter]))

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("Number of parameters: ", params)
    if args.debug:
        model2 = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                d_model=args.hidden_dim, d_ff=args.ff_dim,
                                N=args.num_blocks, compress=True,compress_att=True,
                                compress_mode=args.compress_mode,
                                num_compress_enc=args.num_enc_blocks_comp,
                                num_compress_dec=args.num_dec_blocks_comp)


        # print("Tranable parameters in fc module ", params2)
        debug_compress_info(model, model2)

        exit()

    os.makedirs(os.path.dirname(args.save_to), exist_ok=True)

    if args.multi_gpu:
        devices = list(np.arange(args.num_devices))
        model_parallel = nn.DataParallel(model, device_ids=devices)

    logger_file = {}#Logger(name=args.exp_name)
    logger_file['bleu'] = []
    logger_file['loss'] = []

    for epoch in range(args.max_epoch):
        print("=" * 80)
        print("Epoch ", epoch + 1)
        print("=" * 80)
        print("Train...")
        if args.multi_gpu:
            model_parallel.train()
            train_loss_fn = MultiGPULossCompute(model.generator, criterion,
                                                      devices=devices, opt=model_opt)
            train_model = model_parallel

        else:
            train_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt)

            model.train()

        _, logger_file = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in train_iter),
                                  model_parallel if args.multi_gpu else model, train_loss_fn,
                                  valid_params=valid_params,
                                  epoch_num=epoch, logger=logger_file)

        if args.multi_gpu:
            model_parallel.eval()
            val_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)
        else:
            model.eval()
            val_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt)

        print("Validation...")
        loss, bleu_loss = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in valid_iter),\
                                        model_parallel if args.multi_gpu else model,
                                        val_loss_fn, valid_params=valid_params, is_valid=True)

        if bleu_loss > best_bleu_loss:
            best_bleu_loss = bleu_loss

            model_state_dict = model.state_dict()
            model_file = args.save_to + args.exp_name + 'valid.bin'
            checkpoint = {
                'model': model_state_dict,
            }

            print('save model without optimizer [%s]' % model_file, file=sys.stderr)

            torch.save(checkpoint, model_file)

        print()
        print("Validation perplexity ", np.exp(loss))

    with open("./logs/"+args.exp_name, 'wb') as f_out:
        pickle.dump(logger_file, f_out)
Esempio n. 7
0
def dyn_batch_without_padding(new, i, sofar):
    if args.distillation:
        return sofar + max(len(new.src), len(new.trg), len(new.dec))
    else:
        return sofar + max(len(new.src), len(new.trg))


if args.batch_size == 1:  # speed-test: one sentence per batch.
    batch_size_fn = lambda new, count, sofar: count
else:
    batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding

train_real, dev_real = data.BucketIterator.splits(
    (train_data, dev_data), batch_sizes=(args.batch_size, args.batch_size), device=args.gpu, shuffle=False,
    batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False)
aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False)
            for dataset in aux_data]
logger.info("build the dataset. done!")

# ----------------------------------------------------------------------------------------------------------------- #
# model hyper-params:
logger.info('use default parameters of t2t-base')
hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6,
            'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32
args.__dict__.update(hparams)

# ----------------------------------------------------------------------------------------------------------------- #
# show the arg:

hp_str = (f"{args.dataset}_subword_"
        f"{args.d_model}_{args.d_hidden}_{args.n_layers}_{args.n_heads}_"
    def __init__(self,
                 emb_dim=50,
                 mbsize=32,
                 custom_data=False,
                 eval=False,
                 train_data_path="",
                 eval_data_file="",
                 checkpoint_path=""):

        self.TEXT = data.Field(init_token='<start>',
                               eos_token='<eos>',
                               lower=True,
                               tokenize=self._tokenizer,
                               fix_length=16)
        self.LABEL = data.Field(sequential=False, unk_token=None)
        self.MAX_CHARS = 20000
        self.NLP = en_core_web_sm.load()

        if not eval:

            # Only take sentences with length <= 15
            f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral'

            if custom_data:

                # create tuples representing the columns
                fields = [(None, None), ('text', self.TEXT), (None, None),
                          (None, None), ('label', self.LABEL)]

                # load the dataset in json format
                train_data, validation_data, test_data = data.TabularDataset.splits(
                    path=train_data_path,
                    train='train_data.csv',
                    validation='validation_data.csv',
                    test='test_data.csv',
                    format='csv',
                    fields=fields,
                    skip_header=True)

            else:
                train_data, test_data = datasets.IMDB.splits(
                    self.TEXT, self.LABEL)

                train_data, validation_data = train_data.split()

            self.TEXT.build_vocab(train_data, vectors=GloVe('6B', dim=emb_dim))
            self.LABEL.build_vocab(train_data)

            self.n_vocab = len(self.TEXT.vocab.itos)
            self.emb_dim = emb_dim

            self.train_iter, self.val_iter, self.test_iter = data.BucketIterator.splits(
                (train_data, validation_data, test_data),
                batch_size=mbsize,
                device=-1,
                sort_key=lambda x: len(x.text),
                shuffle=True,
                repeat=True)

            self.train_loader = self.train_iter
            self.test_loader = self.test_iter
            self.validation_loader = self.val_iter

            self.train_iter = iter(self.train_iter)
            self.val_iter = iter(self.val_iter)
            self.test_iter = iter(self.test_iter)

        else:

            self.TEXT = data.Field(init_token='<start>',
                                   eos_token='<eos>',
                                   lower=True,
                                   tokenize=self._tokenizer,
                                   fix_length=16)
            self.TEXT.vocab = self._get_from_checkpoint(checkpoint_path)

            self.n_vocab = len(self.TEXT.vocab.itos)

            fields = [('text', self.TEXT)]

            # load the dataset in json format
            test_data = data.TabularDataset(path=eval_data_file,
                                            format='csv',
                                            fields=fields,
                                            skip_header=True)

            self.test_iter = data.BucketIterator(
                test_data,
                batch_size=mbsize,
                device=-1,
                sort_key=lambda x: len(x.text),
                shuffle=False,
                repeat=False)

            self.test_loader = self.test_iter
            self.test_iter = iter(self.test_iter)
Esempio n. 9
0
print('Train Example: {}'.format('\n'.join([
    '{} ---- {}'.format(example.text, example.label)
    for example in train_data.examples[:5]
])))
print('Valid Example: {}'.format('\n'.join([
    '{} ---- {}'.format(example.text, example.label)
    for example in valid_data.examples[:5]
])))
print('Test Example: {}'.format('\n'.join([
    '{} ---- {}'.format(example.text, example.label)
    for example in test_data.examples[:5]
])))

train_iter = data.BucketIterator(dataset=train_data,
                                 batch_size=BATCH_SIZE,
                                 sort_key=lambda x: len(x.text))
valid_iter = data.BucketIterator(dataset=valid_data,
                                 batch_size=BATCH_SIZE,
                                 sort_key=lambda x: len(x.text))
test_iter = data.Iterator(dataset=test_data, batch_size=BATCH_SIZE, sort=False)

# build model
from text_classify.model import RNN, WordAVGModel, TextCNN
from text_classify.transformer import Transformer
embedding_size = TEXT.vocab.vectors.shape[
    1] if USE_PRE_TRAIN_MODEL else EMBEDDING_SIZE

# model = RNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(LABEL.vocab))
# model = TextCNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, output_size=len(LABEL.vocab), pooling_method='avg')
model = WordAVGModel(vocab_size=len(TEXT.vocab),
Esempio n. 10
0
    def __init__(self, args):

        path = '../data/squad'

        logging.info(
            "Preprocessing Data - First Phase  :: Reading And Transforming")

        self.preprocess('{}/{}'.format(path, args.Train_File))
        self.preprocess('{}/{}'.format(path, args.Dev_File))

        self.RAW = data.RawField()
        self.RAW.is_target = False

        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'qid': ('qid', self.RAW),
            'start_idx': ('start_idx', self.LABEL),
            'end_idx': ('end_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        logging.info("Preprocessing Data - Second Phase :: To Torchtext")

        self.train, self.dev = data.TabularDataset.splits(path=path, train=args.Train_File + 'l',  \
                                                          validation=args.Dev_File + 'l', format='json', fields=dict_fields)
        if args.Max_Token_Length > 0:
            self.train.examples = [
                e for e in self.train.examples
                if len(e.c_word) <= args.Max_Token_Length
            ]

        logging.info(
            "Preprocessing Data - Third Phase  :: Building Vocabulary")

        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train,
                              self.dev,
                              vectors=GloVe(name='6B', dim=args.Word_Dim))

        logging.info("Preprocessing Data - Fourth Phase :: Building Itertors")

        device = torch.device(
            "cuda:{}".format(args.GPU) if torch.cuda.is_available() else "cpu")

        self.train_iter = data.BucketIterator(
            dataset=self.train,
            batch_size=args.Batch_Size)  # sort_key = lambda x : len(x.c_word)

        self.dev_iter = data.BucketIterator(dataset=self.dev, batch_size=10)
 def getBucketIter(self, dataset, **kwargs):
     if 'device' not in kwargs:
         kwargs = dict(kwargs, device=self.device)
     else:
         kwargs = dict(kwargs)
     return data.BucketIterator(dataset, **kwargs)
Esempio n. 12
0
                           fix_length=config['max_seq_length'])
    LABEL = torchdata.Field(use_vocab=False,
                            sequential=False,
                            preprocessing=lambda x: int(x),
                            is_target=True)

    train_dataset, test_dataset = torchdata.TabularDataset.splits(
        path=config['dataset_path'],
        train=config['dataset_train'],
        test=config['dataset_test'],
        format='tsv',
        fields=[('label', LABEL), ('text', TEXT)])

    train_iterator = torchdata.BucketIterator(train_dataset,
                                              batch_size=config['batch_size'],
                                              sort_key=lambda x: len(x.text),
                                              device=device,
                                              sort_within_batch=False)
    test_iterator = torchdata.BucketIterator(
        test_dataset,
        batch_size=config['test_batch_size'],
        sort_key=lambda x: len(x.text),
        device=device,
        sort_within_batch=False)

    TEXT.build_vocab(train_dataset)
    LABEL.build_vocab(train_dataset)

    num_classes, weights = get_weights(
        [e.label for e in train_dataset.examples], config)
    bert_config = BertConfig(vocab_size_or_config_json_file=32000,
Esempio n. 13
0
    test = data.Dataset(counter_test, fields=[('sentence', TEXT),
                                              ('adj', None), ('trigger', TEXT), ('trigger_index', TRIGGERINDEX),
                                              ('eep', EEP), ('index', INDEX)])

    for_vocab = data.Dataset(for_vocab, fields=[('sentence', TEXT),
                                                ('adj', None), ('trigger', None), ('trigger_index', None),
                                                ('eep', None), ('index', None)])

    TEXT.build_vocab(for_vocab, vectors='glove.6B.100d')  # , max_size=30000)
    TEXT.vocab.vectors.unk_init = init.xavier_uniform
    print(TEXT.vocab.vectors.shape)
    print()

    train_iter = data.BucketIterator(train, batch_size=64, train=True,
                                     sort_within_batch=True,
                                     sort_key=lambda x: (len(x.sentence)), repeat=False,
                                     device='cuda')

    for batch in train_iter:
        print(batch)
        for i in batch.index:
            print(len(counter[i].sentence))
            print(batch.sentence.shape[0])
            assert len(counter[i].sentence) <= batch.sentence.shape[0]

        x = batch.sentence.t()
        adj = []
        trigger = batch.trigger_index.t().flatten()

        count = 0
        for ind in batch.index:
Esempio n. 14
0
    def __init__(self, args):
        path = './data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        print("preprocessing data files...")
        if not os.path.exists('{}/{}l'.format(path, args.train_file)):
            self.preprocess_file('{}/{}'.format(path, args.train_file))
        if not os.path.exists('{}/{}l'.format(path, args.dev_file)):
            self.preprocess_file('{}/{}'.format(path, args.dev_file))

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples,
                                      fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train='{}l'.format(args.train_file),
                validation='{}l'.format(args.dev_file),
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        # print(self.train.examples[0].c_word)
        if args.context_threshold > 0:
            self.train.examples = [
                e for e in self.train.examples
                if len(e.c_word) <= args.context_threshold
            ]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train,
                              self.dev,
                              vectors=GloVe(name='6B', dim=args.word_dim))

        print("building iterators...")
        device = torch.device(
            "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
        self.train_iter = data.BucketIterator(self.train,
                                              batch_size=args.train_batch_size,
                                              device=device,
                                              repeat=True,
                                              shuffle=True,
                                              sort_key=lambda x: len(x.c_word))

        self.dev_iter = data.BucketIterator(self.dev,
                                            batch_size=args.dev_batch_size,
                                            device=device,
                                            repeat=False,
                                            sort_key=lambda x: len(x.c_word))
Esempio n. 15
0
def predict():

    predict_cfg = get_predict_args()
    device = get_device()
    print(device)

    # load checkpoint
    ckpt_path = find_ckpt_in_directory(predict_cfg.ckpt)
    ckpt = torch.load(ckpt_path, map_location=device)
    cfg = ckpt["cfg"]

    # to know which words to UNK we need to know the Glove vocabulary
    glove_words = load_glove_words(cfg.word_vectors)

    # load data sets
    print("Loading data... ", end="")
    input_field, label_field, not_in_glove = get_data_fields(glove_words)
    train_data, dev_data, test_data = SNLI.splits(input_field, label_field)
    print("Done")
    print("Words not in glove:", len(not_in_glove))

    # build vocabulary (deterministic so no need to load it)
    input_field.build_vocab(train_data,
                            dev_data,
                            test_data,
                            vectors=None,
                            vectors_cache=None)
    label_field.build_vocab(train_data)

    # construct model
    model = build_model(cfg, input_field.vocab)

    # load parameters from checkpoint into model
    print("Loading saved model..")
    model.load_state_dict(ckpt["model"])
    print("Done")

    train_iter = data.BucketIterator(
        train_data,
        batch_size=cfg.batch_size,
        train=False,
        repeat=False,
        device=device if torch.cuda.is_available() else -1)

    dev_iter = data.BucketIterator(
        dev_data,
        batch_size=cfg.batch_size,
        train=False,
        repeat=False,
        device=device if torch.cuda.is_available() else -1)

    test_iter = data.BucketIterator(
        test_data,
        batch_size=cfg.batch_size,
        train=False,
        repeat=False,
        device=device if torch.cuda.is_available() else -1)

    print_config(cfg)

    print("Embedding variance:", torch.var(model.embed.weight).item())
    model.to(device)

    print_parameters(model)
    print(model)

    # switch model to evaluation mode
    model.eval()
    train_iter.init_epoch()
    dev_iter.init_epoch()
    test_iter.init_epoch()

    criterion = nn.CrossEntropyLoss(reduction='sum')

    print("Starting evaluation..")
    eval_list = [("train", train_iter), ("dev", dev_iter), ("test", test_iter)]
    for name, it in eval_list:
        eval_result = evaluate(model, criterion, it)
        eval_str = make_kv_string(eval_result)
        print("# Evaluation {}: {}".format(name, eval_str))

    # print dev examples for highscore
    dev_iter.init_epoch()
    p2h, h2p, prems, hypos, predictions, targets = extract_attention(
        model, dev_iter, input_field.vocab, label_field.vocab)
    np.savez(os.path.join(cfg.save_path, "dev_items"),
             p2h=p2h,
             h2p=h2p,
             prems=prems,
             hypos=hypos,
             predictions=predictions,
             targets=targets)

    # print dev examples for highscore
    dev_iter.init_epoch()
    dev_dir = os.path.join(cfg.save_path, "dev")
    if not os.path.exists(dev_dir):
        os.makedirs(dev_dir)
    print_examples(model,
                   dev_iter,
                   input_field.vocab,
                   label_field.vocab,
                   dev_dir,
                   0,
                   n=-1)
Esempio n. 16
0
     ('plot_score', PLOT_SCORE), ('image_score', IMAGE_SCORE), ('music_score', MUSIC_SCORE),
     ('actors_score', ACTORS_SCORE), ('name0', None)], skip_header=True)


train, val = get_dataset(union_toloka_result_proc_path).split()
golden_train = get_dataset(union_golden_proc_path2)
TEXT.build_vocab(train, max_size=30000)

model_path = "./models/model"
rnn_model = MultiModel(model=BiLSTMClassifier(300, len(TEXT.vocab.stoi), 256, 2).to(device))
# rnn_model.load_state_dict(torch.load(model_path))


batch_size = 32
train_iter, val_iter = data.BucketIterator.splits(
    (train, val), sort_key=lambda x: len(x.text),
    batch_sizes=(batch_size, batch_size), device=device)
golden_iter = data.BucketIterator(golden_train, sort_key=lambda x: len(x.text), batch_size=batch_size, device=device)

criterion_cls = nn.BCEWithLogitsLoss().to(device)
criterion_scores = nn.MSELoss(reduction='none').to(device)
criterion_scores_l1 = nn.L1Loss(reduction='none').to(device)

rnn_model = MultiModel(model=BiLSTMClassifier(300, len(TEXT.vocab.stoi), 256, 2).to(device))

optimizer = optim.Adam([param for param in rnn_model.model.parameters() if param.requires_grad])
fit(rnn_model, criterion_cls, criterion_scores, optimizer, train_iter, epochs_count=30, val_data=val_iter)
torch.save(rnn_model.model.state_dict(), model_path)

do_eval_epoch(rnn_model, None, criterion_scores_l1, val_iter)
Esempio n. 17
0

sentences = data.Field(lower=True, tokenize=tokenizer)
ans = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(sentences, ans)

sentences.build_vocab(train, dev, test, min_freq=3)
ans.build_vocab(train, dev, test)
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

Batch_Size = 128
test_iter = data.BucketIterator(test, batch_size=Batch_Size, shuffle=False)

n_layer = 1


class My_RNN(nn.Module):
    def __init__(self, embed_dim, hidden_dim, drop_p):
        super(My_RNN, self).__init__()
        self.rnn = nn.LSTM(input_size=embed_dim,
                           hidden_size=hidden_dim,
                           num_layers=n_layer,
                           dropout=drop_p,
                           bidirectional=True)

    def forward(self, inputs):
        batch_size = inputs.size()[1]
Esempio n. 18
0
def get_data_iter():
    #获取字符vocab分词器
    def char_vocab_tokenizer(sentence):
        c_lists = [[c for c in word] for word in sentence.strip().split()]
        return list(_flatten(c_lists))

    def tag_tokenizer(x):
        rel = [int(tag) for tag in x.split()]
        return rel

    def _get_dataset(csv_data, char_to_idx, seq, tag, char_, char_len):
        examples = []
        fileds = [('Seq', seq), ('Tag', tag), ('Char_', char_),
                  ('Char_len', char_len)]
        for seq, tag in zip(csv_data['Seq'], csv_data['Tag']):
            char_list = [[char_to_idx[c] for c in word]
                         for word in seq.strip().split()]
            char_len_list = [len(word) for word in seq.strip().split()]
            examples.append(
                data.Example.fromlist(
                    [seq, tag,
                     pad_char_list(char_list), char_len_list], fileds))
        return examples, fileds

    seq = data.Field(sequential=True, use_vocab=True, lower=True)
    tag = data.Field(sequential=True,
                     lower=False,
                     use_vocab=False,
                     tokenize=tag_tokenizer)
    char_ = data.Field(sequential=True, use_vocab=False, batch_first=True)
    char_len = data.Field(sequential=True, use_vocab=False, batch_first=True)
    char_vocab = data.Field(sequential=True,
                            use_vocab=True,
                            tokenize=char_vocab_tokenizer)  #只是用来构建字符集词典
    get_charvocab_fields = [('Seq', char_vocab), ('None', None),
                            ('None', None)]
    train = data.TabularDataset.splits(path='./Dataset',
                                       train='train.csv',
                                       format='csv',
                                       skip_header=True,
                                       fields=get_charvocab_fields)[0]
    char_vocab.build_vocab(train)  #字符集的词典
    # 构建Dataset数据集
    train_data = pd.read_csv('./Dataset/train.csv')
    val_data = pd.read_csv('./Dataset/valid.csv')
    test_data = pd.read_csv('./Dataset/test.csv')
    train_dataset = data.Dataset(*_get_dataset(
        train_data, char_vocab.vocab.stoi, seq, tag, char_, char_len))
    val_dataset = data.Dataset(*_get_dataset(val_data, char_vocab.vocab.stoi,
                                             seq, tag, char_, char_len))
    test_dataset = data.Dataset(*_get_dataset(test_data, char_vocab.vocab.stoi,
                                              seq, tag, char_, char_len))
    # 构造词典
    seq.build_vocab(
        train_dataset,
        vectors=torchtext.vocab.Vectors(name='./Dataset/glove.6B.200d.txt'))
    # 构造数据迭代器
    train_iter = data.BucketIterator(train_dataset,
                                     batch_size=1,
                                     shuffle=True,
                                     sort_key=lambda x: len(x.Seq),
                                     device=tc.device('cpu'))
    val_iter, test_iter = data.BucketIterator.splits(
        (val_dataset, test_dataset),
        batch_sizes=(1, 1),
        shuffle=False,
        repeat=False,
        sort=False,
        device=tc.device('cpu'))
    return seq, char_vocab, train_iter, test_iter, val_iter
Esempio n. 19
0
    def load_data(self, train_file, test_file, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''
        # Loading Tokenizer
        NLP = spacy.load('en')

        def tokenizer(sent):
            return list(x.text for x in NLP.tokenizer(sent) if x.text != " ")

        # Creating Filed for data
        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          lower=True,
                          fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data
        # from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [
                data.Example.fromlist(i, datafields)
                for i in val_df.values.tolist()
            ]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)

        TEXT.build_vocab(train_data)
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} test examples".format(len(test_data)))
        print("Loaded {} validation examples".format(len(val_data)))
Esempio n. 20
0
def main():
    ###############################
    # PREPROCESSING
    ###############################
    datasets = ["train", "val", "test"]
    for dataset in datasets:
        if not os.path.exists(os.path.join("data", dataset + ".tsv")):
            print("Creating TSV for " + dataset)
            convert_to_tsv(dataset)

    print("Creating datasets", end='', flush=True)
    curr_time = datetime.now()

    article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor,
                                         lower=True,
                                         tokenize=tokenizer_in)
    summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor,
                                         lower=True,
                                         tokenize=tokenizer_out,
                                         init_token='<sos>')

    train_set = data.TabularDataset(path='./data/train.tsv',
                                    format='tsv',
                                    fields=[('article', article_field),
                                            ('summary', summary_field)])
    val_set = data.TabularDataset(path='./data/val.tsv',
                                  format='tsv',
                                  fields=[('article', article_field),
                                          ('summary', summary_field)])

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))

    print("Building vocabulary and creating batches", end='', flush=True)
    article_field.build_vocab(train_set,
                              vectors="glove.6B.100d",
                              max_size=encoder_vocab_size)
    summary_field.build_vocab(train_set, max_size=decoder_vocab_size)

    train_iter = data.BucketIterator(dataset=train_set,
                                     batch_size=batch_size,
                                     sort_key=lambda x: len(x.article),
                                     repeat=False,
                                     device=DEVICE)
    val_iter = data.BucketIterator(dataset=val_set,
                                   batch_size=batch_size,
                                   sort_key=lambda x: len(x.article),
                                   repeat=False,
                                   device=DEVICE)

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))
    ###############################
    # MODEL CREATION
    ###############################
    print("Creating encoder and decoder models", end='', flush=True)
    encoder = EncoderLSTM(input_size=encoder_vocab_size,
                          embed_size=embed_size,
                          hidden_size=encoder_hidden_size,
                          use_gpu=True,
                          gpu_device=DEVICE,
                          batch_size=batch_size)
    encoder.embedding.weight.data = article_field.vocab.vectors
    encoder.cuda(device=DEVICE)

    decoder = AttnDecoderLSTM(input_size=encoder_vocab_size,
                              embed_size=embed_size,
                              hidden_size=decoder_hidden_size,
                              output_size=decoder_vocab_size,
                              use_gpu=True,
                              gpu_device=DEVICE,
                              batch_size=batch_size)
    decoder.embedding.weight.data = article_field.vocab.vectors
    decoder.cuda(device=DEVICE)
    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))

    # Loss and SGD optimizers
    loss_func = nn.NLLLoss(ignore_index=1)  # Ignore <pad> token
    encoder_opt = optim.Adam(encoder.parameters(), lr=lr)
    decoder_opt = optim.Adam(decoder.parameters(), lr=lr)

    ###############################
    # TRAINING
    ###############################
    print("Beginning training")
    tqdm_epoch = tqdm(range(num_epochs), desc="Epoch")
    for epoch in tqdm_epoch:
        train_iter.init_epoch()
        tqdm_batch = tqdm(train_iter, desc="Batch")
        for b_id, batch in enumerate(tqdm_batch):
            encoder.batch_size = batch.batch_size  # Fixes weird bug where we get batch sizes that are not batch_size
            decoder.batch_size = batch.batch_size
            avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt,
                             loss_func, teacher_forcing_ratio)

    ###############################
    # TESTING
    ###############################
    # Load test set
    print("Loading test set")
    test_set = data.TabularDataset(path='./data/test.tsv',
                                   format='tsv',
                                   fields=[('article', article_field),
                                           ('summary', summary_field)])
    test_iter = data.BucketIterator(dataset=test_set,
                                    batch_size=batch_size,
                                    sort_key=lambda x: len(x.article),
                                    repeat=False,
                                    device=DEVICE)
    print("Evaluating model")
    evaluate(encoder=encoder,
             decoder=decoder,
             dataset=test_iter,
             rev_field=article_field)
Esempio n. 21
0
                    max_size=30000,
                    vectors="glove.6B.300d",
                    unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(train_data)

    PAD_INDEX = SRC.vocab.stoi[PAD_TOKEN]
    # SOS_INDEX = SRC.vocab.stoi[SOS_TOKEN]
    # EOS_INDEX = SRC.vocab.stoi[EOS_TOKEN]
    # print(LABEL.vocab.freqs.most_common(10))

    #############################
    # define iterator
    train_iter = data.BucketIterator(train_data,
                                     batch_size=params['BATCH_SIZE'],
                                     device=DEVICE,
                                     sort_within_batch=True,
                                     sort_key=lambda x: len(x.text),
                                     train=True,
                                     repeat=False)

    # train_iter = data.Iterator(train_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE)

    valid_iter = data.Iterator(valid_data,
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
                               device=DEVICE)

    test_iter = data.Iterator(test_data,
                              batch_size=1,
Esempio n. 22
0
def load_pairs():

    TEXT1 = data.Field(fix_length=500)
    TEXT2 = data.Field(fix_length=500)
    LABEL = data.Field(sequential=False,
                       is_target=True,
                       use_vocab=False,
                       dtype=torch.float64)
    ID = data.Field(sequential=False,
                    is_target=True,
                    use_vocab=False,
                    dtype=torch.float64)
    ONEHOT = data.Field(sequential=False,
                        is_target=True,
                        use_vocab=False,
                        dtype=torch.float32)

    # TEXT1是对象,返回到数据结构中的应该是个对象类型,而不是单纯的文本类型
    field = {
        'label': ('label', LABEL),
        'text1': ('text1', TEXT1),
        'text2': ('text2', TEXT2),
        'onehot1': ('onehot1', ONEHOT),
        'onehot2': ('onehot2', ONEHOT)
    }
    field1 = {
        'id': ('id', ID),
        'text': ('text', TEXT1),
        'label': ('label', LABEL),
        'onehot': ('onehot', ONEHOT)
    }
    # train_pairs 的数据格式为field字典,所以train_pairs有text1键,  然后他的属性值以为Text1
    train_pairs, valid_pairs = data.TabularDataset.splits(  # 切分语料库
        path='./data/',
        train='train_pairs.json',
        validation='val_pairs.json',
        format='json',
        fields=field)
    train_data, test_data = data.TabularDataset.splits(
        path='./data/',
        train='compare_data_5.json',
        test='test_data.json',
        format='json',
        fields=field1)

    vectors = torchtext.vocab.Vectors(name='./data/fasttext.vec')
    TEXT1.build_vocab(train_pairs, vectors=vectors)
    # build_vocab构建语料库的vocabulary同时加载word-embedding
    # 从预训练的vectors中,将当前corpus词汇表的词向量抽取出来,构成当前corpus的Vocab(词汇表)
    # 自动构建embedding矩阵
    TEXT2.build_vocab(train_pairs, vectors=vectors)

    print('Length of TEXT1 Vocabulary:' + str(len(TEXT1.vocab)))
    print('Length of TEXT2 Vocabulary:' + str(len(TEXT2.vocab)))
    print('Dim of TEXT1,TEXT2:',
          TEXT1.vocab.vectors.size()[1],
          TEXT2.vocab.vectors.size()[1])

    train_pairs_iter, valid_pairs_iter = data.BucketIterator.splits(
        (train_pairs, valid_pairs),
        sort=False,
        batch_size=100,
        repeat=False,
        shuffle=True,
        device=torch.device('cuda:0'))
    train_data_iter = data.BucketIterator(train_data,
                                          sort=False,
                                          batch_size=5,
                                          repeat=False,
                                          shuffle=False,
                                          device=torch.device('cuda:0'))
    test_data_iter = data.BucketIterator(test_data,
                                         sort=False,
                                         batch_size=100,
                                         repeat=False,
                                         shuffle=True,
                                         device=torch.device('cuda:0'))

    return train_pairs_iter, valid_pairs_iter, train_data_iter, test_data_iter
    trg_sen_in = batch.trg[0][:,:-1] # skip eos
    trg_sen = batch.trg[0][:,1:] # skip sos
    preds = model(src_sen, batch.src[1].cpu().numpy(), trg_sen_in)
    return src_sen, trg_sen, preds

if __name__ == "__main__":
    eng_field, fren_field, (train, val, test) = load_data()
    model = Seq2Seq_Translation(eng_field, fren_field)
    trg_mask = torch.ones(len(eng_field.vocab))
    trg_mask[eng_field.vocab.stoi["<pad>"]] = 0
    criterion = nn.NLLLoss(weight=trg_mask)

    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, 15)

    train_iter = data.BucketIterator(train, batch_size=64, sort_key=lambda ex: len(ex.src), sort_within_batch=True)
    examples = iter(data.BucketIterator(val, batch_size=1, train=False, shuffle=True, repeat=True))

    for epoch in range(20):
        scheduler.step()
        model.train()
        for i, batch in enumerate(train_iter):
            src_sen, trg_sen, preds = batch_forward(batch)
            loss = criterion(preds.contiguous().view(-1,preds.size(2)), trg_sen.contiguous().view(-1))
            # writer.add_scalar('data/train_loss', loss.data[0], len(train_iter)*epoch + i)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(model.parameters(), 5.0)
            optimizer.step()
            if i == len(train_iter)-1:
                break
Esempio n. 24
0
 def get_iterator(self, dataset):
     return data.BucketIterator(dataset,
                                batch_size=self.params['batch_size'],
                                shuffle=False)
Esempio n. 25
0
def load_dataset(config, train_pos='train.pos', train_neg='train.neg',
                 dev_pos='dev.pos', dev_neg='dev.neg',
                 test_pos='test.pos', test_neg='test.neg'):

    root = config.data_path
    
    roots = re.split(', +', root)
    if len(roots) > 1:
        logger.info("Combining datasets...")
        files = {'train.pos':[], 'train.neg':[], 'dev.pos':[], \
                 'dev.neg':[], 'test.pos':[], 'test.neg':[]}
        for dir_path in roots:
            for file in files.keys():
                with open(dir_path + file, 'r', encoding='utf8') as f:
                    files[file].extend(f.readlines())
        
        for file, sents in files.items():
            with open('./data/style_transfer/%s' % file, 'w', encoding='utf8') as f:
                for sent in sents:
                    f.write('%s' % sent)
        root = './data/style_transfer/'
    
    TEXT = data.Field(batch_first=True, eos_token='<eos>')
    
    dataset_fn = lambda name: data.TabularDataset(
        path=root + name,
        format='tsv',
        fields=[('text', TEXT)]
    )

    train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
    dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
    test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])

    TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)

    if config.load_pretrained_embed:
        start = time.time()
        
        vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path)
        TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        print('vectors', TEXT.vocab.vectors.size())
        
        print('load embedding took {:.2f} s.'.format(time.time() - start))

    vocab = TEXT.vocab
        
    dataiter_fn = lambda dataset, train: data.BucketIterator(
        dataset=dataset,
        batch_size=config.batch_size,
        shuffle=train,
        repeat=train,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device=config.device
    )

    train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set])
    dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set])
    test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set])

    train_iters = DatasetIterator(train_pos_iter, train_neg_iter)
    dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter)
    test_iters = DatasetIterator(test_pos_iter, test_neg_iter)
    
    return train_iters, dev_iters, test_iters, vocab
Esempio n. 26
0
    for split in ["train", "val", "test"]:
        my_data[split] = datasets.TranslationDataset(path="data/new_" + split,
                                                     exts=('.nl', '.amr'),
                                                     fields=(NL_SRC, AMR_SRC))
    MIN_FREQ = 5
    NL_SRC.build_vocab(my_data["train"].src, min_freq=MIN_FREQ)
    AMR_SRC.build_vocab(my_data["train"].trg, min_freq=MIN_FREQ)

    PAD_INDEX = AMR_SRC.vocab.stoi[PAD_TOKEN]

    print_data_info(my_data, NL_SRC, AMR_SRC)
    train_iter = data.BucketIterator(my_data["train"],
                                     batch_size=BATCH_SIZE,
                                     train=True,
                                     sort_within_batch=True,
                                     sort_key=lambda x:
                                     (len(x.src), len(x.trg)),
                                     repeat=False,
                                     device=DEVICE)

    valid_iter = data.Iterator(my_data["val"],
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
                               device=DEVICE)

    model = make_autoencoder(len(NL_SRC.vocab),
                             len(AMR_SRC.vocab),
                             emb_size=500,
                             hidden_size=500,
Esempio n. 27
0
def main():
    args_parser = argparse.ArgumentParser(description='Tuning with graph-based parsing')
    args_parser.add_argument('--cuda', action='store_true', help='using GPU')
    args_parser.add_argument('--num_epochs', type=int, default=200, help='Number of training epochs')
    args_parser.add_argument('--batch_size', type=int, default=64, help='Number of sentences in each batch')
    args_parser.add_argument('--hidden_size', type=int, default=256, help='Number of hidden units in RNN')
    args_parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN')
    args_parser.add_argument('--opt', choices=['adam', 'sgd', 'adamax'], help='optimization algorithm')
    args_parser.add_argument('--objective', choices=['cross_entropy', 'crf'], default='cross_entropy',
                             help='objective function of training procedure.')
    args_parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate')
    args_parser.add_argument('--decay_rate', type=float, default=0.05, help='Decay rate of learning rate')
    args_parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
    args_parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization')
    args_parser.add_argument('--epsilon', type=float, default=1e-8, help='epsilon for adam or adamax')
    args_parser.add_argument('--p_rnn', nargs=2, type=float, default=0.1, help='dropout rate for RNN')
    args_parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings')
    args_parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer')
    args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay')
    args_parser.add_argument('--unk_replace', type=float, default=0.,
                             help='The rate to replace a singleton word with UNK')
    #args_parser.add_argument('--punctuation', nargs='+', type=str, help='List of punctuations')
    args_parser.add_argument('--word_path', help='path for word embedding dict')
    args_parser.add_argument('--freeze', action='store_true', help='frozen the word embedding (disable fine-tuning).')
    # args_parser.add_argument('--char_path', help='path for character embedding dict')
    args_parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    args_parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    args_parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    args_parser.add_argument('--model_path', help='path for saving model file.', default='models/temp')
    args_parser.add_argument('--model_name', help='name for saving model file.', default='generator')

    args_parser.add_argument('--seq2seq_save_path', default='checkpoints4/seq2seq_save_model', type=str,
                             help='seq2seq_save_path')
    args_parser.add_argument('--seq2seq_load_path', default='checkpoints4/seq2seq_save_model', type=str,
                             help='seq2seq_load_path')
    # args_parser.add_argument('--rl_finetune_seq2seq_save_path', default='models/rl_finetune/seq2seq_save_model',
    #                          type=str, help='rl_finetune_seq2seq_save_path')
    # args_parser.add_argument('--rl_finetune_network_save_path', default='models/rl_finetune/network_save_model',
    #                          type=str, help='rl_finetune_network_save_path')
    # args_parser.add_argument('--rl_finetune_seq2seq_load_path', default='models/rl_finetune/seq2seq_save_model',
    #                          type=str, help='rl_finetune_seq2seq_load_path')
    # args_parser.add_argument('--rl_finetune_network_load_path', default='models/rl_finetune/network_save_model',
    #                          type=str, help='rl_finetune_network_load_path')

    args_parser.add_argument('--direct_eval', action='store_true', help='direct eval without generation process')
    args = args_parser.parse_args()

    spacy_en = spacy.load('en_core_web_sm')  # python -m spacy download en
    spacy_de = spacy.load('de_core_news_sm')  # python -m spacy download en
    spacy_fr = spacy.load('fr_core_news_sm')  # python -m spacy download en

    SEED = 0
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    device = torch.device('cuda') #torch.device('cuda' if torch.cuda.is_available() else 'cpu') #'cpu' if not torch.cuda.is_available() else 'cuda:0'

    def tokenizer_en(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]
    def tokenizer_de(text):  # create a tokenizer function
        return [tok.text for tok in spacy_de.tokenizer(text)]
    def tokenizer_fr(text):  # create a tokenizer function
        return [tok.text for tok in spacy_fr.tokenizer(text)]

    en_field = data.Field(sequential=True, tokenize=tokenizer_en, lower=True, include_lengths=True, batch_first=True)  #use_vocab=False fix_length=10
    de_field = data.Field(sequential=True, tokenize=tokenizer_de, lower=True, include_lengths=True, batch_first=True)  #use_vocab=False
    fr_field = data.Field(sequential=True, tokenize=tokenizer_fr, lower=True, include_lengths=True, batch_first=True)  #use_vocab=False
    print('begin loading training data-----')
    # print('time: ', time.asctime( time.localtime(time.time()) ))
    seq2seq_train_data = MultiSourceTranslationDataset(
        path='wmt14_3/sample', exts=('.de', '.fr', '.en'),

        fields=(de_field, fr_field, en_field))
    print('begin loading validation data-----')
    # print('time: ', time.asctime( time.localtime(time.time()) ))
    seq2seq_dev_data = MultiSourceTranslationDataset(
        path='wmt14_3/test', exts=('.de', '.fr', '.en'),
        fields=(de_field, fr_field, en_field))
    print('end loading data-----')
    # print('time: ', time.asctime( time.localtime(time.time()) ))

    # en_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.en', '.en'), fields=(en_field, en_field))
    # print('end en data-----')
    # print('time: ', time.asctime( time.localtime(time.time()) ))
    # de_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.de', '.de'), fields=(de_field, de_field))
    # fr_train_data = datasets.TranslationDataset(path='wmt14_3/sample', exts=('.fr', '.fr'), fields=(fr_field, fr_field))
    # en_field.build_vocab(en_train_data, max_size=80000)  # ,vectors="glove.6B.100d"
    # de_field.build_vocab(de_train_data, max_size=80000)  # ,vectors="glove.6B.100d"
    # fr_field.build_vocab(fr_train_data, max_size=80000)  # ,vectors="glove.6B.100d"
    # vocab_thread = 20000+2
    # with open(str(vocab_thread)+'_vocab_en.pickle', 'rb') as f:
    #     en_field.vocab = pickle.load(f)
    # with open(str(vocab_thread)+'_vocab_de.pickle', 'rb') as f:
    #     de_field.vocab = pickle.load(f)
    # with open(str(vocab_thread)+'_vocab_fr.pickle', 'rb') as f:
    #     fr_field.vocab = pickle.load(f)
    with open('vocab_en.pickle', 'rb') as f:
        en_field.vocab = pickle.load(f)
    with open('vocab_de.pickle', 'rb') as f:
        de_field.vocab = pickle.load(f)
    with open('vocab_fr.pickle', 'rb') as f:
        fr_field.vocab = pickle.load(f)
    print('end build vocab-----')
    # print('time: ', time.asctime( time.localtime(time.time()) ))
    # trg_field.build_vocab(seq2seq_train_data, max_size=80000)
    # mt_dev shares the fields, so it shares their vocab objects

    train_iter = data.BucketIterator(
        dataset=seq2seq_train_data, batch_size=10,
        sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), device=device, shuffle=True)  # Note that if you are runing on CPU, you must set device to be -1, otherwise you can leave it to 0 for GPU.
    dev_iter = data.BucketIterator(
        dataset=seq2seq_dev_data, batch_size=10,
        sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), device=device, shuffle=False)

    num_words_en = len(en_field.vocab.stoi)
    # Pretrain seq2seq model using denoising autoencoder. model name: seq2seq model
    
    EPOCHS = 100  # 150
    DECAY = 0.97
    # TODO: #len(en_field.vocab.stoi)  # ?? word_embedd ??
    word_dim = 300  # ??
    seq2seq = Seq2seq_Model(EMB=word_dim, HID=args.hidden_size, DPr=0.5, vocab_size1=len(de_field.vocab.stoi), vocab_size2=len(fr_field.vocab.stoi), vocab_size3=len(en_field.vocab.stoi), word_embedd=None, device=device).to(device)  # TODO: random init vocab
    # seq2seq.emb.weight.requires_grad = False
    print(seq2seq)

    loss_seq2seq = torch.nn.CrossEntropyLoss(reduction='none').to(device)
    parameters_need_update = filter(lambda p: p.requires_grad, seq2seq.parameters())
    optim_seq2seq = torch.optim.Adam(parameters_need_update, lr=0.0003)

    seq2seq.load_state_dict(torch.load(args.seq2seq_load_path +'_batch_'+ str(2000000) + '.pt'))  # TODO: 10.7
    # torch.save(seq2seq.state_dict(), args.seq2seq_save_path +'_batch_'+ str(ii) + '.pt')
    seq2seq.to(device)

    def count_parameters(model: torch.nn.Module):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f'The model has {count_parameters(seq2seq):,} trainable parameters')
    PAD_IDX = en_field.vocab.stoi['<pad>']
    # criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    ii=0#141500
 
    if True:  # i%1 == 0:
        seq2seq.eval()
        bleu_ep = 0
        acc_numerator_ep = 0
        acc_denominator_ep = 0
        testi = 0
        for _, batch in enumerate(dev_iter):  # for _ in range(1, num_batches + 1):  word, char, pos, heads, types, masks, lengths = conllx_data.get_batch_tensor(data_dev, batch_size, unk_replace=unk_replace)  # word:(32,50)  char:(32,50,35)
            src1, lengths_src1 = batch.src1  # word:(32,50)  150,64
            src2, lengths_src2 = batch.src2  # word:(32,50)  150,64
            trg, lengths_trg = batch.trg
            sel, _ = seq2seq(src1.long().to(device), src2.long().to(device), LEN=max(src1.size()[1], src2.size()[1]))  # TODO:
            sel = sel.detach().cpu().numpy()
            dec_out = trg.cpu().numpy()

            bleus = []


            for j in range(sel.shape[0]):
                bleu = get_bleu(sel[j], dec_out[j], num_words_en)  # sel
                bleus.append(bleu)
                numerator, denominator = get_correct(sel[j], dec_out[j], num_words_en)
                acc_numerator_ep += numerator
                acc_denominator_ep += denominator  # .detach().cpu().numpy() TODO: 10.8
            bleu_bh = np.average(bleus)
            bleu_ep += bleu_bh
            testi += 1
        bleu_ep /= testi  # num_batches
        print('testi: ', testi)
        print('Valid bleu: %.4f%%' % (bleu_ep * 100))
        # print(acc_denominator_ep)
        if acc_denominator_ep > 0:
            print('Valid acc: %.4f%%' % ((acc_numerator_ep * 1.0 / acc_denominator_ep) * 100))
Esempio n. 28
0
def load_data(opt):
    # 不设置fix_length
    TEXT = data.Field(sequential=True, fix_length=opt.max_text_len)  # 词或者字符
    LABEL = data.Field(sequential=False, use_vocab=False)

    # load
    # word/ or article/
    train_path = opt.data_path + opt.text_type + '/train_set.csv'
    val_path = opt.data_path + opt.text_type + '/val_set.csv'
    test_path = opt.data_path + opt.text_type + '/test_set.csv'
    train_path = 'D:/git/dataset/val_set.csv'
    test_path = 'D:/git/dataset/val_set.csv'
    val_path = 'D:/git/dataset/val_set.csv'

    # aug for data augmentation
    if opt.aug:
        print('make augmentation datasets!')
    train = GrandDataset(train_path,
                         text_field=TEXT,
                         label_field=LABEL,
                         text_type=opt.text_type,
                         test=False,
                         aug=opt.aug)
    val = GrandDataset(val_path,
                       text_field=TEXT,
                       label_field=LABEL,
                       text_type=opt.text_type,
                       test=False)
    test = GrandDataset(test_path,
                        text_field=TEXT,
                        label_field=None,
                        text_type=opt.text_type,
                        test=True)

    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type,
                                           opt.embedding_dim)
    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform_  # 没有命中的token的初始化方式

    # 构建Vocab
    print('building {} vocabulary......'.format(opt.text_type))
    TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors)
    # LABEL.build_vocab(train)

    # 构建Iterator
    # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
    # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=opt.batch_size,
                                     shuffle=True,
                                     sort_within_batch=False,
                                     repeat=False,
                                     device=opt.device)
    # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False,
    #                                device=opt.device)
    # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device)
    val_iter = data.Iterator(dataset=val,
                             batch_size=opt.batch_size,
                             shuffle=False,
                             sort=False,
                             repeat=False,
                             device=opt.device)
    test_iter = data.Iterator(dataset=test,
                              batch_size=opt.batch_size,
                              shuffle=False,
                              sort=False,
                              repeat=False,
                              device=opt.device)

    return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
Esempio n. 29
0
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path,
                     val_2_meta_path, min_freq, batch_size, device, phase, use_categories, 
                     use_subs):
    spacy_en = spacy.load('en')
    print(f'Preparing dataset for {phase}')
    
    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]
    
    CAPTION = data.ReversibleField(
        tokenize='spacy', init_token=start_token, 
        eos_token=end_token, pad_token=pad_token, lower=True, 
        batch_first=True, is_target=True
    )
    INDEX = data.Field(
        sequential=False, use_vocab=False, batch_first=True
    )
    if use_categories:
        # preprocessing: if there is no category replace with -1 (unique number)
        CATEGORY = data.Field(
            sequential=False, use_vocab=False, batch_first=True, 
            preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x)))
        )
        # filter the dataset if the a category is missing (31 -> 41 (count = 1 :()))
        filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31
    else:
        CATEGORY = None
        filter_pred = None
    
    if use_subs:
        SUBS = data.ReversibleField(
            tokenize='spacy', init_token=start_token, 
            eos_token=end_token, pad_token=pad_token, lower=True, 
            batch_first=True
        )
    else:
        SUBS = None
    
    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('category_32', CATEGORY),
        ('subs', SUBS),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=train_meta_path, format='tsv', skip_header=True, fields=fields,
        filter_pred=filter_pred
    )
    CAPTION.build_vocab(dataset.caption, min_freq=min_freq)
    train_vocab = CAPTION.vocab
    
    train_subs_vocab = None
    if use_subs:
        SUBS.build_vocab(dataset.subs, min_freq=min_freq)
        train_subs_vocab = SUBS.vocab
        
    if phase == 'val_1':
        dataset = data.TabularDataset(
            path=val_1_meta_path, format='tsv', skip_header=True, fields=fields,
            filter_pred=filter_pred
        )
    elif phase == 'val_2':
        dataset = data.TabularDataset(
            path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, 
            filter_pred=filter_pred
        )
    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption))
    sort_key = lambda x: 0 #len(x.caption)
    datasetloader = data.BucketIterator(
        dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True
    )
    return train_vocab, train_subs_vocab, datasetloader
Esempio n. 30
0
def train():
    # Logger.
    logger = helpers.get_logger('training')

    helpers.log_args(logger, args)

    # Prepare training and testing data.
    TEXT = data.Field(lower=True, tokenize=helpers.tokenize, batch_first=True)

    LABEL = data.Field(sequential=False)

    fields = [('label', LABEL), ('text', TEXT)]

    train_set = data.TabularDataset(args.train_file, 'csv', fields)

    logger.info(f'Loaded training data: {args.train_file}')

    TEXT.build_vocab(train_set,
                     max_size=args.max_size,
                     min_freq=args.min_freq,
                     vectors=args.pretrained_embeddings)

    LABEL.build_vocab(train_set)

    train_set, valid_set = helpers.split_data(train_set, fields,
                                              args.random_seed,
                                              args.valid_split)

    logger.info(f'Number of training examples: {len(train_set.examples)}')
    logger.info(f'Number of validation examples: {len(valid_set.examples)}')
    logger.info(f'Size of vocabulary: {len(TEXT.vocab)}')
    logger.info(f'Number of labels: {len(LABEL.vocab)}')

    # Initiate criterion, classifier, and optimizer.
    classifier = CNNClassifier(vocab_size=len(TEXT.vocab),
                               labelset_size=len(LABEL.vocab),
                               embedding_dim=args.embedding_dim,
                               num_layers=args.num_layers,
                               filter_mapping=eval(args.filter_mapping),
                               dropout_prob=args.dropout_prob,
                               pretrained_embeddings=TEXT.vocab.vectors)

    if args.cuda:
        classifier.cuda(device=args.device_id)

    criterion = nn.NLLLoss()
    optimizer = optim.Adam(classifier.parameters(), args.learning_rate)

    iterator = data.BucketIterator(dataset=train_set,
                                   batch_size=args.batch_size,
                                   sort_key=lambda x: len(x.text),
                                   device=args.device_id if args.cuda else -1)

    patience = args.patience
    min_valid_loss = None
    for batch in iterator:
        optimizer.zero_grad()

        log_probs = classifier(batch.text)
        loss = criterion(log_probs, batch.label)
        if args.beta > 0:
            loss = loss - args.beta * helpers.calc_entropy(log_probs)

        loss.backward()
        optimizer.step()

        progress, epoch = math.modf(iterator.epoch)

        if iterator.iterations % args.logging_interval == 0:
            valid_loss, accuracy = helpers.evaluate(
                valid_set, args.batch_size, classifier,
                args.device_id if args.cuda else -1)

            logger.info(f'Epoch {int(epoch):2} | '
                        f'progress: {progress:<6.2%} | '
                        f'training loss: {loss.data[0]:6.4f} | '
                        f'validation loss: {valid_loss:6.4f} | '
                        f'validation accuracy: {accuracy:<6.2%} |')

            classifier.train()

            if min_valid_loss is None:
                min_valid_loss = valid_loss

            if valid_loss < min_valid_loss + args.threshold:
                patience = args.patience
                min_valid_loss = min(valid_loss, min_valid_loss)
            else:
                patience -= 1
                if patience == 0:
                    logger.info(
                        f'Patience of {args.patience} reached, decaying learning rate'
                    )
                    helpers.decay_learning_rate(optimizer, args.decay_factor)
                    patience = args.patience

        if epoch == args.num_epochs:
            break

    # Optional testing after training is done.
    if args.test_file is not None:
        test_set = data.TabularDataset(args.test_file, 'csv', fields)

        logger.info(f'Loaded testing data {args.test_file}')

        test_loss, accuracy = helpers.evaluate(
            test_set, args.batch_size, classifier,
            args.device_id if args.cuda else -1)

        logger.info(f'Testing loss: {test_loss:6.4f}')
        logger.info(f'Testing accuracy: {accuracy:<6.2%}')