Ejemplo n.º 1
0
def get_dataset(fix_length=25,
                lower=False,
                vectors=None,
                train_dir='train.csv',
                batch_size=1,
                device=None):

    train = pd.read_csv(train_dir, error_bad_lines=False)
    train['text'] = train['text'].apply(lambda x: remove_unnecessary(x))

    if vectors is not None:
        lower = True

    prepare_csv(train)

    TEXT = data.Field(sequential=True,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=fix_length)
    LABEL = data.Field(use_vocab=True, sequential=False, dtype=torch.float16)
    ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16)
    train_temp, val_temp = data.TabularDataset.splits(
        path='cache/',
        format='csv',
        skip_header=True,
        train='dataset_train.csv',
        validation='dataset_val.csv',
        fields=[('id', ID), ('target', LABEL), ('text', TEXT)])

    TEXT.build_vocab(train_temp,
                     val_temp,
                     max_size=20000,
                     min_freq=10,
                     vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_temp)
    ID.build_vocab(
        train_temp,
        val_temp,
    )

    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    train_iter = get_iterator(train_temp,
                              batch_size=batch_size,
                              train=True,
                              shuffle=True,
                              repeat=False,
                              device=device)
    val_iter = get_iterator(val_temp,
                            batch_size=batch_size,
                            train=True,
                            shuffle=True,
                            repeat=False,
                            device=device)

    print('Train samples:%d' % (len(train_temp)),
          'Valid samples:%d' % (len(val_temp)),
          'Train minibatch nb:%d' % (len(train_iter)),
          'Valid minibatch nb:%d' % (len(val_iter)))
    return vocab_size, word_embeddings, train_iter, val_iter
Ejemplo n.º 2
0
        word = re.sub(r'\n', ' ', word)
        newSample.append(word)
    return newSample


def postprocessing(batch, vocab):
    """
    Called after numericalising but before vectorising.
    """

    return batch


inputSize = 300
stopWords = {}
wordVectors = GloVe(name='6B', dim=inputSize)

################################################################################
####### The following determines the processing of label data (ratings) ########
################################################################################


def convertNetOutput(ratingOutput, categoryOutput):

    ratingOutputNew = torch.argmax(ratingOutput, dim=1, keepdim=False)
    categoryOutputNew = torch.argmax(categoryOutput, dim=1, keepdim=False)

    return ratingOutputNew, categoryOutputNew


################################################################################
Ejemplo n.º 3
0
for dataset in (train, val, test):
    for example in dataset:
        example.text = [word.lower() for word in example.text]

TEXT.build_vocab(train, val, test)
LABEL.build_vocab(train, val, test)

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=10)


# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

glove = GloVe(name="6B", dim=300)
glove.vectors = glove.vectors[torch.arange(len(TEXT.vocab) + 10)]

idx = torch.arange(len(TEXT.vocab.vectors))
EMBEDDINGS = torch.cat((TEXT.vocab.vectors[idx], glove.vectors[idx]), dim=1)


# genrating a mapping from bigrams to indexes
all_bigrams = set()
for dataset in (train, val, test):
    for example in dataset:
        idx = [TEXT.vocab.stoi[word] for word in example.text]
        all_bigrams |= set((i,) for i in idx)
        for i in range(len(idx) - 1):
            all_bigrams.add((idx[i], idx[i + 1]))
Ejemplo n.º 4
0
def train(param):
    if not isinstance(param, dict):
        args = vars(param)
    else:
        args = param

    # GPUs
    if args['gpu_index'] is not None:
        args['gpus'] = str(args['gpu_index'])

    # DATASET
    ##########################################################
    dp_valid_kwargs = inspect.signature(DataProvider.__init__).parameters
    dp_kwargs = dict(
        (name, args[name]) for name in dp_valid_kwargs if name in args)

    data_provider = DataProvider(**dp_kwargs)
    training_dataset, training_dataloader = data_provider.get_training_dataset_and_loader(
    )
    validation_dataset, validation_dataloader = data_provider.get_validation_dataset_and_loader(
    )
    ##########################################################

    # Set Seed
    if args['resume_from_checkpoint'] is None:
        if args['seed'] is not None:
            seed_everything(args['seed'])

    # MODEL
    ##########################################################

    # Check using pretraining
    pre_trained_word_embedding = args['pre_trained_word_embedding']

    if pre_trained_word_embedding is None:
        pass
    elif pre_trained_word_embedding == 'glove.6B.100d':
        assert args['embedding_dim'] == 100
    else:
        raise ModuleNotFoundError

    # # # get framework
    framework = get_class_by_name(args['model'])
    if args['spec_type'] != 'magnitude':
        args['input_channels'] = 4

    # Model instantiation
    args['vocab_size'] = len(training_dataset.vocab)
    model = framework(**args)

    if pre_trained_word_embedding is None:
        pass
    elif pre_trained_word_embedding == 'glove.6B.100d':
        with torch.no_grad():
            from torchtext.vocab import GloVe
            vocab = training_dataset.vocab
            glove = GloVe(name='6B', dim=100)
            for token in vocab:
                if token in glove.stoi.keys():
                    glove_i = glove.stoi[token]
                    embedding_i = training_dataset.word_to_idx[token]
                    model.spec2spec.embedding.weight[
                        embedding_i] = glove.vectors[glove_i]
                    pass

    else:
        raise ModuleNotFoundError

    if args['last_activation'] != 'identity' and args[
            'spec_est_mode'] != 'masking':
        warn(
            'Please check if you really want to use a mapping-based spectrogram estimation method '
            'with a final activation function. ')
    ##########################################################

    # -- checkpoint
    ckpt_path = Path(args['ckpt_root_path'])
    mkdir_if_not_exists(ckpt_path)
    ckpt_path = ckpt_path.joinpath(args['model'])
    mkdir_if_not_exists(ckpt_path)
    run_id = args['run_id']
    ckpt_path = ckpt_path.joinpath(run_id)
    mkdir_if_not_exists(ckpt_path)
    save_top_k = args['save_top_k']

    checkpoint_callback = ModelCheckpoint(
        filepath=ckpt_path,
        save_top_k=save_top_k,
        verbose=False,
        monitor='val_loss',
        save_last=False,
        save_weights_only=args['save_weights_only'])
    args['checkpoint_callback'] = checkpoint_callback

    # -- early stop
    patience = args['patience']
    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.0,
                                        patience=patience,
                                        verbose=False)
    args['early_stop_callback'] = early_stop_callback

    # -- logger setting
    log = args['log']
    if log == 'False':
        args['logger'] = False
    elif log == 'wandb':
        args['logger'] = WandbLogger(project=args['task'],
                                     tags=args['model'],
                                     offline=False,
                                     id=run_id)
        args['logger'].log_hyperparams(model.hparams)
        args['logger'].watch(model, log='all')
    elif log == 'tensorboard':
        raise NotImplementedError
    else:
        args['logger'] = True  # default
        default_save_path = 'etc/lightning_logs'
        mkdir_if_not_exists(default_save_path)

    valid_kwargs = inspect.signature(Trainer.__init__).parameters
    trainer_kwargs = dict(
        (name, args[name]) for name in valid_kwargs if name in args)

    # Trainer Definition

    # Trainer
    trainer = Trainer(**trainer_kwargs)

    for key in args.keys():
        print('{}:{}'.format(key, args[key]))

    if args['auto_lr_find']:
        lr_find = trainer.tuner.lr_find(model,
                                        training_dataloader,
                                        validation_dataloader,
                                        early_stop_threshold=None,
                                        min_lr=1e-5)

        print(f"Found lr: {lr_find.suggestion()}")
        return 0

    if args['resume_from_checkpoint'] is not None:
        'resume'

    trainer.fit(
        model,
        training_dataloader,
        validation_dataloader,
    )

    return None
Ejemplo n.º 5
0
    "should've", "shouldn", "so", "some", "such", "t", "than", "that",
    "that'll", "the", "their", "theirs", "them", "themselves", "then", "there",
    "these", "they", "this", "those", "through", "to", "too", "under", "until",
    "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren",
    "weren't", "what", "when", "where", "which", "while", "who", "whom", "why",
    "will", "with", "won", "wouldn", "y", "you", "you'd", "you'll", "you're",
    "you've", "your", "yours", "yourself", "yourselves", "could", "he'd",
    "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's",
    "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll",
    "they're", "they've", "we'd", "we'll", "we're", "we've", "what's",
    "when's", "where's", "who's", "why's", "would"
]

DIMENSION = 100

wordVectors = GloVe(name='6B', dim=DIMENSION)

###########################################################################
##### The following determines the processing of label data (ratings) #####
###########################################################################


def convertLabel(datasetLabel):
    """
    Labels (product ratings) from the dataset are provided to you as
    floats, taking the values 1.0, 2.0, 3.0, 4.0, or 5.0.
    You may wish to train with these as they are, or you you may wish
    to convert them to another representation in this function.
    Consider regression vs classification.
    """
    # class1 = torch.Tensor([1., 0., 0., 0., 0.])
Ejemplo n.º 6
0
    def _all_train(self, num_epochs, model_label=None):
        # Create all train dataset
        concat_train_datasets = self.train_datasets[0]
        for train_idx in range(1, self.split_num):
            concat_train_datasets += self.train_datasets[train_idx]
        all_examples = [example for example in concat_train_datasets]
        # Create field
        word = data.Field(
            include_lengths=True,
            batch_first=True,
            lower=True,
            preprocessing=data.Pipeline(lambda w: re.sub('\d', '0', w)
                                        if self.config.is_digit else w))
        char_nesting = data.Field(
            tokenize=list,
            batch_first=True,
            lower=self.config.is_lower,
            init_token=START_TAG,
            eos_token=STOP_TAG,
            preprocessing=data.Pipeline(lambda s: re.sub('\d', '0', s)
                                        if self.config.is_digit else s))
        char = data.NestedField(char_nesting, include_lengths=True)
        label = data.Field(unk_token=UNLABELED_TAG, batch_first=True)
        fields = [(('word', 'char'), (word, char)), ('label', label)]
        # Load train, valid, test datasets
        all_train_dataset = Conll2003Dataset(examples=all_examples,
                                             fields=fields)
        _, valid_dataset, test_dataset = Conll2003Dataset.splits(
            fields=fields,
            path=self.config.dataset_path,
            separator=" ",
            train="eng.train",
            validation="eng.testa",
            test="eng.testb")

        # Build vocab
        word.build_vocab(all_train_dataset,
                         valid_dataset,
                         test_dataset,
                         vectors=GloVe(name='6B', dim='100'))
        char.build_vocab(all_train_dataset, valid_dataset, test_dataset)
        label.build_vocab(all_train_dataset, valid_dataset, test_dataset)
        # UNKNOWN tag is -1
        label.vocab.stoi = Counter(
            {k: v - 1
             for k, v in label.vocab.stoi.items()})
        # Don't count UNKNOWN tag
        num_tags = len(label.vocab) - 1
        assert label.vocab.stoi[UNLABELED_TAG] == UNLABELED_ID
        # Create model
        model = BiLSTM_CRF(num_tags,
                           label.vocab,
                           char.vocab,
                           word.vocab,
                           self.config.emb_dict,
                           dropout_rate=self.config.dropout_rate,
                           inference_type=self.config.inference_type)
        if self.device != "cpu":
            model = model.to(self.device)
        self.trainer_config["path"] = self.base_save_path
        # Trainer
        if model_label is not None:
            self.trainer_config[
                "path"] = self.base_save_path + "/all_train/{}".format(
                    model_label)
        trainer = Trainer(model,
                          self.trainer_config,
                          all_train_dataset,
                          self.valid_dataset,
                          test_dataset=self.test_dataset,
                          label_dict=self.label_dict)
        trainer.train(num_epochs)
Ejemplo n.º 7
0
def main():
    # Use a GPU if available, as it should be faster.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using device: " + str(device))

    # Load the training dataset, and create a data loader to generate a batch.
    textField = data.Field(lower=True, include_lengths=True, batch_first=True)
    labelField = data.Field(sequential=False)

    from imdb_dataloader import IMDB
    train, dev = IMDB.splits(textField, labelField, train="train", validation="dev")

    textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
    labelField.build_vocab(train, dev)

    trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64,
                                                         sort_key=lambda x: len(x.text), sort_within_batch=True)

    # Create an instance of the network in memory (potentially GPU memory). Can change to NetworkCnn during development.
    #net = NetworkLstm().to(device)
    net = NetworkCnn().to(device)
    criterion = lossFunc()
    optimiser = topti.Adam(net.parameters(), lr=0.001)  # Minimise the loss using the Adam algorithm.

    for epoch in range(10):
        running_loss = 0
        # if epoch >= 1:
        #     break
        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)
            # print(inputs)
            # print(inputs.size())
            # print(length)
            # print(length.size())
            # print(labels.size())
            labels -= 1

            # PyTorch calculates gradients by accumulating contributions to them (useful for
            # RNNs).  Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            output = net(inputs, length)
            # print(output)
            # print(labels)
            loss = criterion(output, labels)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            running_loss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32))
                running_loss = 0

    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0

    # Evaluate network on the test dataset.  We aren't calculating gradients, so disable autograd to speed up
    # computations and reduce memory usage.
    with torch.no_grad():
        for batch in testLoader:
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)

            labels -= 1

            outputs = net(inputs, length)

            tp_batch, tn_batch, fp_batch, fn_batch = measures(outputs, labels)
            true_pos += tp_batch
            true_neg += tn_batch
            false_pos += fp_batch
            false_neg += fn_batch

    accuracy = 100 * (true_pos + true_neg) / len(dev)
    matthews = MCC(true_pos, true_neg, false_pos, false_neg)

    print("Classification accuracy: %.2f%%\n"
          "Matthews Correlation Coefficient: %.2f" % (accuracy, matthews))
Ejemplo n.º 8
0
def classic_train(args):
    """
    Train the model in the ol' fashioned way, just like grandma used to
    Args
        args (argparse.ArgumentParser)
    """
    if args.cuda and torch.cuda.is_available():
        print("Using cuda")
        use_cuda = True
    elif args.cuda and not torch.cuda.is_available():
        print("You do not have CUDA, turning cuda off")
        use_cuda = False
    else:
        use_cuda = False

    #Load the data
    print("\nLoading Vocab")
    vocab = du.load_vocab(args.vocab)
    print("Vocab Loaded, Size {}".format(len(vocab.stoi.keys())))

    if args.use_pretrained:
        pretrained = GloVe(name='6B',
                           dim=args.emb_size,
                           unk_init=torch.Tensor.normal_)
        vocab.load_vectors(pretrained)
        print("Vectors Loaded")

    #Set add_eos to false if you want to decode arbitrarly long conditioned on the latents (done in paper), recommended to set this to false if generating
    #event sequences (since length is not that important and we dont need the latents capturing it), if generating raw text its probably better to have it on
    #In the DAVAE class there is a train() fuction that also takes in add_eos, it should match this one
    print("Loading Dataset")
    dataset = du.SentenceDataset(args.train_data,
                                 vocab,
                                 args.src_seq_length,
                                 add_eos=False)
    print("Finished Loading Dataset {} examples".format(len(dataset)))
    batches = BatchIter(dataset,
                        args.batch_size,
                        sort_key=lambda x: len(x.text),
                        train=True,
                        sort_within_batch=True,
                        device=-1)
    data_len = len(dataset)

    if args.load_model:
        print("Loading the Model")
        model = torch.load(args.load_model)
    else:
        print("Creating the Model")
        bidir_mod = 2 if args.bidir else 1
        latents = example_tree(
            args.num_latent_values,
            (bidir_mod * args.enc_hid_size, args.latent_dim),
            use_cuda=use_cuda,
            nohier_mode=args.nohier)  #assume bidirectional
        hidsize = (args.enc_hid_size, args.dec_hid_size)
        model = DAVAE(args.emb_size,
                      hidsize,
                      vocab,
                      latents,
                      layers=args.nlayers,
                      use_cuda=use_cuda,
                      pretrained=args.use_pretrained,
                      dropout=args.dropout)

    #create the optimizer
    if args.load_opt:
        print("Loading the optimizer state")
        optimizer = torch.load(args.load_opt)
    else:
        print("Creating the optimizer anew")
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    start_time = time.time()  #start of epoch 1
    curr_epoch = 1
    valid_loss = [0.0]
    for iteration, bl in enumerate(
            batches
    ):  #this will continue on forever (shuffling every epoch) till epochs finished
        batch, batch_lens = bl.text
        target, target_lens = bl.target

        if use_cuda:
            batch = Variable(batch.cuda())
        else:
            batch = Variable(batch)

        model.zero_grad()
        latent_values, latent_root, diff, dec_outputs = model(
            batch, batch_lens)
        # train set to True so returns total loss
        loss, _ = monolithic_compute_loss(iteration,
                                          model,
                                          target,
                                          target_lens,
                                          latent_values,
                                          latent_root,
                                          diff,
                                          dec_outputs,
                                          use_cuda,
                                          args=args)

        # backward propagation
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        # Optimize
        optimizer.step()

        # End of an epoch - run validation
        if ((args.batch_size * iteration) % data_len == 0
                or iteration % args.validate_after == 0) and iteration != 0:
            print("\nFinished Training Epoch/iteration {}/{}".format(
                curr_epoch, iteration))

            # do validation
            print("Loading Validation Dataset.")
            val_dataset = du.SentenceDataset(args.valid_data,
                                             vocab,
                                             args.src_seq_length,
                                             add_eos=False)
            print("Finished Loading Validation Dataset {} examples.".format(
                len(val_dataset)))
            val_batches = BatchIter(val_dataset,
                                    args.batch_size,
                                    sort_key=lambda x: len(x.text),
                                    train=False,
                                    sort_within_batch=True,
                                    device=-1)
            valid_loss = 0.0
            for v_iteration, bl in enumerate(val_batches):
                batch, batch_lens = bl.text
                target, target_lens = bl.target
                batch_lens = batch_lens.cpu()
                if use_cuda:
                    batch = Variable(batch.cuda(), volatile=True)
                else:
                    batch = Variable(batch, volatile=True)

                latent_values, latent_root, diff, dec_outputs = model(
                    batch, batch_lens)
                # train set to False so returns only CE loss
                loss, ce_loss = monolithic_compute_loss(iteration,
                                                        model,
                                                        target,
                                                        target_lens,
                                                        latent_values,
                                                        latent_root,
                                                        diff,
                                                        dec_outputs,
                                                        use_cuda,
                                                        args=args,
                                                        train=False)
                valid_loss = valid_loss + ce_loss.data.clone()

            valid_loss = valid_loss / (v_iteration + 1)
            print("**Validation loss {:.2f}.**\n".format(valid_loss[0]))

            # Check max epochs and break
            if (args.batch_size * iteration) % data_len == 0:
                curr_epoch += 1
            if curr_epoch > args.epochs:
                print("Max epoch {}-{} reached. Exiting.\n".format(
                    curr_epoch, args.epochs))
                break

        # Save the checkpoint
        if iteration % args.save_after == 0 and iteration != 0:
            print("Saving checkpoint for epoch {} at {}.\n".format(
                curr_epoch, args.save_model))
            # curr_epoch and validation stats appended to the model name
            torch.save(
                model, "{}_{}_{}_.epoch_{}.loss_{:.2f}.pt".format(
                    args.save_model, args.commit_c, args.commit2_c, curr_epoch,
                    float(valid_loss[0])))
            torch.save(
                optimizer,
                "{}.{}.epoch_{}.loss_{:.2f}.pt".format(args.save_model,
                                                       "optimizer", curr_epoch,
                                                       float(valid_loss[0])))
Ejemplo n.º 9
0
def train(mode='train',
          train_path='train.conllx',
          model='dozat',
          dev_path='dev.conllx',
          test_path='test.conllx',
          ud=True,
          output_dir='output',
          emb_dim=0,
          char_emb_dim=0,
          char_model=None,
          tagger=None,
          batch_size=5000,
          n_iters=10,
          dropout_p=0.33,
          num_layers=1,
          print_every=1,
          eval_every=100,
          bi=True,
          lr=0.001,
          adam_beta1=0.9,
          adam_beta2=0.999,
          weight_decay=0.,
          plateau=False,
          resume=False,
          lr_decay=1.0,
          lr_decay_steps=5000,
          clip=5.,
          momentum=0,
          optimizer='adam',
          glove=True,
          seed=42,
          dim=0,
          window_size=0,
          num_filters=0,
          **kwargs):

    device = torch.device(type='cuda') if use_cuda else torch.device(
        type='cpu')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    cfg = locals().copy()

    torch.manual_seed(seed)
    np.random.seed(seed)

    # load data component
    dataset_obj = ConllXDataset
    fields = get_data_fields()
    _form = fields['form'][-1]
    _pos = fields['pos'][-1]
    _chars = fields['chars'][-1]

    train_dataset = dataset_obj(train_path, fields)
    dev_dataset = dataset_obj(dev_path, fields)
    test_dataset = dataset_obj(test_path, fields)

    logger.info("Loaded %d train examples" % len(train_dataset))
    logger.info("Loaded %d dev examples" % len(dev_dataset))
    logger.info("Loaded %d test examples" % len(test_dataset))

    form_vocab_path = os.path.join(output_dir, 'vocab.form.pth.tar')
    pos_vocab_path = os.path.join(output_dir, 'vocab.pos.pth.tar')
    char_vocab_path = os.path.join(output_dir, 'vocab.char.pth.tar')

    if not resume:
        # build vocabularies
        # words have a min frequency of 2 to be included; others become <unk>
        # words without a Glove vector are initialized ~ N(0, 0.5) mimicking Glove

        # Note: this requires the latest torchtext development version from Github.
        # - git clone https://github.com/pytorch/text.git torchtext
        # - cd torchtext
        # - python setup.py build
        # - python setup.py install

        def unk_init(x):
            # return 0.01 * torch.randn(x)
            return torch.zeros(x)

        if glove:
            logger.info("Using Glove vectors")
            glove_vectors = GloVe(name='6B', dim=100)
            _form.build_vocab(train_dataset,
                              min_freq=2,
                              unk_init=unk_init,
                              vectors=glove_vectors)
            n_unks = 0
            unk_set = set()
            # for now, set UNK words manually
            # (torchtext does not seem to support it yet)
            for i, token in enumerate(_form.vocab.itos):
                if token not in glove_vectors.stoi:
                    n_unks += 1
                    unk_set.add(token)
                    _form.vocab.vectors[i] = unk_init(emb_dim)
            # print(n_unks, unk_set)

        else:
            _form.build_vocab(train_dataset, min_freq=2)

        _pos.build_vocab(train_dataset)
        _chars.build_vocab(train_dataset)

        # save vocabularies
        torch.save(_form.vocab, form_vocab_path)
        torch.save(_pos.vocab, pos_vocab_path)
        torch.save(_chars.vocab, char_vocab_path)

    else:
        # load vocabularies
        _form.vocab = torch.load(form_vocab_path)
        _pos.vocab = torch.load(pos_vocab_path)
        _chars.vocab = torch.load(char_vocab_path)

    print("First 10 vocabulary entries, words: ",
          " ".join(_form.vocab.itos[:10]))
    print("First 10 vocabulary entries, pos tags: ",
          " ".join(_pos.vocab.itos[:10]))
    print("First 10 vocabulary entries, chars: ",
          " ".join(_chars.vocab.itos[:10]))

    n_words = len(_form.vocab)
    n_tags = len(_pos.vocab)
    n_chars = len(_chars.vocab)

    def batch_size_fn(new, count, sofar):
        return len(new.form) + 1 + sofar

    # iterators
    train_iter = Iterator(train_dataset,
                          batch_size,
                          train=True,
                          sort_within_batch=True,
                          batch_size_fn=batch_size_fn,
                          device=device)
    dev_iter = Iterator(dev_dataset,
                        32,
                        train=False,
                        sort_within_batch=True,
                        device=device)
    test_iter = Iterator(test_dataset,
                         32,
                         train=False,
                         sort_within_batch=True,
                         device=device)

    # uncomment to see what a mini-batch looks like numerically
    # e.g. some things are being inserted dynamically (ROOT at the start of seq,
    #   padding items, maybe UNKs..)
    # batch = next(iter(train_iter))
    # print("form", batch.form)
    # print("pos", batch.pos)
    # print("deprel", batch.deprel)
    # print("head", batch.head)

    # if n_iters or eval_every are negative, we set them to that many
    # number of epochs
    iters_per_epoch = (len(train_dataset) // batch_size) + 1
    if eval_every < 0:
        logger.info("Setting eval_every to %d epoch(s) = %d iters" %
                    (-1 * eval_every, -1 * eval_every * iters_per_epoch))
        eval_every = iters_per_epoch * eval_every

    if n_iters < 0:
        logger.info("Setting n_iters to %d epoch(s) = %d iters" %
                    (-1 * n_iters, -1 * n_iters * iters_per_epoch))
        n_iters = -1 * n_iters * iters_per_epoch

    # load up the model
    model = Tagger(n_words=n_words,
                   n_tags=n_tags,
                   n_chars=n_chars,
                   form_vocab=_form.vocab,
                   char_vocab=_chars.vocab,
                   pos_vocab=_pos.vocab,
                   **cfg)

    # set word vectors
    if glove:
        _form.vocab.vectors = _form.vocab.vectors / torch.std(
            _form.vocab.vectors)
        # print(torch.std(_form.vocab.vectors))
        model.encoder.embedding.weight.data.copy_(_form.vocab.vectors)
        model.encoder.embedding.weight.requires_grad = True

    model = model.cuda() if use_cuda else model

    start_iter = 1
    best_iter = 0
    best_pos_acc = -1.
    test_pos_acc = -1.

    # optimizer and learning rate scheduler
    trainable_parameters = [p for p in model.parameters() if p.requires_grad]
    if optimizer == 'sgd':
        optimizer = torch.optim.SGD(trainable_parameters,
                                    lr=lr,
                                    momentum=momentum)
    else:
        optimizer = torch.optim.Adam(trainable_parameters,
                                     lr=lr,
                                     betas=(adam_beta1, adam_beta2))

    # learning rate schedulers
    if not plateau:
        scheduler = LambdaLR(optimizer, lr_lambda=lambda t: lr_decay**t)
    else:
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='max',
                                      factor=0.75,
                                      patience=5,
                                      min_lr=1e-4)

    # load model and vocabularies if resuming
    if resume:
        if os.path.isfile(resume):
            print("=> loading checkpoint '{}'".format(resume))
            checkpoint = torch.load(resume)
            start_iter = checkpoint['iter_i']
            best_pos_acc = checkpoint['best_pos_acc']
            test_pos_acc = checkpoint['test_pos_acc']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (iter {})".format(
                resume, checkpoint['iter_i']))
        else:
            print("=> no checkpoint found at '{}'".format(resume))

    print_parameters(model)

    # print some stuff just for fun
    logger.info("Most common words: %s" % _form.vocab.freqs.most_common(20))
    logger.info("Word vocab size: %s" % n_words)
    logger.info("Most common XPOS-tags: %s" % _pos.vocab.freqs.most_common())
    logger.info("POS vocab size: %s" % n_tags)
    # logger.info("Most common chars: %s" % _chars.nesting_field.vocab.freqs.most_common())
    logger.info("Chars vocab size: %s" % n_chars)

    print("First training example:")
    print_example(train_dataset[0])

    print("First dev example:")
    print_example(dev_dataset[0])

    print("First test example:")
    print_example(test_dataset[0])

    logger.info("Training starts..")
    upos_var, morph_var = None, None
    for iter_i in range(start_iter, n_iters + 1):

        if not plateau and iter_i % (912344 // batch_size) == 0:
            scheduler.step()
        model.train()

        batch = next(iter(train_iter))
        form_var, lengths = batch.form

        pos_var = batch.pos
        char_var, sentence_lengths, word_lengths = batch.chars
        lengths = lengths.view(-1).tolist()

        result = model(form_var=form_var,
                       char_var=char_var,
                       pos_var=pos_var,
                       lengths=lengths,
                       word_lengths=word_lengths)

        # rows sum to 1
        # print(torch.exp(output_graph).sum(-1))

        # print sizes
        # print(head_logits.data.cpu().size())
        targets = dict(pos=batch.pos)

        all_losses = model.get_loss(scores=result, targets=targets)

        loss = all_losses['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        optimizer.zero_grad()

        if iter_i % print_every == 0:

            # get scores for this batch
            if model.tagger == "linear":
                pos_predictions = result['output'].max(2)[1]
            else:
                pos_predictions = result['sequence']
            predictions = dict(pos=pos_predictions)
            targets = dict(pos=batch.pos)

            pos_acc = model.get_accuracy(predictions=predictions,
                                         targets=targets)

            if not plateau:
                lr = scheduler.get_lr()[0]
            else:
                lr = [group['lr'] for group in optimizer.param_groups][0]

            fmt = "Iter %08d loss %8.4f pos-acc %5.2f lr %.5f"

            logger.info(fmt % (iter_i, loss, pos_acc, lr))

        if iter_i % eval_every == 0:

            # parse dev set and save to file for official evaluation
            dev_out_path = 'dev.iter%08d.conll' % iter_i
            dev_out_path = os.path.join(output_dir, dev_out_path)
            predict_and_save(dataset=dev_dataset,
                             model=model,
                             dataset_path=dev_path,
                             out_path=dev_out_path)

            _dev_pos_acc = get_pos_acc(dev_path, dev_out_path)

            logger.info("Evaluation dev Iter %08d "
                        "pos-acc %5.2f" % (iter_i, _dev_pos_acc))

            # parse test set and save to file for official evaluation
            test_out_path = 'test.iter%08d.conll' % iter_i
            test_out_path = os.path.join(output_dir, test_out_path)
            predict_and_save(dataset=test_dataset,
                             model=model,
                             dataset_path=test_path,
                             out_path=test_out_path)
            _test_pos_acc = get_pos_acc(test_path, test_out_path)

            logger.info("Evaluation test Iter %08d "
                        "pos-acc %5.2f" % (iter_i, _test_pos_acc))

            if plateau:
                scheduler.step(_dev_pos_acc)

            if _dev_pos_acc > best_pos_acc:
                best_iter = iter_i
                best_pos_acc = _dev_pos_acc
                test_pos_acc = _test_pos_acc
                is_best = True
            else:
                is_best = False

            save_checkpoint(
                output_dir, {
                    'iter_i': iter_i,
                    'state_dict': model.state_dict(),
                    'best_iter': best_iter,
                    'test_pos_acc': test_pos_acc,
                    'optimizer': optimizer.state_dict(),
                }, False)

    logger.info("Done Training")
    logger.info(
        "Best model Iter %08d Dev POS-acc %12.4f Test POS-acc %12.4f " %
        (best_iter, best_pos_acc, test_pos_acc))
Ejemplo n.º 10
0
args = get_args()
torch.cuda.set_device(args.gpu)

inputs = data.Field(lower=args.lower)
answers = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(inputs, answers)

inputs.build_vocab(train, dev, test)
if args.word_vectors:
    if os.path.isfile(args.vector_cache):
        inputs.vocab.vectors = torch.load(args.vector_cache)
    else:
        # FIXME: quick fix. Do bring the actual arguments
        inputs.vocab.load_vectors(vectors=GloVe(name='6B', dim=300))
        makedirs(os.path.dirname(args.vector_cache))
        torch.save(inputs.vocab.vectors, args.vector_cache)
answers.build_vocab(train)

train_iter, dev_iter, test_iter = data.BucketIterator.splits(
    (train, dev, test), batch_size=args.batch_size, device=args.gpu)

config = args
config.n_embed = len(inputs.vocab)
config.d_out = len(answers.vocab)
config.n_cells = config.n_layers

# double the number of cells for bidirectional networks
if config.birnn:
    config.n_cells *= 2
Ejemplo n.º 11
0
    def to_cmdline_kwarg(key, value):
        if len(key) == 1:
            key = "-{}".format(key)
        else:
            key = "--{}".format(re.sub(r"_", "-", key))
        value = str(value)
        return key, value

    kwargs_pairs = (to_cmdline_kwarg(key, value)
                    for key, value in kwargs.items())
    cmdline_args = list(sum(kwargs_pairs, ()))
    args = parser.parse_args(cmdline_args)


VECTORS = {
    "GloVe": GloVe(name='6B', dim=300),
    # "FastText": FastText()
}


def create_data_loaders(args):
    LOG.info("importing IMDB dataset")
    train_dataset, eval_dataset, = \
        data.make_imdb_dataset(args.total_num_labeled, VECTORS[args.vectors], args.exclude_unlabeled, args.seed, args.use_gpu)

    LOG.info("building torchtext iterators")
    if args.total_num_labeled == -1:
        train_iter = tdata.BucketIterator(
            dataset=train_dataset,
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
Ejemplo n.º 12
0
    def load(self):
        if self.run_mode == 'word':
            WORD_MODE_FIELD = data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True)
            self.field = WORD_MODE_FIELD

        if self.run_mode == 'bert':
            # Load the BERT tokenizer.
            print('Loading BERT tokenizer...')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

            self.max_input_length = self.tokenizer.max_model_input_sizes['bert-base-uncased']

            def tokenize_and_cut(sentence):
                tokens = self.tokenizer.tokenize(sentence) 
                # tokens = tokens[:max_input_length-2]
                return tokens
            BERT_FIELD = data.Field(batch_first = True,
                        use_vocab = False,
                        tokenize = tokenize_and_cut,
                        preprocessing = self.tokenizer.convert_tokens_to_ids,
                        init_token = self.tokenizer.cls_token_id,
                        eos_token = self.tokenizer.sep_token_id,
                        pad_token = self.tokenizer.pad_token_id,
                        unk_token = self.tokenizer.unk_token_id)
            self.field = BERT_FIELD
        
        if self.run_mode == 'sentence':
            WORD_FIELD = data.Field(sequential=True, lower=True, tokenize='spacy')
            SENTENCE_FIELD = data.NestedField(WORD_FIELD, tokenize=sent_tokenize, include_lengths=True)
            self.field = SENTENCE_FIELD
        
        def split_start(x, y):
            idx = x[0].split(",")[0]
            if idx == 'None':
                return int(-1)
            else:
                return int(idx)
        process_start = data.Pipeline(split_start)
        START_INDEX = data.Field(sequential=True, postprocessing=process_start, use_vocab=False)

        def split_end(x, y):
            if len(x[0].split(",")) > 1:
                idx = x[0].split(",")[len(x[0].split(",")) - 1]
                if idx == 'None':
                    return int(-1)
                else:
                    return int(idx)
            else:
                idx = x[0].split(",")[0]
                if idx == 'None':
                    return int(-1)
                else:
                    return int(idx)
        process_end = data.Pipeline(split_end)
        END_INDEX = data.Field(sequential=True, postprocessing=process_end, use_vocab=False)

        def floor_label(x, y):
            return math.floor(float(x[0]))
        process_answerable = data.Pipeline(floor_label)
        ANSWERABLE = data.Field(sequential=True, postprocessing=process_answerable, use_vocab=False)


        if self.run_mode == 'word':
            col_dict = {'story_text': WORD_MODE_FIELD, 'question': WORD_MODE_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE}
        elif self.run_mode == 'bert':
            col_dict = {'story_text': BERT_FIELD, 'question': BERT_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE}
        elif self.run_mode == 'sentence':
            col_dict = {'story_text': SENTENCE_FIELD, 'question': WORD_FIELD, 'word_start_index_1': START_INDEX, 'word_end_index_1': END_INDEX, 'is_answer_absent': ANSWERABLE}

        def populateDatafields(somedf, col_dict):
            datafields = []
            for col in somedf.columns:
                if col in col_dict.keys():
                    datafields.append((col, col_dict[col]))
                else:
                    datafields.append((col, None))
            return datafields
        newsqa_df = pd.read_csv(self.train_data_path)
        datafields = populateDatafields(newsqa_df, col_dict)

        print("Building Dataset...")
        self.training_data=data.TabularDataset(path = self.train_data_path,\
                                        format = 'csv',\
                                        fields = datafields,\
                                        skip_header = True)

        self.validation_data=data.TabularDataset(path = self.val_data_path,\
                                        format = 'csv',\
                                        fields = datafields,\
                                        skip_header = True)
        if self.verbose:
            count = 0
            for t in self.training_data:
                print("*******************************")
                print("Story Text: ", len(t.story_text), t.story_text)
                print("Question: ", t.question)
                print("Start Index: ", t.word_start_index_1)
                print("End Index: ", t.word_end_index_1)
                print("Unanswerable: ", t.is_answer_absent)

                if count > 5:
                    break
                count += 1

        print("Building Vocab...")
        if self.run_mode == 'word':
            WORD_MODE_FIELD.build_vocab(self.training_data, self.validation_data, min_freq = 3, vectors=GloVe(name = '6B', dim = 300))
            if self.verbose:
                print("Length of Vocab: ", len(WORD_MODE_FIELD.vocab))
        elif self.run_mode == 'sentence':
            SENTENCE_FIELD.build_vocab(self.training_data, self.validation_data, min_freq = 3, vectors=GloVe(name = '6B', dim = 300))
            if self.verbose:
                print("Length of Vocab: ", len(SENTENCE_FIELD.vocab))

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
        print("Initializing the iterator...")
        # Define the train iterator
        self.train_iterator = data.BucketIterator(
            self.training_data, 
            batch_size = self.batch_size,
            sort_key = lambda x: len(x.story_text),
            sort_within_batch = True,
            repeat=False, 
            shuffle=True,
            device = device)

        self.val_iterator = data.BucketIterator(
            self.validation_data, 
            batch_size = 1,
            sort_key = lambda x: len(x.story_text),
            sort_within_batch = False,
            sort=False,
            repeat=False,
            shuffle=False,
            device = device)

        if self.verbose:
            for batch in self.train_iterator:
                print("Story: ", batch.story_text[0].shape, batch.story_text[1].shape)
                print("Start/End: ", batch.word_start_index_1, batch.word_end_index_1, batch.is_answer_absent)
                break
Ejemplo n.º 13
0
 def init_GloVe(self, name, dim, cache=None):
     return GloVe(name, dim, cache=cache)
Ejemplo n.º 14
0
    tgt = TargetField(stop_words=stopwords)
    max_len = 100

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len
    train = torchtext.data.TabularDataset(
        path=opt.train_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    dev = torchtext.data.TabularDataset(
        path=opt.dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300))
    tgt.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300))
    input_vocab = src.vocab
    output_vocab = tgt.vocab
    print(len(train), len(dev))

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
Ejemplo n.º 15
0
    def __init__(self, args):
        path = '.data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        print("preprocessing data files...")
        if not os.path.exists(f'{path}/{args.train_file}l'):
            self.preprocess_file(f'{path}/{args.train_file}')
        if not os.path.exists(f'{path}/{args.dev_file}l'):
            self.preprocess_file(f'{path}/{args.dev_file}')

        self.RAW = data.RawField()
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples,
                                      fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train=f'{args.train_file}l',
                validation=f'{args.dev_file}l',
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if args.context_threshold > 0:
            self.train.examples = [
                e for e in self.train.examples
                if len(e.c_word) <= args.context_threshold
            ]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train,
                              self.dev,
                              vectors=GloVe(name='6B', dim=args.word_dim))

        print("building iterators...")
        device = torch.device(
            f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
        self.train_iter, self.dev_iter = \
            data.BucketIterator.splits((self.train, self.dev),
                                       batch_sizes=[args.train_batch_size, args.dev_batch_size],
                                       device=device,
                                       sort_key=lambda x: len(x.c_word))
Ejemplo n.º 16
0
def load_dataset(test_sen=None):
    print("in load_dataset")
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """

    #
    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(sequential=True,
                      tokenize=tokenizer,
                      lower=True,
                      fix_length=200)
    LABEL = data.Field(tensor_type=torch.FloatTensor, sequential=False)

    # tokenize = lambda x: x.split()
    # TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    # LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    # train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    # print('data loaded')
    train_data, valid_data, test_data = data.TabularDataset.splits(
        path='../Github/Data/author_identification/',
        train='train.csv',
        validation='val.csv',
        test='test.csv',
        format='csv',
        fields=[('id', None), ('text', TEXT), ('author', LABEL)])
    #
    # TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    # LABEL.build_vocab(train_data)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    from torchtext.data import Iterator, BucketIterator

    # train_iter, valid_iter = BucketIterator.splits(
    #     (train_data, valid_data),  # we pass in the datasets we want the iterator to draw data from
    #     batch_size=64,
    #     device=-1,  # if you want to use the GPU, specify the GPU number here
    #     sort_key=lambda x: len(x.text),
    #     # the BucketIterator needs to be told what function it should use to group the data.
    #     sort_within_batch=False,
    #     repeat=False  # we pass repeat=False because we want to wrap this Iterator layer.
    # )
    # test_iter = Iterator(test_data, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)
    # train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data

    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=32,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)
    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, LABEL, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Ejemplo n.º 17
0
def train(data_path, train_path, val_path, test_path, hidden_size, num_classes,
          num_layers, num_dir, batch_size, emb_dim, dropout, net_type, embfix):

    print('Training...')

    # define fields
    TEXT = data.Field(lower=True, init_token="<start>", eos_token="<end>")
    LABEL = data.Field(sequential=False, unk_token=None)

    # build dataset splits
    train, val, test = data.TabularDataset.splits(path=data_path,
                                                  train=train_path,
                                                  validation=val_path,
                                                  test=test_path,
                                                  format='tsv',
                                                  fields=[('text', TEXT),
                                                          ('label', LABEL)])

    # build vocabs
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=emb_dim), min_freq=2)
    prevecs = TEXT.vocab.vectors
    #TEXT.build_vocab(train, min_freq=3)
    LABEL.build_vocab(train)

    # build iterators
    train_iter = data.BucketIterator(train,
                                     batch_size=batch_size,
                                     sort_key=lambda x: len(x.text),
                                     train=True)
    val_iter = data.Iterator(val,
                             batch_size=batch_size,
                             repeat=False,
                             train=False,
                             sort=False,
                             shuffle=False)
    test_iter = data.Iterator(test,
                              batch_size=batch_size,
                              repeat=False,
                              train=False,
                              sort=False,
                              shuffle=False)

    # print info
    print(max(LABEL.vocab.freqs.values()))
    print('num_classes: ', len(LABEL.vocab))
    print('input_size: ', len(TEXT.vocab))

    print('majority class acc:', max(LABEL.vocab.freqs.values()) / len(train))
    print('random guess acc:',
          (max(LABEL.vocab.freqs.values()) / len(train))**2 +
          (min(LABEL.vocab.freqs.values()) / len(train))**2)

    num_classes = len(LABEL.vocab)
    input_size = len(TEXT.vocab)

    model = RNN(input_size=input_size,
                hidden_size=hidden_size,
                num_classes=num_classes,
                prevecs=prevecs,
                num_layers=num_layers,
                num_dir=num_dir,
                batch_size=batch_size,
                emb_dim=emb_dim,
                embfix=embfix,
                dropout=dropout,
                net_type=net_type)

    epochs = 100
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adamax(model.parameters())
    #optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.5)
    if int(torch.cuda.is_available()) == 1:
        model = model.cuda()

    # train
    model.train()
    best_val_acc = 0
    for e in range(epochs):
        print('Epoch:', e)
        tot_loss = 0
        corrects = 0
        train_iter.repeat = False
        for batch_count, batch in enumerate(train_iter):
            #print('Batch:', batch_count)
            #print(batch.text)
            #print(batch.label)
            model.zero_grad()

            inp = batch.text.t()
            preds = model(inp)
            target = batch.label

            #print(preds, batch.label)
            loss = criterion(preds, batch.label)
            loss.backward()
            optimizer.step()

            _, preds = torch.max(preds, 1)
            corrects += int(preds.data.eq(target.data).sum())
            tot_loss += loss.data[0]

        print('acc (train):', 100 * corrects / len(train_iter.dataset))
        print('loss (train):', tot_loss)
        val_acc, _, val_loss = evaluate(val_iter, model, TEXT, LABEL)
        print('acc (val):', val_acc)
        print('loss (val):', val_loss)
        if val_acc > best_val_acc:
            test_acc, test_preds, test_loss = evaluate(test_iter, model, TEXT,
                                                       LABEL)
            #print('Test acc:', test_acc)
            f = open('./preds/preds_' + str(e) + '.txt', 'w')
            for x in test_preds:
                f.write(str(int(x)) + '\n')
            f.close()
            torch.save(model.state_dict(),
                       './models/e' + str(e) + '_' + str(val_acc) + '.pt')
Ejemplo n.º 18
0
# We'll use NestedField to tokenize each word into list of chars
CHAR_NESTING = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
CHAR = data.NestedField(CHAR_NESTING, init_token="<bos>", eos_token="<eos>")

fields = [(('word', 'char'), (WORD, CHAR)), (None, None), ('ptbtag', PTB_TAG)]
train, val, test = datasets.UDPOS.splits(fields=fields)

print(train.fields)
print(len(train))
print(vars(train[0]))

WORD.build_vocab(train.word,
                 val.word,
                 test.word,
                 vectors=[GloVe(name='6B', dim='300')])
CHAR.build_vocab(train.char, val.char, test.char)
PTB_TAG.build_vocab(train.ptbtag)

print(CHAR.vocab.freqs)
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=3)

batch = next(iter(train_iter))

print("words", batch.word)
print("chars", batch.char)
print("ptbtags", batch.ptbtag)

# Using the CoNLL 2000 Chunking dataset:
INPUTS = data.Field(init_token="<bos>", eos_token="<eos>")
CHUNK_TAGS = data.Field(init_token="<bos>", eos_token="<eos>")
Ejemplo n.º 19
0
import torch
import torch.nn as tnn
import torch.optim as topti

from torchtext import data
from torchtext.vocab import GloVe

textField = data.Field(lower=True, include_lengths=True, batch_first=True)
labelField = data.Field(sequential=False)
train, dev = IMDB.splits(textField,
                         labelField,
                         train="train",
                         validation="dev")
print(textField)

textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
labelField.build_vocab(train, dev)

trainLoader, testLoader = data.BucketIterator.splits(
    (train, dev),
    shuffle=True,
    batch_size=64,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# j = 0
# for i, batch in enumerate(trainLoader):
#     # Get a batch and potentially send it to GPU memory.
#     inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
#         device), batch.label.type(torch.FloatTensor).to(device)
Ejemplo n.º 20
0
def return_data(args):

    name = args.dataset
    root = args.root
    batch_size = args.batch_size
    data_loader = dict()
    device = 0 if args.cuda else -1

    if name in ['mnist', 'MNIST']:

        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, ), (0.5, )),
        ])

        train_kwargs = {
            'root': root,
            'mode': 'train',
            'transform': transform,
            'download': True,
            'load_pred': args.load_pred,
            'model_name': args.model_name
        }
        valid_kwargs = {
            'root': root,
            'mode': 'valid',
            'transform': transform,
            'download': True,
            'load_pred': args.load_pred,
            'model_name': args.model_name
        }
        test_kwargs = {
            'root': root,
            'mode': 'test',
            'transform': transform,
            'download': False,
            'load_pred': args.load_pred,
            'model_name': args.model_name
        }
        dset = MNIST_modified

        train_data = dset(**train_kwargs)
        valid_data = dset(**valid_kwargs)
        test_data = dset(**test_kwargs)

        # data loader
        num_workers = 0
        train_loader = DataLoader(train_data,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  drop_last=True,
                                  pin_memory=True)

        valid_loader = DataLoader(valid_data,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=num_workers,
                                  drop_last=False,
                                  pin_memory=True)

        test_loader = DataLoader(test_data,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=num_workers,
                                 drop_last=False,
                                 pin_memory=True)

        data_loader[
            'x_type'] = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor
        data_loader[
            'y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor

    elif name in ['imdb', 'IMDB']:

        embedding_dim = 100
        max_total_num_words = 20000
        text = data.Field(tokenize=tokenizer_twolevel, batch_first=True)
        label = data.Field(lower=True)
        label_pred = data.Field(use_vocab=False, fix_length=1)
        fname = data.Field(use_vocab=False, fix_length=1)

        train, valid, test = IMDB_modified.splits(text,
                                                  label,
                                                  label_pred,
                                                  fname,
                                                  root=root,
                                                  model_name=args.model_name,
                                                  load_pred=args.load_pred)
        print("build vocab...")
        text.build_vocab(train,
                         vectors=GloVe(name='6B',
                                       dim=embedding_dim,
                                       cache=root),
                         max_size=max_total_num_words)
        label.build_vocab(train)
        #label_pred.build_vocab(train)

        print("Create Iterator objects for multiple splits of a dataset...")
        train_loader, valid_loader, test_loader = data.Iterator.splits(
            (train, valid, test),
            batch_size=batch_size,
            device=device,
            repeat=False)

        data_loader['word_idx'] = text.vocab.itos
        data_loader[
            'x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader[
            'y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader['max_total_num_words'] = max_total_num_words
        data_loader['embedding_dim'] = embedding_dim
        data_loader['max_num_words'] = 50
        #_, (text, _, _, _) = next(iter(train_loader))
        data_loader['max_num_sents'] = int(
            next(iter(train_loader)).text.size(-1) /
            data_loader['max_num_words'])

    else:
        raise UnknownDatasetError()

    data_loader['train'] = train_loader
    data_loader['valid'] = valid_loader
    data_loader['test'] = test_loader

    return data_loader
Ejemplo n.º 21
0
def main():
    # Use a GPU if available, as it should be faster.
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using device: " + str(device))

    # Load the training dataset, and create a data loader to generate a batch.
    textField = PreProcessing.text_field
    labelField = data.Field(sequential=False)
    #print(1234)
    train, dev = IMDB.splits(textField, labelField, train="train", validation="dev")
    #print(1234)
    textField.build_vocab(train, dev, vectors=GloVe(name="6B", dim=50))
    labelField.build_vocab(train, dev)

    trainLoader, testLoader = data.BucketIterator.splits((train, dev), shuffle=True, batch_size=64,
                                                         sort_key=lambda x: len(x.text), sort_within_batch=True)

    net = Network().to(device)
    criterion =lossFunc()
    optimiser = topti.Adam(net.parameters(), lr=0.001)  # Minimise the loss using the Adam algorithm.

    for epoch in range(10):
        running_loss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)

            labels -= 1

            # PyTorch calculates gradients by accumulating contributions to them (useful for
            # RNNs).  Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            output = net(inputs, length)

            loss = criterion(output, labels)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            running_loss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, running_loss / 32))
                running_loss = 0

    num_correct = 0

    # Save mode
    torch.save(net.state_dict(), "./model.pth")
    print("Saved model")
    
    # Evaluate network on the test dataset.  We aren't calculating gradients, so disable autograd to speed up
    # computations and reduce memory usage.
    with torch.no_grad():
        for batch in testLoader:
            # Get a batch and potentially send it to GPU memory.
            inputs, length, labels = textField.vocab.vectors[batch.text[0]].to(device), batch.text[1].to(
                device), batch.label.type(torch.FloatTensor).to(device)

            labels -= 1

            # Get predictions
            outputs = torch.sigmoid(net(inputs, length))
            predicted = torch.round(outputs)

            num_correct += torch.sum(labels == predicted).item()

    accuracy = 100 * num_correct / len(dev)

    print(f"Classification accuracy: {accuracy}")
Ejemplo n.º 22
0
def conll2003_dataset(tag_type,
                      batch_size,
                      root='./conll2003',
                      train_file='eng.train.txt',
                      validation_file='eng.testa.txt',
                      test_file='eng.testb.txt',
                      convert_digits=True):
    """
    conll2003: Conll 2003 (Parser only. You must place the files)
    Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        tag_type: Type of tag to pick as task [pos, chunk, ner]
        batch_size: Batch size to return from iterator
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's
    Returns:
        A dict containing:
            task: 'conll2003.' + tag_type
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """

    # Setup fields with batch dimension first
    inputs_word = data.Field(init_token="<bos>",
                             eos_token="<eos>",
                             batch_first=True,
                             lower=True)

    inputs_char_nesting = data.Field(tokenize=list,
                                     init_token="<bos>",
                                     eos_token="<eos>",
                                     batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>",
                                   eos_token="<eos>")

    labels = data.Field(init_token="<bos>",
                        eos_token="<eos>",
                        batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] +
              [('labels', labels) if label == tag_type else (None, None)
               for label in ['pos', 'chunk', 'ner']])

    # Load the data
    train, val, test = SequenceTaggingDataset.splits(
        path=root,
        train=train_file,
        validation=validation_file,
        test=test_file,
        separator=' ',
        fields=tuple(fields))

    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char,
                            test.inputs_char)
    inputs_word.build_vocab(train.inputs_word,
                            val.inputs_word,
                            test.inputs_word,
                            max_size=50000,
                            vectors=[GloVe(name='6B', dim='200'),
                                     CharNGram()])

    labels.build_vocab(train.labels)

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False

    return {
        'task': 'conll2003.%s' % tag_type,
        'iters': (train_iter, val_iter, test_iter),
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)
    }
Ejemplo n.º 23
0
def train(**args):
    params = EasyDict(args)
    params.gpu = int(params.gpu)

    config = ConfigParser()
    config.read('config.ini')
    if params.datasets == ['all']:
        params.datasets = ['imdb', 'amazon', 'yelp', 'rottentomatoes', 'hotel']

    is_tokenizer_length_dataset_specific = Models(params.model) == Models.distilbert and (
            params.tokenizer_length is None or params.tokenizer_length)
    is_number_prototypes_dataset_specific = Models(params.model) == Models.protoconv and (
            params.pc_number_of_prototypes is None or params.pc_number_of_prototypes == -1)
    is_sep_loss_dataset_specific = Models(params.model) == Models.protoconv and (
            params.pc_sep_loss_weight is None or params.pc_sep_loss_weight == -1)
    if_ce_loss_dataset_specific = Models(params.model) == Models.protoconv and (
            params.pc_ce_loss_weight is None or params.pc_ce_loss_weight == -1)

    for dataset in params.datasets:
        params.data_set = dataset
        seed_everything(params.seed)

        if is_tokenizer_length_dataset_specific:
            params.tokenizer_length = dataset_tokens_length[params.data_set]

        if is_number_prototypes_dataset_specific:
            params.pc_number_of_prototypes = dataset_to_number_of_prototypes[params.data_set]

        if is_sep_loss_dataset_specific:
            params.pc_sep_loss_weight = dataset_to_separation_loss[params.data_set]

        if if_ce_loss_dataset_specific:
            weight = 1 - (params.pc_cls_loss_weight + params.pc_sep_loss_weight + params.pc_l1_loss_weight)
            assert weight > 0, f'Weight {weight} of cross entropy loss cannot be less or equal to 0'
            params.pc_ce_loss_weight = weight

        logger = DummyLogger()
        if params.logger:
            comet_config = EasyDict(config['cometml'])
            project_name = params.project_name if params.project_name else comet_config.projectname
            logger = CometLogger(api_key=comet_config.apikey, project_name=project_name,
                                 workspace=comet_config.workspace)

        # logger.experiment.log_code(folder='src')
        logger.log_hyperparams(params)
        base_callbacks = [LearningRateMonitor(logging_interval='epoch')]

        df_dataset = pd.read_csv(f'data/{params.data_set}/tokenized_data.csv')
        n_splits = get_n_splits(dataset=df_dataset, x_label='text', y_label='label', folds=params.fold)
        log_splits(n_splits, logger)

        embeddings = GloVe('42B', cache=params.cache) if Models(params.model) != Models.distilbert else None

        best_models_scores, number_of_prototypes = [], []
        for fold_id, (train_index, val_index, test_index) in enumerate(n_splits):
            i = str(fold_id)

            model_checkpoint = ModelCheckpoint(
                filepath='checkpoints/fold_' + i + '_{epoch:02d}-{val_loss_' + i + ':.4f}-{val_acc_' + i + ':.4f}',
                save_weights_only=True, save_top_k=1, monitor='val_acc_' + i,
                period=params.pc_project_prototypes_every_n
            )
            early_stop = EarlyStopping(monitor=f'val_loss_{i}', patience=10, verbose=True, mode='min', min_delta=0.005)
            callbacks = deepcopy(base_callbacks) + [model_checkpoint, early_stop]

            lit_module = model_to_litmodule[params.model]
            train_df, valid_df = df_dataset.iloc[train_index + val_index], df_dataset.iloc[test_index]
            model, train_loader, val_loader, *utils = lit_module.from_params_and_dataset(train_df, valid_df, params,
                                                                                         fold_id, embeddings)
            trainer = Trainer(auto_lr_find=params.find_lr, logger=logger, max_epochs=params.epoch, callbacks=callbacks,
                              gpus=params.gpu, deterministic=True, fast_dev_run=params.fast_dev_run,
                              num_sanity_val_steps=0)

            trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)
            trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)

            for absolute_path in model_checkpoint.best_k_models.keys():
                logger.experiment.log_model(Path(absolute_path).name, absolute_path)

            if model_checkpoint.best_model_score:
                best_models_scores.append(model_checkpoint.best_model_score.tolist())
                logger.log_metrics({'best_model_score_' + i: model_checkpoint.best_model_score.tolist()}, step=0)

            if Models(params.model) == Models.protoconv and model_checkpoint.best_model_path:
                best_model = lit_module.load_from_checkpoint(model_checkpoint.best_model_path)
                saved_number_of_prototypes = sum(best_model.enabled_prototypes_mask.tolist())
                number_of_prototypes.append(saved_number_of_prototypes)
                logger.log_hyperparams({
                    f'saved_prototypes_{fold_id}': saved_number_of_prototypes,
                    f'best_model_path_{fold_id}': str(Path(model_checkpoint.best_model_path).name)
                })

                if params.pc_visualize:
                    data_visualizer = DataVisualizer(best_model)
                    logger.experiment.log_html(f'<h1>Split {fold_id}</h1><br> <h3>Prototypes:</h3><br>'
                                               f'{data_visualizer.visualize_prototypes()}<br>')
                    logger.experiment.log_figure(f'Prototypes similarity_{fold_id}',
                                                 data_visualizer.visualize_similarity().figure)
                    logger.experiment.log_html(f'<h3>Random prediction explanations:</h3><br>'
                                               f'{data_visualizer.visualize_random_predictions(val_loader, n=15)}')

        if len(best_models_scores) >= 1:
            avg_best, std_best = float(np.mean(np.array(best_models_scores))), float(
                np.std(np.array(best_models_scores)))
            table_entry = f'{avg_best:.3f} ($\pm${std_best:.3f})'

            logger.log_hyperparams({
                'avg_best_scores': avg_best,
                'std_best_scores': std_best,
                'table_entry': table_entry
            })

        if len(number_of_prototypes) >= 1:
            logger.log_hyperparams({'avg_saved_prototypes': float(np.mean(np.array(number_of_prototypes)))})

        logger.experiment.end()
Ejemplo n.º 24
0
def load_dataset(test_sen=None):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """

    #    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.cuda.FloatTensor)
    INDEX = data.Field(tensor_type=torch.cuda.LongTensor)

    TEXT = data.Field(sequential=True,
                      fix_length=20000,
                      tokenize=tokenizer,
                      pad_first=True,
                      tensor_type=torch.cuda.LongTensor,
                      lower=True,
                      batch_first=True)

    train_data, test_data = data.TabularDataset.splits(
        path='.',
        format='csv',
        skip_header=True,
        train='blogs_training.csv',
        validation='blogs_testing.csv',
        fields=[('index', None), ('text', TEXT), ('fileIndex', None),
                ('label', LABEL), ('age', None), ('industry', None),
                ('hscope', None)])

    #    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='twitter.27B', dim=100))
    LABEL.build_vocab(train_data)

    pickle.dump(TEXT, open("TEXT.pickle", "wb"))

    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split(
    )  # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=32,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)
    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Ejemplo n.º 25
0
fields = {'label': ('label', LABEL), 'input': ('input', INPUT)}

train, valid = data.TabularDataset.splits(path='/jet/prs/workspace',
                                          train='train.json',
                                          test='valid.json',
                                          format='json',
                                          fields=fields)

# In[8]:

print(vars(train[0]))

# In[9]:

INPUT.build_vocab(train,
                  vectors=GloVe(name='6B', dim=300),
                  max_size=10000,
                  min_freq=10)
LABEL.build_vocab(train, )

# In[ ]:

# FOR DEBUGGING ONLY

#print(INPUT.vocab.freqs)
#print(INPUT.vocab.vectors)
#print(INPUT.vocab.stoi)

INPUT = data.Field(fix_length=50, batch_first=False)
LABEL = data.Field(sequential=False, )
Ejemplo n.º 26
0
inputs = data.Field(lower=True, include_lengths=True, batch_first=True)

print('Generating train, dev, test splits')
train, dev, test = datasets.IWSLT.splits(root=args.data,
                                         exts=['.en', '.de'],
                                         fields=[inputs, inputs])
train_iter, dev_iter, test_iter = data.Iterator.splits(
    (train, dev, test),
    batch_size=100,
    device=torch.device(args.device) if args.device >= 0 else None)

print('Building vocabulary')
inputs.build_vocab(train, dev, test)
inputs.vocab.load_vectors(
    vectors=GloVe(name='840B', dim=300, cache=args.embeddings))

outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab),
                                 vectors=inputs.vocab.vectors)
outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab),
                                 vectors=inputs.vocab.vectors,
                                 layer0=True)
outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab),
                                            vectors=inputs.vocab.vectors,
                                            layer0=True,
                                            residual_embeddings=True)

if args.device >= 0:
    outputs_last_layer_cove.cuda()
    outputs_both_layer_cove.cuda()
    outputs_both_layer_cove_with_glove.cuda()
Ejemplo n.º 27
0
def main():
    patience_counter = 0
    # handle and display arguments
    args = parser.parse_args()
    pprint.PrettyPrinter().pprint(args.__dict__)

    # handling timestamp:
    cur_date = datetime.now()
    now_str = '%d-%d-%d_%d:%d' % (cur_date.year, cur_date.month, cur_date.day, cur_date.hour, cur_date.minute)
    model_path, learning_curve_path, roc_curve_path, conf_mat_path, norm_conf_mat_path, args_path, roc_curve_path_ext_swapped ,roc_curve_path_ext_regular = set_plots_model_names(
        now_str, args)
    args.norm_conf_mat_path = norm_conf_mat_path
    args.roc_curve_path_ext_swapped = roc_curve_path_ext_swapped
    args.roc_curve_path_ext_regular = roc_curve_path_ext_regular

    # handle cuda usage
    args.use_cuda = args.yes_cuda > 0 and torch.cuda.is_available()
    device = torch.device("cuda" if args.use_cuda else "cpu")

    # set a seed to ensure deterministic start
    torch.manual_seed(args.seed)
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed)
    # print type of execution
    print('CUDA device_count {0}'.format(torch.cuda.device_count())
          if args.use_cuda else 'CPU')

    # to get the right dataset
    if args.task_type == "nsp":
        train, val, test, TEXT, LABELS = get_nsp_dataset(args)
    elif args.task_type == "snli":
        train, val, test, TEXT, LABELS = get_snli_dataset(args)

    # create batches:
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_sizes=(args.batch_size, args.batch_size, args.batch_size),
        sort_key=lambda x: len(x.premise), device=device, repeat=False)

    TEXT.build_vocab(train, vectors=GloVe(name='840B', dim=args.embedding_dim))
    LABELS.build_vocab(train)


    print('#examples', len(train_iter.dataset), len(val_iter.dataset),
          len(test_iter.dataset))

    model = LSTM_for_SNLI(args, TEXT, LABELS).to(device)

    optimizer = optim.Adam(model.req_grad_params, lr=args.lr,
                           betas=(0.9, 0.999), amsgrad=True)

    loss_func = nn.CrossEntropyLoss().to(device)

    best_loss = float('inf')
    best_valid_acc = float('-inf')
    best_acc = 0.
    best_epoch = 0

    test_losses = []
    test_accuracies = []

    valid_losses = []
    valid_accuracies = []

    train_losses = []
    train_accuracies = []
    is_last_one = False

    for epoch in range(1, args.epochs + 1):

        train_loss, train_acc = train_epoch(device, train_iter, model, epoch, optimizer, loss_func, args)
        train_losses.append( train_loss)
        train_accuracies.append( train_acc)

        valid_loss, valid_acc = evaluate_epoch(device, val_iter, model, epoch, loss_func, 'Valid', args)
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_acc)

        if valid_acc >= best_valid_acc:
            patience_counter = 0
            best_valid_acc = valid_acc
        else:
            patience_counter += 1

        if valid_loss < best_loss:
            best_loss = valid_loss
            best_acc = valid_acc
            best_epoch = epoch
        print('\tLowest Valid Loss {:.6f}, Acc. {:.1f}%, Epoch {}'.
              format(best_loss, 100 * best_acc, best_epoch))
        if patience_counter > args.patience:
            is_last_one = True
        iter_test_loss, iter_test_accuracy = evaluate_epoch(device, test_iter, model, epoch, loss_func, 'Test', args, finish=is_last_one)
        test_losses.append(iter_test_loss)
        test_accuracies.append(iter_test_accuracy)

        # forced finish in case of overfitting
        if patience_counter > args.patience:
            print('Training terminated: PATIENCE exceeded')
            break

        # learning rate decay
        for param_group in optimizer.param_groups:
            print('lr: {:.6f} -> {:.6f}'
                  .format(param_group['lr'], param_group['lr'] * args.lr_decay))
            param_group['lr'] *= args.lr_decay

    # draw_results
    draw_learning_curve(train_accuracies, valid_accuracies, path=learning_curve_path)

    # works with torchtext bigger than 0.4.0 amd on dtu server there is 0.3.1
    # ask admins?

    # Save model
    #if args.save_model:
    #    torch.save(model, model_path)
    #    print('Model saved: ', str(model_path))


    # external dataset evaluation:

    if args.eed == True:

        evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.eed_regular, )

        test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], []

        ### Evaluate test set
        model.eval()
        for batch_idx, batch in enumerate(evaluation_iter):
            output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1])
            preds = torch.max(output, 1)[1]

            if (args.use_cuda):
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_targs += list(get_numpy(batch.label))
                test_preds += list(get_numpy(preds.data))
            else:
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_preds += list(preds.data.numpy())
                test_targs += list(batch.label.numpy())
        test_accuracy = accuracy_score(test_targs, test_preds)

        print("\nEvaluation set Acc:  %f" % (test_accuracy))
        print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))

        y_test_preds = np.array(raw_outputs_class_two)
        y_test_targs = np.array(test_targs)

        draw_roc_curve(y_test_preds, y_test_targs, path=args.roc_curve_path_ext_regular)

    if args.eed == True:

        evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.eed_swapped)

        test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], []

        ### Evaluate test set
        model.eval()
        for batch_idx, batch in enumerate(evaluation_iter):
            output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1])
            preds = torch.max(output, 1)[1]

            if (args.use_cuda):
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_targs += list(get_numpy(batch.label))
                test_preds += list(get_numpy(preds.data))
            else:
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_preds += list(preds.data.numpy())
                test_targs += list(batch.label.numpy())
        test_accuracy = accuracy_score(test_targs, test_preds)
        print("\nEvaluation set Acc:  %f" % (test_accuracy))
        print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))

        y_test_preds = np.array(raw_outputs_class_two)
        y_test_targs = np.array(test_targs)

        draw_roc_curve(y_test_preds, y_test_targs, path=args.roc_curve_path_ext_swapped  )

    if args.test_con == True:
        evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.con_data_path)

        test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], []

        ### Evaluate test set
        model.eval()
        for batch_idx, batch in enumerate(evaluation_iter):
            output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1])
            preds = torch.max(output, 1)[1]

            if (args.use_cuda):
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_targs += list(get_numpy(batch.label))
                test_preds += list(get_numpy(preds.data))
            else:
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_preds += list(preds.data.numpy())
                test_targs += list(batch.label.numpy())
        test_accuracy = accuracy_score(test_targs, test_preds)
        print("\nEvaluation of Consecutive data set Acc:  %f" % (test_accuracy))
        print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))

    if args.test_rand == True:
        evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.rand_data_path)

        test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], []

        ### Evaluate test set
        model.eval()
        for batch_idx, batch in enumerate(evaluation_iter):
            output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1])
            preds = torch.max(output, 1)[1]

            if (args.use_cuda):
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_targs += list(get_numpy(batch.label))
                test_preds += list(get_numpy(preds.data))
            else:
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_preds += list(preds.data.numpy())
                test_targs += list(batch.label.numpy())
        test_accuracy = accuracy_score(test_targs, test_preds)
        print("\nEvaluation of Random data set Acc:  %f" % (test_accuracy))
        print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))

    if args.test_swap == True:
        evaluation_iter, eval_dataset = load_evaluation_dataset(TEXT, LABELS, path=args.swap_data_path)

        test_targs, test_preds, raw_outputs_class_one, raw_outputs_class_two = [], [], [], []

        ### Evaluate test set
        model.eval()
        for batch_idx, batch in enumerate(evaluation_iter):
            output = model(batch.premise[0], batch.premise[1], batch.hypothesis[0], batch.hypothesis[1])
            preds = torch.max(output, 1)[1]

            if (args.use_cuda):
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_targs += list(get_numpy(batch.label))
                test_preds += list(get_numpy(preds.data))
            else:
                raw_outputs_class_one += list(get_numpy(output[:, 0]))
                raw_outputs_class_two += list(get_numpy(output[:, 1]))
                test_preds += list(preds.data.numpy())
                test_targs += list(batch.label.numpy())
        test_accuracy = accuracy_score(test_targs, test_preds)
        print("\nEvaluation of Swapped data set Acc:  %f" % (test_accuracy))
        print('size of evaluation dataset: %d sentence pairs' % (len(eval_dataset)))
Ejemplo n.º 28
0
             batch_first=True,
             stop_words=set(stopwords.words('english')))
#preprocessing=lambda x: [porter.stem(word) for word in x])
LABEL = Field(sequential=True,
              lower=True,
              use_vocab=True,
              is_target=True,
              unk_token=None,
              pad_token=None,
              batch_first=True)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = BucketIterator.splits((train, test),
                                              batch_sizes=(64, 64),
                                              device=device,
                                              sort_key=lambda x: len(x.text),
                                              sort_within_batch=False,
                                              repeat=False,
                                              shuffle=True)
''' Define model '''
model = LSTM(vocab_size=len(TEXT.vocab.stoi),
             embed_size=300,
             hidden_dim=400,
             batch_size=64,
    "mightn", "more", "most", "my", "myself", "needn", "now", "o", "of", "off",
    "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out",
    "over", "own", "re", "s", "same", "shan", "she", "she's", "should",
    "should've", "shouldn", "so", "some", "such", "t", "than", "that",
    "that'll", "the", "their", "theirs", "them", "themselves", "then", "there",
    "these", "they", "this", "those", "through", "to", "too", "under", "until",
    "up", "ve", "very", "was", "wasn", "we", "were", "weren", "what", "when",
    "where", "which", "while", "who", "whom", "why", "will", "with", "won",
    "wouldn", "y", "you", "you'd", "you'll", "you're", "you've", "your",
    "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's",
    "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd",
    "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've",
    "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's",
    "why's", "would"
]
wordVectors = GloVe(name='6B', dim=300)

###########################################################################
##### The following determines the processing of label data (ratings) #####
###########################################################################


def convertLabel(datasetLabel):
    """
    Labels (product ratings) from the dataset are provided to you as
    floats, taking the values 1.0, 2.0, 3.0, 4.0, or 5.0.
    You may wish to train with these as they are, or you you may wish
    to convert them to another representation in this function.
    Consider regression vs classification.
    """
    # label = datasetLabel.view((1,-1))
def load_dataset(batch_size, test_sen=None):

    office_actions = pd.read_csv('/mnt/data/training-patent-data4144f61d-a15b-421e-9346-659741ee1c22/office_actions.csv', usecols=['app_id', 'ifw_number', 'rejection_102', 'rejection_103'], nrows=100000)

    abstractList = []
    idList = []
    rejectionColumn = []
    for num in range(10000):

        app_id = str(office_actions.app_id[num])
        filename = "/mnt/data/training-patent-data4144f61d-a15b-421e-9346-659741ee1c22/json_files_1/oa_"+app_id+".json"

        try:
            jfile = open(filename, 'r')
        except FileNotFoundError:
            print("File Not Found")
            continue

        parsed_json = json.load(jfile)
        jfile.close()

        try:
            abstractList.append(parsed_json[0]['abstract_full'])
            idList.append(parsed_json[0]['application_number'])
        except IndexError:
            print("WARNING: file "+filename+" is empty!\n")
            continue

        n = int(office_actions.rejection_102[num])
        o = int(office_actions.rejection_103[num])

        if n == 0 and o == 0:
            rejType = 0 #neither
        elif n == 0 and o == 1:
            rejType = 1 #obvious
        elif n == 1 and o == 0:
            rejType = 0 #novelty
        elif n == 1 and o == 1:
            rejType = 1 #both
        else:
            print("Office action error:", sys.exc_info()[0])
            raise

        rejectionColumn.append(rejType)

    all_data = {'text': abstractList, 'label': rejectionColumn}
    df = pd.DataFrame(all_data, index = idList)

    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = LabelField(sequential=False)
    #fields={'Abstract': ('text', TEXT), 'RejectionType': ('labels', LABEL)}
    fields={'text': TEXT, 'label': LABEL}


    ds = DataFrameDataset(df, fields)

    TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(ds)

    train_data, test_data = ds.split()
    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter