def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels):
     config.num_labels = self.num_labels
     model = TransfoXLForSequenceClassification(config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids_1)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
Exemple #2
0
def configure(pytorch_chkp=None):
    """
    Configure and load the model
    :param pytorch_chkp: Optional parameter for a pre-trained pytorch model checkpoint
    :return: The model
    """

    # Quick config for running different training configurations
    config1 = True
    if config1:
        n_head = 8
        n_layer = 12
        d_inner = 2048
        dropout = 0.1
    else:
        n_head = 6
        n_layer = 8
        d_inner = 1024
        dropout = 0.3

    # Initialise the Transformer XL configuration
    configuration = TransfoXLConfig(
        # Number of tokens
        n_token=n_token,
        # Number of self-attention layers for encoder and decoder
        n_layer=n_layer,
        # Number of attention heads for each attention layer in encoder
        n_head=n_head,
        # Length of the model's hidden states
        d_model=input_length,
        # Dimensionality of the model's heads
        d_head=input_length // n_head,
        # Inner dimension in feed-forward layer
        d_inner=d_inner,
        # Dropout probability
        dropout=dropout,
        # Dropout for attention probabilities
        dropatt=dropout,
        # Length of the retained previous heads
        mem_len=input_length,
        # Dimensionality/length of embeddings
        d_embed=input_length,
        # Length of target logits for classification
        tgt_len=input_length,
        # Length of the extended context
        ext_len=input_length,
        # Cutoffs for the adaptive softmax
        cutoffs=[],
        # Divident value for adapative input and softmax
        div_val=-1,
        # Use the same positional embeddings after clamp_len
        clamp_len=-1,
        # Whether to use the same attention length for all tokens
        same_length=False,
        # Number of samples in the sampled softmax
        sample_softmax=1,
        # Tie encoder weights to decoder weights
        tie_weight=True,
        tie_encoder_decoder=True,
        tie_word_embeddings=True,
        # Tie encoder biases to decoder biases
        untie_r=True,
        # Number of labels used for classification in the last layer
        num_labels=308,
        proj_share_all_but_first=False,
        # Make sure that this is greater than n_token!
        pad_token_id=309)

    # Initialise the model from the configuration
    model = TransfoXLForSequenceClassification(configuration)

    # Load a pre-trained checkpoint if it exists
    if pytorch_chkp is not None:
        model.load_state_dict(torch.load(pytorch_chkp, map_location=device))
        print("Loaded model checkpoint ", pytorch_chkp)

        # Apply model quantisation to massively speed up inference during testing/generating on CPUs
        if device.type != 'cuda':
            # Block the warning
            import warnings
            warnings.filterwarnings(
                "ignore",
                message="Setting attributes on ParameterList is not supported."
            )
            # Only quantise testing model, not during training.
            # Also, for some reason quantisation doesn't play well with all Nvidia GPUs
            model = torch.quantization.quantize_dynamic(model, {
                torch.nn.Linear, torch.nn.Softmax, torch.nn.Embedding,
                torch.nn.Dropout
            },
                                                        dtype=torch.qint8)

    return model.to(device)  # Set model to graphics device (cpu/gpu)