def test_encoder_decoder_model(self):
     token_embedder = PretrainedTransformerEmbedder("facebook/bart-large",
                                                    sub_module="encoder")
     token_ids = torch.LongTensor([[1, 2, 3], [2, 3, 4]])
     mask = torch.ones_like(token_ids).bool()
     token_embedder(token_ids, mask)
Esempio n. 2
0
 def __init__(self, model_name: str, max_length: int = None) -> None:
     super().__init__()
     # The matched version v.s. mismatched
     self._matched_embedder = PretrainedTransformerEmbedder(
         model_name, max_length)
Esempio n. 3
0
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model_name: str = "bert-base-uncased",
        feedforward: Optional[FeedForward] = None,
        smoothing: bool = False,
        smooth_alpha: float = 0.7,
        sentiment_task: bool = False,
        sentiment_task_weight: float = 1.0,
        sentiment_classification_with_label: bool = True,
        sentiment_seq2vec: Optional[Seq2VecEncoder] = None,
        candidate_span_task: bool = False,
        candidate_span_task_weight: float = 1.0,
        candidate_delay: int = 30000,
        candidate_span_num: int = 5,
        candidate_classification_layer_units: int = 128,
        candidate_span_extractor: Optional[SpanExtractor] = None,
        candidate_span_with_logits: bool = False,
        dropout: Optional[float] = None,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)
        if "BERTweet" not in transformer_model_name:
            self._text_field_embedder = BasicTextFieldEmbedder({
                "tokens":
                PretrainedTransformerEmbedder(transformer_model_name)
            })
        else:
            self._text_field_embedder = BasicTextFieldEmbedder(
                {"tokens": TweetBertEmbedder(transformer_model_name)})
        # span start & end task
        if feedforward is None:
            self._linear_layer = nn.Sequential(
                nn.Linear(self._text_field_embedder.get_output_dim(), 128),
                nn.ReLU(),
                nn.Linear(128, 2),
            )
        else:
            self._linear_layer = feedforward
        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._jaccard = Jaccard()
        self._candidate_delay = candidate_delay
        self._delay = 0

        self._smoothing = smoothing
        self._smooth_alpha = smooth_alpha
        if smoothing:
            self._loss = nn.KLDivLoss(reduction="batchmean")
        else:
            self._loss = nn.CrossEntropyLoss()

        # sentiment task
        self._sentiment_task = sentiment_task
        if self._sentiment_task:
            self._sentiment_classification_accuracy = CategoricalAccuracy()
            self._sentiment_loss_log = LossLog()
            self.register_buffer("sentiment_task_weight",
                                 torch.tensor(sentiment_task_weight))
            self._sentiment_classification_with_label = (
                sentiment_classification_with_label)
            if sentiment_seq2vec is None:
                raise ConfigurationError(
                    "sentiment task is True, we need a sentiment seq2vec encoder"
                )
            else:
                self._sentiment_encoder = sentiment_seq2vec
                self._sentiment_linear = nn.Linear(
                    self._sentiment_encoder.get_output_dim(),
                    vocab.get_vocab_size("labels"),
                )

        # candidate span task
        self._candidate_span_task = candidate_span_task
        if candidate_span_task:
            assert candidate_span_num > 0
            assert candidate_span_task_weight > 0
            assert candidate_classification_layer_units > 0
            self._candidate_span_num = candidate_span_num
            self.register_buffer("candidate_span_task_weight",
                                 torch.tensor(candidate_span_task_weight))
            self._candidate_classification_layer_units = (
                candidate_classification_layer_units)
            self._span_classification_accuracy = CategoricalAccuracy()
            self._candidate_loss_log = LossLog()
            self._candidate_span_linear = nn.Linear(
                self._text_field_embedder.get_output_dim(),
                self._candidate_classification_layer_units,
            )

            if candidate_span_extractor is None:
                self._candidate_span_extractor = EndpointSpanExtractor(
                    input_dim=self._candidate_classification_layer_units)
            else:
                self._candidate_span_extractor = candidate_span_extractor

            if candidate_span_with_logits:
                self._candidate_with_logits = True
                self._candidate_span_vec_linear = nn.Linear(
                    self._candidate_span_extractor.get_output_dim() + 1, 1)
            else:
                self._candidate_with_logits = False
                self._candidate_span_vec_linear = nn.Linear(
                    self._candidate_span_extractor.get_output_dim(), 1)

            self._candidate_jaccard = Jaccard()

        if sentiment_task or candidate_span_task:
            self._base_loss_log = LossLog()
        else:
            self._base_loss_log = None

        if dropout is not None:
            self._dropout = nn.Dropout(dropout)
        else:
            self._dropout = None
    parser.add_argument('--cache-path', type=str)
    parser.add_argument('--batch-size', type=int, default=12)
    parser.add_argument('--model', type=str, default='bert-base-cased')
    parser.add_argument('--file-suffix',
                        type=str,
                        choices=['.json', '.comm', '.concrete'],
                        default='.json')
    parser.add_argument('--normalize-token', action='store_true')
    parser.add_argument('--cuda', action='store_true')
    args = parser.parse_args()

    tokenizer = PretrainedTransformerTokenizer(model_name=args.model)
    token_indexers = {
        'token': PretrainedTransformerIndexer(model_name=args.model)
    }
    model = PretrainedTransformerEmbedder(model_name=args.model)
    if args.cuda:
        device = 'cuda'
        model.to('cuda')
    else:
        device = 'cpu'

    o = h5py.File(args.cache_path, 'w')
    write_to_hdf5(
        cache_handler=o,
        embed_stream=debatch_embedding_stream(
            embed_stream=get_embedding_stream(
                embedder=model,
                batch_stream=compose_batch_stream(
                    ins_stream=instance_stream(file_path=args.input_path,
                                               tokenizer=tokenizer,
# Again, it's easier to just run the data code to get the right output.

# We're using the smallest transformer model we can here, so that it runs on
# binder.
transformer_model = 'google/reformer-crime-and-punishment'
tokenizer = PretrainedTransformerTokenizer(model_name=transformer_model)
token_indexer = PretrainedTransformerIndexer(model_name=transformer_model)
text = "Some text with an extraordinarily long identifier."
tokens = tokenizer.tokenize(text)
print("Transformer tokens:", tokens)
text_field = TextField(tokens, {'bert_tokens': token_indexer})
text_field.index(vocab)
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
print("Transformer tensors:", token_tensor)

embedding = PretrainedTransformerEmbedder(model_name=transformer_model)

embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("Transformer embedded tokens:", embedded_tokens)

# Use GloVe
# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer;
# see the exercises above.
token_tensor = {'tokens': {'tokens': torch.LongTensor([1, 3, 2, 1, 4, 3])}}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')