Example #1
0
 def __init__(self, pretrained_tokenizer: str, max_instances: int = None):
     super().__init__(max_instances=max_instances)
     self.tokenizer = PretrainedTransformerTokenizer(pretrained_tokenizer,
                                                     max_length=2000)
     self.token_indexers = {
         "tokens": PretrainedTransformerIndexer(pretrained_tokenizer)
     }
Example #2
0
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
Example #3
0
 def __init__(
     self,
     sent1_col: str,
     sent2_col: str = None,
     label_col: str = 'label',
     bert_model: str = 'bert-base-uncased',
     max_sequence_length: int = 500,
     skip_label_indexing: bool = False,
     lower: bool = True,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._sent1_col = sent1_col
     self._sent2_col = sent2_col
     self._label_col = label_col
     self._tokenizer = PretrainedTransformerTokenizer(
         bert_model,
         add_special_tokens=False,
         max_length=max_sequence_length
     )  # type: PretrainedTransformerTokenizer
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._lower = lower
     self._token_indexers = {
         "tokens": PretrainedTransformerIndexer(model_name=bert_model)
     }
Example #4
0
    def __init__(
        self,
        model_name: str,
        vocab: Vocabulary,
        indexer: PretrainedTransformerIndexer = None,
        max_decoding_steps: int = 140,
        beam_size: int = 4,
        encoder: Seq2SeqEncoder = None,
    ):
        """
        # Parameters

        model_name : `str`, required
            Name of the pre-trained BART model to use. Available options can be found in
            `transformers.models.bart.modeling_bart.BART_PRETRAINED_MODEL_ARCHIVE_MAP`.
        vocab : `Vocabulary`, required
            Vocabulary containing source and target vocabularies.
        indexer : `PretrainedTransformerIndexer`, optional (default = `None`)
            Indexer to be used for converting decoded sequences of ids to to sequences of tokens.
        max_decoding_steps : `int`, optional (default = `128`)
            Number of decoding steps during beam search.
        beam_size : `int`, optional (default = `5`)
            Number of beams to use in beam search. The default is from the BART paper.
        encoder : `Seq2SeqEncoder`, optional (default = `None`)
            Encoder to used in BART. By default, the original BART encoder is used.
        """
        super().__init__(vocab)
        self.bart = BartForConditionalGeneration.from_pretrained(model_name)
        self._indexer = indexer or PretrainedTransformerIndexer(
            model_name, namespace="tokens")

        self._start_id = self.bart.config.bos_token_id  # CLS
        self._decoder_start_id = self.bart.config.decoder_start_token_id or self._start_id
        self._end_id = self.bart.config.eos_token_id  # SEP
        self._pad_id = self.bart.config.pad_token_id  # PAD

        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_id,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size or 1)

        self._rouge = ROUGE(
            exclude_indices={self._start_id, self._pad_id, self._end_id})
        self._bleu = BLEU(
            exclude_indices={self._start_id, self._pad_id, self._end_id})

        # Replace bart encoder with given encoder. We need to extract the two embedding layers so that
        # we can use them in the encoder wrapper
        if encoder is not None:
            assert (encoder.get_input_dim() == encoder.get_output_dim() ==
                    self.bart.config.hidden_size)
            self.bart.model.encoder = _BartEncoderWrapper(
                encoder,
                self.bart.model.encoder.embed_tokens,
                self.bart.model.encoder.embed_positions,
            )
Example #5
0
 def __init__(
     self,
     hf_pretrained_tokenizer: str,
     cache_directory: str = 'data/cache',
     clue_prefix="cryptic crossword clue: ",
 ):
     super().__init__(cache_directory=cache_directory)
     self.clue_prefix = clue_prefix
     self.tokenizer = PretrainedTransformerTokenizer(
         hf_pretrained_tokenizer)
     self.token_indexers = {
         "tokens": PretrainedTransformerIndexer(hf_pretrained_tokenizer)
     }
Example #6
0
    def __init__(
        self,
        model_name: str,
        vocab: Vocabulary,
        beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
        indexer: PretrainedTransformerIndexer = None,
        encoder: Seq2SeqEncoder = None,
        **kwargs,
    ):
        super().__init__(vocab)
        self.bart = BartForConditionalGeneration.from_pretrained(model_name)
        self._indexer = indexer or PretrainedTransformerIndexer(model_name, namespace="tokens")

        self._start_id = self.bart.config.bos_token_id  # CLS
        self._decoder_start_id = self.bart.config.decoder_start_token_id or self._start_id
        self._end_id = self.bart.config.eos_token_id  # SEP
        self._pad_id = self.bart.config.pad_token_id  # PAD

        # At prediction time, we'll use a beam search to find the best target sequence.
        # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as
        # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning
        deprecation_warning = (
            "The parameter {} has been deprecated."
            " Provide this parameter as argument to beam_search instead."
        )
        beam_search_extras = {}
        if "beam_size" in kwargs:
            beam_search_extras["beam_size"] = kwargs["beam_size"]
            warnings.warn(deprecation_warning.format("beam_size"), DeprecationWarning)
        if "max_decoding_steps" in kwargs:
            beam_search_extras["max_steps"] = kwargs["max_decoding_steps"]
            warnings.warn(deprecation_warning.format("max_decoding_steps"), DeprecationWarning)
        self._beam_search = beam_search.construct(
            end_index=self._end_id, vocab=self.vocab, **beam_search_extras
        )

        self._rouge = ROUGE(exclude_indices={self._start_id, self._pad_id, self._end_id})
        self._bleu = BLEU(exclude_indices={self._start_id, self._pad_id, self._end_id})

        # Replace bart encoder with given encoder. We need to extract the two embedding layers so that
        # we can use them in the encoder wrapper
        if encoder is not None:
            assert (
                encoder.get_input_dim() == encoder.get_output_dim() == self.bart.config.hidden_size
            )
            self.bart.model.encoder = _BartEncoderWrapper(
                encoder,
                self.bart.model.encoder.embed_tokens,
                self.bart.model.encoder.embed_positions,
            )
Example #7
0
    def __init__(
            self,
            lazy: bool = True,
            bert_model_name: str = 'bert-base-uncased',
            max_bpe: int = None,
            # token_indexers: Dict[str, TokenIndexer] = PretrainedTransformerIndexer(bert_model_name),
            debug: bool = False,
            bertsum_oracle: bool = True,
            semantic_red_map: bool = True,
            semantic_red_map_key: List[str] = None) -> None:
        super().__init__(lazy=lazy)

        self._token_indexers = PretrainedTransformerIndexer(
            bert_model_name, True)
        if 'roberta' in bert_model_name:
            from transformers import RobertaTokenizer
            self._token_indexers.tokenizer = RobertaTokenizer.from_pretrained(
                bert_model_name)
            self.generic_tokenzier = RobertaTokenizer.from_pretrained(
                bert_model_name)
        elif 'bert' in bert_model_name:
            from transformers import BertTokenizer
            self._token_indexers.tokenizer = BertTokenizer.from_pretrained(
                bert_model_name)
            self.generic_tokenzier = BertTokenizer.from_pretrained(
                bert_model_name)
        # if max_bpe is not None:
        #     self._token_indexers['bert'].max_pieces = max_bpe
        self._debug = debug

        # self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        # self.bert_tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        logger.info("Finish Initializing of Dataset Reader")
        if 'roberta' in bert_model_name:
            self.bert_lut = self.generic_tokenzier.decoder
        else:
            self.bert_lut = self.generic_tokenzier.ids_to_tokens
        self.max_bpe = max_bpe
        self.train_pts = []
        self.bertsum_oracle = bertsum_oracle

        self.semantic_red_map = semantic_red_map
        if semantic_red_map:
            self.map_kiosk = MapKiosk(semantic_red_map_key)

        random.seed(1112)
def test_sequence_tagging_reader():
    model_name = 'bert-base-chinese'

    bert_token_indexers = PretrainedTransformerIndexer(model_name=model_name)
    reader = SequenceTaggingDatasetReader(
        token_indexers={"tokens": bert_token_indexers})

    train_file = './data/weibo/train.corpus'
    dev_file = './data/weibo/dev.corpus'
    test_file = './data/weibo/dev.corpus'
    train_instances = list(reader.read(train_file))
    dev_instances = list(reader.read(dev_file))
    test_instances = list(reader.read(test_file))

    vocab: Vocabulary = Vocabulary.from_instances(train_instances)
    assert vocab.get_namespaces() is not None

    bert_text_field_embedder = PretrainedTransformerEmbedder(
        model_name=model_name)
    tagger = SimpleTagger(
        vocab=vocab,
        text_field_embedder=BasicTextFieldEmbedder(
            token_embedders={'tokens': bert_text_field_embedder}),
        encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
        calculate_span_f1=True,
        label_encoding="BMES",
        # verbose_metrics=True
    )

    train_data_loader, dev_data_loader = build_data_loaders(
        train_instances, dev_instances)
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    trainer = build_trainer(model=tagger,
                            serialization_dir='./output',
                            train_loader=train_data_loader,
                            dev_loader=dev_data_loader)
    print("Starting training")
    trainer.train()
    print("Finished training")
Example #9
0
    def __init__(
        self,
        model_name: str,
        vocab: Vocabulary,
        indexer: PretrainedTransformerIndexer = None,
        max_decoding_steps: int = 140,
        beam_size: int = 4,
        encoder: Seq2SeqEncoder = None,
    ):
        super().__init__(vocab)
        self.bart = BartForConditionalGeneration.from_pretrained(model_name)
        self._indexer = indexer or PretrainedTransformerIndexer(
            model_name, namespace="tokens")

        self._start_id = self.bart.config.bos_token_id  # CLS
        self._decoder_start_id = self.bart.config.decoder_start_token_id or self._start_id
        self._end_id = self.bart.config.eos_token_id  # SEP
        self._pad_id = self.bart.config.pad_token_id  # PAD

        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_id,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size or 1)

        self._rouge = ROUGE(
            exclude_indices={self._start_id, self._pad_id, self._end_id})
        self._bleu = BLEU(
            exclude_indices={self._start_id, self._pad_id, self._end_id})

        # Replace bart encoder with given encoder. We need to extract the two embedding layers so that
        # we can use them in the encoder wrapper
        if encoder is not None:
            assert (encoder.get_input_dim() == encoder.get_output_dim() ==
                    self.bart.config.hidden_size)
            self.bart.model.encoder = _BartEncoderWrapper(
                encoder,
                self.bart.model.encoder.embed_tokens,
                self.bart.model.encoder.embed_positions,
            )
Example #10
0
def test_dataset_reader():
    model_name = 'bert-base-uncased'
    source_tokenizer = PretrainedTransformerTokenizer(model_name=model_name,
                                                      do_lowercase=True)
    target_tokenizer = PretrainedTransformerTokenizer(model_name=model_name,
                                                      do_lowercase=True)
    source_token_indexers = {
        "tokens":
        PretrainedTransformerIndexer(model_name=model_name,
                                     do_lowercase=True,
                                     namespace='bert')
    }

    ds = CopySeq2MultiSeqNetDatasetReader(
        target_namespace='bert',
        source_tokenizer=source_tokenizer,
        target_tokenizer=target_tokenizer,
        source_token_indexers=source_token_indexers,
        lazy=True,
        max_tokens=500,
        bert=True,
        max_extractions=10)

    instances = ds._read(
        "/Users/mostafa/git/deep/openie/imojie/data/train/4cr_qpbo_extractions.tsv"
    )

    for instance in list(instances)[:10]:
        print(instance)
        print('*' * 70)
    ds._validation = True
    instances2 = ds._read(
        "/Users/mostafa/git/deep/openie/imojie/data/dev/carb_sentences.txt")
    for instance in list(instances2)[:10]:
        print(instance)
        print('*' * 70)
Example #11
0

if __name__ == '__main__':

    set_seed(1234)

    # TODO: make this tokenizer initialization a method?
    uncased = True
    tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased",
                                               do_lowercase=uncased)
    start_tokens = tokenizer._start_tokens
    end_tokens = tokenizer._end_tokens
    tokenizer._start_tokens = []
    tokenizer._end_tokens = []

    token_indexer = PretrainedTransformerIndexer(model_name="bert-base-cased",
                                                 do_lowercase=uncased)

    reader = SemTagDatasetReader(tokenizer, {"model_tokens": token_indexer},
                                 start_tokens, end_tokens)

    train_dataset = reader.read('sem-0.1.0/data/gold/train')
    val_dataset = reader.read('sem-0.1.0/data/gold/val')

    # NOTE: PretrainedTransformerIndexer does not implement the
    # count_vocab_items method, so this vocabulary reflects only the new
    # dataset, not the pretrained model's vocabulary
    # see: https://github.com/allenai/allennlp/blob/master/allennlp/data/
    # token_indexers/pretrained_transformer_indexer.py#L47-L50
    data_vocab = Vocabulary.from_instances(train_dataset + val_dataset)

    bert_token_embedder = PretrainedTransformerEmbedder("bert-base-uncased")
from allennlp.training.trainer import Trainer


def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':

    set_seed(1234)

    model_string = "bert-base-uncased"

    tokenizer = PretrainedTransformerTokenizer(model_string, do_lowercase=True)
    token_indexer = PretrainedTransformerIndexer(model_string,
                                                 do_lowercase=True)

    reader = SSTDatasetReader(tokenizer, {"tokens": token_indexer})

    train_dataset = reader.read('sst/trees/train.txt')
    val_dataset = reader.read('sst/trees/dev.txt')

    print(train_dataset[0])

    vocab = Vocabulary.from_instances(train_dataset + val_dataset)

    bert_token_embedder = PretrainedTransformerEmbedder(model_string)
    bert_textfield_embedder = BasicTextFieldEmbedder(
        {"tokens": bert_token_embedder})

    model = BertClassifier(vocab,
Example #13
0
def build_dataset_reader() -> DatasetReader:
    tokenizer = PretrainedTransformerTokenizer(model_name=PRETRAIN_MODEL)
    token_indexers = PretrainedTransformerIndexer(model_name=PRETRAIN_MODEL)
    return ActionCLSTrainDataReader(tokenizer=tokenizer,
                                    token_indexers={'tokens': token_indexers},
                                    max_tokens=150,)