Example #1
0
 def __init__(self, pretrained_tokenizer: str, max_instances: int = None):
     super().__init__(max_instances=max_instances)
     self.tokenizer = PretrainedTransformerTokenizer(pretrained_tokenizer,
                                                     max_length=2000)
     self.token_indexers = {
         "tokens": PretrainedTransformerIndexer(pretrained_tokenizer)
     }
Example #2
0
class InliDatasetReaderBase(DatasetReader):
    def __init__(self, pretrained_tokenizer: str, max_instances: int = None):
        super().__init__(max_instances=max_instances)
        self.tokenizer = PretrainedTransformerTokenizer(pretrained_tokenizer,
                                                        max_length=2000)
        self.token_indexers = {
            "tokens": PretrainedTransformerIndexer(pretrained_tokenizer)
        }

    def text_to_instance(self, data_chunk) -> Instance:
        raise NotImplementedError

    def _read(self, file_path: str) -> Iterator[Instance]:

        with open(file_path, encoding="utf8") as jsonl_file:
            dataset_list = list(jsonl_file)

        for data_chunk in dataset_list:
            yield self.text_to_instance(json.loads(data_chunk))

    def prepare_fields(self, task_name: str, label: str, sentence1: str,
                       sentence2: str):
        label_and_first_sentence_tokens = self.tokenizer.tokenize(
            f'{task_name} {label}: {sentence1} '
            f'{self.tokenizer.tokenizer.eos_token}')
        second_sentence_tokens = self.tokenizer.tokenize(
            f'{sentence2} {self.tokenizer.tokenizer.eos_token}')
        fields = {
            "label_and_first_sentence":
            TextField(label_and_first_sentence_tokens, self.token_indexers),
            "second_sentence":
            TextField(second_sentence_tokens, self.token_indexers),
        }
        return fields
Example #3
0
 def __init__(
     self,
     hf_pretrained_tokenizer: str,
     cache_directory: str = 'data/cache',
     clue_prefix="cryptic crossword clue: ",
 ):
     super().__init__(cache_directory=cache_directory)
     self.clue_prefix = clue_prefix
     self.tokenizer = PretrainedTransformerTokenizer(
         hf_pretrained_tokenizer)
     self.token_indexers = {
         "tokens": PretrainedTransformerIndexer(hf_pretrained_tokenizer)
     }
Example #4
0
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
Example #5
0
def test_dataset_reader():
    model_name = 'bert-base-uncased'
    source_tokenizer = PretrainedTransformerTokenizer(model_name=model_name,
                                                      do_lowercase=True)
    target_tokenizer = PretrainedTransformerTokenizer(model_name=model_name,
                                                      do_lowercase=True)
    source_token_indexers = {
        "tokens":
        PretrainedTransformerIndexer(model_name=model_name,
                                     do_lowercase=True,
                                     namespace='bert')
    }

    ds = CopySeq2MultiSeqNetDatasetReader(
        target_namespace='bert',
        source_tokenizer=source_tokenizer,
        target_tokenizer=target_tokenizer,
        source_token_indexers=source_token_indexers,
        lazy=True,
        max_tokens=500,
        bert=True,
        max_extractions=10)

    instances = ds._read(
        "/Users/mostafa/git/deep/openie/imojie/data/train/4cr_qpbo_extractions.tsv"
    )

    for instance in list(instances)[:10]:
        print(instance)
        print('*' * 70)
    ds._validation = True
    instances2 = ds._read(
        "/Users/mostafa/git/deep/openie/imojie/data/dev/carb_sentences.txt")
    for instance in list(instances2)[:10]:
        print(instance)
        print('*' * 70)
Example #6
0
class CrypticCrosswordReader(DatasetReader):
    def __init__(
        self,
        hf_pretrained_tokenizer: str,
        cache_directory: str = 'data/cache',
        clue_prefix="cryptic crossword clue: ",
    ):
        super().__init__(cache_directory=cache_directory)
        self.clue_prefix = clue_prefix
        self.tokenizer = PretrainedTransformerTokenizer(
            hf_pretrained_tokenizer)
        self.token_indexers = {
            "tokens": PretrainedTransformerIndexer(hf_pretrained_tokenizer)
        }

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:

        with open(file_path) as jsonl_file:
            dataset_list = list(jsonl_file)

        for example in dataset_list:
            yield self.text_to_instance(json.loads(example))

    def text_to_instance(self, example) -> Instance:

        fields: Dict[str, Field] = {}

        original_clue = example['clue']
        answer = example.get('answer')

        clue = f"{self.clue_prefix}{original_clue}"
        clue_tokens = self.tokenizer.tokenize(clue)
        fields['clue'] = TextField(clue_tokens, self.token_indexers)

        if answer is not None:
            answer_tokens = self.tokenizer.tokenize(answer)
            fields['answer'] = TextField(answer_tokens, self.token_indexers)

        enumeration = example['enumeration']

        publisher = example['publisher']
        crossword_id = example['crossword_id']
        number = example['number']
        orientation = example['orientation']
        id_ = f"{publisher}-{crossword_id}-{number}{orientation}"

        date = example['date']
        quick = example['quick']

        fields['metadata'] = MetadataField({
            'clue': original_clue,
            'answer': answer,
            'enumeration': enumeration,
            'publisher': publisher,
            'date': date,
            'quick': quick,
            'id': id_
        })
        return Instance(fields)

    @overrides
    def _instances_from_cache_file(self,
                                   cache_filename: str) -> Iterable[Instance]:
        print('reading instances from', cache_filename)
        with open(cache_filename, 'rb') as cache_file:
            instances = pickle.load(cache_file)
            for instance in instances:
                yield instance

    @overrides
    def _instances_to_cache_file(self, cache_filename, instances) -> None:
        print('writing instance to', cache_filename)
        with open(cache_filename, 'wb') as cache_file:
            pickle.dump(instances, cache_file)
Example #7
0
from allennlp.training.trainer import Trainer


def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':

    set_seed(1234)

    # TODO: make this tokenizer initialization a method?
    uncased = True
    tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased",
                                               do_lowercase=uncased)
    start_tokens = tokenizer._start_tokens
    end_tokens = tokenizer._end_tokens
    tokenizer._start_tokens = []
    tokenizer._end_tokens = []

    token_indexer = PretrainedTransformerIndexer(model_name="bert-base-cased",
                                                 do_lowercase=uncased)

    reader = SemTagDatasetReader(tokenizer, {"model_tokens": token_indexer},
                                 start_tokens, end_tokens)

    train_dataset = reader.read('sem-0.1.0/data/gold/train')
    val_dataset = reader.read('sem-0.1.0/data/gold/val')

    # NOTE: PretrainedTransformerIndexer does not implement the
Example #8
0
from examples_allennlp.entity_typing.reader import EntityTypingReader
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from examples_allennlp.utils.util import ENT


extra_tokens = [ENT]
# transformer_model_name = "roberta-base"
transformer_model_name = "bert-base-multilingual-cased"
tokenizer = PretrainedTransformerTokenizer(
    transformer_model_name, tokenizer_kwargs={"additional_special_tokens": extra_tokens}
)
token_indexers = {
    "tokens": PretrainedTransformerIndexer(
        transformer_model_name, tokenizer_kwargs={"additional_special_tokens": extra_tokens}
    )
}

test_data_path = "examples_allennlp/entity_typing/tests/fixtures/test.json"


def test_read():

    reader = EntityTypingReader(tokenizer, token_indexers)

    instances = [i for i in reader.read(test_data_path)]
    assert len(instances) == 1

    instance = instances[0]
    expected = [
        "[CLS]",
from allennlp.training.trainer import Trainer


def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':

    set_seed(1234)

    model_string = "bert-base-uncased"

    tokenizer = PretrainedTransformerTokenizer(model_string, do_lowercase=True)
    token_indexer = PretrainedTransformerIndexer(model_string,
                                                 do_lowercase=True)

    reader = SSTDatasetReader(tokenizer, {"tokens": token_indexer})

    train_dataset = reader.read('sst/trees/train.txt')
    val_dataset = reader.read('sst/trees/dev.txt')

    print(train_dataset[0])

    vocab = Vocabulary.from_instances(train_dataset + val_dataset)

    bert_token_embedder = PretrainedTransformerEmbedder(model_string)
    bert_textfield_embedder = BasicTextFieldEmbedder(
        {"tokens": bert_token_embedder})
Example #10
0
def build_dataset_reader() -> DatasetReader:
    tokenizer = PretrainedTransformerTokenizer(model_name=PRETRAIN_MODEL)
    token_indexers = PretrainedTransformerIndexer(model_name=PRETRAIN_MODEL)
    return ActionCLSTrainDataReader(tokenizer=tokenizer,
                                    token_indexers={'tokens': token_indexers},
                                    max_tokens=150,)