Beispiel #1
0
def create_task_dict(task_config_dict: dict,
                     verbose: bool = True) -> Dict[str, tasks.Task]:
    """Make map of task name to task instances from map of task name to task config file paths.

    Args:
        task_config_dict (Dict): map from task name to task config filepath.
        verbose (bool): True to print task config info.

    Returns:
        Dict mapping from task name to task instance.

    """
    task_dict = {}
    for task_name, task_config_path in task_config_dict.items():
        task = tasks.create_task_from_config_path(config_path=task_config_path,
                                                  verbose=False)
        if not task.name == task_name:
            warnings.warn(
                "task {} from {} has conflicting names: {}/{}. Using {}".
                format(
                    task_name,
                    task_config_path,
                    task_name,
                    task.name,
                    task_name,
                ))
            task.name = task_name
        task_dict[task_name] = task
    if verbose:
        print("Creating Tasks:")
        for task_name, task_config_path in task_config_dict.items():
            task_class = task_dict[task_name].__class__.__name__
            print(f"    {task_name} ({task_class}): {task_config_path}")
    return task_dict
Beispiel #2
0
def test_featurization_of_task_data():
    # Test reading the task-specific toy dataset into examples.
    task = create_task_from_config_path(os.path.join(os.path.dirname(__file__),
                                                     "resources/mnli.json"),
                                        verbose=False)
    # Test getting train, val, and test examples. Only the contents of train are checked.
    train_examples = task.get_train_examples()
    val_examples = task.get_val_examples()
    test_examples = task.get_test_examples()
    for train_example_dataclass, raw_example_dict in zip(
            train_examples, TRAIN_EXAMPLES):
        assert train_example_dataclass.to_dict() == raw_example_dict
    assert val_examples
    assert test_examples

    # Testing conversion of examples into tokenized examples
    # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data:
    token_counter = Counter()
    for example in train_examples:
        token_counter.update(example.premise.split())
        token_counter.update(example.hypothesis.split())
    token_vocab = list(token_counter.keys())
    tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab)
    tokenized_examples = [
        example.tokenize(tokenizer) for example in train_examples
    ]
    for tokenized_example, expected_tokenized_example in zip(
            tokenized_examples, TOKENIZED_TRAIN_EXAMPLES):
        assert tokenized_example.to_dict() == expected_tokenized_example

    # Testing conversion of a tokenized example to a featurized example
    train_example_0_length = len(tokenized_examples[0].premise) + len(
        tokenized_examples[0].hypothesis)
    feat_spec = model_resolution.build_featurization_spec(
        model_type="bert-", max_seq_length=train_example_0_length)
    featurized_examples = [
        tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec)
        for tokenized_example in tokenized_examples
    ]
    featurized_example_0_dict = featurized_examples[0].to_dict()
    # not bothering to compare the input_ids because they were made by a dummy tokenizer.
    assert "input_ids" in featurized_example_0_dict
    assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "guid"]
    assert (featurized_example_0_dict["input_mask"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all()
    assert (featurized_example_0_dict["segment_ids"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all()
    assert featurized_example_0_dict["label_id"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "label_id"]
    assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "tokens"]
Beispiel #3
0
def test_featurization_of_task_data():
    # Test reading the task-specific toy dataset into examples.
    task = create_task_from_config_path(os.path.join(os.path.dirname(__file__),
                                                     "resources/sst.json"),
                                        verbose=True)
    examples = task.get_train_examples()
    for example_dataclass, raw_example_dict in zip(examples, TRAIN_EXAMPLES):
        assert example_dataclass.to_dict() == raw_example_dict

    # Testing conversion of examples into tokenized examples
    # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data:
    token_counter = Counter()
    for example in examples:
        token_counter.update(example.text.split())
    token_vocab = list(token_counter.keys())
    tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab)
    tokenized_examples = [example.tokenize(tokenizer) for example in examples]
    for tokenized_example, expected_tokenized_example in zip(
            tokenized_examples, TOKENIZED_TRAIN_EXAMPLES):
        assert tokenized_example.to_dict() == expected_tokenized_example

    # Testing conversion of a tokenized example to a featurized example
    feat_spec = tokenizer.get_feat_spec(max_seq_length=10)
    featurized_examples = [
        tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec)
        for tokenized_example in tokenized_examples
    ]
    featurized_example_0_dict = featurized_examples[0].to_dict()
    # not bothering to compare the input_ids because they were made by a dummy tokenizer.
    assert "input_ids" in featurized_example_0_dict
    assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "guid"]
    assert (featurized_example_0_dict["input_mask"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all()
    assert (featurized_example_0_dict["segment_ids"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all()
    assert featurized_example_0_dict["label_id"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "label_id"]
    assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "tokens"]
Beispiel #4
0
def create_task_dict(task_config_dict: dict,
                     verbose: bool = True) -> Dict[str, tasks.Task]:
    """Make map of task name to task instances from map of task name to task config file paths.

    Args:
        task_config_dict (Dict): map from task name to task config filepath.
        verbose (bool): True to print task config info.

    Returns:
        Dict mapping from task name to task instance.

    """
    task_dict = {
        task_name:
        tasks.create_task_from_config_path(config_path=task_config_path,
                                           verbose=False)
        for task_name, task_config_path in task_config_dict.items()
    }
    if verbose:
        print("Creating Tasks:")
        for task_name, task_config_path in task_config_dict.items():
            task_class = task_dict[task_name].__class__.__name__
            print(f"    {task_name} ({task_class}): {task_config_path}")
    return task_dict
Beispiel #5
0
def main(args: RunConfiguration):
    task = tasks.create_task_from_config_path(
        config_path=args.task_config_path, verbose=True)
    feat_spec = model_resolution.build_featurization_spec(
        model_type=args.model_type,
        max_seq_length=args.max_seq_length,
    )
    tokenizer = model_setup.get_tokenizer(
        model_type=args.model_type,
        tokenizer_path=args.model_tokenizer_path,
    )
    if isinstance(args.phases, str):
        phases = args.phases.split(",")
    else:
        phases = args.phases
    assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST}

    paths_dict = {}
    os.makedirs(args.output_dir, exist_ok=True)

    if PHASE.TRAIN in phases:
        chunk_and_save(
            task=task,
            phase=PHASE.TRAIN,
            examples=task.get_train_examples(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict["train"] = os.path.join(args.output_dir, PHASE.TRAIN)

    if PHASE.VAL in phases:
        val_examples = task.get_val_examples()
        chunk_and_save(
            task=task,
            phase=PHASE.VAL,
            examples=val_examples,
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task)
        shared_caching.chunk_and_save(
            data=evaluation_scheme.get_labels_from_cache_and_examples(
                task=task,
                cache=shared_caching.ChunkedFilesDataCache(
                    os.path.join(args.output_dir, PHASE.VAL)),
                examples=val_examples,
            ),
            chunk_size=args.chunk_size,
            data_args=args.to_dict(),
            output_dir=os.path.join(args.output_dir, "val_labels"),
        )
        paths_dict[PHASE.VAL] = os.path.join(args.output_dir, PHASE.VAL)
        paths_dict["val_labels"] = os.path.join(args.output_dir, "val_labels")

    if PHASE.TEST in phases:
        chunk_and_save(
            task=task,
            phase=PHASE.TEST,
            examples=task.get_test_examples(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict[PHASE.TEST] = os.path.join(args.output_dir, PHASE.TEST)

    if not args.skip_write_output_paths:
        py_io.write_json(data=paths_dict,
                         path=os.path.join(args.output_dir, "paths.json"))
Beispiel #6
0
def test_featurization_of_task_data():
    # Test reading the task-specific toy dataset into examples.
    task = create_task_from_config_path(os.path.join(os.path.dirname(__file__),
                                                     "resources/spr1.json"),
                                        verbose=True)
    # Test getting train, val, and test examples. Only the contents of train are checked.
    train_examples = task.get_train_examples()
    val_examples = task.get_val_examples()
    for train_example_dataclass, raw_example_dict in zip(
            train_examples, TRAIN_EXAMPLES):
        assert train_example_dataclass.to_dict() == raw_example_dict
    assert val_examples

    # Testing conversion of examples into tokenized examples
    # the dummy tokenizer requires a vocab — using a Counter here to find that vocab from the data:
    token_counter = Counter()
    for example in train_examples:
        token_counter.update(example.text.split())
    token_vocab = list(token_counter.keys())
    space_tokenizer = SimpleSpaceTokenizer(vocabulary=token_vocab)

    # Mocking to pass normalize_tokenizations's isinstance check during example.tokenize(tokenizer)
    tokenizer = Mock(spec_set=transformers.RobertaTokenizer)
    tokenizer.tokenize.side_effect = space_tokenizer.tokenize
    tokenized_examples = [
        example.tokenize(tokenizer) for example in train_examples
    ]
    for tokenized_example, expected_tokenized_example in zip(
            tokenized_examples, TOKENIZED_TRAIN_EXAMPLES):
        assert tokenized_example.to_dict() == expected_tokenized_example
    # Dropping the mock and continuing the test with the space tokenizer
    tokenizer = space_tokenizer

    # Testing conversion of a tokenized example to a featurized example
    train_example_0_length = len(tokenized_examples[0].tokens) + 4
    feat_spec = model_resolution.build_featurization_spec(
        model_type="bert-", max_seq_length=train_example_0_length)
    featurized_examples = [
        tokenized_example.featurize(tokenizer=tokenizer, feat_spec=feat_spec)
        for tokenized_example in tokenized_examples
    ]
    featurized_example_0_dict = featurized_examples[0].to_dict()

    # not bothering to compare the input_ids because they were made by a dummy tokenizer.
    assert "input_ids" in featurized_example_0_dict
    assert featurized_example_0_dict["guid"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "guid"]
    assert (featurized_example_0_dict["input_mask"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["input_mask"]).all()
    assert (featurized_example_0_dict["segment_ids"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["segment_ids"]).all()
    assert (featurized_example_0_dict["label_ids"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["label_ids"]).all()
    assert featurized_example_0_dict["tokens"] == FEATURIZED_TRAIN_EXAMPLE_0[
        "tokens"]
    assert featurized_example_0_dict[
        "span1_text"] == FEATURIZED_TRAIN_EXAMPLE_0["span1_text"]
    assert featurized_example_0_dict[
        "span2_text"] == FEATURIZED_TRAIN_EXAMPLE_0["span2_text"]
    assert (featurized_example_0_dict["spans"] ==
            FEATURIZED_TRAIN_EXAMPLE_0["spans"]).all()
Beispiel #7
0
def main(args: RunConfiguration):
    task = tasks.create_task_from_config_path(config_path=args.task_config_path, verbose=True)
    feat_spec = model_resolution.build_featurization_spec(
        model_type=args.model_type, max_seq_length=args.max_seq_length,
    )
    tokenizer = model_setup.get_tokenizer(
        model_type=args.model_type, tokenizer_path=args.model_tokenizer_path,
    )
    if isinstance(args.phases, str):
        phases = args.phases.split(",")
    else:
        phases = args.phases
    assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST}

    paths_dict = {}
    os.makedirs(args.output_dir, exist_ok=True)

    def do_tokenize(phase: str):
        evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task)
        output_dir = os.path.join(args.output_dir, f"{phase}")
        labels_output_dir = os.path.join(args.output_dir, f"{phase}_labels")
        if phase == PHASE.TRAIN:
            get_examples_func = task.get_train_examples
        elif phase == PHASE.VAL:
            get_examples_func = task.get_val_examples
        elif phase == PHASE.TEST:
            # get_examples_func = task.get_test_examples

            def get_examples_func():
                try:
                    return task.get_examples('test')
                except NotImplementedError:
                    logger.warning('The labels for "test" split is not retrieved, so, metrics for the "test" split will not be evaluated properly.')
                return task.get_test_examples()

        chunk_and_save(  # HONOKA
            task=task,
            phase=phase,
            examples=get_examples_func(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict[phase] = output_dir

        shared_caching.chunk_and_save(
            data=evaluation_scheme.get_labels_from_cache_and_examples(
                task=task,
                cache=shared_caching.ChunkedFilesDataCache(output_dir),
                examples=get_examples_func(),
            ),
            chunk_size=args.chunk_size,
            data_args=args.to_dict(),
            output_dir=labels_output_dir,
        )
        paths_dict[f"{phase}_labels"] = labels_output_dir

    if PHASE.TRAIN in phases:
        do_tokenize(PHASE.TRAIN)
        
    if PHASE.VAL in phases:
        do_tokenize(PHASE.VAL)

    if PHASE.TEST in phases:
        do_tokenize(PHASE.TEST)

    if not args.skip_write_output_paths:
        py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))
Beispiel #8
0
def main(args: RunConfiguration):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # === Shared model components setup === #
    model_type = "roberta-base"
    model_arch = ModelArchitectures.from_model_type(model_type=model_type)
    transformers_class_spec = model_setup.TRANSFORMERS_CLASS_SPEC_DICT[
        model_arch]
    ancestor_model = model_setup.get_ancestor_model(
        transformers_class_spec=transformers_class_spec,
        model_config_path=args.model_config_path,
    )
    encoder = model_setup.get_encoder(
        model_arch=model_arch,
        ancestor_model=ancestor_model,
    )
    tokenizer = shared_model_setup.get_tokenizer(
        model_type=model_type,
        tokenizer_path=args.model_tokenizer_path,
    )

    # === Taskmodels setup === #
    task_dict = {
        "mnli":
        tasks.create_task_from_config_path(
            os.path.join(
                args.task_config_base_path,
                "mnli.json",
            )),
        "qnli":
        tasks.create_task_from_config_path(
            os.path.join(
                args.task_config_base_path,
                "qnli.json",
            )),
        "rte":
        tasks.create_task_from_config_path(
            os.path.join(
                args.task_config_base_path,
                "qnli.json",
            ))
    }
    taskmodels_dict = {
        "nli":
        taskmodels.ClassificationModel(
            encoder=encoder,
            classification_head=heads.ClassificationHead(
                hidden_size=encoder.config.hidden_size,
                hidden_dropout_prob=encoder.config.hidden_dropout_prob,
                num_labels=len(task_dict["mnli"].LABELS),
            ),
        ),
        "rte":
        taskmodels.ClassificationModel(
            encoder=encoder,
            classification_head=heads.ClassificationHead(
                hidden_size=encoder.config.hidden_size,
                hidden_dropout_prob=encoder.config.hidden_dropout_prob,
                num_labels=len(task_dict["rte"].LABELS),
            ),
        ),
    }
    task_to_taskmodel_map = {
        "mnli": "nli",
        "qnli": "nli",
        "rte": "rte",
    }

    # === Final === #
    jiant_model = JiantModel(
        task_dict=task_dict,
        encoder=encoder,
        taskmodels_dict=taskmodels_dict,
        task_to_taskmodel_map=task_to_taskmodel_map,
        tokenizer=tokenizer,
    )
    jiant_model = jiant_model.to(device)

    # === Run === #
    task_dataloader_dict = {}
    for task_name, task in task_dict.items():
        train_cache = caching.ChunkedFilesDataCache(
            cache_fol_path=os.path.join(args.task_cache_base_path, task_name,
                                        "train"), )
        train_dataset = train_cache.get_iterable_dataset(buffer_size=10000,
                                                         shuffle=True)
        train_dataloader = torch_utils.DataLoaderWithLength(
            dataset=train_dataset,
            batch_size=4,
            collate_fn=task.collate_fn,
        )
        task_dataloader_dict[task_name] = train_dataloader

    for task_name, task in task_dict.items():
        batch, batch_metadata = next(iter(task_dataloader_dict[task_name]))
        batch = batch.to(device)
        with torch.no_grad():
            model_output = wrap_jiant_forward(
                jiant_model=jiant_model,
                batch=batch,
                task=task,
                compute_loss=True,
            )
        print(task_name)
        print(model_output)
        print()