Example #1
0
def full_chunk_and_save(task, phase, examples, feat_spec, tokenizer, args: RunConfiguration):
    """Convert Examples to ListDataset, optionally truncate sequences if possible, and save to disk.

    Args:
        task: Task object
        phase (str): string identifying the data subset (e.g., train, val or test).
        examples (list[Example]): list of task Examples.
        feat_spec: (FeaturizationSpec): Tokenization-related metadata.
        tokenizer: TODO  (issue #1188)
        args (RunConfiguration): run configuration object.

    """
    dataset = preprocessing.convert_examples_to_dataset(
        task=task,
        examples=examples,
        feat_spec=feat_spec,
        tokenizer=tokenizer,
        phase=phase,
        verbose=True,
    )
    if args.smart_truncate:
        dataset, length = preprocessing.smart_truncate(
            dataset=dataset, max_seq_length=args.max_seq_length, verbose=True,
        )
        os.makedirs(os.path.join(args.output_dir, phase), exist_ok=True)
        py_io.write_json(
            data={"truncated_to": int(length)},
            path=os.path.join(args.output_dir, phase, "smart_truncate.json"),
        )
    shared_caching.chunk_and_save(
        data=dataset.data,
        chunk_size=args.chunk_size,
        data_args=args.to_dict(),
        output_dir=os.path.join(args.output_dir, phase),
    )
Example #2
0
    def do_tokenize(phase: str):
        evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task)
        output_dir = os.path.join(args.output_dir, f"{phase}")
        labels_output_dir = os.path.join(args.output_dir, f"{phase}_labels")
        if phase == PHASE.TRAIN:
            get_examples_func = task.get_train_examples
        elif phase == PHASE.VAL:
            get_examples_func = task.get_val_examples
        elif phase == PHASE.TEST:
            # get_examples_func = task.get_test_examples

            def get_examples_func():
                try:
                    return task.get_examples('test')
                except NotImplementedError:
                    logger.warning('The labels for "test" split is not retrieved, so, metrics for the "test" split will not be evaluated properly.')
                return task.get_test_examples()

        chunk_and_save(  # HONOKA
            task=task,
            phase=phase,
            examples=get_examples_func(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict[phase] = output_dir

        shared_caching.chunk_and_save(
            data=evaluation_scheme.get_labels_from_cache_and_examples(
                task=task,
                cache=shared_caching.ChunkedFilesDataCache(output_dir),
                examples=get_examples_func(),
            ),
            chunk_size=args.chunk_size,
            data_args=args.to_dict(),
            output_dir=labels_output_dir,
        )
        paths_dict[f"{phase}_labels"] = labels_output_dir
Example #3
0
def main(args: RunConfiguration):
    task = tasks.create_task_from_config_path(
        config_path=args.task_config_path, verbose=True)
    feat_spec = model_resolution.build_featurization_spec(
        model_type=args.model_type,
        max_seq_length=args.max_seq_length,
    )
    tokenizer = model_setup.get_tokenizer(
        model_type=args.model_type,
        tokenizer_path=args.model_tokenizer_path,
    )
    if isinstance(args.phases, str):
        phases = args.phases.split(",")
    else:
        phases = args.phases
    assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST}

    paths_dict = {}
    os.makedirs(args.output_dir, exist_ok=True)

    if PHASE.TRAIN in phases:
        chunk_and_save(
            task=task,
            phase=PHASE.TRAIN,
            examples=task.get_train_examples(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict["train"] = os.path.join(args.output_dir, PHASE.TRAIN)

    if PHASE.VAL in phases:
        val_examples = task.get_val_examples()
        chunk_and_save(
            task=task,
            phase=PHASE.VAL,
            examples=val_examples,
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task)
        shared_caching.chunk_and_save(
            data=evaluation_scheme.get_labels_from_cache_and_examples(
                task=task,
                cache=shared_caching.ChunkedFilesDataCache(
                    os.path.join(args.output_dir, PHASE.VAL)),
                examples=val_examples,
            ),
            chunk_size=args.chunk_size,
            data_args=args.to_dict(),
            output_dir=os.path.join(args.output_dir, "val_labels"),
        )
        paths_dict[PHASE.VAL] = os.path.join(args.output_dir, PHASE.VAL)
        paths_dict["val_labels"] = os.path.join(args.output_dir, "val_labels")

    if PHASE.TEST in phases:
        chunk_and_save(
            task=task,
            phase=PHASE.TEST,
            examples=task.get_test_examples(),
            feat_spec=feat_spec,
            tokenizer=tokenizer,
            args=args,
        )
        paths_dict[PHASE.TEST] = os.path.join(args.output_dir, PHASE.TEST)

    if not args.skip_write_output_paths:
        py_io.write_json(data=paths_dict,
                         path=os.path.join(args.output_dir, "paths.json"))