Example #1
0
def convert_hf_local_dataset_to_examples(path,
                                         name=None,
                                         version=None,
                                         field_map=None,
                                         label_map=None,
                                         phase_map=None,
                                         phase_list=None):
    """Helper function for reading from datasets.load_dataset and converting to examples

    Args:
        path: path argument (from datasets.load_dataset)
        name: name argument (from datasets.load_dataset)
        version: version argument (from datasets.load_dataset)
        field_map: dictionary for renaming fields, non-exhaustive
        label_map: dictionary for replacing labels, non-exhaustive
        phase_map: dictionary for replacing phase names, non-exhaustive
        phase_list: phases to keep (after phase_map)

    Returns:
        Dict[phase] -> list[examples]
    """
    print('pppath', os.path.dirname(os.path.abspath(__file__)), path)
    dataset = datasets.load_from_disk(
        os.path.dirname(os.path.abspath(__file__)) + '/' + path)
    if phase_map:
        for old_phase_name, new_phase_name in phase_map.items():
            replace_key(dataset,
                        old_key=old_phase_name,
                        new_key=new_phase_name)
    if phase_list is None:
        phase_list = dataset.keys()
    examples_dict = {}
    for phase in phase_list:
        phase_examples = []
        for raw_example in dataset[phase]:
            if field_map:
                for old_field_name, new_field_name in field_map.items():
                    replace_key(raw_example,
                                old_key=old_field_name,
                                new_key=new_field_name)
            if label_map and "label" in raw_example:
                # Optionally use an dict or function to map labels
                label = raw_example["label"]
                if isinstance(label_map, dict):
                    if raw_example["label"] in label_map:
                        label = label_map[raw_example["label"]]
                elif callable(label_map):
                    label = label_map(raw_example["label"])
                else:
                    raise TypeError(label_map)
                raw_example["label"] = label
            phase_examples.append(raw_example)
        examples_dict[phase] = phase_examples
    return examples_dict
Example #2
0
def convert_nlp_dataset_to_examples(path,
                                    name=None,
                                    version=None,
                                    field_map=None,
                                    label_map=None,
                                    phase_map=None,
                                    phase_list=None):
    """Helper function for reading from nlp.load_dataset and converting to examples

    Args:
        path: path argument (from nlp.load_dataset)
        name: name argument (from nlp.load_dataset)
        version: version argument (from nlp.load_dataset)
        field_map: dictionary for renaming fields, non-exhaustive
        label_map: dictionary for replacing labels, non-exhaustive
        phase_map: dictionary for replacing phase names, non-exhaustive
        phase_list: phases to keep (after phase_map)

    Returns:
        Dict[phase] -> list[examples]
    """
    dataset = nlp.load_dataset(path=path, name=name, version=version)
    if phase_map:
        for old_phase_name, new_phase_name in phase_map.items():
            replace_key(dataset,
                        old_key=old_phase_name,
                        new_key=new_phase_name)
    if phase_list is None:
        phase_list = dataset.keys()
    examples_dict = {}
    for phase in phase_list:
        phase_examples = []
        for raw_example in dataset[phase]:
            if field_map:
                for old_field_name, new_field_name in field_map.items():
                    replace_key(raw_example,
                                old_key=old_field_name,
                                new_key=new_field_name)
            if label_map and "label" in raw_example and raw_example[
                    "label"] in label_map:
                raw_example["label"] = label_map[raw_example["label"]]
            phase_examples.append(raw_example)
        examples_dict[phase] = phase_examples
    return examples_dict
Example #3
0
def load_hf_dataset(path,
                    name=None,
                    version=None,
                    phase_map=None,
                    n_fold: int = None,
                    fold: int = None,
                    split_type: str = None):
    phase_map = phase_map or {}
    split_type = split_type or 'local_evaluation'

    dataset_dict = load_dataset(path=path, name=name, version=version)

    for old_phase_name, new_phase_name in phase_map.items():
        replace_key(dataset_dict,
                    old_key=old_phase_name,
                    new_key=new_phase_name)

    if n_fold is not None:
        jiant2hf_phase_map = {val: key for key, val in phase_map.items()}
        hf_train_phase = jiant2hf_phase_map.get('train', 'train')
        hf_val_phase = jiant2hf_phase_map.get('val', 'val')

        if split_type == 'local_evaluation':
            hf_local_phase = '+'.join([hf_train_phase])
            dataset_dict['test'] = dataset_dict['val']
        elif split_type == 'submission':
            hf_local_phase = '+'.join([hf_train_phase, hf_val_phase])
        else:
            raise ValueError(f'Unknown split type "{split_type}"')

        hf_local_dataset = load_dataset(path=path,
                                        name=name,
                                        version=version,
                                        split=hf_local_phase)
        cv_dataset = build_cv_splits(
            hf_local_dataset,
            n_fold,
            stratify=False,
        )
        dataset_dict['train'] = cv_dataset[f'fold-{fold}.train']
        dataset_dict['val'] = cv_dataset[f'fold-{fold}.val']

    return dataset_dict
Example #4
0
def convert_hf_dataset_to_examples(
    path,
    name=None,
    version=None,
    field_map=None,
    label_map=None,
    phase_map=None,
    phase_list=None,
    n_fold: int = None,
    fold: int = None,
    return_all: bool = False,
    experiment_id_for_metric=None,
    cache_dir_for_metric=None,
):
    """Helper function for reading from datasets.load_dataset and converting to examples

    Args:
        path: path argument (from datasets.load_dataset)
        name: name argument (from datasets.load_dataset)
        version: version argument (from datasets.load_dataset)
        field_map: dictionary for renaming fields, non-exhaustive
        label_map: dictionary for replacing labels, non-exhaustive
        phase_map: dictionary for replacing phase names, non-exhaustive
        phase_list: phases to keep (after phase_map)

    Returns:
        Dict[phase] -> list[examples]
    """
    # "mrpc.cv-5-0"
    fold_dataset = load_hf_dataset(path=path,
                                   name=name,
                                   version=version,
                                   phase_map=phase_map,
                                   n_fold=n_fold,
                                   fold=fold)

    if phase_list is None:
        phase_list = fold_dataset.keys()
    examples_dict = {}
    for phase in phase_list:
        phase_examples = []
        for raw_example in fold_dataset[phase]:
            if field_map:
                for old_field_name, new_field_name in field_map.items():
                    replace_key(raw_example,
                                old_key=old_field_name,
                                new_key=new_field_name)
            if label_map and "label" in raw_example:
                # Optionally use an dict or function to map labels
                label = raw_example["label"]
                if isinstance(label_map, dict):
                    if raw_example["label"] in label_map:
                        label = label_map[raw_example["label"]]
                elif callable(label_map):
                    label = label_map(raw_example["label"])
                else:
                    raise TypeError(label_map)
                raw_example["label"] = label
            phase_examples.append(raw_example)
        examples_dict[phase] = phase_examples

    metric = load_metric(path,
                         config_name=name,
                         experiment_id=experiment_id_for_metric,
                         cache_dir=cache_dir_for_metric)

    if return_all:
        return examples_dict, {
            'hf_fold_dataset': fold_dataset,
            # 'hf_full_dataset': full_dataset,
            'hf_metric': metric
        }
    else:
        return examples_dict