def convert_hf_local_dataset_to_examples(path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None): """Helper function for reading from datasets.load_dataset and converting to examples Args: path: path argument (from datasets.load_dataset) name: name argument (from datasets.load_dataset) version: version argument (from datasets.load_dataset) field_map: dictionary for renaming fields, non-exhaustive label_map: dictionary for replacing labels, non-exhaustive phase_map: dictionary for replacing phase names, non-exhaustive phase_list: phases to keep (after phase_map) Returns: Dict[phase] -> list[examples] """ print('pppath', os.path.dirname(os.path.abspath(__file__)), path) dataset = datasets.load_from_disk( os.path.dirname(os.path.abspath(__file__)) + '/' + path) if phase_map: for old_phase_name, new_phase_name in phase_map.items(): replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name) if phase_list is None: phase_list = dataset.keys() examples_dict = {} for phase in phase_list: phase_examples = [] for raw_example in dataset[phase]: if field_map: for old_field_name, new_field_name in field_map.items(): replace_key(raw_example, old_key=old_field_name, new_key=new_field_name) if label_map and "label" in raw_example: # Optionally use an dict or function to map labels label = raw_example["label"] if isinstance(label_map, dict): if raw_example["label"] in label_map: label = label_map[raw_example["label"]] elif callable(label_map): label = label_map(raw_example["label"]) else: raise TypeError(label_map) raw_example["label"] = label phase_examples.append(raw_example) examples_dict[phase] = phase_examples return examples_dict
def convert_nlp_dataset_to_examples(path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None): """Helper function for reading from nlp.load_dataset and converting to examples Args: path: path argument (from nlp.load_dataset) name: name argument (from nlp.load_dataset) version: version argument (from nlp.load_dataset) field_map: dictionary for renaming fields, non-exhaustive label_map: dictionary for replacing labels, non-exhaustive phase_map: dictionary for replacing phase names, non-exhaustive phase_list: phases to keep (after phase_map) Returns: Dict[phase] -> list[examples] """ dataset = nlp.load_dataset(path=path, name=name, version=version) if phase_map: for old_phase_name, new_phase_name in phase_map.items(): replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name) if phase_list is None: phase_list = dataset.keys() examples_dict = {} for phase in phase_list: phase_examples = [] for raw_example in dataset[phase]: if field_map: for old_field_name, new_field_name in field_map.items(): replace_key(raw_example, old_key=old_field_name, new_key=new_field_name) if label_map and "label" in raw_example and raw_example[ "label"] in label_map: raw_example["label"] = label_map[raw_example["label"]] phase_examples.append(raw_example) examples_dict[phase] = phase_examples return examples_dict
def load_hf_dataset(path, name=None, version=None, phase_map=None, n_fold: int = None, fold: int = None, split_type: str = None): phase_map = phase_map or {} split_type = split_type or 'local_evaluation' dataset_dict = load_dataset(path=path, name=name, version=version) for old_phase_name, new_phase_name in phase_map.items(): replace_key(dataset_dict, old_key=old_phase_name, new_key=new_phase_name) if n_fold is not None: jiant2hf_phase_map = {val: key for key, val in phase_map.items()} hf_train_phase = jiant2hf_phase_map.get('train', 'train') hf_val_phase = jiant2hf_phase_map.get('val', 'val') if split_type == 'local_evaluation': hf_local_phase = '+'.join([hf_train_phase]) dataset_dict['test'] = dataset_dict['val'] elif split_type == 'submission': hf_local_phase = '+'.join([hf_train_phase, hf_val_phase]) else: raise ValueError(f'Unknown split type "{split_type}"') hf_local_dataset = load_dataset(path=path, name=name, version=version, split=hf_local_phase) cv_dataset = build_cv_splits( hf_local_dataset, n_fold, stratify=False, ) dataset_dict['train'] = cv_dataset[f'fold-{fold}.train'] dataset_dict['val'] = cv_dataset[f'fold-{fold}.val'] return dataset_dict
def convert_hf_dataset_to_examples( path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None, n_fold: int = None, fold: int = None, return_all: bool = False, experiment_id_for_metric=None, cache_dir_for_metric=None, ): """Helper function for reading from datasets.load_dataset and converting to examples Args: path: path argument (from datasets.load_dataset) name: name argument (from datasets.load_dataset) version: version argument (from datasets.load_dataset) field_map: dictionary for renaming fields, non-exhaustive label_map: dictionary for replacing labels, non-exhaustive phase_map: dictionary for replacing phase names, non-exhaustive phase_list: phases to keep (after phase_map) Returns: Dict[phase] -> list[examples] """ # "mrpc.cv-5-0" fold_dataset = load_hf_dataset(path=path, name=name, version=version, phase_map=phase_map, n_fold=n_fold, fold=fold) if phase_list is None: phase_list = fold_dataset.keys() examples_dict = {} for phase in phase_list: phase_examples = [] for raw_example in fold_dataset[phase]: if field_map: for old_field_name, new_field_name in field_map.items(): replace_key(raw_example, old_key=old_field_name, new_key=new_field_name) if label_map and "label" in raw_example: # Optionally use an dict or function to map labels label = raw_example["label"] if isinstance(label_map, dict): if raw_example["label"] in label_map: label = label_map[raw_example["label"]] elif callable(label_map): label = label_map(raw_example["label"]) else: raise TypeError(label_map) raw_example["label"] = label phase_examples.append(raw_example) examples_dict[phase] = phase_examples metric = load_metric(path, config_name=name, experiment_id=experiment_id_for_metric, cache_dir=cache_dir_for_metric) if return_all: return examples_dict, { 'hf_fold_dataset': fold_dataset, # 'hf_full_dataset': full_dataset, 'hf_metric': metric } else: return examples_dict