def setup_jiant_model( model_type: str, model_config_path: str, tokenizer_path: str, task_dict: Dict[str, Task], taskmodels_config: container_setup.TaskmodelsConfig, ): """Sets up tokenizer, encoder, and task models, and instantiates and returns a JiantModel. Args: model_type (str): model shortcut name. model_config_path (str): Path to the JSON file containing the configuration parameters. tokenizer_path (str): path to tokenizer directory. task_dict (Dict[str, tasks.Task]): map from task name to task instance. taskmodels_config: maps mapping from tasks to models, and specifying task-model configs. Returns: JiantModel nn.Module. """ model_arch = ModelArchitectures.from_model_type(model_type) transformers_class_spec = TRANSFORMERS_CLASS_SPEC_DICT[model_arch] tokenizer = model_setup.get_tokenizer(model_type=model_type, tokenizer_path=tokenizer_path) ancestor_model = get_ancestor_model( transformers_class_spec=transformers_class_spec, model_config_path=model_config_path, ) encoder = get_encoder(model_arch=model_arch, ancestor_model=ancestor_model) taskmodels_dict = { taskmodel_name: create_taskmodel( task=task_dict[task_name_list[0]], # Take the first task model_arch=model_arch, encoder=encoder, taskmodel_kwargs=taskmodels_config.get_taskmodel_kwargs( taskmodel_name), ) for taskmodel_name, task_name_list in get_taskmodel_and_task_names( taskmodels_config.task_to_taskmodel_map).items() } return primary.JiantModel( task_dict=task_dict, encoder=encoder, taskmodels_dict=taskmodels_dict, task_to_taskmodel_map=taskmodels_config.task_to_taskmodel_map, tokenizer=tokenizer, )
def main(args: RunConfiguration): task = tasks.create_task_from_config_path( config_path=args.task_config_path, verbose=True) feat_spec = model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ) tokenizer = model_setup.get_tokenizer( model_type=args.model_type, tokenizer_path=args.model_tokenizer_path, ) if isinstance(args.phases, str): phases = args.phases.split(",") else: phases = args.phases assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST} paths_dict = {} os.makedirs(args.output_dir, exist_ok=True) if PHASE.TRAIN in phases: chunk_and_save( task=task, phase=PHASE.TRAIN, examples=task.get_train_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict["train"] = os.path.join(args.output_dir, PHASE.TRAIN) if PHASE.VAL in phases: val_examples = task.get_val_examples() chunk_and_save( task=task, phase=PHASE.VAL, examples=val_examples, feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache( os.path.join(args.output_dir, PHASE.VAL)), examples=val_examples, ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=os.path.join(args.output_dir, "val_labels"), ) paths_dict[PHASE.VAL] = os.path.join(args.output_dir, PHASE.VAL) paths_dict["val_labels"] = os.path.join(args.output_dir, "val_labels") if PHASE.TEST in phases: chunk_and_save( task=task, phase=PHASE.TEST, examples=task.get_test_examples(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[PHASE.TEST] = os.path.join(args.output_dir, PHASE.TEST) if not args.skip_write_output_paths: py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))
def main(args: RunConfiguration): task = tasks.create_task_from_config_path(config_path=args.task_config_path, verbose=True) feat_spec = model_resolution.build_featurization_spec( model_type=args.model_type, max_seq_length=args.max_seq_length, ) tokenizer = model_setup.get_tokenizer( model_type=args.model_type, tokenizer_path=args.model_tokenizer_path, ) if isinstance(args.phases, str): phases = args.phases.split(",") else: phases = args.phases assert set(phases) <= {PHASE.TRAIN, PHASE.VAL, PHASE.TEST} paths_dict = {} os.makedirs(args.output_dir, exist_ok=True) def do_tokenize(phase: str): evaluation_scheme = evaluate.get_evaluation_scheme_for_task(task) output_dir = os.path.join(args.output_dir, f"{phase}") labels_output_dir = os.path.join(args.output_dir, f"{phase}_labels") if phase == PHASE.TRAIN: get_examples_func = task.get_train_examples elif phase == PHASE.VAL: get_examples_func = task.get_val_examples elif phase == PHASE.TEST: # get_examples_func = task.get_test_examples def get_examples_func(): try: return task.get_examples('test') except NotImplementedError: logger.warning('The labels for "test" split is not retrieved, so, metrics for the "test" split will not be evaluated properly.') return task.get_test_examples() chunk_and_save( # HONOKA task=task, phase=phase, examples=get_examples_func(), feat_spec=feat_spec, tokenizer=tokenizer, args=args, ) paths_dict[phase] = output_dir shared_caching.chunk_and_save( data=evaluation_scheme.get_labels_from_cache_and_examples( task=task, cache=shared_caching.ChunkedFilesDataCache(output_dir), examples=get_examples_func(), ), chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=labels_output_dir, ) paths_dict[f"{phase}_labels"] = labels_output_dir if PHASE.TRAIN in phases: do_tokenize(PHASE.TRAIN) if PHASE.VAL in phases: do_tokenize(PHASE.VAL) if PHASE.TEST in phases: do_tokenize(PHASE.TEST) if not args.skip_write_output_paths: py_io.write_json(data=paths_dict, path=os.path.join(args.output_dir, "paths.json"))
def main(args: RunConfiguration): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # === Shared model components setup === # model_type = "roberta-base" model_arch = ModelArchitectures.from_model_type(model_type=model_type) transformers_class_spec = model_setup.TRANSFORMERS_CLASS_SPEC_DICT[ model_arch] ancestor_model = model_setup.get_ancestor_model( transformers_class_spec=transformers_class_spec, model_config_path=args.model_config_path, ) encoder = model_setup.get_encoder( model_arch=model_arch, ancestor_model=ancestor_model, ) tokenizer = shared_model_setup.get_tokenizer( model_type=model_type, tokenizer_path=args.model_tokenizer_path, ) # === Taskmodels setup === # task_dict = { "mnli": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "mnli.json", )), "qnli": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "qnli.json", )), "rte": tasks.create_task_from_config_path( os.path.join( args.task_config_base_path, "qnli.json", )) } taskmodels_dict = { "nli": taskmodels.ClassificationModel( encoder=encoder, classification_head=heads.ClassificationHead( hidden_size=encoder.config.hidden_size, hidden_dropout_prob=encoder.config.hidden_dropout_prob, num_labels=len(task_dict["mnli"].LABELS), ), ), "rte": taskmodels.ClassificationModel( encoder=encoder, classification_head=heads.ClassificationHead( hidden_size=encoder.config.hidden_size, hidden_dropout_prob=encoder.config.hidden_dropout_prob, num_labels=len(task_dict["rte"].LABELS), ), ), } task_to_taskmodel_map = { "mnli": "nli", "qnli": "nli", "rte": "rte", } # === Final === # jiant_model = JiantModel( task_dict=task_dict, encoder=encoder, taskmodels_dict=taskmodels_dict, task_to_taskmodel_map=task_to_taskmodel_map, tokenizer=tokenizer, ) jiant_model = jiant_model.to(device) # === Run === # task_dataloader_dict = {} for task_name, task in task_dict.items(): train_cache = caching.ChunkedFilesDataCache( cache_fol_path=os.path.join(args.task_cache_base_path, task_name, "train"), ) train_dataset = train_cache.get_iterable_dataset(buffer_size=10000, shuffle=True) train_dataloader = torch_utils.DataLoaderWithLength( dataset=train_dataset, batch_size=4, collate_fn=task.collate_fn, ) task_dataloader_dict[task_name] = train_dataloader for task_name, task in task_dict.items(): batch, batch_metadata = next(iter(task_dataloader_dict[task_name])) batch = batch.to(device) with torch.no_grad(): model_output = wrap_jiant_forward( jiant_model=jiant_model, batch=batch, task=task, compute_loss=True, ) print(task_name) print(model_output) print()