def run(arguments): if arguments["--aml"]: from azureml.core.run import Run aml_ctx = Run.get_context() assert torch.cuda.is_available(), "No CUDA available. Aborting training." else: aml_ctx = None log_path = configure_logging(aml_ctx) azure_info_path = arguments.get("--azure-info", None) training_data_path = RichPath.create(arguments["TRAIN_DATA_PATH"], azure_info_path) training_data = LazyDataIterable(lambda: load_from_folder(training_data_path, shuffle=True)) validation_data_path = RichPath.create(arguments["VALID_DATA_PATH"], azure_info_path) validation_data = LazyDataIterable( lambda: load_from_folder(validation_data_path, shuffle=False) ) model_path = Path(arguments["MODEL_FILENAME"]) assert model_path.name.endswith(".pkl.gz"), "MODEL_FILENAME must have a `.pkl.gz` suffix." initialize_metadata = True restore_path = arguments.get("--restore-path", None) if restore_path: initialize_metadata = False model, nn = Graph2Class.restore_model(Path(restore_path)) elif arguments["--aml"] and model_path.exists(): initialize_metadata = False model, nn = Graph2Class.restore_model(model_path) else: nn = None model = create_graph2class_gnn_model() def create_optimizer(parameters): return torch.optim.Adam(parameters, lr=0.00025) trainer = ModelTrainer( model, model_path, max_num_epochs=int(arguments["--max-num-epochs"]), minibatch_size=int(arguments["--minibatch-size"]), optimizer_creator=create_optimizer, clip_gradient_norm=1, target_validation_metric="Accuracy", target_validation_metric_higher_is_better=True, enable_amp=arguments["--amp"], ) if nn is not None: trainer.neural_module = nn trainer.register_train_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "train", model, epoch, metrics) ) trainer.register_validation_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "valid", model, epoch, metrics) ) trainer.train( training_data, validation_data, show_progress_bar=not arguments["--quiet"], initialize_metadata=initialize_metadata, parallelize=not arguments["--sequential-run"], patience=10, store_tensorized_data_in_memory=True, ) test_data_path = RichPath.create(arguments["TEST_DATA_PATH"], azure_info_path) test_data = LazyDataIterable(lambda: load_from_folder(test_data_path, shuffle=False)) acc = model.report_accuracy( test_data, trainer.neural_module, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), ) print(f"Test accuracy: {acc:%}") if aml_ctx is not None: aml_ctx.log("Test Accuracy", acc) aml_ctx.upload_file(name="model.pkl.gz", path_or_stream=str(model_path)) aml_ctx.upload_file(name="full.log", path_or_stream=log_path)
def train(cfg): if cfg.show_in_wandb: logger = logging.getLogger("wandb") logger.setLevel(logging.WARNING) # Initializing wandb logger. name_of_run = cfg.model.name_of_run wandb.init(project="IdTransformer", config=cfg, group="GNN", name=name_of_run) training_data = LazyDataIterable(lambda: load_from_folder( join(cfg.dataset.path, "train"), shuffle=True)) validation_data = LazyDataIterable(lambda: load_from_folder( join(cfg.dataset.path, "validation"), shuffle=False)) test_data = LazyDataIterable(lambda: load_from_folder( join(cfg.dataset.path, "test"), shuffle=False)) model_path = Path(cfg.model.filename) assert model_path.name.endswith( ".pkl.gz"), "model filename must have a `.pkl.gz` suffix." initialize_metadata = True if cfg.model.use_checkpoint and cfg.model.restore_path: initialize_metadata = False model, nn = VarNamingModel.restore_model(Path(cfg.model.restore_path)) else: nn = None model = create_var_naming_gnn_model(cfg.model) def create_optimizer(parameters): return torch.optim.Adam(parameters, lr=cfg.model.max_lr) trainer = ModelTrainer( model, model_path, max_num_epochs=int(cfg.model.max_epochs), minibatch_size=int(cfg.model.minibatch_size), optimizer_creator=create_optimizer, clip_gradient_norm=1, target_validation_metric="accuracy", target_validation_metric_higher_is_better=True, ) if nn is not None: trainer.neural_module = nn if cfg.show_in_wandb: trainer.register_train_epoch_end_hook( lambda model, nn, epoch, metrics: log_run("train", model, epoch, metrics)) trainer.register_validation_epoch_end_hook( lambda model, nn, epoch, metrics: log_run("val", model, epoch, metrics)) trainer.train(training_data, validation_data, validate_on_start=cfg.model.validate_on_start, show_progress_bar=True, initialize_metadata=initialize_metadata, parallelize=cfg.model.parallelize, use_multiprocessing=cfg.model.use_multiprocessing)
def run(arguments): if arguments["--aml"]: import torch from azureml.core.run import Run aml_ctx = Run.get_context() assert torch.cuda.is_available( ), "No CUDA available. Aborting training." else: aml_ctx = None log_path = configure_logging(aml_ctx) azure_info_path = arguments.get("--azure-info", None) training_data_path = RichPath.create(arguments["TRAIN_DATA_PATH"], azure_info_path) training_data = LazyDataIterable( lambda: training_data_path.read_as_jsonl()) validation_data_path = RichPath.create(arguments["VALID_DATA_PATH"], azure_info_path) validation_data = LazyDataIterable( lambda: validation_data_path.read_as_jsonl()) model_path = Path(arguments["MODEL_FILENAME"]) assert model_path.name.endswith( ".pkl.gz"), "MODEL_FILENAME must have a `.pkl.gz` suffix." initialize_metadata = True restore_path = arguments.get("--restore-path", None) if restore_path: initialize_metadata = False model, nn = AbstractNeuralModel.restore_model(Path(restore_path)) else: embedding_size = 128 dropout_rate = 0.1 nn = None def create_mp_layers(num_edges: int): ggnn_mp = GatedMessagePassingLayer( state_dimension=embedding_size, message_dimension=embedding_size, num_edge_types=num_edges, message_aggregation_function="sum", dropout_rate=dropout_rate, ) r1 = MeanResidualLayer(embedding_size) return [ r1.pass_through_dummy_layer(), ggnn_mp, ggnn_mp, ggnn_mp, ggnn_mp, ggnn_mp, ggnn_mp, ggnn_mp, r1, GatedMessagePassingLayer( state_dimension=embedding_size, message_dimension=embedding_size, num_edge_types=num_edges, message_aggregation_function="sum", dropout_rate=dropout_rate, ), ] model = Graph2Seq( gnn_model=GraphNeuralNetworkModel( node_representation_model=StrElementRepresentationModel( token_splitting="token", embedding_size=embedding_size, ), message_passing_layer_creator=create_mp_layers, ), decoder=GruCopyingDecoderModel(hidden_size=128, embedding_size=256, memories_hidden_dim=embedding_size), ) trainer = ModelTrainer( model, model_path, max_num_epochs=int(arguments["--max-num-epochs"]), minibatch_size=int(arguments["--minibatch-size"]), enable_amp=arguments["--amp"], ) if nn is not None: trainer.neural_module = nn trainer.register_train_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "train", model, epoch, metrics)) trainer.register_validation_epoch_end_hook( lambda model, nn, epoch, metrics: log_run(aml_ctx, "valid", model, epoch, metrics)) trainer.train( training_data, validation_data, show_progress_bar=not arguments["--quiet"], initialize_metadata=initialize_metadata, parallelize=not arguments["--sequential-run"], ) if aml_ctx is not None: aml_ctx.upload_file(name="model.pkl.gz", path_or_stream=str(model_path)) aml_ctx.upload_file(name="full.log", path_or_stream=log_path)