def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: JSONDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) vocab = Vocabulary.from_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab, TASKS) map_location = "cpu" if not args.cuda else None model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"), map_location=map_location)) if args.cuda: model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() model.set_inference_mode(True) return TASKS, vocab, model, readers, test_iterator
def run(args): ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file, args.dataset_path_prefix) SELECTED_TASK_NAMES = args.task PROJECTION_DIM = args.proj_dim HIDDEN_DIM = args.hidden_dim # BIDIRECTIONAL=True # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM DROPOUT = args.dropout LR = args.lr WEIGHT_DECAY = args.weight_decay BATCH_SIZE = args.batch_size NUM_EPOCHS = args.epochs PATIENTCE = args.patience SERIALIZATION_DIR = args.model_dir CLEAN_MODEL_DIR = args.clean_model_dir CUDA_DEVICE = cuda_device(args.cuda) TEST_MODE = args.test_mode # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu") TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES] dataset_paths = { task_name: ALL_DATASET_PATHS[task_name] for task_name in SELECTED_TASK_NAMES } tag_namespace_hashing_fn = { tag_namespace: i for i, tag_namespace in enumerate(TASK_CONFIGS.keys()) }.get elmo_token_indexer = ELMoTokenCharactersIndexer() token_indexers = {"tokens": elmo_token_indexer} readers = { task.tag_namespace: ConLLDatasetReader( task.tag_namespace, token_indexers=token_indexers, tag_namespace_hashing_fn=tag_namespace_hashing_fn, ) for task in TASKS } elmo_embedder = ElmoTokenEmbedder( options_file, weight_file, requires_grad=False, dropout=DROPOUT, projection_dim=PROJECTION_DIM, ) # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3) # Pass in the ElmoTokenEmbedder instance instead word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim() # POS -> CHUNK -> NER task_suffixes = set( [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES]) encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM) if not TEST_MODE: train_dataset = read_datasets(dataset_paths, readers, data_split="train") validation_dataset = read_datasets(dataset_paths, readers, data_split="dev") vocab = create_vocab([train_dataset, validation_dataset]) # Special case for CCG if "ccg" in task_suffixes or "pos" in task_suffixes: for task in TASKS: if task.task_type == "ccg": for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]: vocab.add_token_to_namespace(tag, task.tag_namespace) if task.tag_namespace == "ud_pos": for tag in ["CONJ"]: vocab.add_token_to_namespace(tag, task.tag_namespace) else: vocab = Vocabulary.from_files( os.path.join(SERIALIZATION_DIR, "vocabulary")) # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM) model = MultiTaskCRFTagger(word_embeddings, encoders, vocab, TASKS) model = model.cuda(device=CUDA_DEVICE) if not TEST_MODE: iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE, cache_instances=True) iterator.index_with(vocab) if CLEAN_MODEL_DIR: if os.path.exists(SERIALIZATION_DIR): logger.info(f"Deleting {SERIALIZATION_DIR}") shutil.rmtree(SERIALIZATION_DIR) logger.info(f"Creating {SERIALIZATION_DIR}") os.makedirs(SERIALIZATION_DIR) logger.info( f"Writing arguments to arguments.json in {SERIALIZATION_DIR}") with open(os.path.join(SERIALIZATION_DIR, "arguments.json"), "w+") as fp: json.dump(vars(args), fp, indent=2) logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}") vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary")) # Use list to ensure each epoch is a full pass through the data combined_training_dataset = list( roundrobin_iterator(*train_dataset.values())) combined_validation_dataset = list( roundrobin_iterator(*validation_dataset.values())) # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) training_stats = [] trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=combined_training_dataset, validation_dataset=combined_validation_dataset, patience=PATIENTCE, num_epochs=NUM_EPOCHS, cuda_device=CUDA_DEVICE, serialization_dir=SERIALIZATION_DIR, # model_save_interval=600 ) stats = trainer.train() training_stats.append(stats) with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"), "w+") as fp: json.dump(training_stats, fp, indent=2) else: model.load_state_dict( torch.load(os.path.join(SERIALIZATION_DIR, "best.th"))) model = model.cuda(device=CUDA_DEVICE) # Empty cache to ensure larger batch can be loaded for testing torch.cuda.empty_cache() test_filepaths = { task.tag_namespace: dataset_paths[task.tag_namespace]["test"] for task in TASKS } logger.info("Evaluating on test data") test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset", batch_size=BATCH_SIZE * 2) test_iterator.index_with(vocab) model = model.eval() test_stats = evaluate_multiple_data(model, readers, test_iterator, test_filepaths, cuda_device=CUDA_DEVICE) with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp: json.dump(test_stats, fp, indent=2)