def test_simple_vc_infer(): # Load checkpointed model and run inference test_data_dir = get_data_folder() model_dir = os.path.join(test_data_dir, ".test_model") # Create neural factory nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(test_data_dir, "small_bam.bam") labels = os.path.join(test_data_dir, "candidates.vcf.gz") vcf_bam_tuple = VCFReader.VcfBamPath(vcf=labels, bam=bam, is_fp=False) vcf_loader = VCFReader([vcf_bam_tuple]) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, vcf_loader, batch_size=32, shuffle=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG encoding = test_dataset() vz = alexnet(encoding=encoding) # Invoke the "train" action. results = nf.infer([vz], checkpoint_dir=model_dir, verbose=True) # Decode inference results to labels zyg_decoder = ZygosityLabelDecoder() for tensor_batches in results: for batch in tensor_batches: predicted_classes = torch.argmax(batch, dim=1) inferred_zygosity = [ zyg_decoder(pred) for pred in predicted_classes ] assert (len(inferred_zygosity) == len(vcf_loader)) shutil.rmtree(model_dir)
def test_simple_vc_trainer(): # Train a sample model with test data # Create neural factory model_dir = os.path.join(get_data_folder(), ".test_model") nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(get_data_folder(), "small_bam.bam") labels = os.path.join(get_data_folder(), "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader], batch_size=32, shuffle=True) vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) vz_labels, encoding = dataset_train() vz = alexnet(encoding=encoding) vz_loss = vz_ce_loss(logits=vz, labels=vz_labels) # Create evaluation DAG using same dataset as training dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL, [vcf_loader], batch_size=32, shuffle=False) vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2) vz_labels_eval, encoding_eval = dataset_eval() vz_eval = alexnet(encoding=encoding_eval) vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval) # Logger callback logger_callback = nemo.core.SimpleLossLoggerCallback( tensors=[vz_loss, vz, vz_labels], step_freq=1, ) evaluator_callback = nemo.core.EvaluatorCallback( eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=1, ) # Checkpointing models through NeMo callback checkpoint_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, load_from_folder=None, # Checkpointing frequency in steps step_freq=-1, # Checkpointing frequency in epochs epoch_freq=1, # Number of checkpoints to keep checkpoints_to_keep=1, # If True, CheckpointCallback will raise an Error if restoring fails force_load=False) # Invoke the "train" action. nf.train( [vz_loss], callbacks=[logger_callback, checkpoint_callback, evaluator_callback], optimization_params={ "num_epochs": 1, "lr": 0.001 }, optimizer="adam") assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))
repo_root_dir = pathlib.Path(__file__).parent.parent.parent.parent.absolute() # Create neural factory. In this case, the checkpoint_dir has to be set for NeMo to pick # up a pre-trained model. nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir="./") # Dataset generation is done in a similar manner. It's important to note that the encoder used # for inference much match that for training. encoding_layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY] pileup_encoder = PileupEncoder(window_size=100, max_reads=100, layers=encoding_layers) # Neural Network model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3) # Similar to training, a dataloader needs to be setup for the relevant datasets. In the case of # inference, it doesn't matter if the files are tagged as false positive or not. Each example will be # evaluated by the network. For simplicity the example is using the same dataset from training. # Note: No label encoder is required in inference. data_folder = os.path.join(repo_root_dir, "tests", "data") bam = os.path.join(data_folder, "small_bam.bam") labels = os.path.join(data_folder, "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, [vcf_loader], batch_size=32, shuffle=False, sample_encoder=pileup_encoder)
def create_model(): """Return neural network to train.""" # Neural Network alexnet = AlexNet(num_input_channels=2, num_output_logits=3) return alexnet