def test_labels_correctness(): region = FileRegion(start_pos=0, end_pos=14460, file_path=os.path.join(get_data_folder(), "subreads_and_truth.pileup")) encoder = HaploidLabelEncoder(exclude_no_coverage_positions=False) haploid_labels, _ = encoder(region) correct_labels = np.load( os.path.join(get_data_folder(), "sample_haploid_labels.npy")) assert (haploid_labels.shape == correct_labels.shape) assert (np.allclose(haploid_labels, correct_labels))
def test_counts_correctness(): region = FileRegion(start_pos=0, end_pos=14460, file_path=os.path.join(get_data_folder(), "subreads_and_truth.pileup")) encoder = SummaryEncoder(exclude_no_coverage_positions=False, normalize_counts=True) pileup_counts, _ = encoder(region) correct_counts = np.load( os.path.join(get_data_folder(), "sample_counts.npy")) assert (pileup_counts.shape == correct_counts.shape) assert (np.allclose(pileup_counts, correct_counts))
def deletion_variant(): bam = os.path.join(get_data_folder(), "some_indels.bam") variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C', quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=[VariantZygosity.HETEROZYGOUS], type=VariantType.DELETION, vcf='null.vcf', bams=[bam]) return variant
def insertion_variant(): bam = os.path.join(get_data_folder(), "some_indels.bam") variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG', quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.INSERTION, vcf='null.vcf', bams=[bam]) return variant
def snp_variant(): bam = os.path.join(get_data_folder(), "small_bam.bam") variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'], samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) return variant
def test_simple_vc_infer(): # Load checkpointed model and run inference test_data_dir = get_data_folder() model_dir = os.path.join(test_data_dir, ".test_model") # Create neural factory nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(test_data_dir, "small_bam.bam") labels = os.path.join(test_data_dir, "candidates.vcf.gz") vcf_bam_tuple = VCFReader.VcfBamPath(vcf=labels, bam=bam, is_fp=False) vcf_loader = VCFReader([vcf_bam_tuple]) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, vcf_loader, batch_size=32, shuffle=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG encoding = test_dataset() vz = alexnet(encoding=encoding) # Invoke the "train" action. results = nf.infer([vz], checkpoint_dir=model_dir, verbose=True) # Decode inference results to labels zyg_decoder = ZygosityLabelDecoder() for tensor_batches in results: for batch in tensor_batches: predicted_classes = torch.argmax(batch, dim=1) inferred_zygosity = [ zyg_decoder(pred) for pred in predicted_classes ] assert (len(inferred_zygosity) == len(vcf_loader)) shutil.rmtree(model_dir)
def test_zygosity_encoder(): encoder = ZygosityLabelEncoder() bam = os.path.join(get_data_folder(), "small_bam.bam") variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) # Since it should return a scalar assert (encoding.size() == torch.Size([])) assert (encoding == 1) variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['0/0', '50']], zygosity=[VariantZygosity.NO_VARIANT], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) assert (encoding == 0) variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['0/1', '50']], zygosity=[VariantZygosity.HETEROZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) assert (encoding == 2)
end_pos=14460, file_path=os.path.join(get_data_folder(), "subreads_and_truth.pileup")) encoder = SummaryEncoder(exclude_no_coverage_positions=False, normalize_counts=True) pileup_counts, _ = encoder(region) correct_counts = np.load( os.path.join(get_data_folder(), "sample_counts.npy")) assert (pileup_counts.shape == correct_counts.shape) assert (np.allclose(pileup_counts, correct_counts)) @pytest.mark.parametrize( "start_pos,end_pos,shape,pileup_file,truth_positions", [(0, 1, (1, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"), torch.IntTensor([[0, 0]])), (1, 4, (3, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"), torch.IntTensor([[1, 0], [2, 0], [3, 0]])), (14459, 14460, (1, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"), torch.IntTensor([[14459, 0]])), (5, 6, (2, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"), torch.IntTensor([[5, 0], [5, 1]]))], ) def test_encoder_region_bounds(start_pos, end_pos, shape, pileup_file, truth_positions): encoder = SummaryEncoder(exclude_no_coverage_positions=False, normalize_counts=True)
def test_simple_vc_trainer(): # Train a sample model with test data # Create neural factory model_dir = os.path.join(get_data_folder(), ".test_model") nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(get_data_folder(), "small_bam.bam") labels = os.path.join(get_data_folder(), "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader], batch_size=32, shuffle=True) vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) vz_labels, encoding = dataset_train() vz = alexnet(encoding=encoding) vz_loss = vz_ce_loss(logits=vz, labels=vz_labels) # Create evaluation DAG using same dataset as training dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL, [vcf_loader], batch_size=32, shuffle=False) vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2) vz_labels_eval, encoding_eval = dataset_eval() vz_eval = alexnet(encoding=encoding_eval) vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval) # Logger callback logger_callback = nemo.core.SimpleLossLoggerCallback( tensors=[vz_loss, vz, vz_labels], step_freq=1, ) evaluator_callback = nemo.core.EvaluatorCallback( eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=1, ) # Checkpointing models through NeMo callback checkpoint_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, load_from_folder=None, # Checkpointing frequency in steps step_freq=-1, # Checkpointing frequency in epochs epoch_freq=1, # Number of checkpoints to keep checkpoints_to_keep=1, # If True, CheckpointCallback will raise an Error if restoring fails force_load=False) # Invoke the "train" action. nf.train( [vz_loss], callbacks=[logger_callback, checkpoint_callback, evaluator_callback], optimization_params={ "num_epochs": 1, "lr": 0.001 }, optimizer="adam") assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))