Esempio n. 1
0
def test_labels_correctness():
    region = FileRegion(start_pos=0,
                        end_pos=14460,
                        file_path=os.path.join(get_data_folder(),
                                               "subreads_and_truth.pileup"))
    encoder = HaploidLabelEncoder(exclude_no_coverage_positions=False)
    haploid_labels, _ = encoder(region)
    correct_labels = np.load(
        os.path.join(get_data_folder(), "sample_haploid_labels.npy"))
    assert (haploid_labels.shape == correct_labels.shape)
    assert (np.allclose(haploid_labels, correct_labels))
Esempio n. 2
0
def test_counts_correctness():
    region = FileRegion(start_pos=0,
                        end_pos=14460,
                        file_path=os.path.join(get_data_folder(),
                                               "subreads_and_truth.pileup"))
    encoder = SummaryEncoder(exclude_no_coverage_positions=False,
                             normalize_counts=True)
    pileup_counts, _ = encoder(region)
    correct_counts = np.load(
        os.path.join(get_data_folder(), "sample_counts.npy"))
    assert (pileup_counts.shape == correct_counts.shape)
    assert (np.allclose(pileup_counts, correct_counts))
def deletion_variant():
    bam = os.path.join(get_data_folder(), "some_indels.bam")
    variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C',
                      quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
                      samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=[VariantZygosity.HETEROZYGOUS],
                      type=VariantType.DELETION, vcf='null.vcf', bams=[bam])
    return variant
def insertion_variant():
    bam = os.path.join(get_data_folder(), "some_indels.bam")
    variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG',
                      quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
                      samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.INSERTION, vcf='null.vcf', bams=[bam])
    return variant
def snp_variant():
    bam = os.path.join(get_data_folder(), "small_bam.bam")
    variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A',
                      quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'],
                      samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.SNP, vcf='null.vcf', bams=[bam])
    return variant
Esempio n. 6
0
def test_simple_vc_infer():
    # Load checkpointed model and run inference
    test_data_dir = get_data_folder()
    model_dir = os.path.join(test_data_dir, ".test_model")

    # Create neural factory
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU,
        checkpoint_dir=model_dir)

    # Generate dataset
    bam = os.path.join(test_data_dir, "small_bam.bam")
    labels = os.path.join(test_data_dir, "candidates.vcf.gz")
    vcf_bam_tuple = VCFReader.VcfBamPath(vcf=labels, bam=bam, is_fp=False)
    vcf_loader = VCFReader([vcf_bam_tuple])
    test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST,
                                        vcf_loader,
                                        batch_size=32,
                                        shuffle=False)

    # Neural Network
    alexnet = AlexNet(num_input_channels=1, num_output_logits=3)

    # Create train DAG
    encoding = test_dataset()
    vz = alexnet(encoding=encoding)

    # Invoke the "train" action.
    results = nf.infer([vz], checkpoint_dir=model_dir, verbose=True)

    # Decode inference results to labels
    zyg_decoder = ZygosityLabelDecoder()
    for tensor_batches in results:
        for batch in tensor_batches:
            predicted_classes = torch.argmax(batch, dim=1)
            inferred_zygosity = [
                zyg_decoder(pred) for pred in predicted_classes
            ]

    assert (len(inferred_zygosity) == len(vcf_loader))

    shutil.rmtree(model_dir)
Esempio n. 7
0
def test_zygosity_encoder():
    encoder = ZygosityLabelEncoder()

    bam = os.path.join(get_data_folder(), "small_bam.bam")
    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['1/1', '50']],
                      zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    # Since it should return a scalar
    assert (encoding.size() == torch.Size([]))
    assert (encoding == 1)

    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['0/0', '50']],
                      zygosity=[VariantZygosity.NO_VARIANT],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    assert (encoding == 0)

    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['0/1', '50']],
                      zygosity=[VariantZygosity.HETEROZYGOUS],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    assert (encoding == 2)
Esempio n. 8
0
                        end_pos=14460,
                        file_path=os.path.join(get_data_folder(),
                                               "subreads_and_truth.pileup"))
    encoder = SummaryEncoder(exclude_no_coverage_positions=False,
                             normalize_counts=True)
    pileup_counts, _ = encoder(region)
    correct_counts = np.load(
        os.path.join(get_data_folder(), "sample_counts.npy"))
    assert (pileup_counts.shape == correct_counts.shape)
    assert (np.allclose(pileup_counts, correct_counts))


@pytest.mark.parametrize(
    "start_pos,end_pos,shape,pileup_file,truth_positions",
    [(0, 1,
      (1, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"),
      torch.IntTensor([[0, 0]])),
     (1, 4,
      (3, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"),
      torch.IntTensor([[1, 0], [2, 0], [3, 0]])),
     (14459, 14460,
      (1, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"),
      torch.IntTensor([[14459, 0]])),
     (5, 6,
      (2, 10), os.path.join(get_data_folder(), "subreads_and_truth.pileup"),
      torch.IntTensor([[5, 0], [5, 1]]))],
)
def test_encoder_region_bounds(start_pos, end_pos, shape, pileup_file,
                               truth_positions):
    encoder = SummaryEncoder(exclude_no_coverage_positions=False,
                             normalize_counts=True)
Esempio n. 9
0
def test_simple_vc_trainer():
    # Train a sample model with test data

    # Create neural factory
    model_dir = os.path.join(get_data_folder(), ".test_model")
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU,
        checkpoint_dir=model_dir)

    # Generate dataset
    bam = os.path.join(get_data_folder(), "small_bam.bam")
    labels = os.path.join(get_data_folder(), "candidates.vcf.gz")
    vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)

    # Neural Network
    alexnet = AlexNet(num_input_channels=1, num_output_logits=3)

    # Create train DAG
    dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN,
                                         [vcf_loader],
                                         batch_size=32,
                                         shuffle=True)
    vz_ce_loss = CrossEntropyLossNM(logits_ndim=2)
    vz_labels, encoding = dataset_train()
    vz = alexnet(encoding=encoding)
    vz_loss = vz_ce_loss(logits=vz, labels=vz_labels)

    # Create evaluation DAG using same dataset as training
    dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL,
                                        [vcf_loader],
                                        batch_size=32,
                                        shuffle=False)
    vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2)
    vz_labels_eval, encoding_eval = dataset_eval()
    vz_eval = alexnet(encoding=encoding_eval)
    vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval)

    # Logger callback
    logger_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[vz_loss, vz, vz_labels],
        step_freq=1,
    )

    evaluator_callback = nemo.core.EvaluatorCallback(
        eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval],
        user_iter_callback=eval_iter_callback,
        user_epochs_done_callback=eval_epochs_done_callback,
        eval_step=1,
    )

    # Checkpointing models through NeMo callback
    checkpoint_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir,
        load_from_folder=None,
        # Checkpointing frequency in steps
        step_freq=-1,
        # Checkpointing frequency in epochs
        epoch_freq=1,
        # Number of checkpoints to keep
        checkpoints_to_keep=1,
        # If True, CheckpointCallback will raise an Error if restoring fails
        force_load=False)

    # Invoke the "train" action.
    nf.train(
        [vz_loss],
        callbacks=[logger_callback, checkpoint_callback, evaluator_callback],
        optimization_params={
            "num_epochs": 1,
            "lr": 0.001
        },
        optimizer="adam")

    assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))