コード例 #1
0
def test_vcf_load_variant_from_multiple_files(get_created_vcf_tabix_files):
    """Get variants from multiple mocked VCF files.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False)
    vcf_reader_2x = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False)
    assert (len(vcf_reader) == len(vcf_reader_2x))
コード例 #2
0
ファイル: test_vcfio.py プロジェクト: rahulmohan/VariantWorks
def test_vcf_load_variant_from_multiple_files(get_created_vcf_tabix_files):
    """Get variants from multiple mocked VCF files.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
    second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
    vcf_loader = VCFReader([first_vcf_bam_tuple])
    vcf_loader_2x = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple])
    assert (2 * len(vcf_loader) == len(vcf_loader_2x))
コード例 #3
0
def generate_hdf5(args):
    """Serialize encodings to HDF5.

    Generate encodings in multiprocess loop and save tensors to HDF5.
    """
    # Get list of files from arguments
    # and generate the variant entries using VCF reader.
    bam = args.bam
    vcf_readers = []
    for tp_file in args.tp_files:
        vcf_readers.append(VCFReader(vcf=tp_file, bams=[bam], is_fp=False))
    for fp_file in args.fp_files:
        vcf_readers.append(VCFReader(vcf=fp_file, bams=[bam], is_fp=True))
    total_labels = sum([len(reader) for reader in vcf_readers])

    # Setup encoder for samples and labels.
    sample_encoder = PileupEncoder(
        window_size=100,
        max_reads=100,
        layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY])
    label_encoder = ZygosityLabelEncoder()

    encode_func = partial(encode, sample_encoder, label_encoder)

    # Create HDF5 datasets.
    h5_file = h5py.File(args.output_file, "w")
    encoded_data = h5_file.create_dataset(
        "encodings",
        shape=(total_labels, sample_encoder.depth, sample_encoder.height,
               sample_encoder.width),
        dtype=np.float32,
        fillvalue=0)
    label_data = h5_file.create_dataset("labels",
                                        shape=(total_labels, ),
                                        dtype=np.int64,
                                        fillvalue=0)

    pool = mp.Pool(args.threads)
    print("Serializing {} entries...".format(total_labels))
    for vcf_reader in vcf_readers:
        label_idx = 0
        for out in pool.imap(encode_func, vcf_reader):
            if label_idx % 1000 == 0:
                print("Saved {} entries".format(label_idx))
            encoding, label = out
            encoded_data[label_idx] = encoding
            label_data[label_idx] = label
            label_idx += 1
    print("Saved {} entries".format(total_labels))

    h5_file.close()
コード例 #4
0
def test_vcf_reader_to_df(get_created_vcf_tabix_files):
    """Get all variants from parsed file into dataframe.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False)
    df = vcf_reader.dataframe
    assert(len(vcf_reader) == len(df))
コード例 #5
0
ファイル: test_vcfio.py プロジェクト: tijyojwad/VariantWorks
 def get_invalid_vcf(mp, vcf_bam_list):
     with mp.context() as m:
         # Mock vcf.Reader.__init__() return value
         m.setattr(vcf.Reader, "__init__",
                   MockPyVCFReader.new_bad_vcf_reader_init)
         vcf_loader = VCFReader(vcf_bam_list)
     return vcf_loader
コード例 #6
0
ファイル: test_vcfio.py プロジェクト: rahulmohan/VariantWorks
def test_vcf_loader_snps(get_created_vcf_tabix_files):
    """Get all variants from mocked file stream, filter SNPs, multi allele & multi samples
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
    vcf_loader = VCFReader([vcf_bam_tuple])
    assert(len(vcf_loader) == 13)
コード例 #7
0
def test_vcf_outputting(monkeypatch):
    """Write inference output into vcf files
    """
    first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf="/dummy/path1.gz",
                                               bam="temp.bam",
                                               is_fp=False)
    second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf="/dummy/path2.gz",
                                                bam="temp.bam",
                                                is_fp=False)
    with monkeypatch.context() as mp:
        mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init)
        vcf_loader = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple])
    inferred_results = [
        VariantZygosity.HOMOZYGOUS, VariantZygosity.HOMOZYGOUS,
        VariantZygosity.HETEROZYGOUS, VariantZygosity.HETEROZYGOUS,
        VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS
    ]
    assert (len(inferred_results) == len(vcf_loader))
    with monkeypatch.context() as mp:
        mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init)
        result_writer = VCFResultWriter(vcf_loader, inferred_results)
        result_writer.write_output()
    # Validate output files format and make sure the outputted genotype for each record matches to the network output
    i = 0
    for f in ['inferred_path1.vcf', 'inferred_path2.vcf']:
        vcf_reader = vcf.Reader(
            filename=os.path.join(result_writer.output_location, f))
        for record in vcf_reader:
            assert (record.samples[0]['GT'] == result_writer.
                    zygosity_to_vcf_genotype[inferred_results[i]])
            i += 1
    assert (i == 6)
    # Clean up files
    shutil.rmtree(result_writer.output_location)
コード例 #8
0
ファイル: test_vcfio.py プロジェクト: rahulmohan/VariantWorks
def test_load_vcf_content_with_wrong_format(get_created_vcf_tabix_files):
    """ parse vcf file with wrong format
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_invalid_file_input())
    vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False)
    with pytest.raises(RuntimeError):
        VCFReader([vcf_bam_tuple])
コード例 #9
0
def test_vcf_outputting(get_created_vcf_tabix_files):
    """Write inference output into vcf files
    """
    first_vcf_file_path, first_tabix_file_path = get_created_vcf_tabix_files(mock_small_filtered_file_input())
    vcf_loader = VCFReader(vcf=first_vcf_file_path, bams=[], is_fp=False)

    inferred_results = [VariantZygosity.HOMOZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS]
    assert (len(inferred_results) == len(vcf_loader))

    result_writer = VCFResultWriter(vcf_loader, inferred_results)
    result_writer.write_output()

    # Validate output files format and make sure the outputted genotype for each record matches to the network output
    first_output_file_name = \
        '{}_{}.{}'.format("inferred", "".join(os.path.basename(first_vcf_file_path).split('.')[0:-2]), 'vcf')
    i = 0
    for f in [first_output_file_name]:
        vcf_reader = vcf.Reader(filename=os.path.join(
            result_writer.output_location, f))
        for record in vcf_reader:
            assert(record.samples[0]['GT'] == result_writer.zygosity_to_vcf_genotype[inferred_results[i]])
            i += 1
    assert (i == 3)
    # Clean up files
    shutil.rmtree(result_writer.output_location)
コード例 #10
0
def test_vcf_outputting(get_created_vcf_tabix_files):
    """Write inference output into vcf files
    """
    orig_vcf_file_path, orig_vcf_tabix = get_created_vcf_tabix_files(mock_small_filtered_file_input())
    vcf_reader = VCFReader(orig_vcf_file_path,
                           bams=[],
                           is_fp=False,
                           format_keys=["*"],
                           info_keys=["*"],
                           filter_keys=["*"],
                           sort=True)

    inferred_results = [int(VariantZygosity.NO_VARIANT),
                        int(VariantZygosity.NO_VARIANT),
                        int(VariantZygosity.NO_VARIANT)]
    assert (len(inferred_results) == len(vcf_reader))

    input_vcf_df = vcf_reader.dataframe
    gt_col = "{}_GT".format(vcf_reader.samples[0])
    assert(gt_col in input_vcf_df)

    # Update GT column data
    input_vcf_df[gt_col] = inferred_results

    output_path = '{}_{}.{}'.format("inferred", "".join(os.path.basename(orig_vcf_file_path).split('.')[0:-2]), 'vcf')
    vcf_writer = VCFWriter(input_vcf_df, output_path=output_path, sample_names=vcf_reader.samples)
    vcf_writer.write_output(input_vcf_df)

    # Tabix index output file
    with open(output_path, "rb") as in_file:
        data = in_file.read()
    indexed_output_file_path, _ = get_created_vcf_tabix_files(data)

    # Validate output files format and make sure the outputted genotype for each record matches to the network output
    vcf_reader_updated = VCFReader(indexed_output_file_path,
                                   is_fp=False,
                                   format_keys=["*"],
                                   info_keys=["*"],
                                   filter_keys=["*"],
                                   sort=True)
    assert(len(vcf_reader) == len(vcf_reader_updated))
    for i, record in enumerate(vcf_reader_updated):
        assert(record.zygosity[0] == inferred_results[i])

    # Clean up files
    os.remove(output_path)
コード例 #11
0
ファイル: test_vcfio.py プロジェクト: rahulmohan/VariantWorks
def test_vcf_load_fp(get_created_vcf_tabix_files):
    """Get first variant from false positive mocked VCF file stream and check zygosity.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=True)
    vcf_loader = VCFReader([vcf_bam_tuple])
    for v in vcf_loader:
        assert(v.zygosity == VariantZygosity.NO_VARIANT)
コード例 #12
0
def test_vcf_load_fp(get_created_vcf_tabix_files):
    """Get first variant from false positive mocked VCF file stream and check zygosity.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=True, format_keys=["GT"])
    for v in vcf_reader:
        for i in range(len(v.samples)):
            assert(v.zygosity[i] == VariantZygosity.NO_VARIANT)
コード例 #13
0
def test_vcf_fetch_variant(get_created_vcf_tabix_files):
    """Get first variant from mocked VCF file stream.
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False)
    try:
        assert (type(vcf_reader[0]) == Variant)
    except IndexError:
        pytest.fail("Can not retrieve first element from VCFReader")
コード例 #14
0
def generate_hdf5(args):
    """Serialize encodings to HDF5.

    Generate encodings in multiprocess loop and save tensors to HDF5.
    """
    # Get list of files from arguments.
    bam = args.bam
    file_list = []
    for tp_file in args.tp_files:
        file_list.append(VCFReader.VcfBamPath(
            vcf=tp_file, bam=bam, is_fp=False))
    for fp_file in args.fp_files:
        file_list.append(VCFReader.VcfBamPath(
            vcf=fp_file, bam=bam, is_fp=True))

    # Generate the variant entries using VCF reader.
    vcf_reader = VCFReader(file_list)

    # Setup encoder for samples and labels.
    sample_encoder = PileupEncoder(window_size=100, max_reads=100,
                                   layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY])
    label_encoder = ZygosityLabelEncoder()

    encode_func = partial(encode, sample_encoder, label_encoder)

    # Create HDF5 datasets.
    h5_file = h5py.File(args.output_file, "w")
    encoded_data = h5_file.create_dataset("encodings",
                                          shape=(len(vcf_reader), sample_encoder.depth,
                                                 sample_encoder.height, sample_encoder.width),
                                          dtype=np.float32, fillvalue=0)
    label_data = h5_file.create_dataset("labels",
                                        shape=(len(vcf_reader),), dtype=np.int64, fillvalue=0)

    pool = mp.Pool(args.threads)
    print("Serializing {} entries...".format(len(vcf_reader)))
    for i, out in enumerate(pool.imap(encode_func, vcf_reader)):
        if i % 1000 == 0:
            print("Saved {} entries".format(i))
        encoding, label = out
        encoded_data[i] = encoding
        label_data[i] = label
    print("Saved {} entries".format(len(vcf_reader)))

    h5_file.close()
コード例 #15
0
def test_simple_vc_infer():
    # Load checkpointed model and run inference
    test_data_dir = get_data_folder()
    model_dir = os.path.join(test_data_dir, ".test_model")

    # Create neural factory
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU,
        checkpoint_dir=model_dir)

    # Generate dataset
    bam = os.path.join(test_data_dir, "small_bam.bam")
    labels = os.path.join(test_data_dir, "candidates.vcf.gz")
    vcf_bam_tuple = VCFReader.VcfBamPath(vcf=labels, bam=bam, is_fp=False)
    vcf_loader = VCFReader([vcf_bam_tuple])
    test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST,
                                        vcf_loader,
                                        batch_size=32,
                                        shuffle=False)

    # Neural Network
    alexnet = AlexNet(num_input_channels=1, num_output_logits=3)

    # Create train DAG
    encoding = test_dataset()
    vz = alexnet(encoding=encoding)

    # Invoke the "train" action.
    results = nf.infer([vz], checkpoint_dir=model_dir, verbose=True)

    # Decode inference results to labels
    zyg_decoder = ZygosityLabelDecoder()
    for tensor_batches in results:
        for batch in tensor_batches:
            predicted_classes = torch.argmax(batch, dim=1)
            inferred_zygosity = [
                zyg_decoder(pred) for pred in predicted_classes
            ]

    assert (len(inferred_zygosity) == len(vcf_loader))

    shutil.rmtree(model_dir)
コード例 #16
0
ファイル: io.py プロジェクト: ayushdg/GPU-GWAS
def load_vcf_variantworks(
    vcf_file=None,
    num_threads=os.cpu_count(),
    require_genotype=True,
    info_keys=None,
    format_keys=None,
):
    try:
        from variantworks.io.vcfio import VCFReader
    except ImportError:
        print(
            "Install VariantWorks from https://github.com/clara-parabricks/VariantWorks"
        )

    vcf = VCFReader(
        vcf_file,
        num_threads=num_threads,
        require_genotype=require_genotype,
        info_keys=info_keys,
        format_keys=format_keys,
    )
    vcf_df = vcf.dataframe

    vcf_df_2 = _transform_df(
        vcf_df,
        sample_key_cols=list(vcf_df.columns[14:]),
        common_key_cols=list(vcf_df.columns[7:14]),
        common_cols=list(vcf_df.columns[0:7]),
        drop_cols=[
            "id",
            "variant_type",
            "AC-1",
            "AC-2",
            "AF-1",
            "AF-2",
            "end_pos",
        ],
    )

    return vcf_df_2
コード例 #17
0
zyg_encoder = ZygosityLabelEncoder()

# Create neural network that receives 2 channel inputs (encoding layers defined above)
# and outputs a logit over three classes (no variant, homozygous variant, heterozygous variant.
model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3)

# Get datasets to train on.
# NOTE: To train a neural network well, the model needs to see samples from all types of classes.
# The example here shows a file that has true variant (either homozygous or heterozygous),
# but in practice one also needs to pass a set of false positive samples so the model can learn to
# ignore them. False positive samples can be marked with `is_fp` so the reader can appripriately
# assign their variant types.
data_folder = os.path.join(repo_root_dir, "tests", "data")
bam = os.path.join(data_folder, "small_bam.bam")
samples = os.path.join(data_folder, "candidates.vcf.gz")
vcf_loader = VCFReader(vcf=samples, bams=[bam], is_fp=False)

# Create a data loader with custom sample and label encoder.
dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN,
                                     [vcf_loader],
                                     batch_size=32,
                                     shuffle=True,
                                     sample_encoder=pileup_encoder,
                                     label_encoder=zyg_encoder)

# Use CrossEntropyLoss to train.
vz_ce_loss = nemo.backends.pytorch.common.losses.CrossEntropyLossNM(
    logits_ndim=2)

# Create NeMo training DAG.
vz_labels, encoding = dataset_train()
コード例 #18
0
encoding_layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY]
pileup_encoder = PileupEncoder(window_size=100,
                               max_reads=100,
                               layers=encoding_layers)

# Neural Network
model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3)

# Similar to training, a dataloader needs to be setup for the relevant datasets. In the case of
# inference, it doesn't matter if the files are tagged as false positive or not. Each example will be
# evaluated by the network. For simplicity the example is using the same dataset from training.
# Note: No label encoder is required in inference.
data_folder = os.path.join(repo_root_dir, "tests", "data")
bam = os.path.join(data_folder, "small_bam.bam")
labels = os.path.join(data_folder, "candidates.vcf.gz")
vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)
test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST,
                                    [vcf_loader],
                                    batch_size=32,
                                    shuffle=False,
                                    sample_encoder=pileup_encoder)

# Create inference DAG
encoding = test_dataset()
vz = model(encoding=encoding)

# Invoke the "infer" action.
results = nf.infer([vz], checkpoint_dir="./", verbose=True)

# Instantiate a decoder that converts the predicted output of the network to
# a zygosity enum.
コード例 #19
0
def test_simple_vc_trainer():
    # Train a sample model with test data

    # Create neural factory
    model_dir = os.path.join(get_data_folder(), ".test_model")
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU,
        checkpoint_dir=model_dir)

    # Generate dataset
    bam = os.path.join(get_data_folder(), "small_bam.bam")
    labels = os.path.join(get_data_folder(), "candidates.vcf.gz")
    vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)

    # Neural Network
    alexnet = AlexNet(num_input_channels=1, num_output_logits=3)

    # Create train DAG
    dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN,
                                         [vcf_loader],
                                         batch_size=32,
                                         shuffle=True)
    vz_ce_loss = CrossEntropyLossNM(logits_ndim=2)
    vz_labels, encoding = dataset_train()
    vz = alexnet(encoding=encoding)
    vz_loss = vz_ce_loss(logits=vz, labels=vz_labels)

    # Create evaluation DAG using same dataset as training
    dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL,
                                        [vcf_loader],
                                        batch_size=32,
                                        shuffle=False)
    vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2)
    vz_labels_eval, encoding_eval = dataset_eval()
    vz_eval = alexnet(encoding=encoding_eval)
    vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval)

    # Logger callback
    logger_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[vz_loss, vz, vz_labels],
        step_freq=1,
    )

    evaluator_callback = nemo.core.EvaluatorCallback(
        eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval],
        user_iter_callback=eval_iter_callback,
        user_epochs_done_callback=eval_epochs_done_callback,
        eval_step=1,
    )

    # Checkpointing models through NeMo callback
    checkpoint_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir,
        load_from_folder=None,
        # Checkpointing frequency in steps
        step_freq=-1,
        # Checkpointing frequency in epochs
        epoch_freq=1,
        # Number of checkpoints to keep
        checkpoints_to_keep=1,
        # If True, CheckpointCallback will raise an Error if restoring fails
        force_load=False)

    # Invoke the "train" action.
    nf.train(
        [vz_loss],
        callbacks=[logger_callback, checkpoint_callback, evaluator_callback],
        optimization_params={
            "num_epochs": 1,
            "lr": 0.001
        },
        optimizer="adam")

    assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))
コード例 #20
0
"""Sample showing utilization of VCFReader to generate dataframe."""

import os
import pandas as pd
import time

from variantworks.io.vcfio import VCFReader

pd.set_option('max_columns', 100)

sample_folder = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
repo_root_folder = os.path.dirname(sample_folder)
tests_data_folder = os.path.join(repo_root_folder, "tests", "data")
test_vcf_file = os.path.join(tests_data_folder, "candidates_multisample.vcf.gz")

t = time.time()
reader = VCFReader(test_vcf_file,
                   bams=[],
                   tags={"custom_tag": 1},
                   info_keys=["AF"],
                   filter_keys=[],
                   format_keys=[],
                   num_threads=4,
                   regions=[],
                   require_genotype=True,
                   sort=True)
read_time = time.time() - t
print(reader.dataframe)

print("Elapsed time for reading VCF (seconds): ", read_time)
コード例 #21
0
def test_vcf_reader(get_created_vcf_tabix_files):
    """Get all variants from mocked file stream, filter SNPs, multi allele & multi samples
    """
    vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input())
    vcf_reader = VCFReader(vcf_file_path, bams=[], is_fp=False)
    assert(len(vcf_reader) == 17)
コード例 #22
0
"""Sample showing utilization of VCFReader to generate dataframe."""

import os
import pandas as pd

from variantworks.io.vcfio import VCFReader, VCFWriter

pd.set_option('max_columns', 100)

cwd = os.path.dirname(os.path.realpath(__file__))
sample_folder = os.path.dirname(cwd)
repo_root_folder = os.path.dirname(sample_folder)
tests_data_folder = os.path.join(repo_root_folder, "tests", "data")
test_vcf_file = os.path.join(tests_data_folder, "candidates_multisample.vcf.gz")

reader = VCFReader(test_vcf_file,
                   bams=[],
                   tags={"custom_tag": 1},
                   info_keys=["*"],
                   filter_keys=["*"],
                   format_keys=["*"],
                   num_threads=4,
                   regions=[],
                   require_genotype=False,
                   sort=True,
                   unbounded_val_max_cols=2)
print(reader.dataframe)

writer = VCFWriter(reader.dataframe, os.path.join(cwd, "test_out.vcf"), sample_names=reader.samples, num_threads=4)
writer.write_output(reader.dataframe)