def test_vcf_load_variant_from_multiple_files(get_created_vcf_tabix_files): """Get variants from multiple mocked VCF files. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False) vcf_reader_2x = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False) assert (len(vcf_reader) == len(vcf_reader_2x))
def test_vcf_load_variant_from_multiple_files(get_created_vcf_tabix_files): """Get variants from multiple mocked VCF files. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False) second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False) vcf_loader = VCFReader([first_vcf_bam_tuple]) vcf_loader_2x = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple]) assert (2 * len(vcf_loader) == len(vcf_loader_2x))
def generate_hdf5(args): """Serialize encodings to HDF5. Generate encodings in multiprocess loop and save tensors to HDF5. """ # Get list of files from arguments # and generate the variant entries using VCF reader. bam = args.bam vcf_readers = [] for tp_file in args.tp_files: vcf_readers.append(VCFReader(vcf=tp_file, bams=[bam], is_fp=False)) for fp_file in args.fp_files: vcf_readers.append(VCFReader(vcf=fp_file, bams=[bam], is_fp=True)) total_labels = sum([len(reader) for reader in vcf_readers]) # Setup encoder for samples and labels. sample_encoder = PileupEncoder( window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY]) label_encoder = ZygosityLabelEncoder() encode_func = partial(encode, sample_encoder, label_encoder) # Create HDF5 datasets. h5_file = h5py.File(args.output_file, "w") encoded_data = h5_file.create_dataset( "encodings", shape=(total_labels, sample_encoder.depth, sample_encoder.height, sample_encoder.width), dtype=np.float32, fillvalue=0) label_data = h5_file.create_dataset("labels", shape=(total_labels, ), dtype=np.int64, fillvalue=0) pool = mp.Pool(args.threads) print("Serializing {} entries...".format(total_labels)) for vcf_reader in vcf_readers: label_idx = 0 for out in pool.imap(encode_func, vcf_reader): if label_idx % 1000 == 0: print("Saved {} entries".format(label_idx)) encoding, label = out encoded_data[label_idx] = encoding label_data[label_idx] = label label_idx += 1 print("Saved {} entries".format(total_labels)) h5_file.close()
def test_vcf_reader_to_df(get_created_vcf_tabix_files): """Get all variants from parsed file into dataframe. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False) df = vcf_reader.dataframe assert(len(vcf_reader) == len(df))
def get_invalid_vcf(mp, vcf_bam_list): with mp.context() as m: # Mock vcf.Reader.__init__() return value m.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_bad_vcf_reader_init) vcf_loader = VCFReader(vcf_bam_list) return vcf_loader
def test_vcf_loader_snps(get_created_vcf_tabix_files): """Get all variants from mocked file stream, filter SNPs, multi allele & multi samples """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False) vcf_loader = VCFReader([vcf_bam_tuple]) assert(len(vcf_loader) == 13)
def test_vcf_outputting(monkeypatch): """Write inference output into vcf files """ first_vcf_bam_tuple = VCFReader.VcfBamPath(vcf="/dummy/path1.gz", bam="temp.bam", is_fp=False) second_vcf_bam_tuple = VCFReader.VcfBamPath(vcf="/dummy/path2.gz", bam="temp.bam", is_fp=False) with monkeypatch.context() as mp: mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init) vcf_loader = VCFReader([first_vcf_bam_tuple, second_vcf_bam_tuple]) inferred_results = [ VariantZygosity.HOMOZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS, VariantZygosity.HETEROZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS ] assert (len(inferred_results) == len(vcf_loader)) with monkeypatch.context() as mp: mp.setattr(vcf.Reader, "__init__", MockPyVCFReader.new_vcf_reader_init) result_writer = VCFResultWriter(vcf_loader, inferred_results) result_writer.write_output() # Validate output files format and make sure the outputted genotype for each record matches to the network output i = 0 for f in ['inferred_path1.vcf', 'inferred_path2.vcf']: vcf_reader = vcf.Reader( filename=os.path.join(result_writer.output_location, f)) for record in vcf_reader: assert (record.samples[0]['GT'] == result_writer. zygosity_to_vcf_genotype[inferred_results[i]]) i += 1 assert (i == 6) # Clean up files shutil.rmtree(result_writer.output_location)
def test_load_vcf_content_with_wrong_format(get_created_vcf_tabix_files): """ parse vcf file with wrong format """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_invalid_file_input()) vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=False) with pytest.raises(RuntimeError): VCFReader([vcf_bam_tuple])
def test_vcf_outputting(get_created_vcf_tabix_files): """Write inference output into vcf files """ first_vcf_file_path, first_tabix_file_path = get_created_vcf_tabix_files(mock_small_filtered_file_input()) vcf_loader = VCFReader(vcf=first_vcf_file_path, bams=[], is_fp=False) inferred_results = [VariantZygosity.HOMOZYGOUS, VariantZygosity.HOMOZYGOUS, VariantZygosity.HETEROZYGOUS] assert (len(inferred_results) == len(vcf_loader)) result_writer = VCFResultWriter(vcf_loader, inferred_results) result_writer.write_output() # Validate output files format and make sure the outputted genotype for each record matches to the network output first_output_file_name = \ '{}_{}.{}'.format("inferred", "".join(os.path.basename(first_vcf_file_path).split('.')[0:-2]), 'vcf') i = 0 for f in [first_output_file_name]: vcf_reader = vcf.Reader(filename=os.path.join( result_writer.output_location, f)) for record in vcf_reader: assert(record.samples[0]['GT'] == result_writer.zygosity_to_vcf_genotype[inferred_results[i]]) i += 1 assert (i == 3) # Clean up files shutil.rmtree(result_writer.output_location)
def test_vcf_outputting(get_created_vcf_tabix_files): """Write inference output into vcf files """ orig_vcf_file_path, orig_vcf_tabix = get_created_vcf_tabix_files(mock_small_filtered_file_input()) vcf_reader = VCFReader(orig_vcf_file_path, bams=[], is_fp=False, format_keys=["*"], info_keys=["*"], filter_keys=["*"], sort=True) inferred_results = [int(VariantZygosity.NO_VARIANT), int(VariantZygosity.NO_VARIANT), int(VariantZygosity.NO_VARIANT)] assert (len(inferred_results) == len(vcf_reader)) input_vcf_df = vcf_reader.dataframe gt_col = "{}_GT".format(vcf_reader.samples[0]) assert(gt_col in input_vcf_df) # Update GT column data input_vcf_df[gt_col] = inferred_results output_path = '{}_{}.{}'.format("inferred", "".join(os.path.basename(orig_vcf_file_path).split('.')[0:-2]), 'vcf') vcf_writer = VCFWriter(input_vcf_df, output_path=output_path, sample_names=vcf_reader.samples) vcf_writer.write_output(input_vcf_df) # Tabix index output file with open(output_path, "rb") as in_file: data = in_file.read() indexed_output_file_path, _ = get_created_vcf_tabix_files(data) # Validate output files format and make sure the outputted genotype for each record matches to the network output vcf_reader_updated = VCFReader(indexed_output_file_path, is_fp=False, format_keys=["*"], info_keys=["*"], filter_keys=["*"], sort=True) assert(len(vcf_reader) == len(vcf_reader_updated)) for i, record in enumerate(vcf_reader_updated): assert(record.zygosity[0] == inferred_results[i]) # Clean up files os.remove(output_path)
def test_vcf_load_fp(get_created_vcf_tabix_files): """Get first variant from false positive mocked VCF file stream and check zygosity. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_bam_tuple = VCFReader.VcfBamPath(vcf=vcf_file_path, bam=tabix_file_path, is_fp=True) vcf_loader = VCFReader([vcf_bam_tuple]) for v in vcf_loader: assert(v.zygosity == VariantZygosity.NO_VARIANT)
def test_vcf_load_fp(get_created_vcf_tabix_files): """Get first variant from false positive mocked VCF file stream and check zygosity. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=True, format_keys=["GT"]) for v in vcf_reader: for i in range(len(v.samples)): assert(v.zygosity[i] == VariantZygosity.NO_VARIANT)
def test_vcf_fetch_variant(get_created_vcf_tabix_files): """Get first variant from mocked VCF file stream. """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_reader = VCFReader(vcf=vcf_file_path, bams=[], is_fp=False) try: assert (type(vcf_reader[0]) == Variant) except IndexError: pytest.fail("Can not retrieve first element from VCFReader")
def generate_hdf5(args): """Serialize encodings to HDF5. Generate encodings in multiprocess loop and save tensors to HDF5. """ # Get list of files from arguments. bam = args.bam file_list = [] for tp_file in args.tp_files: file_list.append(VCFReader.VcfBamPath( vcf=tp_file, bam=bam, is_fp=False)) for fp_file in args.fp_files: file_list.append(VCFReader.VcfBamPath( vcf=fp_file, bam=bam, is_fp=True)) # Generate the variant entries using VCF reader. vcf_reader = VCFReader(file_list) # Setup encoder for samples and labels. sample_encoder = PileupEncoder(window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY]) label_encoder = ZygosityLabelEncoder() encode_func = partial(encode, sample_encoder, label_encoder) # Create HDF5 datasets. h5_file = h5py.File(args.output_file, "w") encoded_data = h5_file.create_dataset("encodings", shape=(len(vcf_reader), sample_encoder.depth, sample_encoder.height, sample_encoder.width), dtype=np.float32, fillvalue=0) label_data = h5_file.create_dataset("labels", shape=(len(vcf_reader),), dtype=np.int64, fillvalue=0) pool = mp.Pool(args.threads) print("Serializing {} entries...".format(len(vcf_reader))) for i, out in enumerate(pool.imap(encode_func, vcf_reader)): if i % 1000 == 0: print("Saved {} entries".format(i)) encoding, label = out encoded_data[i] = encoding label_data[i] = label print("Saved {} entries".format(len(vcf_reader))) h5_file.close()
def test_simple_vc_infer(): # Load checkpointed model and run inference test_data_dir = get_data_folder() model_dir = os.path.join(test_data_dir, ".test_model") # Create neural factory nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(test_data_dir, "small_bam.bam") labels = os.path.join(test_data_dir, "candidates.vcf.gz") vcf_bam_tuple = VCFReader.VcfBamPath(vcf=labels, bam=bam, is_fp=False) vcf_loader = VCFReader([vcf_bam_tuple]) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, vcf_loader, batch_size=32, shuffle=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG encoding = test_dataset() vz = alexnet(encoding=encoding) # Invoke the "train" action. results = nf.infer([vz], checkpoint_dir=model_dir, verbose=True) # Decode inference results to labels zyg_decoder = ZygosityLabelDecoder() for tensor_batches in results: for batch in tensor_batches: predicted_classes = torch.argmax(batch, dim=1) inferred_zygosity = [ zyg_decoder(pred) for pred in predicted_classes ] assert (len(inferred_zygosity) == len(vcf_loader)) shutil.rmtree(model_dir)
def load_vcf_variantworks( vcf_file=None, num_threads=os.cpu_count(), require_genotype=True, info_keys=None, format_keys=None, ): try: from variantworks.io.vcfio import VCFReader except ImportError: print( "Install VariantWorks from https://github.com/clara-parabricks/VariantWorks" ) vcf = VCFReader( vcf_file, num_threads=num_threads, require_genotype=require_genotype, info_keys=info_keys, format_keys=format_keys, ) vcf_df = vcf.dataframe vcf_df_2 = _transform_df( vcf_df, sample_key_cols=list(vcf_df.columns[14:]), common_key_cols=list(vcf_df.columns[7:14]), common_cols=list(vcf_df.columns[0:7]), drop_cols=[ "id", "variant_type", "AC-1", "AC-2", "AF-1", "AF-2", "end_pos", ], ) return vcf_df_2
zyg_encoder = ZygosityLabelEncoder() # Create neural network that receives 2 channel inputs (encoding layers defined above) # and outputs a logit over three classes (no variant, homozygous variant, heterozygous variant. model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3) # Get datasets to train on. # NOTE: To train a neural network well, the model needs to see samples from all types of classes. # The example here shows a file that has true variant (either homozygous or heterozygous), # but in practice one also needs to pass a set of false positive samples so the model can learn to # ignore them. False positive samples can be marked with `is_fp` so the reader can appripriately # assign their variant types. data_folder = os.path.join(repo_root_dir, "tests", "data") bam = os.path.join(data_folder, "small_bam.bam") samples = os.path.join(data_folder, "candidates.vcf.gz") vcf_loader = VCFReader(vcf=samples, bams=[bam], is_fp=False) # Create a data loader with custom sample and label encoder. dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader], batch_size=32, shuffle=True, sample_encoder=pileup_encoder, label_encoder=zyg_encoder) # Use CrossEntropyLoss to train. vz_ce_loss = nemo.backends.pytorch.common.losses.CrossEntropyLossNM( logits_ndim=2) # Create NeMo training DAG. vz_labels, encoding = dataset_train()
encoding_layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY] pileup_encoder = PileupEncoder(window_size=100, max_reads=100, layers=encoding_layers) # Neural Network model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3) # Similar to training, a dataloader needs to be setup for the relevant datasets. In the case of # inference, it doesn't matter if the files are tagged as false positive or not. Each example will be # evaluated by the network. For simplicity the example is using the same dataset from training. # Note: No label encoder is required in inference. data_folder = os.path.join(repo_root_dir, "tests", "data") bam = os.path.join(data_folder, "small_bam.bam") labels = os.path.join(data_folder, "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, [vcf_loader], batch_size=32, shuffle=False, sample_encoder=pileup_encoder) # Create inference DAG encoding = test_dataset() vz = model(encoding=encoding) # Invoke the "infer" action. results = nf.infer([vz], checkpoint_dir="./", verbose=True) # Instantiate a decoder that converts the predicted output of the network to # a zygosity enum.
def test_simple_vc_trainer(): # Train a sample model with test data # Create neural factory model_dir = os.path.join(get_data_folder(), ".test_model") nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(get_data_folder(), "small_bam.bam") labels = os.path.join(get_data_folder(), "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader], batch_size=32, shuffle=True) vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) vz_labels, encoding = dataset_train() vz = alexnet(encoding=encoding) vz_loss = vz_ce_loss(logits=vz, labels=vz_labels) # Create evaluation DAG using same dataset as training dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL, [vcf_loader], batch_size=32, shuffle=False) vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2) vz_labels_eval, encoding_eval = dataset_eval() vz_eval = alexnet(encoding=encoding_eval) vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval) # Logger callback logger_callback = nemo.core.SimpleLossLoggerCallback( tensors=[vz_loss, vz, vz_labels], step_freq=1, ) evaluator_callback = nemo.core.EvaluatorCallback( eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=1, ) # Checkpointing models through NeMo callback checkpoint_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, load_from_folder=None, # Checkpointing frequency in steps step_freq=-1, # Checkpointing frequency in epochs epoch_freq=1, # Number of checkpoints to keep checkpoints_to_keep=1, # If True, CheckpointCallback will raise an Error if restoring fails force_load=False) # Invoke the "train" action. nf.train( [vz_loss], callbacks=[logger_callback, checkpoint_callback, evaluator_callback], optimization_params={ "num_epochs": 1, "lr": 0.001 }, optimizer="adam") assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))
"""Sample showing utilization of VCFReader to generate dataframe.""" import os import pandas as pd import time from variantworks.io.vcfio import VCFReader pd.set_option('max_columns', 100) sample_folder = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) repo_root_folder = os.path.dirname(sample_folder) tests_data_folder = os.path.join(repo_root_folder, "tests", "data") test_vcf_file = os.path.join(tests_data_folder, "candidates_multisample.vcf.gz") t = time.time() reader = VCFReader(test_vcf_file, bams=[], tags={"custom_tag": 1}, info_keys=["AF"], filter_keys=[], format_keys=[], num_threads=4, regions=[], require_genotype=True, sort=True) read_time = time.time() - t print(reader.dataframe) print("Elapsed time for reading VCF (seconds): ", read_time)
def test_vcf_reader(get_created_vcf_tabix_files): """Get all variants from mocked file stream, filter SNPs, multi allele & multi samples """ vcf_file_path, tabix_file_path = get_created_vcf_tabix_files(mock_file_input()) vcf_reader = VCFReader(vcf_file_path, bams=[], is_fp=False) assert(len(vcf_reader) == 17)
"""Sample showing utilization of VCFReader to generate dataframe.""" import os import pandas as pd from variantworks.io.vcfio import VCFReader, VCFWriter pd.set_option('max_columns', 100) cwd = os.path.dirname(os.path.realpath(__file__)) sample_folder = os.path.dirname(cwd) repo_root_folder = os.path.dirname(sample_folder) tests_data_folder = os.path.join(repo_root_folder, "tests", "data") test_vcf_file = os.path.join(tests_data_folder, "candidates_multisample.vcf.gz") reader = VCFReader(test_vcf_file, bams=[], tags={"custom_tag": 1}, info_keys=["*"], filter_keys=["*"], format_keys=["*"], num_threads=4, regions=[], require_genotype=False, sort=True, unbounded_val_max_cols=2) print(reader.dataframe) writer = VCFWriter(reader.dataframe, os.path.join(cwd, "test_out.vcf"), sample_names=reader.samples, num_threads=4) writer.write_output(reader.dataframe)