def _load_single_sample_variants(self, sample_idx, file_format_funcs, variant_type, merge_type): sample_id = self.sample_ids[sample_idx] normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[sample_idx] tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[sample_idx] cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type) cached = self.load_from_cache(self.cache_names["variant"], sample_id, cached_file_name) if cached is not None: return cached combined_variants = [] for file_format_func in file_format_funcs: file_name = file_format_func( sample_id, normal_bam_id, tumor_bam_id) variants = varcode.load_vcf_fast(path.join(self.data_dir, file_name)) combined_variants.append(set(variants.elements)) if len(combined_variants) == 1: # There is nothing to merge merged_variants = VariantCollection(combined_variants[0]) else: assert merge_type in ["union", "intersection"], "Unknown merge type: %s" % merge_type if merge_type == "union": merged_variants = VariantCollection(set.union(*combined_variants)) elif merge_type == "intersection": merged_variants = VariantCollection(set.intersection(*combined_variants)) self.save_to_cache(merged_variants, self.cache_names["variant"], sample_id, cached_file_name) return merged_variants
def do_test(kwargs): vcf_pandas = load_vcf_fast(**kwargs) vcf_pyvcf = load_vcf(**kwargs) eq_(vcf_pandas, vcf_pyvcf) eq_(len(vcf_pandas), len(vcf_pyvcf)) eq_(vcf_pandas.elements, vcf_pyvcf.elements) eq_(vcf_pandas.metadata, vcf_pyvcf.metadata) assert len(vcf_pandas) > 1 assert len(vcf_pyvcf) > 1
def loader(filename): collection = varcode.load_vcf_fast( filename, genome=genome, max_variants=max_variants, only_passing=only_passing, allow_extended_nucleotides=True) return variants_to_dataframe( collection, collection.metadata, metadata_column_prefix=metadata_column_prefix)
def run(): args = parser.parse_args() extra_args = {} if not args.info_field: extra_args["include_info"] = False start = time.time() if args.pyvcf: result = varcode.load_vcf(args.path, allow_extended_nucleotides=True) else: result = varcode.load_vcf_fast(args.path, allow_extended_nucleotides=True, **extra_args) print("Loaded %d variants in %0.3f sec. " % (len(result), time.time() - start)) print(result.to_string(limit=5))
def __init__(self, ranked_variants_with_vaccine_peptides, patient_info, final_review, reviewers, args_for_report, input_json_file, cosmic_vcf_filename=None): """ Construct a TemplateDataCreator object, from the output of the vaxrank pipeline. """ self.ranked_variants_with_vaccine_peptides = ranked_variants_with_vaccine_peptides self.patient_info = patient_info # filter output-related command-line args: we want to display everything else args_to_display_in_report = { k: v for k, v in args_for_report.items() if not k.startswith("output") } self.template_data = { 'args': sorted(args_to_display_in_report.items()), 'reviewers': reviewers.split(',') if reviewers else [], 'final_review': final_review, 'input_json_file': input_json_file, # these report sections are optional 'include_manufacturability': args_for_report['manufacturability'], 'include_wt_epitopes': args_for_report['wt_epitopes'], } # map from peptide objects to their COSMIC IDs if they exist if cosmic_vcf_filename: logger.info('Loading COSMIC data...') self.cosmic_variant_collection = load_vcf_fast( cosmic_vcf_filename, allow_extended_nucleotides=True, include_info=False) logger.info('COSMIC data loaded.') else: self.cosmic_variant_collection = None
def run(): args = parser.parse_args() extra_args = {} if not args.info_field: extra_args["include_info"] = False start = time.time() if args.pyvcf: result = varcode.load_vcf( args.path, allow_extended_nucleotides=True) else: result = varcode.load_vcf_fast( args.path, allow_extended_nucleotides=True, **extra_args) print("Loaded %d variants in %0.3f sec. " % ( len(result), time.time() - start)) print(result.to_string(limit=5))
def __init__( self, ranked_variants_with_vaccine_peptides, patient_info, final_review, reviewers, args_for_report, input_json_file, cosmic_vcf_filename=None): """ Construct a TemplateDataCreator object, from the output of the vaxrank pipeline. """ self.ranked_variants_with_vaccine_peptides = ranked_variants_with_vaccine_peptides self.patient_info = patient_info # filter output-related command-line args: we want to display everything else args_to_display_in_report = { k: v for k, v in args_for_report.items() if not k.startswith("output") } self.template_data = { 'args': sorted(args_to_display_in_report.items()), 'reviewers': reviewers.split(',') if reviewers else [], 'final_review': final_review, 'input_json_file': input_json_file, # these report sections are optional 'include_manufacturability': args_for_report['manufacturability'], 'include_wt_epitopes': args_for_report['wt_epitopes'], } # map from peptide objects to their COSMIC IDs if they exist if cosmic_vcf_filename: logger.info('Loading COSMIC data...') self.cosmic_variant_collection = load_vcf_fast( cosmic_vcf_filename, allow_extended_nucleotides=True, include_info=False) logger.info('COSMIC data loaded.') else: self.cosmic_variant_collection = None
def generate_vcfs(id_to_mutation_count, file_format, template_name): """ Generate cropped VCFs from a template, for each sample. Parameters ---------- id_to_mutation_count : dict sample ID to number of mutations we want to generate for that sample Returns ------- str Path to the generated VCF directory """ for sample_id in id_to_mutation_count.keys(): template_path = data_path(template_name) vcf_reader = vcf.Reader(filename=template_path) file_path = generated_data_path( path.join("vcfs", file_format % sample_id)) file_dir = path.dirname(file_path) if not path.exists(file_dir): makedirs(file_dir) with open(file_path, "w") as f: vcf_writer = vcf.Writer(f, vcf_reader) i = 0 num_records_in_template = len(load_vcf_fast(template_path)) num_records_to_generate = id_to_mutation_count[sample_id] assert num_records_in_template >= num_records_to_generate, ( "Cannot generate more records than exist in the template: %d is less than %d" % ( num_records_in_template, num_records_to_generate)) for record in vcf_reader: if i < id_to_mutation_count[sample_id]: vcf_writer.write_record(record) i += 1 else: break return path.dirname(f.name)
def generate_vcfs(id_to_mutation_count, file_format_func, template_name): """ Generate cropped VCFs from a template, for each sample. Parameters ---------- id_to_mutation_count : dict sample ID to number of mutations we want to generate for that sample Returns ------- str Path to the generated VCF directory """ for sample_id in id_to_mutation_count.keys(): template_path = data_path(template_name) vcf_reader = vcf.Reader(filename=template_path) file_path = generated_data_path( path.join("vcfs", file_format_func(sample_id, None, None))) file_dir = path.dirname(file_path) if not path.exists(file_dir): makedirs(file_dir) with open(file_path, "w") as f: vcf_writer = vcf.Writer(f, vcf_reader) i = 0 num_records_in_template = len(load_vcf_fast(template_path)) num_records_to_generate = id_to_mutation_count[sample_id] assert num_records_in_template >= num_records_to_generate, ( "Cannot generate more records than exist in the template: %d is less than %d" % (num_records_in_template, num_records_to_generate)) for record in vcf_reader: if i < id_to_mutation_count[sample_id]: vcf_writer.write_record(record) i += 1 else: break return path.dirname(f.name)
def _load_single_sample_variants(self, sample_idx, file_format_funcs, variant_type, merge_type): sample_id = self.sample_ids[sample_idx] normal_bam_id = None if self.normal_bam_ids is None else self.normal_bam_ids[ sample_idx] tumor_bam_id = None if self.tumor_bam_ids is None else self.tumor_bam_ids[ sample_idx] cached_file_name = "%s-%s-variants.pkl" % (variant_type, merge_type) cached = self.load_from_cache(self.cache_names["variant"], sample_id, cached_file_name) if cached is not None: return cached combined_variants = [] for file_format_func in file_format_funcs: file_name = file_format_func(sample_id, normal_bam_id, tumor_bam_id) variants = varcode.load_vcf_fast( path.join(self.data_dir, file_name)) combined_variants.append(set(variants.elements)) if len(combined_variants) == 1: # There is nothing to merge merged_variants = VariantCollection(combined_variants[0]) else: assert merge_type in ["union", "intersection" ], "Unknown merge type: %s" % merge_type if merge_type == "union": merged_variants = VariantCollection( set.union(*combined_variants)) elif merge_type == "intersection": merged_variants = VariantCollection( set.intersection(*combined_variants)) self.save_to_cache(merged_variants, self.cache_names["variant"], sample_id, cached_file_name) return merged_variants
def test_load_vcf_mouse_with_explicit_urls(): variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome) eq_(len(variants), 217) variants = load_vcf_fast(MOUSE_VCF, genome=explicit_url_genome) eq_(len(variants), 217)
def test_load_vcf_mouse_with_ensembl_release(): variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome) eq_(len(variants), 217) variants = load_vcf_fast(MOUSE_VCF, genome=ensembl_mouse_genome) eq_(len(variants), 217)
def test_load_vcf_mouse_with_inferred_genome(): variants = load_vcf(MOUSE_VCF) eq_(len(variants), 217) variants = load_vcf_fast(MOUSE_VCF) eq_(len(variants), 217)