def get_distributions_by_type(records, variant_types, field, bins): num_bins = len(bins) counts = {} types_set = set(variant_types) for type in variant_types: counts[type] = [0] * (num_bins + 1) for record in records: type = vu.get_sv_type(record, types_set) val = vu.get_info_field(record, field) idx = get_distribution_index(val, bins, num_bins) counts[type][idx] += 1 return counts
def create_trees_from_records(records, variant_types, contigs, padding=0): trees = {} variant_types_set = set(variant_types) for type in variant_types: trees[type] = {} for contig in contigs: trees[type][contig] = IntervalTree() for record in records: type = vu.get_sv_type(record, variant_types_set) contig = record.chrom length = vu.get_record_length(record) trees[type][contig].addi(record.start - padding, record.start + length + padding) return trees
def collect_evidence_fields(records, variant_types): evidence_counts = {} for variant_type in variant_types: evidence_counts[variant_type] = {} for evidence_type in EVIDENCE_TYPES: evidence_counts[variant_type][evidence_type] = 0 variant_types_set = set(variant_types) evidence_types_set = set(EVIDENCE_TYPES) for record in records: variant_type = vu.get_sv_type(record, variant_types_set) evidence_types = vu.get_evidence_types(record, evidence_types_set) for evidence_type in evidence_types: evidence_counts[variant_type][evidence_type] += 1 return evidence_counts
def get_allele_frequency_counts(records, header, variant_types): num_samples = float(len(header.samples)) allele_freq = {} num_singletons = {} types_set = set(variant_types) # Don't calculate MCNV AF since non-ref alleles cannot be determined without chromosome ploidy af_types = types_set - set(["CNV"]) for type in af_types: allele_freq[type] = [] num_singletons[type] = 0 for record in records: type = vu.get_sv_type(record, types_set) if type not in af_types: continue af = 0 for sample in record.samples.values(): for val in sample["GT"]: if val is not None and val > 0: af += 1 if af == 1: num_singletons[type] += 1 allele_freq[type].append(af / num_samples) allele_freq_counts = {} num_bins = len(AF_BINS) for type in af_types: allele_freq_counts[type] = [0] * (num_bins + 1) for val in allele_freq[type]: idx = get_distribution_index(val, AF_BINS, num_bins) allele_freq_counts[type][idx] += 1 return allele_freq_counts, num_singletons
def get_count_by_type(records, variant_types): counts = {} types_set = set(variant_types) for type in variant_types: counts[type] = 0 for record in records: type = vu.get_sv_type(record, types_set) counts[type] += 1 return counts