def merge_linked_depth_calls(vcf, ID_links): """ vcf : pysam.VariantFile ID_links : list of (str, str) """ # Make list of linked IDs and build map to corresponding records linked_IDs = sorted(set([ID for link in ID_links for ID in link])) record_map = {} # If a record wasn't linked with a bedtools merge, just return it for record in vcf: if record.id not in linked_IDs: yield record else: record_map[record.id] = record # Ignore links on other chromosomes linked_IDs = sorted(record_map.keys()) ID_links = [ l for l in ID_links if l[0] in linked_IDs and l[1] in linked_IDs ] # Convert links from pairs of IDs to pairs of records record_links = np.empty([len(ID_links), 2], dtype=object) for i, link in enumerate(ID_links): record_links[i, 0] = record_map[link[0]] record_links[i, 1] = record_map[link[1]] clusters = slink(record_links, record_map) # Merge clusters for cluster in clusters: if len(cluster) == 1: yield cluster[0] continue # Take maximal region start = np.min([record.pos for record in cluster]) end = np.max([record.stop for record in cluster]) merged_record = cluster[0].copy() merged_record.pos = start merged_record.stop = end merged_record.info['SVLEN'] = end - start members = list(record.info['MEMBERS']) + [r.id for r in cluster] merged_record.info['MEMBERS'] = members # Take union of called samples svu.update_best_genotypes(merged_record, cluster, preserve_multiallelic=True) yield merged_record
def _merge_pair(record_a, record_b): is_depth_a = _record_is_depth(record_a) is_depth_b = _record_is_depth(record_b) if is_depth_a == is_depth_b: raise ValueError( "Attempted to write pesr/pesr or depth/depth pair") if is_depth_a: depth_record = record_a pesr_record = record_b else: pesr_record = record_a depth_record = record_b pesr_record.info['ALGORITHMS'] = tuple( sorted(set(pesr_record.info['ALGORITHMS'] + ('depth', )))) pesr_record.info['MEMBERS'] = tuple( sorted( set(pesr_record.info['MEMBERS'] + depth_record.info['MEMBERS']))) svu.update_best_genotypes(pesr_record, [pesr_record, depth_record], preserve_multiallelic=True) if 'varGQ' in pesr_record.info.keys( ) and 'varGQ' in depth_record.info.keys(): pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'], depth_record.info['varGQ']) for sample in pesr_record.samples: if 'EV' in pesr_record.samples[sample].keys( ) and 'EV' in depth_record.info.keys(): pesr_ev = pesr_record.samples[sample]['EV'] depth_ev = depth_record.samples[sample]['EV'] pesr_record.samples[sample]['EV'] = tuple( sorted(set(pesr_ev).union(depth_ev))) _cache_sample_overlap(pesr_record, force=True)
def make_record(self): self.vcf_record = self.records[0].copy() svu.update_best_genotypes(self.vcf_record, self.records, preserve_multiallelic=False)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.5, sample_overlap=0.5): # Memory inefficient but it's easier and shouldn't matter too much # now that the variants have been filtered down records = dict() records['pesr'] = {record.id: record for record in pesr_vcf} records['depth'] = {record.id: record for record in depth_vcf} # Wipe MEMBERS from prior clustering for source in 'pesr depth'.split(): for ID, record in records[source].items(): record.info['MEMBERS'] = [ID] # Reset for bedtool creation pesr_vcf.reset() base_record = next(pesr_vcf) # Reset for bedtool creation pesr_vcf.reset() depth_vcf.reset() pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False, include_samples=True, include_strands=False, report_alt=False) depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False, include_samples=True, include_strands=False, report_alt=False) # Remove records with no samples def _filter_allref(feature): "Returns False if feature has no called samples" exclude = False if len(feature.fields) == 6: samples = feature.fields[5] if samples not in ['.', '']: exclude = True return exclude pesr_bed = pesr_bed.filter(_filter_allref).saveas('filtered_pesr.bed') depth_bed = depth_bed.filter(_filter_allref).saveas('filtered_depth.bed') # Merge depth records with PE/SR records if they share 50% recip overlap sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac) filtered_depth_IDs = deque() for pair in sect.intervals: # Check SV types match if pair.fields[4] != pair.fields[10]: continue # Get vcf records pesr_id, depth_id = pair.fields[3], pair.fields[9] pesr_record = records['pesr'][pesr_id] depth_record = records['depth'][depth_id] # Check for >=50% sample overlap samp_ovr = svu.samples_overlap(samplesA=pair.fields[5].split(','), samplesB=pair.fields[11].split(',')) if not samp_ovr: continue # Note removal of depth ID filtered_depth_IDs.append(depth_id) # Update metadata and samples pesr_record.info['MEMBERS'] = (pesr_record.info.get('MEMBERS', ()) + (depth_record.id, )) pesr_record.info['ALGORITHMS'] = pesr_record.info['ALGORITHMS'] + ('depth', ) svu.update_best_genotypes(pesr_record, [pesr_record, depth_record], preserve_multiallelic=True) if 'varGQ' in pesr_record.info.keys() and 'varGQ' in depth_record.info.keys(): pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'], depth_record.info['varGQ']) for sample in pesr_record.samples: if 'EV' in pesr_record.samples[sample].keys() and 'EV' in depth_record.info.keys(): pesr_ev = pesr_record.samples[sample]['EV'] depth_ev = depth_record.samples[sample]['EV'] pesr_record.samples[sample]['EV'] = tuple(sorted(set(pesr_ev).union(depth_ev))) # Remove overlapping depth records (not performed in for loop to account # for double overlaps # TODO: handle double overlap of depth calls for ID in set(filtered_depth_IDs): records['depth'].pop(ID) # In remaining depth-only calls, add samples to PE/SR record if the # record covers 90% of the depth-only call. # SFARI ONLY - REMOVED FOR OTHER ANALYSES # sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9) # # for pair in sect.intervals: # # Check SV types match # if pair.fields[4] != pair.fields[10]: # continue # # pesr_id, depth_id = pair.fields[3], pair.fields[9] # # # Skip depth records we already added with 50% reciprocal # if depth_id in filtered_depth_IDs: # continue # # # If sample is in both depth record and pe/sr record, remove it from # # depth record # depth_record = records['depth'][depth_id] # pesr_record = records['pesr'][pesr_id] # # merge_nested_depth_record(pesr_record, depth_record) # Merge records together def _sort_key(record): return (record.chrom, record.pos, record.info['CHR2'], record.stop) pesr_records = sorted(records['pesr'].values(), key=_sort_key) depth_records = sorted(records['depth'].values(), key=_sort_key) depth_records = [clean_depth_record(base_record, r) for r in depth_records] for record in heapq.merge(pesr_records, depth_records, key=_sort_key): # Clean out unwanted format keys # EDIT - this should be handled upstream by add_genotypes # FORMATS = 'GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split() # for key in record.format.keys(): # if key not in FORMATS: # del record.format[key] record.info['ALGORITHMS'] = sorted(set(record.info['ALGORITHMS'])) record.info['MEMBERS'] = sorted(set(record.info.get('MEMBERS', ()))) # Skip emptied depth records if len(svu.get_called_samples(record)) == 0: continue yield record