def filter_on_reciprocal_overlap(single_sample_vcf_file, ref_vcf_file, svtype,
                                 case_sample, overlap_frac,
                                 variant_gts_allowed):
    single_sample_vcf = single_sample_vcf_file
    ref_vcf = ref_vcf_file

    single_sample_bed = svu.vcf2bedtool(single_sample_vcf,
                                        annotate_ins=False,
                                        include_samples=True,
                                        svtypes=[svtype])
    ref_bed = svu.vcf2bedtool(ref_vcf,
                              annotate_ins=False,
                              include_samples=True,
                              svtypes=[svtype])
    ref_bed.filter(ac_filter,
                   variant_gts_allowed=variant_gts_allowed,
                   sample_to_exclude=case_sample)

    intersection = single_sample_bed.intersect(ref_bed,
                                               wa=True,
                                               f=overlap_frac,
                                               r=True,
                                               v=True)

    filtered_variant_ids = []

    for intx in intersection:
        filtered_variant_ids.append(intx.name)
    return filtered_variant_ids
def integrate_melt(cxsv, melt, fout, window=100):
    cxsv_bed = svu.vcf2bedtool(cxsv, annotate_ins=False, include_samples=True)
    melt_bed = svu.vcf2bedtool(melt, annotate_ins=False, include_samples=True)

    sect = cxsv_bed.window(melt_bed, w=window)

    # Check breakpoints are within window
    def close_enough(interval):
        startA, endA = [int(x) for x in interval.fields[1:3]]
        startB, endB = [int(x) for x in interval.fields[8:10]]
        return abs(startA - startB) < window and abs(endA - endB) < window

    excluded_cxsv = deque()
    for interval in sect.intervals:
        samplesA = interval.fields[6].split(',')
        samplesB = interval.fields[13].split(',')

        if (samples_overlap(samplesA, samplesB) and close_enough(interval)
                and interval.fields[4] == 'INS'):
            excluded_cxsv.append(interval.fields[3])

    cxsv.reset()
    melt.reset()

    for record in heapq.merge(cxsv, melt, key=lambda record: record.pos):
        if record.id in excluded_cxsv:
            #  print(record.id)
            continue
        fout.write(record)
Exemple #3
0
def link_inv(vcf, bkpt_window=300, cpx_dist=2000):
    bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False)
    overlap = bt.window(bt, w=bkpt_window).saveas()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[
        10] == "DEL")).saveas()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[
        10] == "DUP")).saveas()
    links = [(b[3], b[9]) for b in overlap.intervals]
    linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
    linked_IDs = np.array(linked_IDs)
    bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)}
    indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links])
    n_bkpts = len(linked_IDs)
    bkpts = extract_breakpoints(vcf, bkpt_idxs)
    # Exclude wildly disparate overlaps
    G = sps.eye(n_bkpts, dtype=np.uint16, format='lil')
    for i, j in indexed_links:
        if (samples_overlap(bkpts[i], bkpts[j])
                and ro_calu(bkpts[i], bkpts[j]) > 0):
            G[i, j] = 1
    # Generate lists of clustered breakpoints
    n_comp, comp_list = sps.csgraph.connected_components(G)
    clusters = [deque() for x in range(n_comp)]
    for i, c_label in enumerate(comp_list):
        clusters[c_label].append(bkpts[i])
    return clusters
Exemple #4
0
def link_cpx(vcf, bkpt_window=300, cpx_dist=2000):
    """
    Parameters
    ----------
    vcfpath : str
        Path to breakpoint VCF
    """

    bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False)

    # Identify breakpoints which overlap within specified window
    overlap = bt.window(bt, w=bkpt_window).saveas()

    # Exclude self-hits
    #  overlap = overlap.filter(lambda b: b.fields[3] != b.fields[9]).saveas()

    # Exclude intersections where two DELs or two DUPs cluster together
    # cnvtypes = 'DEL DUP'.split()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas()

    # # Exclude intersections with annotated mobile elements (rather than BNDs)
    # overlap = overlap.filter(lambda b: b.fields[4] is not re.match(re.compile('INS\:ME\:*'), b.fields[4])).saveas()    

    # Get linked variant IDs
    links = [(b[3], b[9]) for b in overlap.intervals]
    linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
    linked_IDs = np.array(linked_IDs)

    # Map variant IDs to indices
    bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)}
    indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links])

    # Extract VariantRecords corresponding to breakpoints
    n_bkpts = len(linked_IDs)
    bkpts = extract_breakpoints(vcf, bkpt_idxs)

    # Exclude wildly disparate overlaps
    # Build sparse graph from links
    G = sps.eye(n_bkpts, dtype=np.uint16, format='lil')
    for i, j in indexed_links:
        if (samples_overlap(bkpts[i], bkpts[j]) and
                close_enough(bkpts[i], bkpts[j])):
            G[i, j] = 1

    # Generate lists of clustered breakpoints
    n_comp, comp_list = sps.csgraph.connected_components(G)
    clusters = [deque() for x in range(n_comp)]
    for i, c_label in enumerate(comp_list):
        clusters[c_label].append(bkpts[i])

    # # Remove clusters of only CNV - leftover from shared sample filtering
    # def _ok_cluster(cluster):
    #     ok = any([record.info['SVTYPE'] not in cnvtypes for record in cluster])
    #     return ok

    # clusters = [c for c in clusters if _ok_cluster(c)]
    #  clusters = [c for c in clusters if len(c) > 1]

    return clusters
Exemple #5
0
def link_cpx(vcf, bkpt_window=300):
    """
    Parameters
    ----------
    vcfpath : str
        Path to breakpoint VCF
    """

    bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False)

    # Identify breakpoints which overlap within specified window
    overlap = bt.window(bt, w=bkpt_window).saveas()

    # Exclude intersections where two DELs or two DUPs cluster together
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[
        10] == "DEL")).saveas()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[
        10] == "DUP")).saveas()

    # Get linked variant IDs
    links = [(b[3], b[9]) for b in overlap.intervals]
    linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
    linked_IDs = np.array(linked_IDs)

    # Map variant IDs to indices
    bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)}
    indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links])

    # Extract VariantRecords corresponding to breakpoints
    n_bkpts = len(linked_IDs)
    bkpts = extract_breakpoints(vcf, bkpt_idxs)

    # Build called sample index
    # Get lists of called samples for each record
    sample_sets_dict = {
        idx: set(svu.get_called_samples(bkpts[idx]))
        for idx in set(indexed_links.flatten().tolist())
    }

    # Exclude wildly disparate overlaps
    # Build sparse graph from links
    G = sps.eye(n_bkpts, dtype=np.uint16, format='lil')
    for i, j in indexed_links:
        if (samples_overlap(sample_sets_dict[i], sample_sets_dict[j])
                and close_enough(bkpts[i], bkpts[j])):
            G[i, j] = 1

    # Generate lists of clustered breakpoints
    n_comp, comp_list = sps.csgraph.connected_components(G)
    clusters = [deque() for x in range(n_comp)]
    for i, c_label in enumerate(comp_list):
        clusters[c_label].append(bkpts[i])

    return clusters
Exemple #6
0
def annotate_vcf(vcf, gencode, noncoding, annotated_vcf):
    """
    Parameters
    ----------
    vcf : pysam.VariantFile
    gencode : pbt.BedTool
        Gencode gene annotations
    noncoding : pbt.BedTool
        Noncoding elements
    annotated_vcf : str
        Path to output VCF
    """

    # Add metadata lines for annotations
    header = vcf.header

    if gencode is not None:
        for line in GENCODE_INFO:
            header.add_line(line)
    if noncoding is not None:
        for line in NONCODING_INFO:
            header.add_line(line)

    # Open output file
    fout = pysam.VariantFile(annotated_vcf, 'w', header=header)

    # Annotate genic hits
    if isinstance(vcf.filename, bytes):
        fname = vcf.filename.decode()
    else:
        fname = vcf.filename
    sv = svu.vcf2bedtool(fname,
                         split_bnd=True,
                         split_cpx=True,
                         simple_sinks=True,
                         include_unresolved=False)

    effects = annotate(sv, gencode, noncoding)
    effects = effects.to_dict(orient='index')
    # Add results to variant records and save
    for record in vcf:
        anno = effects.get(record.id)
        if anno is None:
            fout.write(record)
            continue

        # Handle general catch-all intersection for MULTIALLELIC variants
        if 'MULTIALLELIC' in record.filter:
            multi_ovr = []
            for info, genelist in anno.items():
                if info in 'LOF DUP_LOF COPY_GAIN DUP_PARTIAL'.split():
                    if genelist != 'NA':
                        for gene in genelist.split(','):
                            if gene not in multi_ovr:
                                multi_ovr.append(gene)
                else:
                    if genelist != 'NA':
                        record.info[info] = genelist
            if len(multi_ovr) > 0:
                record.info['MSV_EXON_OVR'] = ','.join(multi_ovr)
        else:
            for info, genelist in anno.items():
                if genelist != 'NA':
                    record.info[info] = genelist

        if 'NEAREST_TSS' in record.info:
            record.info['INTERGENIC'] = True

        fout.write(record)

    fout.close()
def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype,
                           case_sample, overlap_frac, variant_gts_allowed):
    single_sample_vcf = single_sample_vcf_file
    ref_vcf = ref_vcf_file

    single_sample_bed = svu.vcf2bedtool(single_sample_vcf,
                                        annotate_ins=False,
                                        include_samples=True,
                                        svtypes=[svtype])
    ref_bed = svu.vcf2bedtool(ref_vcf,
                              annotate_ins=False,
                              include_samples=True,
                              svtypes=[svtype])

    # in bash bedtools this gets the results we want:
    # bedtools coverage -a single_sample_calls.bed -b ref_panel_calls.bed -d \ # compute per-base coverage of query by intervals in ref
    #  | awk '{OFS="\t"; print $1,$2,$3,$8,$9}' \ # slim down the of data by removing sample list, extra fields
    #  | bedtools groupby -g 1,2,3,5 -c 4 -o min,max \ # group together regions with the same coverage value
    #  | awk '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}' \ # make these regions into new bed intervals
    #  | bedtools intersect -a stdin -b ref_panel_calls.bed -wb \ # print out the ref intervals that overlapped these regions
    #  | bedtools groupby  -g 1,2,3 -c 10 -o distinct\ # condense the sample lists
    #  | bedtools intersect -a single_sample_calls.bed -b stdin -wao # intersect with the query, printing the amt of overlap
    #
    # pybedtools unable to handle this pipeline without blowing up disk space due to lack of working streaming support
    #
    # subprocess streaming equivalent
    single_sample_bed.saveas('single_sample_calls.bed')
    ref_bed.saveas('ref_panel_calls.bed')

    cov_hist = subprocess.Popen([
        'bedtools', 'coverage', '-a', 'single_sample_calls.bed', '-b',
        'ref_panel_calls.bed', '-d'
    ],
                                stdout=subprocess.PIPE)
    cov_hist_slim = subprocess.Popen(
        ['awk', '{OFS="\t"; print $1,$2,$3,$8,$9}'],
        stdin=cov_hist.stdout,
        stdout=subprocess.PIPE)
    cov_reg_grouped = subprocess.Popen(
        ['bedtools', 'groupby', '-g', '1,2,3,5', '-c', '4', '-o', 'min,max'],
        stdin=cov_hist_slim.stdout,
        stdout=subprocess.PIPE)
    cov_reg_grp_fix = subprocess.Popen(
        ['awk', '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}'],
        stdin=cov_reg_grouped.stdout,
        stdout=subprocess.PIPE)
    cov_reg_ref_ovl = subprocess.Popen([
        'bedtools', 'intersect', '-a', 'stdin', '-b', 'ref_panel_calls.bed',
        '-wb'
    ],
                                       stdin=cov_reg_grp_fix.stdout,
                                       stdout=subprocess.PIPE)
    cov_reg_ref_cds = subprocess.Popen(
        ['bedtools', 'groupby', '-g', '1,2,3', '-c', '10', '-o', 'distinct'],
        stdin=cov_reg_ref_ovl.stdout,
        stdout=subprocess.PIPE)
    final_intersect_process = subprocess.Popen(
        [
            'bedtools', 'intersect', '-a', 'single_sample_calls.bed', '-b',
            'stdin', '-wao'
        ],
        stdin=cov_reg_ref_cds.stdout,
        stdout=open('final_merged_intersection.bed', 'w'))

    data = final_intersect_process.communicate()[0]  # expect this to be empty
    return_code = final_intersect_process.returncode
    if return_code != 0:
        raise Exception(
            'intersection pipeline process exited with return code ' +
            returncode)

    intersection = pybedtools.BedTool('final_merged_intersection.bed')
    filtered_variant_ids = []

    current_case_id = ''
    has_ref_panel_gts = False
    bases_covered_by_matching_calls = 0
    current_case_length = -1

    for intx in intersection:
        new_case_id = intx.name

        if new_case_id != current_case_id:
            if current_case_id != '':
                covered_by_matching_case_calls = (
                    bases_covered_by_matching_calls /
                    current_case_length) > overlap_frac
                if has_ref_panel_gts and not covered_by_matching_case_calls:
                    filtered_variant_ids.append(current_case_id)
            current_case_id = new_case_id
            has_ref_panel_gts = False
            bases_covered_by_matching_calls = 0
            current_case_length = intx.end - intx.start

        variant_samples = set(intx.fields[6].split(','))
        if case_sample in variant_samples:
            variant_samples.remove(case_sample)
        if len(variant_samples) > variant_gts_allowed:
            has_ref_panel_gts = True

        if intx.fields[7] != ".":
            ref_panel_gts = set(intx.fields[10].split(','))
            if len(variant_samples - ref_panel_gts) <= variant_gts_allowed:
                bases_covered_by_matching_calls += int(intx.fields[11])
    return filtered_variant_ids
Exemple #8
0
def vcf2bed(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='VCF to convert.')
    parser.add_argument('bed', help='Converted bed. Specify `-` or `stdout` '
                        'to write to stdout.')
    parser.add_argument('--no-samples', dest='include_samples',
                        action='store_false', default=True,
                        help='Don\'t include comma-delimited list of called '
                        'samples for each variant.')
    parser.add_argument('-i', '--info', action='append',
                        help='INFO field to include as column in output. '
                        'May be specified more than once. To include all INFO '
                        'fields, specify `--info ALL`. INFO fields are '
                        'reported in the order in which they are requested. '
                        'If ALL INFO fields are requested, they are reported '
                        'in the order in which they appear in the VCF header.')
    parser.add_argument('--include-filters', action='store_true', default=False,
                        help='Include FILTER status in output, with the same ' + 
                             'behavior an INFO field.')
    parser.add_argument('--split-bnd', action='store_true', default=False,
                        help='Report two entries in bed file for each BND.')
    parser.add_argument('--split-cpx', action='store_true', default=False,
                        help='Report entries for each CPX rearrangement interval.')
    parser.add_argument('--no-header', dest='header', action='store_false',
                        default=True, help='Suppress header.')
    parser.add_argument('--no-sort-coords', dest='no_sort_coords', action='store_true',
                        default=False, help='Do not sort start/end coordinates '
                        'per record before writing to bed.')
    parser.add_argument('--no-unresolved', dest='no_unresolved', action='store_true',
                        default=False, help='Do not output unresolved variants.')
    parser.add_argument('--simple-sinks', dest='simple_sinks', action='store_true',
                        default=False, help='Report all INS sinks as 1bp intervals.')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    if args.vcf in '- stdin'.split():
        vcf = pysam.VariantFile(sys.stdin)
    else:
        vcf = pysam.VariantFile(args.vcf)

    header = '#chrom start end name svtype'.split()
    if args.include_samples:
        header.append('samples')
    if args.info:
        if 'ALL' in args.info:
            header = header + vcf.header.info.keys()
        else:
            header = header + args.info
    if args.include_filters:
        header = header + ['FILTER']
    header = '\t'.join(header)

    include_unresolved = not args.no_unresolved

    bt = svu.vcf2bedtool(vcf,
                         split_bnd=args.split_bnd,
                         include_samples=args.include_samples,
                         include_strands=False,
                         split_cpx=args.split_cpx,
                         include_infos=args.info,
                         annotate_ins=False,
                         report_alt=True,
                         no_sort_coords=args.no_sort_coords,
                         simple_sinks=args.simple_sinks,
                         include_unresolved=include_unresolved,
                         include_filters=args.include_filters)

    if args.bed in 'stdout -'.split():
        if args.header:
            sys.stdout.write(header + '\n')
        sys.stdout.write(str(bt))
    else:
        if args.header:
            bt.saveas(args.bed, trackline=header)
        else:
            bt.saveas(args.bed)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.8):
    # Memory inefficient but it's easier and shouldn't matter too much
    # now that the variants have been filtered down
    records = dict()
    records['pesr'] = {record.id: record for record in pesr_vcf}
    records['depth'] = {record.id: record for record in depth_vcf}

    # Wipe MEMBERS from prior clustering
    for source in 'pesr depth'.split():
        for ID, record in records[source].items():
            record.info['MEMBERS'] = [ID]

    # Reset for bedtool creation
    pesr_vcf.reset()
    depth_vcf.reset()
    pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False,
                               include_strands=False)
    depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False,
                                include_strands=False)

    # Merge depth records with PE/SR records if they share 80% recip overlap
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac)

    filtered_depth_IDs = deque()
    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Add depth record's samples to PE/SR
        filtered_depth_IDs.append(depth_id)
        pesr_record = records['pesr'][pesr_id]
        depth_record = records['depth'][depth_id]

        # Update metadata and samples
        pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] +
                                       (depth_record.id, ))
        pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', )
        add_samples(pesr_record, depth_record)

    # Remove overlapping depth records (not performed in for loop to account
    # for double overlaps
    # TODO: handle double overlap of depth calls
    for ID in set(filtered_depth_IDs):
        records['depth'].pop(ID)

    # In remaining depth-only calls, add samples to PE/SR record if the
    # record covers 90% of the depth-only call.
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9)

    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Skip depth records we already added with 80% reciprocal
        if depth_id in filtered_depth_IDs:
            continue

        # If sample is in both depth record and pe/sr record, remove it from
        # depth record
        depth_record = records['depth'][depth_id]
        pesr_record = records['pesr'][pesr_id]

        merge_nested_depth_record(pesr_record, depth_record)

    # Merge records together
    def _sort_key(record):
        return (record.chrom, record.pos, record.info['CHR2'], record.stop)

    pesr_records = sorted(records['pesr'].values(), key=_sort_key)
    depth_records = sorted(records['depth'].values(), key=_sort_key)
    for record in heapq.merge(pesr_records, depth_records, key=_sort_key):
        # Clean out unwanted format keys
        for key in record.format.keys():
            if key != 'GT':
                del record.format[key]

        record.info['SOURCES'] = sorted(set(record.info['SOURCES']))
        record.info['MEMBERS'] = sorted(set(record.info['MEMBERS']))

        # Skip emptied depth records
        if len(svu.get_called_samples(record)) == 0:
            continue

        yield record
Exemple #10
0
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.5, sample_overlap=0.5):
    # Memory inefficient but it's easier and shouldn't matter too much
    # now that the variants have been filtered down
    records = dict()
    records['pesr'] = {record.id: record for record in pesr_vcf}
    records['depth'] = {record.id: record for record in depth_vcf}

    # Wipe MEMBERS from prior clustering
    for source in 'pesr depth'.split():
        for ID, record in records[source].items():
            record.info['MEMBERS'] = [ID]

    # Reset for bedtool creation
    pesr_vcf.reset()
    base_record = next(pesr_vcf)

    # Reset for bedtool creation
    pesr_vcf.reset()
    depth_vcf.reset()
    pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False,
                               include_samples=True,
                               include_strands=False, 
                               report_alt=False)
    depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False,
                                include_samples=True,
                                include_strands=False, 
                                report_alt=False)

    # Remove records with no samples
    def _filter_allref(feature):
        "Returns False if feature has no called samples"
        exclude = False
        if len(feature.fields) == 6:
            samples = feature.fields[5]
            if samples not in ['.', '']:
                exclude = True
        return exclude

    pesr_bed = pesr_bed.filter(_filter_allref).saveas('filtered_pesr.bed')
    depth_bed = depth_bed.filter(_filter_allref).saveas('filtered_depth.bed')
    

    # Merge depth records with PE/SR records if they share 50% recip overlap
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac)

    filtered_depth_IDs = deque()
    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[10]:
            continue

        # Get vcf records
        pesr_id, depth_id = pair.fields[3], pair.fields[9]
        pesr_record = records['pesr'][pesr_id]
        depth_record = records['depth'][depth_id]

        # Check for >=50% sample overlap
        samp_ovr = svu.samples_overlap(samplesA=pair.fields[5].split(','),
                                       samplesB=pair.fields[11].split(','))
        if not samp_ovr:
            continue

        # Note removal of depth ID
        filtered_depth_IDs.append(depth_id)
        
        # Update metadata and samples
        pesr_record.info['MEMBERS'] = (pesr_record.info.get('MEMBERS', ()) +
                                       (depth_record.id, ))
        pesr_record.info['ALGORITHMS'] = pesr_record.info['ALGORITHMS'] + ('depth', )

        svu.update_best_genotypes(pesr_record,
                                  [pesr_record, depth_record],
                                  preserve_multiallelic=True)

        if 'varGQ' in pesr_record.info.keys() and 'varGQ' in depth_record.info.keys():
            pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'],
                                            depth_record.info['varGQ'])

        for sample in pesr_record.samples:
            if 'EV' in pesr_record.samples[sample].keys() and 'EV' in depth_record.info.keys():
                pesr_ev = pesr_record.samples[sample]['EV']
                depth_ev = depth_record.samples[sample]['EV']
                pesr_record.samples[sample]['EV'] = tuple(sorted(set(pesr_ev).union(depth_ev)))

    # Remove overlapping depth records (not performed in for loop to account
    # for double overlaps
    # TODO: handle double overlap of depth calls
    for ID in set(filtered_depth_IDs):
        records['depth'].pop(ID)

    # In remaining depth-only calls, add samples to PE/SR record if the
    # record covers 90% of the depth-only call.
    # SFARI ONLY - REMOVED FOR OTHER ANALYSES
#    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9)
#
#    for pair in sect.intervals:
#        # Check SV types match
#        if pair.fields[4] != pair.fields[10]:
#            continue
#
#        pesr_id, depth_id = pair.fields[3], pair.fields[9]
#
#        # Skip depth records we already added with 50% reciprocal
#        if depth_id in filtered_depth_IDs:
#            continue
#
#        # If sample is in both depth record and pe/sr record, remove it from
#        # depth record
#        depth_record = records['depth'][depth_id]
#        pesr_record = records['pesr'][pesr_id]
#
#        merge_nested_depth_record(pesr_record, depth_record)
    
    # Merge records together
    def _sort_key(record):
        return (record.chrom, record.pos, record.info['CHR2'], record.stop)

    pesr_records = sorted(records['pesr'].values(), key=_sort_key)
    depth_records = sorted(records['depth'].values(), key=_sort_key)
    depth_records = [clean_depth_record(base_record, r) for r in depth_records]

    for record in heapq.merge(pesr_records, depth_records, key=_sort_key):
        # Clean out unwanted format keys
        # EDIT - this should be handled upstream by add_genotypes
        #  FORMATS = 'GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split()
        #  for key in record.format.keys():
            #  if key not in FORMATS:
                #  del record.format[key]

        record.info['ALGORITHMS'] = sorted(set(record.info['ALGORITHMS']))
        record.info['MEMBERS'] = sorted(set(record.info.get('MEMBERS', ())))

        # Skip emptied depth records
        if len(svu.get_called_samples(record)) == 0:
            continue

        yield record