Exemple #1
0
def merge_linked_depth_calls(vcf, ID_links):
    """
    vcf : pysam.VariantFile
    ID_links : list of (str, str)
    """

    # Make list of linked IDs and build map to corresponding records
    linked_IDs = sorted(set([ID for link in ID_links for ID in link]))
    record_map = {}

    # If a record wasn't linked with a bedtools merge, just return it
    for record in vcf:
        if record.id not in linked_IDs:
            yield record
        else:
            record_map[record.id] = record

    # Ignore links on other chromosomes
    linked_IDs = sorted(record_map.keys())
    ID_links = [
        l for l in ID_links if l[0] in linked_IDs and l[1] in linked_IDs
    ]

    # Convert links from pairs of IDs to pairs of records
    record_links = np.empty([len(ID_links), 2], dtype=object)
    for i, link in enumerate(ID_links):
        record_links[i, 0] = record_map[link[0]]
        record_links[i, 1] = record_map[link[1]]

    clusters = slink(record_links, record_map)

    # Merge clusters
    for cluster in clusters:
        if len(cluster) == 1:
            yield cluster[0]
            continue

        # Take maximal region
        start = np.min([record.pos for record in cluster])
        end = np.max([record.stop for record in cluster])

        merged_record = cluster[0].copy()
        merged_record.pos = start
        merged_record.stop = end
        merged_record.info['SVLEN'] = end - start

        members = list(record.info['MEMBERS']) + [r.id for r in cluster]
        merged_record.info['MEMBERS'] = members

        # Take union of called samples
        svu.update_best_genotypes(merged_record,
                                  cluster,
                                  preserve_multiallelic=True)

        yield merged_record
Exemple #2
0
    def _merge_pair(record_a, record_b):
        is_depth_a = _record_is_depth(record_a)
        is_depth_b = _record_is_depth(record_b)
        if is_depth_a == is_depth_b:
            raise ValueError(
                "Attempted to write pesr/pesr or depth/depth pair")
        if is_depth_a:
            depth_record = record_a
            pesr_record = record_b
        else:
            pesr_record = record_a
            depth_record = record_b

        pesr_record.info['ALGORITHMS'] = tuple(
            sorted(set(pesr_record.info['ALGORITHMS'] + ('depth', ))))
        pesr_record.info['MEMBERS'] = tuple(
            sorted(
                set(pesr_record.info['MEMBERS'] +
                    depth_record.info['MEMBERS'])))

        svu.update_best_genotypes(pesr_record, [pesr_record, depth_record],
                                  preserve_multiallelic=True)

        if 'varGQ' in pesr_record.info.keys(
        ) and 'varGQ' in depth_record.info.keys():
            pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'],
                                            depth_record.info['varGQ'])

        for sample in pesr_record.samples:
            if 'EV' in pesr_record.samples[sample].keys(
            ) and 'EV' in depth_record.info.keys():
                pesr_ev = pesr_record.samples[sample]['EV']
                depth_ev = depth_record.samples[sample]['EV']
                pesr_record.samples[sample]['EV'] = tuple(
                    sorted(set(pesr_ev).union(depth_ev)))

        _cache_sample_overlap(pesr_record, force=True)
Exemple #3
0
 def make_record(self):
     self.vcf_record = self.records[0].copy()
     svu.update_best_genotypes(self.vcf_record,
                               self.records,
                               preserve_multiallelic=False)
Exemple #4
0
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.5, sample_overlap=0.5):
    # Memory inefficient but it's easier and shouldn't matter too much
    # now that the variants have been filtered down
    records = dict()
    records['pesr'] = {record.id: record for record in pesr_vcf}
    records['depth'] = {record.id: record for record in depth_vcf}

    # Wipe MEMBERS from prior clustering
    for source in 'pesr depth'.split():
        for ID, record in records[source].items():
            record.info['MEMBERS'] = [ID]

    # Reset for bedtool creation
    pesr_vcf.reset()
    base_record = next(pesr_vcf)

    # Reset for bedtool creation
    pesr_vcf.reset()
    depth_vcf.reset()
    pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False,
                               include_samples=True,
                               include_strands=False, 
                               report_alt=False)
    depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False,
                                include_samples=True,
                                include_strands=False, 
                                report_alt=False)

    # Remove records with no samples
    def _filter_allref(feature):
        "Returns False if feature has no called samples"
        exclude = False
        if len(feature.fields) == 6:
            samples = feature.fields[5]
            if samples not in ['.', '']:
                exclude = True
        return exclude

    pesr_bed = pesr_bed.filter(_filter_allref).saveas('filtered_pesr.bed')
    depth_bed = depth_bed.filter(_filter_allref).saveas('filtered_depth.bed')
    

    # Merge depth records with PE/SR records if they share 50% recip overlap
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac)

    filtered_depth_IDs = deque()
    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[10]:
            continue

        # Get vcf records
        pesr_id, depth_id = pair.fields[3], pair.fields[9]
        pesr_record = records['pesr'][pesr_id]
        depth_record = records['depth'][depth_id]

        # Check for >=50% sample overlap
        samp_ovr = svu.samples_overlap(samplesA=pair.fields[5].split(','),
                                       samplesB=pair.fields[11].split(','))
        if not samp_ovr:
            continue

        # Note removal of depth ID
        filtered_depth_IDs.append(depth_id)
        
        # Update metadata and samples
        pesr_record.info['MEMBERS'] = (pesr_record.info.get('MEMBERS', ()) +
                                       (depth_record.id, ))
        pesr_record.info['ALGORITHMS'] = pesr_record.info['ALGORITHMS'] + ('depth', )

        svu.update_best_genotypes(pesr_record,
                                  [pesr_record, depth_record],
                                  preserve_multiallelic=True)

        if 'varGQ' in pesr_record.info.keys() and 'varGQ' in depth_record.info.keys():
            pesr_record.info['varGQ'] = max(pesr_record.info['varGQ'],
                                            depth_record.info['varGQ'])

        for sample in pesr_record.samples:
            if 'EV' in pesr_record.samples[sample].keys() and 'EV' in depth_record.info.keys():
                pesr_ev = pesr_record.samples[sample]['EV']
                depth_ev = depth_record.samples[sample]['EV']
                pesr_record.samples[sample]['EV'] = tuple(sorted(set(pesr_ev).union(depth_ev)))

    # Remove overlapping depth records (not performed in for loop to account
    # for double overlaps
    # TODO: handle double overlap of depth calls
    for ID in set(filtered_depth_IDs):
        records['depth'].pop(ID)

    # In remaining depth-only calls, add samples to PE/SR record if the
    # record covers 90% of the depth-only call.
    # SFARI ONLY - REMOVED FOR OTHER ANALYSES
#    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9)
#
#    for pair in sect.intervals:
#        # Check SV types match
#        if pair.fields[4] != pair.fields[10]:
#            continue
#
#        pesr_id, depth_id = pair.fields[3], pair.fields[9]
#
#        # Skip depth records we already added with 50% reciprocal
#        if depth_id in filtered_depth_IDs:
#            continue
#
#        # If sample is in both depth record and pe/sr record, remove it from
#        # depth record
#        depth_record = records['depth'][depth_id]
#        pesr_record = records['pesr'][pesr_id]
#
#        merge_nested_depth_record(pesr_record, depth_record)
    
    # Merge records together
    def _sort_key(record):
        return (record.chrom, record.pos, record.info['CHR2'], record.stop)

    pesr_records = sorted(records['pesr'].values(), key=_sort_key)
    depth_records = sorted(records['depth'].values(), key=_sort_key)
    depth_records = [clean_depth_record(base_record, r) for r in depth_records]

    for record in heapq.merge(pesr_records, depth_records, key=_sort_key):
        # Clean out unwanted format keys
        # EDIT - this should be handled upstream by add_genotypes
        #  FORMATS = 'GT GQ RD_CN RD_GQ PE_GT PE_GQ SR_GT SR_GQ EV'.split()
        #  for key in record.format.keys():
            #  if key not in FORMATS:
                #  del record.format[key]

        record.info['ALGORITHMS'] = sorted(set(record.info['ALGORITHMS']))
        record.info['MEMBERS'] = sorted(set(record.info.get('MEMBERS', ())))

        # Skip emptied depth records
        if len(svu.get_called_samples(record)) == 0:
            continue

        yield record