Ejemplo n.º 1
0
def samples_overlap(recA, recB, upper_thresh=0.5, lower_thresh=0.5):
    """
    Report if the samples called in two VCF records overlap sufficiently.
    The fraction of each record's samples which are shared with the other
    record is calculated. The record with a greater fraction of shared samples
    must exceed the upper threshold AND the record with a lesser fraction of
    shared samples must exceed the lower threshold. This is intended to
    maximize sensitivity in rare variants with a false negative in one
    breakpoint.
    Parameters
    ----------
    recA : pysam.VariantRecord
    recB : pysam.VariantRecord
    upper_thresh : float, optional
        Minimum sample overlap in record with greater overlap
    lower_thresh : float, optional
        Minimum sample overlap in record with lesser overlap
    Returns
    -------
    samples_overlap : bool
        Samples shared between records meet required thresholds.
    """
    # Get lists of called samples for each record
    samplesA = set(svu.get_called_samples(recA))
    samplesB = set(svu.get_called_samples(recB))
    # Compute fraction of each record's samples which are shared
    if len(samplesA) > 0 and len(samplesB) > 0:
        shared = samplesA & samplesB
        fracA = len(shared) / len(samplesA)
        fracB = len(shared) / len(samplesB)
        min_frac, max_frac = sorted([fracA, fracB])
    else:
        min_frac, max_frac = [0, 0]
    return min_frac >= lower_thresh and max_frac >= upper_thresh
Ejemplo n.º 2
0
def samples_overlap_records(recA,
                            recB,
                            called_samples_dict,
                            upper_thresh=0.5,
                            lower_thresh=0.5):
    if recA.id not in called_samples_dict:
        called_samples_dict[recA.id] = set(svu.get_called_samples(recA))
    if recB.id not in called_samples_dict:
        called_samples_dict[recB.id] = set(svu.get_called_samples(recB))
    return samples_overlap(called_samples_dict[recA.id],
                           called_samples_dict[recB.id],
                           upper_thresh=upper_thresh,
                           lower_thresh=lower_thresh)
Ejemplo n.º 3
0
def samples_overlap(recordA, recordB, upper_thresh=0.8, lower_thresh=0.5):
    # Get lists of called samples for each record
    samplesA = set(svu.get_called_samples(recordA))
    samplesB = set(svu.get_called_samples(recordB))

    # Compute fraction of each record's samples which are shared
    shared = samplesA & samplesB
    fracA = len(shared) / len(samplesA)
    fracB = len(shared) / len(samplesB)

    min_frac, max_frac = sorted([fracA, fracB])

    return min_frac >= lower_thresh and max_frac >= upper_thresh
Ejemplo n.º 4
0
def filter_dn_variants(vcf, filterfile, fam, fout):
    """
    Add parent false negatives and remove child false positives from dn filter
    Arguments
    ---------
    vcf : pysam.VariantFile
    filterfile : file
    fout : pysam.VariantFile
    """

    # Get dictionaries of samples to add and remove from each variant
    add, remove = parse_filtered(filterfile, fam)

    for record in vcf:
        # Write records unaltered if they weren't included in the de novo check
        if record.id not in add.keys() and record.id not in remove.keys():
            fout.write(record)

        # Otherwise set samples appropriately
        else:
            for sample in add.get(record.id, []):
                record.samples[sample]['GT'] = (0, 1)

            for sample in remove.get(record.id, []):
                set_null(record, sample)

            # Only report record if any samples made it through de novo check
            if len(svu.get_called_samples(record)) > 0:
                fout.write(record)
Ejemplo n.º 5
0
    def choose_background(self, record, whitelist=None, blacklist=None):
        # Select called and background samples
        called = svu.get_called_samples(record)
        background = [s for s in self.samples if s not in called]

        # Permit override of specified white/blacklists
        whitelist = whitelist if whitelist is not None else self.whitelist
        blacklist = blacklist if blacklist is not None else self.blacklist

        def _filter_whitelist(samples):
            return [s for s in samples if s in whitelist]

        def _filter_blacklist(samples):
            return [s for s in samples if s not in blacklist]

        called = _filter_whitelist(called)
        background = _filter_whitelist(background)

        called = _filter_blacklist(called)
        background = _filter_blacklist(background)

        if len(background) >= self.n_background:
            background = np.random.choice(background,
                                          self.n_background,
                                          replace=False).tolist()

        return called, background
Ejemplo n.º 6
0
def sfari_filters(vcf):
    for record in vcf:
        called = set(svu.get_called_samples(record))

        # Remove allosomal calls in aneuploidy samples
        if record.chrom in 'X Y'.split():
            if called.issubset(ANEUPLOIDIES):
                continue
            else:
                for sample in called.intersection(ANEUPLOIDIES):
                    svu.set_null(record, sample)

        # Remove depth-only events in Robertsonian translocation cases
        if record.info['SOURCES'] == ('depth', ):
            if called.issubset(DEPTH_EXCLUDED):
                continue

        # Check variant wasn't only in aneuploidies and Robertsonians
        #  called = set(svu.get_called_samples(record))
        #  if len(called) == 0:
        #  continue

        # Remove variants specific to dosage outliers
        #  if called.issubset(OUTLIERS):
        #  continue

        yield record
Ejemplo n.º 7
0
    def scrape_sample_stats(self, record):
        name = record.id
        chrom = record.chrom
        svtype = record.info['SVTYPE']
        if record.info['ALGORITHMS'] == ('depth', ):
            source = 'depth-only'
        elif 'depth' in record.info['ALGORITHMS']:
            source = 'pesr+depth'
        else:
            source = 'pesr-only'

        called = svu.get_called_samples(record)
        inh_status = get_inh(called, self.fam)
        inh_map = {}
        for status, samples in inh_status.items():
            for s in samples:
                inh_map[s] = status

        fmt = ('{sample}\t{name}\t{chrom}\t{svtype}\t{source}\t{inh}\n')

        for sample in called:
            if self.fam.samples[sample].has_parents:
                inh = inh_map[sample]
            else:
                inh = 'parent'

            self.obs_fout.write(fmt.format(**locals()))
Ejemplo n.º 8
0
 def _cache_sample_overlap(record, force=False):
     if force or record.id not in sample_overlap_cache:
         _samples = svu.get_called_samples(record)
         sample_overlap_cache[record.id] = set(sample_id_to_index_dict[s]
                                               for s in _samples)
         return _samples
     else:
         return sample_overlap_cache[record.id]
def is_pilot_denovo(svrecord):
    record = svrecord.record
    is_pilot = record.id.startswith('Pilot')

    called = svu.get_called_samples(record)
    is_private = len(called) == 1
    is_child = called[0].endswith('p1') or called[0].endswith('s1')

    return is_pilot and is_private and is_child
Ejemplo n.º 10
0
def link_cpx(vcf, bkpt_window=300):
    """
    Parameters
    ----------
    vcfpath : str
        Path to breakpoint VCF
    """

    bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False)

    # Identify breakpoints which overlap within specified window
    overlap = bt.window(bt, w=bkpt_window).saveas()

    # Exclude intersections where two DELs or two DUPs cluster together
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DEL" and b.fields[
        10] == "DEL")).saveas()
    overlap = overlap.filter(lambda b: not (b.fields[4] == "DUP" and b.fields[
        10] == "DUP")).saveas()

    # Get linked variant IDs
    links = [(b[3], b[9]) for b in overlap.intervals]
    linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
    linked_IDs = np.array(linked_IDs)

    # Map variant IDs to indices
    bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)}
    indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links])

    # Extract VariantRecords corresponding to breakpoints
    n_bkpts = len(linked_IDs)
    bkpts = extract_breakpoints(vcf, bkpt_idxs)

    # Build called sample index
    # Get lists of called samples for each record
    sample_sets_dict = {
        idx: set(svu.get_called_samples(bkpts[idx]))
        for idx in set(indexed_links.flatten().tolist())
    }

    # Exclude wildly disparate overlaps
    # Build sparse graph from links
    G = sps.eye(n_bkpts, dtype=np.uint16, format='lil')
    for i, j in indexed_links:
        if (samples_overlap(sample_sets_dict[i], sample_sets_dict[j])
                and close_enough(bkpts[i], bkpts[j])):
            G[i, j] = 1

    # Generate lists of clustered breakpoints
    n_comp, comp_list = sps.csgraph.connected_components(G)
    clusters = [deque() for x in range(n_comp)]
    for i, c_label in enumerate(comp_list):
        clusters[c_label].append(bkpts[i])

    return clusters
def merge_nested_depth_record(pesr_record, depth_record):
    """Add samples from nested depth record to PE/SR record"""

    pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] +
                                   (depth_record.id, ))
    pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', )

    def _quad(s):
        return s.strip('.')[0]

    depth_samples = svu.get_called_samples(depth_record)
    pesr_samples = svu.get_called_samples(pesr_record)

    # If a sample is called in both the pe/sr record and the nested depth
    # record, move any relatives called in the depth record to the pe/sr record
    for quad, samples in itertools.groupby(depth_samples, _quad):
        samples = list(samples)
        if any([s in pesr_samples for s in samples]):
            for sample in samples:
                svu.set_null(depth_record, sample)
                pesr_record.samples[sample]['GT'] = (0, 1)
def to_bed(record, cluster):
    fmt = ('{chrom}\t{start}\t{end}\t{name}\t{svtype}\t{samples}\t{sources}\t'
           '{batch}\t{cluster}\n')

    batch = 'Pilot' if 'Pilot' in record.id else 'Phase1'

    return fmt.format(chrom=record.chrom, start=record.pos, end=record.stop,
                      name=record.id + '__' + record.chrom,
                      svtype=record.info['SVTYPE'],
                      samples=','.join(svu.get_called_samples(record)),
                      sources=','.join(record.info['SOURCES']),
                      batch=batch, cluster=cluster)
def overlap_fail(phase1, pilot, header, dist=300, frac=0.1):
    svc = VCFCluster([phase1, pilot], dist=dist, frac=frac, preserve_ids=True)

    for cluster in svc.cluster(merge=False):
        # Keep any pilot de novo variants
        other_records = []
        for record in cluster.records:
            if is_pilot_denovo(record):
                yield make_new_record(SVRecordCluster([record]), header)
            else:
                other_records.append(record)

        if len(other_records) > 0:
            cluster.records = other_records
        else:
            continue

        # skip phase1-only variants and pilot variants that overlap with
        # rejected phase1 variants
        if is_batch_only(cluster, 'Pilot'):
            records = cluster.records
            depth_only = all([is_depth_only(r) for r in records])

            if len(records) == 1:
                record = make_new_record(cluster, header)
                yield record

            # check that we're not overclustering Pilot variants
            elif len(records) == 2:
                samples = [svu.get_called_samples(r.record) for r in records]

                if samples_overlap(*samples) or depth_only:
                    record = make_new_record(cluster, header)
                    yield record
                else:
                    for svrecord in cluster.records:
                        newcluster = SVRecordCluster([svrecord])
                        record = make_new_record(newcluster, header)
                        yield record

            elif depth_only:
                record = make_new_record(cluster, header)
                yield record

            else:
                import ipdb
                ipdb.set_trace()
                raise Exception('Multiple Pilot variants clustered')
Ejemplo n.º 14
0
    def choose_background(self, record, candidates):
        # Exclude called samples and all candidate families from background
        quads = sorted(set([s.split('.')[0] for s in candidates]))
        members = 'fa mo p1 s1'.split()
        related = ['.'.join(s) for s in itertools.product(quads, members)]

        called = set(svu.get_called_samples(record))
        blacklist = called.union(related)

        background = [s for s in self.samples if s not in blacklist]

        if len(background) >= self.n_background:
            background = np.random.choice(background,
                                          self.n_background,
                                          replace=False).tolist()

        return background
Ejemplo n.º 15
0
def process_metadata(variants, bed=False, batch_list=None):
    if bed:
        samples = [s.strip() for s in batch_list.readlines()]
    else:
        samples = list(variants.header.samples)

    parents = [s for s in samples if _is_parent(s)]
    children = [s for s in samples if _is_child(s)]
    n_parents = len(parents)
    n_children = len(children)

    metadata = deque()
    for variant in variants:
        # bed record
        if bed:
            if variant.startswith('#'):
                continue
            data = variant.strip().split()
            called = data[4].split(',')
            name = data[3]
            svtype = data[5]
        # VCF record
        else:
            called = svu.get_called_samples(variant)
            name = variant.id
            svtype = variant.info['SVTYPE']

        # Calculate parental VF
        parents = [s for s in called if _is_parent(s)]
        parental_vf = len(parents) / n_parents

        children = [s for s in called if _is_child(s)]
        child_vf = len(children) / n_children

        if child_vf > 0:
            inh_rate = get_inh_rate(called)
        else:
            inh_rate = 0

        dat = [name, svtype, parental_vf, child_vf, inh_rate]
        metadata.append(dat)

    metadata = np.array(metadata)
    cols = 'name svtype parental_vf child_vf inh_rate'.split()
    metadata = pd.DataFrame(metadata, columns=cols)
    return metadata
Ejemplo n.º 16
0
def get_denovo_candidates(record, fam, max_parents=10):
    """
    Obtain list of samples which are putatively called de novo
    """
    called = svu.get_called_samples(record)
    parents = [s for s in called if fam.samples[s].is_parent]

    if len(parents) > max_parents:
        return []

    denovo = []
    for ID in called:
        sample = fam.samples[ID]
        if sample.has_parents:
            if sample.mother not in called and sample.father not in called:
                denovo.append(sample.ID)

    return denovo
Ejemplo n.º 17
0
    def run(self):
        for record in self.vcf:
            # Skip records without any Mendelian violations
            candidates = get_denovo_candidates(record, self.max_parents)
            if len(candidates) == 0:
                continue

            # Restrict to rare (parental VF<0.1%) variants
            c = svu.get_called_samples(record)
            parents = [s for s in c if s.endswith('fa') or s.endswith('mo')]
            if len(parents) > self.max_parents:
                continue

            # Skip non-stranded (wham)
            if record.info['STRANDS'] not in '+- -+ ++ --'.split():
                continue

            self.test_record(record)
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf')
    parser.add_argument('dn_filter')
    parser.add_argument('fout')
    args = parser.parse_args()

    vcf = pysam.VariantFile(args.vcf)

    if args.fout in '- stdout'.split():
        fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header)
    else:
        fout = pysam.VariantFile(args.fout, 'w', header=vcf.header)

    dn_filter = pd.read_table(args.dn_filter)
    dn_filter = dn_filter.pivot_table(index='name',
                                      columns='operation',
                                      values='sample',
                                      aggfunc=lambda s: ','.join(s))
    for record in vcf:
        if record.id not in dn_filter.index:
            fout.write(record)
            continue

        # Add false negative parents
        parents = dn_filter.loc[record.id, 'add']
        if parents is not None:
            for sample in parents.split(','):
                record.samples[sample]['GT'] = (0, 1)

        # Remove false positive children
        children = dn_filter.loc[record.id, 'remove']
        if children is not None:
            for sample in children.split(','):
                svu.set_null(record, sample)

        # Skip variant if the child was only sample
        called = svu.get_called_samples(record)
        if len(called) == 0:
            continue

        fout.write(record)
Ejemplo n.º 19
0
def scrape_record_stats(record, fam):
    """
    record : pysam.VariantRecord
    sample_keys : dict of {str: list of str}
        {batch: [samples]}
    """
    name = record.id
    svtype = record.info['SVTYPE']

    #  svsize = record.stop - record.pos
    svsize = record.info['SVLEN']
    algorithms = ','.join(record.info['ALGORITHMS'])

    called = svu.get_called_samples(record)
    parents = [s for s in called if not fam.samples[s].has_parents]
    children = [s for s in called if fam.samples[s].has_parents]

    n_called = len(called)
    n_parents = len(parents)
    n_children = len(children)

    homs = [s for s in called if record.samples[s]['GT'] == (1, 1)]
    hom_parents = [s for s in homs if s in parents]
    hets = [s for s in called if record.samples[s]['GT'] == (0, 1)]

    n_homs = len(homs)
    n_hets = len(hets)

    inh_status = get_inh(called, fam)
    n_denovo = len(inh_status['denovo'])
    n_maternal = len(inh_status['maternal'])
    n_paternal = len(inh_status['paternal'])
    n_biparental = len(inh_status['biparental'])

    chrom, start, end = record.chrom, record.pos, record.stop

    statline = ('{chrom}\t{start}\t{end}\t'
                '{name}\t{svtype}\t{svsize}\t{algorithms}\t'
                '{n_called}\t{n_parents}\t{n_children}\t'
                '{n_homs}\t{n_hets}\t'
                '{n_denovo}\t{n_maternal}\t{n_paternal}\t{n_biparental}')

    statline = statline.format(**locals())
    return statline
Ejemplo n.º 20
0
def filter_denovo_records(vcf, fam, max_parents=10):
    for record in vcf:
        # Skip records without any Mendelian violations
        candidates = get_denovo_candidates(record, fam, max_parents)
        if len(candidates) == 0:
            continue

        # Restrict to rare (parental VF<0.1%) variants
        called = svu.get_called_samples(record)
        parents = [s for s in called if fam.samples[s].is_parent]
        if len(parents) > max_parents:
            continue

        # Skip non-stranded (wham)
        if 'STRANDS' in record.info.keys():
            if record.info['STRANDS'] not in '+- -+ ++ --'.split():
                continue

        yield record
Ejemplo n.º 21
0
def get_denovo_candidates(record, max_parents=20):
    """
    Obtain list of samples which are putatively called de novo
    """
    called = svu.get_called_samples(record)
    parents = [s for s in called if s.endswith('fa') or s.endswith('mo')]

    if len(parents) > max_parents:
        return []

    denovo = []
    for quad, samples in itertools.groupby(called, lambda s: s.split('.')[0]):
        # Add putative de novo calls
        samples = list(samples)
        members = [s.split('.')[1] for s in samples]
        if 'fa' not in members and 'mo' not in members:
            denovo += samples

    return denovo
Ejemplo n.º 22
0
def count_svtypes(vcf):
    """
    Count instances of each SVTYPE in each sample in a VCF.

    Parameters
    ----------
    vcf : pysam.VariantFile

    Returns
    -------
    counts : pd.DataFrame
        Columns: sample, svtype, count
    """

    samples = list(vcf.header.samples)

    # Initialize counts per sample - each dict is keyed on svtype
    count_dict = {}
    for sample in samples:
        count_dict[sample] = defaultdict(int)

    for record in vcf:
        for sample in svu.get_called_samples(record):
            # Count the SVTYPE if it's present, otherwise increment NO_SVTYPE
            if 'SVTYPE' in record.info.keys():
                count_dict[sample][record.info['SVTYPE']] += 1
            else:
                count_dict[sample]['NO_SVTYPE'] += 1

    # Convert to dataframe, adding zeros to samples with no instances of a
    # given svtype
    counts = pd.DataFrame.from_dict(count_dict, orient='index')\
                         .fillna(0).astype(int)\
                         .reset_index().rename(columns={'index': 'sample'})

    # Tidy data from "column-per-svtype" format
    counts = pd.melt(counts,
                     id_vars=['sample'],
                     var_name='svtype',
                     value_name='count')

    return counts
Ejemplo n.º 23
0
def select_mosaic_candidates(vcf, metrics, mode, cutoffs):
    candidates = metrics.loc[metrics.svtype.isin('DEL DUP'.split())]
    candidates = candidates.loc[candidates.svsize >= 5000]
    candidates = candidates.loc[candidates.poor_region_cov < 0.3]

    if mode == 'pesr':
        candidates = candidates.loc[candidates.RD_log_pval >= cutoffs.min_pval]
        candidates = candidates.loc[candidates.RD_log_2ndMaxP >= cutoffs.min_secondp]
    else:
        pval_filter = (((candidates.svtype == 'DEL') & (candidates.RD_log_pval >= cutoffs.del_min_pval)) |
                       ((candidates.svtype == 'DUP') & (candidates.RD_log_pval >= cutoffs.dup_min_pval)))
                 
        candidates = candidates.loc[pval_filter]

    candidate_IDs = candidates.name.values

    for record in vcf:
        called = svu.get_called_samples(record)
        if len(called) == 1 and record.id in candidate_IDs and record.info['SVTYPE'] in 'DEL DUP'.split():
            yield record
Ejemplo n.º 24
0
    def from_vcf(cls, record, whitelist=None):
        """
        Parameters
        ----------
        record : pysam.VariantRecord
        """

        chrA = record.chrom
        posA = record.pos
        chrB = record.info['CHR2']
        posB = record.stop

        name = record.id
        strands = record.info['STRANDS']

        samples = svu.get_called_samples(record)
        if whitelist is not None:
            samples = [s for s in samples if s in whitelist]

        return cls(chrA, posA, chrB, posB, name, samples, strands)
Ejemplo n.º 25
0
    def choose_background(self, record, candidates):
        # Exclude called samples and all candidate families from background
        related = [c for c in candidates]
        for s in candidates:
            if self.fam.samples[s].has_parents:
                related.append(self.fam.samples[s].father)
                related.append(self.fam.samples[s].mother)

                related += self.fam.samples[self.fam.samples[s].father].children

        called = set(svu.get_called_samples(record))
        blacklist = called.union(related)

        background = [s for s in self.samples if s not in blacklist]

        if len(background) >= self.n_background:
            background = np.random.choice(background, self.n_background,
                                          replace=False).tolist()

        return background
Ejemplo n.º 26
0
 def __init__(self, record):
     self.id = record.id
     if 'EVIDENCE' in record.info:
         ev = set(record.info['EVIDENCE'])
     else:
         ev = set()
     if 'PE' in ev and 'SR' in ev and 'RD' in ev:
         self.level_of_support = 1
     elif 'PE' in ev and 'RD' in ev:
         self.level_of_support = 2
     elif 'PE' in ev and 'SR' in ev:
         self.level_of_support = 3
     elif 'RD' in ev and 'SR' in ev:
         self.level_of_support = 4
     elif 'PE' in ev:
         self.level_of_support = 5
     elif 'RD' in ev:
         self.level_of_support = 6
     elif 'SR' in ev:
         self.level_of_support = 7
     elif len(ev) == 0:
         self.level_of_support = 8
     else:
         raise ValueError("Uninterpretable evidence: {}".format(ev))
     if record.id in bothside_pass:
         self.both_end_support = bothside_pass[record.id]
     else:
         self.both_end_support = 0
     self.sr_fail = record.id in background_fail
     self.is_bnd = record.info['SVTYPE'] == 'BND'
     self.vargq = record.info['varGQ']
     self.called_samples = [
         sample_id_to_idx[s] for s in svu.get_called_samples(record)
     ]
     self.freq = len(self.called_samples)
     self.length = record.info['SVLEN']
     self.gt_50bp = self.length >= 50
     self.is_mei = 'melt' in record.info['ALGORITHMS']
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Input vcf (supports "stdin").')
    parser.add_argument('minGQtable', help='Tab-delimited minGQ filtering lookup' + 
                        ' table generated by create_minGQ_lookup_table.R.')
    parser.add_argument('fout', help='Output file (supports "stdout").')
    parser.add_argument('-m', '--minGQ', help='Global min GQ', type=int, 
                        default=0, dest='globalMin')
    parser.add_argument('--multiallelics', default=False, action='store_true',
                        help='Also apply filtering to multiallelic sites ' + 
                        '(default: skip multiallelics).')
    parser.add_argument('--dropEmpties', default=False, action='store_true',
                        help='After GT reassignments, drop any SV with no remaining ' + 
                        ' non-ref samples (default: keep all SV).')
    parser.add_argument('--simplify-INS-SVTYPEs', default=False, action='store_true',
                        help='Resets the SVTYPE of all INS variants, including MEIs, ' + 
                        'to be SVTYPE=INS (default: keep original SVTYPEs).')    
    parser.add_argument('--maxNCR', help='Max no-call rate among all ' + 
                        'samples before adding a flag to the record\'s FILTER field' + 
                        ' (default: 0.005)', 
                        type=float, default=0.005, dest='maxNCR')
    parser.add_argument('--cleanAFinfo', help='Remove all AF-related terms from ' + 
                        ' the INFO field and VCF header (default: keep all terms).', 
                        default=False, action='store_true')
    parser.add_argument('--prefix', help='Cohort label to append to NCR FILTER.', 
                        default='COHORT', dest='prefix')

    args = parser.parse_args()

    if args.vcf in '- stdin'.split():
        vcf = pysam.VariantFile(sys.stdin) 
    else:
        vcf = pysam.VariantFile(args.vcf)

    #Add HIGH_NOCALL_RATE filter to vcf header
    NEW_FILTER = '##FILTER=<ID=HIGH_{0}_NOCALL_RATE,Description="More than '.format(args.prefix) + \
                 '{:.2%}'.format(args.maxNCR) + ' of {0} sample GTs were '.format(args.prefix) + \
                 'masked as no-call GTs due to low GQ. Indicates a possibly noisy locus ' + \
                 'in {0} samples.>'.format(args.prefix)
    header = vcf.header
    header.add_line(NEW_FILTER)
    filter_text = 'HIGH_{0}_NOCALL_RATE'.format(args.prefix)

    if args.fout in '- stdout'.split():
        fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header)
    else:
        fout = pysam.VariantFile(args.fout, 'w', header=vcf.header)

    #Make dummy lookup tables for SVLEN, AF, SVTYPE, FILTER, and EV
    SVLEN_table = _make_SVLEN_interval_dict(args.minGQtable)
    AF_table = _make_AF_interval_dict(args.minGQtable)
    SVTYPE_table = _make_SVTYPE_dict(args.minGQtable)
    FILTER_table = _make_FILTER_dict(args.minGQtable, vcf)
    EV_table = _make_EV_dict(args.minGQtable)

    #Make minGQ lookup table
    minGQ_dict = make_minGQ_dict(args.minGQtable, SVLEN_table, AF_table, 
                                 SVTYPE_table, FILTER_table, EV_table)

    #Iterate over records in vcf and apply filter
    for record in vcf.fetch():
        #Do not process multiallelic variants, unless optioned
        if args.multiallelics or \
        (not args.multiallelics and 
         not _is_multiallelic(record)):
            apply_minGQ_filter(record, minGQ_dict, SVLEN_table, AF_table, 
                               SVTYPE_table, FILTER_table, EV_table, 
                               globalMin=args.globalMin, maxNCR=args.maxNCR, 
                               highNCR_filter=filter_text)

        if args.cleanAFinfo:
            # Clean biallelic AF annotation
            for key in 'AN AC AF N_BI_GENOS N_HOMREF N_HET N_HOMALT FREQ_HOMREF FREQ_HET FREQ_HOMALT'.split(' '):
                if key in record.info.keys():
                    record.info.pop(key)
            # Clean CN frequency annotation
            for key in 'CN_NUMBER CN_COUNT CN_FREQ CN_NONREF_COUNT CN_NONREF_FREQ'.split():
                if key in record.info.keys():
                    record.info.pop(key)

        # Standardize SVTYPE for all INS variants, if optioned
        if any([keyword in record.info['SVTYPE'].split(':') for keyword in 'INS MEI'.split()]):
            if args.simplify_INS_SVTYPEs:
                record.info['SVTYPE'] = 'INS'

        if args.dropEmpties:
            samps = svu.get_called_samples(record, include_null=False)
            if len(samps) > 0:
                fout.write(record)
        else: 
            fout.write(record)

    fout.close()
def add_samples(pesr_record, depth_record):
    # TODO: add pesr/depth FORMAT fields
    for sample in svu.get_called_samples(depth_record):
        pesr_record.samples[sample]['GT'] = (0, 1)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.8):
    # Memory inefficient but it's easier and shouldn't matter too much
    # now that the variants have been filtered down
    records = dict()
    records['pesr'] = {record.id: record for record in pesr_vcf}
    records['depth'] = {record.id: record for record in depth_vcf}

    # Wipe MEMBERS from prior clustering
    for source in 'pesr depth'.split():
        for ID, record in records[source].items():
            record.info['MEMBERS'] = [ID]

    # Reset for bedtool creation
    pesr_vcf.reset()
    depth_vcf.reset()
    pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False,
                               include_strands=False)
    depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False,
                                include_strands=False)

    # Merge depth records with PE/SR records if they share 80% recip overlap
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac)

    filtered_depth_IDs = deque()
    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Add depth record's samples to PE/SR
        filtered_depth_IDs.append(depth_id)
        pesr_record = records['pesr'][pesr_id]
        depth_record = records['depth'][depth_id]

        # Update metadata and samples
        pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] +
                                       (depth_record.id, ))
        pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', )
        add_samples(pesr_record, depth_record)

    # Remove overlapping depth records (not performed in for loop to account
    # for double overlaps
    # TODO: handle double overlap of depth calls
    for ID in set(filtered_depth_IDs):
        records['depth'].pop(ID)

    # In remaining depth-only calls, add samples to PE/SR record if the
    # record covers 90% of the depth-only call.
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9)

    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Skip depth records we already added with 80% reciprocal
        if depth_id in filtered_depth_IDs:
            continue

        # If sample is in both depth record and pe/sr record, remove it from
        # depth record
        depth_record = records['depth'][depth_id]
        pesr_record = records['pesr'][pesr_id]

        merge_nested_depth_record(pesr_record, depth_record)

    # Merge records together
    def _sort_key(record):
        return (record.chrom, record.pos, record.info['CHR2'], record.stop)

    pesr_records = sorted(records['pesr'].values(), key=_sort_key)
    depth_records = sorted(records['depth'].values(), key=_sort_key)
    for record in heapq.merge(pesr_records, depth_records, key=_sort_key):
        # Clean out unwanted format keys
        for key in record.format.keys():
            if key != 'GT':
                del record.format[key]

        record.info['SOURCES'] = sorted(set(record.info['SOURCES']))
        record.info['MEMBERS'] = sorted(set(record.info['MEMBERS']))

        # Skip emptied depth records
        if len(svu.get_called_samples(record)) == 0:
            continue

        yield record
def merge_linked_depth_calls(vcf, ID_links, flagged=[]):
    """
    vcf : pysam.VariantFile
    ID_links : list of (str, str)
    flagged : list of str
        Multiallelic sites
    """

    # Make list of linked IDs and build map to corresponding records
    linked_IDs = sorted(set([ID for link in ID_links for ID in link]))
    record_map = {}

    # If a record wasn't linked with a bedtools merge, just return it
    for record in vcf:
        if record.id not in linked_IDs:
            yield record
        else:
            record_map[record.id] = record

    # Ignore links on other chromosomes
    linked_IDs = sorted(record_map.keys())
    ID_links = [
        l for l in ID_links if l[0] in linked_IDs and l[1] in linked_IDs
    ]

    # Convert links from pairs of IDs to pairs of records
    record_links = np.empty([len(ID_links), 2], dtype=object)
    for i, link in enumerate(ID_links):
        record_links[i, 0] = record_map[link[0]]
        record_links[i, 1] = record_map[link[1]]

    clusters = slink(record_links, record_map)

    # Merge clusters
    for cluster in clusters:
        if len(cluster) == 1:
            yield cluster[0]
            continue

        # Take maximal region
        start = np.min([record.pos for record in cluster])
        end = np.max([record.stop for record in cluster])

        merged_record = cluster[0].copy()
        merged_record.pos = start
        merged_record.stop = end

        # members = list(record.info['MEMBERS']) + [r.id for r in cluster]
        # merged_record.info['MEMBERS'] = members

        # Take union of called samples
        for record in cluster:
            called = svu.get_called_samples(record)
            for sample in called:
                merged_record.samples[sample]['GT'] = (0, 1)

        # Flag multiallelic
        if any([record.id in flagged for record in cluster]):
            merged_record.info['MULTIALLELIC'] = True

        yield merged_record