def remove_phased_hets(person, vcf_path, bam_path, output_vcf_path):
    ''' screen out putative compound hets in phase from a VCF.
    
    We identify putative compound hets when we lack parental data, but sequence
    reads can identify candidates where both sites are in the same read, which
    means both sites are inherited from the same parent, and excludes the site
    as being a compound het.
    
    Args:
        person: sample ID for person, which
        vcf_path: path to probands VCF
        bam_path: path to probands BAM sequence
        output_vcf_path: path to write filtered VCF to
    '''
    
    phased = [ x for x in get_compound(vcf_path, person) if in_phase(bam_path, x) ]
    phased = set([ x for sublist in phased for x in sublist ])
    
    initial_vcf = open_vcf(vcf_path)
    header = get_vcf_header(initial_vcf)
    exclude_header(initial_vcf)
    
    output_vcf = gzip.open(output_vcf_path, 'wt')
    output_vcf.writelines(header)
    
    for line in initial_vcf:
        record = construct_variant(line.split('\t'), 'F')
        # only write out variants which are not 'compound hets' in phase
        key = (record.chrom, record.position, record.ref_allele, record.alt_alleles)
        if key not in phased:
            output_vcf.write(line)
Exemple #2
0
def open_individual(individual, child_variants=None, mnvs=None, sum_x_lr2=None, parents=None):
    """ Convert VCF to TSV format. Use for single sample VCF file.
    
    Obtains the VCF data for a single sample. This function optionally
    filters the lines of the VCF file that pass defined criteria, in order
    to reduce memory usage.
    
    Args:
        individual: Person object for individual
        child_variants: True/False for whether variants have been filtered
            for the proband (if so, we can simply check the parent's
            variants for matches in the child's variants).
        mnvs: dictionary
        sum_x_lr2: Sum of mean lr2 for proband X chromosome for filtering CNVs
        parents: does the family have both parents?
    
    Returns:
        A list of variants for the individual.
    """

#    parents = individual.has_parents()

    if individual is None:
        return []
    
    path = individual.get_path()
    logging.info("sample path: {}".format(path))
    gender = individual.get_gender()
    
    # open the vcf, and adjust the position in the file to immediately after
    # the header, so we can run through the variants
    vcf = open_vcf(path)
    exclude_header(vcf)
    
    variants = []
    for line in vcf:
        line = line.strip().split("\t")
        
        try:
            # check if we want to include the variant or not
            if include_variant(line, child_variants, gender, mnvs, sum_x_lr2, parents):
                var = construct_variant(line, gender, mnvs, sum_x_lr2, parents)
                var.add_vcf_line(line)
                variants.append(var)
        except ValueError:
            # we only get ValueError when the genotype cannot be set, which
            # occurs for x chrom male heterozygotes (an impossible genotype)
            if line[0] == SNV.debug_chrom and int(line[1]) == SNV.debug_pos:
                print("failed as heterozygous genotype in male on chrX")
            continue
    
    vcf.close()
    
    return variants
Exemple #3
0
 def test_find_nearby_variants_separated(self):
     ''' test that find_nearby_variants() doesn't include vars far apart
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=4))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [])
Exemple #4
0
 def test_find_nearby_variants(self):
     ''' test that find_nearby_variants() works correctly
     '''
     
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
 def test_open_vcf(self):
     """ test obtaining a file handle for the VCF
     """
     
     vcf = make_minimal_vcf()
     path = os.path.join(self.temp_dir, "temp.vcf")
     write_temp_vcf(path, vcf)
     
     # check that plain VCF files can be loaded
     handle = open_vcf(path)
     self.assertEqual(type(handle), io.TextIOWrapper)
     handle.close()
     
     # check that gzipped vcf files are handled correctly
     path = os.path.join(self.temp_dir, "temp.vcf.gz")
     write_gzipped_vcf(path, vcf)
     
     handle = open_vcf(path)
     if IS_PYTHON3:
         self.assertEqual(type(handle), io.TextIOWrapper)
     else:
         self.assertEqual(type(handle), gzip.GzipFile)
     handle.close()
     
     # make sure files that don't exists raise an error
     path = os.path.join(self.temp_dir, "zzz.txt")
     with self.assertRaises(OSError):
         open_vcf(path)
     
     # check that files with unknown extensions raise errors
     path = os.path.join(self.temp_dir, "temp.zzz")
     write_temp_vcf(path, vcf)
     with self.assertRaises(OSError):
         open_vcf(path)
Exemple #6
0
    def test_open_vcf(self):
        """ test obtaining a file handle for the VCF
        """

        vcf = make_minimal_vcf()
        path = os.path.join(self.temp_dir, "temp.vcf")
        write_temp_vcf(path, vcf)

        # check that plain VCF files can be loaded
        handle = open_vcf(path)
        self.assertEqual(type(handle), io.TextIOWrapper)
        handle.close()

        # check that gzipped vcf files are handled correctly
        path = os.path.join(self.temp_dir, "temp.vcf.gz")
        write_gzipped_vcf(path, vcf)

        handle = open_vcf(path)
        if IS_PYTHON3:
            self.assertEqual(type(handle), io.TextIOWrapper)
        else:
            self.assertEqual(type(handle), gzip.GzipFile)
        handle.close()

        # make sure files that don't exists raise an error
        path = os.path.join(self.temp_dir, "zzz.txt")
        with self.assertRaises(OSError):
            open_vcf(path)

        # check that files with unknown extensions raise errors
        path = os.path.join(self.temp_dir, "temp.zzz")
        write_temp_vcf(path, vcf)
        with self.assertRaises(OSError):
            open_vcf(path)
Exemple #7
0
 def test_find_nearby_variants_different_chroms(self):
     ''' test that find_nearby_variants() works correctly with successive
     variants on different chroms, but at the same position.
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(chrom='1', pos=1))
     lines.append(make_vcf_line(chrom='2', pos=1))
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [])
Exemple #8
0
 def test_find_nearby_variants_different_threshold(self):
     ''' test that find_nearby_variants() works correctly when we change the threshold distance.
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     
     # using a lower threshold shouldn't allow any of the variants to pass
     self.assertEqual(find_nearby_variants(vcf, threshold=0), [])
Exemple #9
0
 def test_find_nearby_variants_duplicate_position(self):
     ''' test that find_nearby_variants() works correctly with a duplicate var
     '''
     
     # get the default two variants
     lines = make_vcf_header()
     lines.append(make_vcf_line(pos=1))
     lines.append(make_vcf_line(pos=2))
     
     # make a third variant, but at the same position as the second
     lines.append(make_vcf_line(pos=2))
     self.write_vcf(lines)
     
     vcf = open_vcf(self.path)
     exclude_header(vcf)
     self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
Exemple #10
0
def get_mnv_candidates(path):
    ''' identify MNV candidates, and their MNV consequences within a VCF.
    
    Args:
        path: path to VCF
    
    Returns:
        list of (variant, mnv_consequence) tuples, where variant is (chrom, pos)
    '''

    with open_vcf(path) as vcf:
        exclude_header(vcf)
        header = get_vcf_header(vcf)
        pairs = find_nearby_variants(vcf)

    # ensure variants are not indels, are coding, and pairs alter the same amino
    # acid position
    vcf = tabix.open(path)
    pairs = screen_pairs(vcf, pairs, is_not_indel)
    pairs = screen_pairs(vcf, pairs, is_coding)
    pairs = same_aa(vcf, pairs)

    pattern = re.compile('[ACGT]')

    candidates = {}
    for pair in pairs:
        var1, var2 = list(get_matches(vcf, pair))
        try:
            cq = check_mnv_consequence(var1, var2, pattern)
            candidates[pair[0]] = cq
            candidates[pair[1]] = cq
        except AssertionError:
            print('{0}:{1} and {0}:{2} in {3} have multiple alternative ' \
                'transcripts or odd codon sequences'.format(var1.chrom,
                var1.pos, var2.pos, path))

    return candidates
def get_mnv_candidates(path):
    ''' identify MNV candidates, and their MNV consequences within a VCF.
    
    Args:
        path: path to VCF
    
    Returns:
        list of (variant, mnv_consequence) tuples, where variant is (chrom, pos)
    '''
    
    with open_vcf(path) as vcf:
        exclude_header(vcf)
        header = get_vcf_header(vcf)
        pairs = find_nearby_variants(vcf)
    
    # ensure variants are not indels, are coding, and pairs alter the same amino
    # acid position
    vcf = tabix.open(path)
    pairs = screen_pairs(vcf, pairs, is_not_indel)
    pairs = screen_pairs(vcf, pairs, is_coding)
    pairs = same_aa(vcf, pairs)
    
    pattern = re.compile('[ACGT]')
    
    candidates = {}
    for pair in pairs:
        var1, var2 = list(get_matches(vcf, pair))
        try:
            cq = check_mnv_consequence(var1, var2, pattern)
            candidates[pair[0]] = cq
            candidates[pair[1]] = cq
        except AssertionError:
            print('{0}:{1} and {0}:{2} in {3} have multiple alternative ' \
                'transcripts or odd codon sequences'.format(var1.chrom,
                var1.pos, var2.pos, path))
    
    return candidates
def get_compound(vcf_path, sample_id):
    ''' pull out the compound hets, grouped by gene
    
    Args:
        vcf_path: path to VCF
        sample_id: sample ID for individual in VCF.
    
    Returns:
        list of lists of (chrom, pos, ref, alts) tuples for the variants in a
        compound het.
    '''
    
    vcf = open_vcf(vcf_path)
    exclude_header(vcf)
    
    genes = {}
    for line in vcf:
        line = line.split('\t')
        variant = construct_variant(line, 'F')
        variant.add_format(line[8], line[9])
        
        if 'compound_het' not in variant.info['ClinicalFilterType']:
            continue
        
        # only check sites in singletons, which always have inheritance=unknown
        if variant.format['INHERITANCE'] != 'unknown':
            continue
        
        for symbol in variant.info['ClinicalFilterReportableHGNC']:
            if symbol not in genes:
                genes[symbol] = []
            
            genes[symbol].append((variant.chrom, variant.position,
                variant.ref_allele, variant.alt_alleles))
    
    return genes.values()