def test_exclude_header(self): """ test that exclude_header() works correctly """ vcf = make_minimal_vcf() # make sure we drop the header, and only the header from the file # check this by reading the file, and making sure the first line # is the line we expect from the VCF path = os.path.join(self.temp_dir, "temp.vcf") write_temp_vcf(path, vcf) handler = open(path, "r") exclude_header(handler) self.assertEqual(handler.readline(), vcf[4]) handler.close() # also check for gzipped VCF files. path = os.path.join(self.temp_dir, "temp.vcf.gz") write_gzipped_vcf(path, vcf) mode = 'r' if IS_PYTHON3: mode = 'rt' with gzip.open(path, mode) as handler: exclude_header(handler) self.assertEqual(handler.readline(), vcf[4])
def remove_phased_hets(person, vcf_path, bam_path, output_vcf_path): ''' screen out putative compound hets in phase from a VCF. We identify putative compound hets when we lack parental data, but sequence reads can identify candidates where both sites are in the same read, which means both sites are inherited from the same parent, and excludes the site as being a compound het. Args: person: sample ID for person, which vcf_path: path to probands VCF bam_path: path to probands BAM sequence output_vcf_path: path to write filtered VCF to ''' phased = [ x for x in get_compound(vcf_path, person) if in_phase(bam_path, x) ] phased = set([ x for sublist in phased for x in sublist ]) initial_vcf = open_vcf(vcf_path) header = get_vcf_header(initial_vcf) exclude_header(initial_vcf) output_vcf = gzip.open(output_vcf_path, 'wt') output_vcf.writelines(header) for line in initial_vcf: record = construct_variant(line.split('\t'), 'F') # only write out variants which are not 'compound hets' in phase key = (record.chrom, record.position, record.ref_allele, record.alt_alleles) if key not in phased: output_vcf.write(line)
def open_individual(individual, child_variants=None, mnvs=None, sum_x_lr2=None, parents=None): """ Convert VCF to TSV format. Use for single sample VCF file. Obtains the VCF data for a single sample. This function optionally filters the lines of the VCF file that pass defined criteria, in order to reduce memory usage. Args: individual: Person object for individual child_variants: True/False for whether variants have been filtered for the proband (if so, we can simply check the parent's variants for matches in the child's variants). mnvs: dictionary sum_x_lr2: Sum of mean lr2 for proband X chromosome for filtering CNVs parents: does the family have both parents? Returns: A list of variants for the individual. """ # parents = individual.has_parents() if individual is None: return [] path = individual.get_path() logging.info("sample path: {}".format(path)) gender = individual.get_gender() # open the vcf, and adjust the position in the file to immediately after # the header, so we can run through the variants vcf = open_vcf(path) exclude_header(vcf) variants = [] for line in vcf: line = line.strip().split("\t") try: # check if we want to include the variant or not if include_variant(line, child_variants, gender, mnvs, sum_x_lr2, parents): var = construct_variant(line, gender, mnvs, sum_x_lr2, parents) var.add_vcf_line(line) variants.append(var) except ValueError: # we only get ValueError when the genotype cannot be set, which # occurs for x chrom male heterozygotes (an impossible genotype) if line[0] == SNV.debug_chrom and int(line[1]) == SNV.debug_pos: print("failed as heterozygous genotype in male on chrX") continue vcf.close() return variants
def test_find_nearby_variants_separated(self): ''' test that find_nearby_variants() doesn't include vars far apart ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=4)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [])
def test_find_nearby_variants(self): ''' test that find_nearby_variants() works correctly ''' lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
def test_find_nearby_variants_different_chroms(self): ''' test that find_nearby_variants() works correctly with successive variants on different chroms, but at the same position. ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(chrom='1', pos=1)) lines.append(make_vcf_line(chrom='2', pos=1)) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [])
def test_find_nearby_variants_different_threshold(self): ''' test that find_nearby_variants() works correctly when we change the threshold distance. ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) vcf = open_vcf(self.path) exclude_header(vcf) # using a lower threshold shouldn't allow any of the variants to pass self.assertEqual(find_nearby_variants(vcf, threshold=0), [])
def test_find_nearby_variants_duplicate_position(self): ''' test that find_nearby_variants() works correctly with a duplicate var ''' # get the default two variants lines = make_vcf_header() lines.append(make_vcf_line(pos=1)) lines.append(make_vcf_line(pos=2)) # make a third variant, but at the same position as the second lines.append(make_vcf_line(pos=2)) self.write_vcf(lines) vcf = open_vcf(self.path) exclude_header(vcf) self.assertEqual(find_nearby_variants(vcf), [[('1', 1), ('1', 2)]])
def get_mnv_candidates(path): ''' identify MNV candidates, and their MNV consequences within a VCF. Args: path: path to VCF Returns: list of (variant, mnv_consequence) tuples, where variant is (chrom, pos) ''' with open_vcf(path) as vcf: exclude_header(vcf) header = get_vcf_header(vcf) pairs = find_nearby_variants(vcf) # ensure variants are not indels, are coding, and pairs alter the same amino # acid position vcf = tabix.open(path) pairs = screen_pairs(vcf, pairs, is_not_indel) pairs = screen_pairs(vcf, pairs, is_coding) pairs = same_aa(vcf, pairs) pattern = re.compile('[ACGT]') candidates = {} for pair in pairs: var1, var2 = list(get_matches(vcf, pair)) try: cq = check_mnv_consequence(var1, var2, pattern) candidates[pair[0]] = cq candidates[pair[1]] = cq except AssertionError: print('{0}:{1} and {0}:{2} in {3} have multiple alternative ' \ 'transcripts or odd codon sequences'.format(var1.chrom, var1.pos, var2.pos, path)) return candidates
def get_compound(vcf_path, sample_id): ''' pull out the compound hets, grouped by gene Args: vcf_path: path to VCF sample_id: sample ID for individual in VCF. Returns: list of lists of (chrom, pos, ref, alts) tuples for the variants in a compound het. ''' vcf = open_vcf(vcf_path) exclude_header(vcf) genes = {} for line in vcf: line = line.split('\t') variant = construct_variant(line, 'F') variant.add_format(line[8], line[9]) if 'compound_het' not in variant.info['ClinicalFilterType']: continue # only check sites in singletons, which always have inheritance=unknown if variant.format['INHERITANCE'] != 'unknown': continue for symbol in variant.info['ClinicalFilterReportableHGNC']: if symbol not in genes: genes[symbol] = [] genes[symbol].append((variant.chrom, variant.position, variant.ref_allele, variant.alt_alleles)) return genes.values()