def test_pagb(): """ Test probability of a given b calculation """ records = [ _Record(1, 0, '1', '', '', '', '', '', '', ''), _Record(2, 0, '2', '', '', '', '', '', '', ''), _Record(3, 0, '3', '', '', '', '', '', '', ''), _Record(4, 0, '4', '', '', '', '', '', '', '') ] group = vg.VariantGroup('1', records) group.coverage_array = populate_array(len(group.coverage_array), 400) group.existence_array = populate_array(len(group.coverage_array), 100) group.a_not_b_array = populate_array(len(group.coverage_array), 25) group.a_not_b_array[1][0] = 100 group.a_not_b_array[2][0] = 25 group.a_not_b_array[3][1] = 100 group.b_not_a_array[3][0] = 25 group.b_not_a_array[2][1] = 100 group.b_not_a_array[3][1] = 100 group.set_filter_fq_pagb(50, True) for i in range(len(group.filter)): for j in range(i, len(group.filter)): group.filter[i][j] = True assert_true(not group.filter.all()) group.filter[3][1] = True assert_true(group.filter.all())
def test_add_min_read_filter(): """ Test minimum read filter """ records = [ _Record(1, 0, '1', '', '', '', '', '', '', ''), _Record(2, 0, '2', '', '', '', '', '', '', ''), _Record(3, 0, '3', '', '', '', '', '', '', ''), _Record(4, 0, '4', '', '', '', '', '', '', '') ] group = vg.VariantGroup('1', records) arlen = len(records) group.existence_array = np.zeros((arlen, arlen)) correct = {'1': np.zeros((arlen, arlen)), '2': np.zeros((arlen, arlen))} group.existence_array[0][1] = 30 group.existence_array[0][2] = 40 group.existence_array[0][3] = 50 group.existence_array[1][2] = 5 group.existence_array[1][3] = 10 group.existence_array[2][3] = 1 correct['1'][0][2] = 1 correct['1'][0][3] = 1 correct['2'][0][1] = 1 correct['2'][0][2] = 1 correct['2'][0][3] = 1 correct['2'][1][3] = 1 correct = {key: correct[key] > 0 for key in correct} group.add_filter_min_reads(30) assert_true((correct['1'] == group.filter).all()) group.reset_filter() group.add_filter_min_reads(5) assert_true((correct['2'] == group.filter).all())
def test_coordinates_for_insert_and_snp(self): record = model._Record( '1', 10, 'id6', 'C', [ model._Substitution('GTA'), model._Substitution('G'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) record = model._Record( '1', 10, 'id7', 'C', [ model._Substitution('G'), model._Substitution('GTA'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_coordinates_for_snp_and_deletion(self): record = model._Record( '1', 10, 'id8', 'CTA', [ model._Substitution('C'), model._Substitution('CTG'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) record = model._Record( '1', 10, 'id9', 'CTA', [ model._Substitution('CTG'), model._Substitution('C'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 12), (10, 12))
def test_coordinates_for_insertion_and_deletion(self): record = model._Record( '1', 10, 'id10', 'CT', [ model._Substitution('CA'), model._Substitution('CTT'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) record = model._Record( '1', 10, 'id11', 'CT', [ model._Substitution('CTT'), model._Substitution('CA'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 11), (10, 11))
def test_coordinates_for_insertion_and_deletion(self): record = model._Record('1', 10, 'id10', 'CT', [ model._Substitution('CA'), model._Substitution('CTT'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) record = model._Record('1', 10, 'id11', 'CT', [ model._Substitution('CTT'), model._Substitution('CA'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 11), (10, 11))
def test_coordinates_for_snp_and_deletion(self): record = model._Record('1', 10, 'id8', 'CTA', [ model._Substitution('C'), model._Substitution('CTG'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) record = model._Record('1', 10, 'id9', 'CTA', [ model._Substitution('CTG'), model._Substitution('C'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 12), (10, 12))
def test_coordinates_for_insert_and_snp(self): record = model._Record('1', 10, 'id6', 'C', [ model._Substitution('GTA'), model._Substitution('G'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) record = model._Record('1', 10, 'id7', 'C', [ model._Substitution('G'), model._Substitution('GTA'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_coordinates_for_multiple_snps(self): record = model._Record('1', 10, 'id5', 'C', [ model._Substitution('A'), model._Substitution('G'), model._Substitution('T') ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def _get_record_for_indel(self, bpm_record_group): """ Create a new VCF record for an indel Args: bpm_record_group (list(BPMRecord)) : BPM records for the group (must all be indels) Returns: vcf._Record : The new VCF record definition """ (qual, filt, info, sample_indexes) = VcfRecordFactory._get_record_defaults() identifier = self._get_identifier(bpm_record_group) for record in bpm_record_group: assert record.is_deletion == bpm_record_group[0].is_deletion bpm_record = bpm_record_group[0] (_, indel_sequence, _) = bpm_record.get_indel_source_sequences(RefStrand.Plus) start_index = bpm_record.pos - 1 chrom = bpm_record.chromosome if chrom == "XX" or chrom == "XY": chrom = "X" if bpm_record.is_deletion: reference_base = self._genome_reader.get_reference_bases( chrom, start_index - 1, start_index) reference_allele = reference_base + indel_sequence alternate_allele = reference_base return _Record(chrom, start_index, identifier, reference_allele, [_Substitution(alternate_allele)], qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes) reference_base = self._genome_reader.get_reference_bases( chrom, start_index, start_index + 1) reference_allele = reference_base alternate_allele = reference_base + indel_sequence return _Record(chrom, start_index + 1, identifier, reference_allele, [_Substitution(alternate_allele)], qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes)
def _grid_item_to_vcf_record(info_dict, obj, sample_ids, sample_names): # , get_genotype_from_expanded_zygosity): CHROM = obj.get("locus__contig__name", ".") POS = obj.get("locus__position", ".") ID = obj.get("variantannotation__dbsnp_rs_id") REF = obj.get("locus__ref__seq", ".") ALT = obj.get("alt__seq", ".") QUAL = '.' # QUAL = obj.get("annotation__quality", ".") FILTER = None INFO = {} for info_id, data in info_dict.items(): col = data['column__variant_column'] val = obj.get(col) if val: INFO[info_id] = val FORMAT = None MY_FORMAT = ['GT', 'AD', 'AF', 'PL', 'DP', 'GQ'] CallData = make_calldata_tuple(MY_FORMAT) sample_indexes = {} samples = [] if sample_ids: FORMAT = ':'.join(MY_FORMAT) alts = [_Substitution(ALT)] ALT = alts record = _Record(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes) if sample_ids: for i, (sample_id, sample) in enumerate(zip(sample_ids, sample_names)): ad = obj[f"{sample_id}_samples_allele_depth"] zygosity = obj[f"{sample_id}_samples_zygosity"] gt = Zygosity.get_genotype_from_expanded_zygosity(zygosity) dp = obj[f"{sample_id}_samples_read_depth"] af = obj[f"{sample_id}_samples_allele_frequency"] # GQ/PL/FT are optional now # TODO: Ideally, we'd not write them out pl = obj.get(f"{sample_id}_samples_phred_likelihood", ".") gq = obj.get(f"{sample_id}_samples_genotype_quality", ".") # TODO: Need to grab information for reference base to be able to properly fill in this data. data_args = {'AD': ['.', ad], 'GT': gt, 'PL': ['.', pl], 'DP': ['.', dp], 'GQ': ['.', gq], 'AF': ['.', af]} data = CallData(**data_args) call = _Call(record, sample, data) samples.append(call) sample_indexes[sample] = i record.samples = samples return record
def tab_to_vcf(input_file, output_file, reference_file, columns, info_fields, convert_iupac=False): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes convert_iupac (bool) : When present, convert IUPAC codes to the non-reference allele. This is only possible for when the reference and IUPAC-determined alternates share at least one allele. Tri-allelic conversion is not supported and will emit a warning. IUPAC codes: http://www.bioinformatics.org/sms/iupac.html """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [row.get(columns.get(f,None), ".") for f in VCF_COLUMN_ORDER] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Optionally convert IUPAC code if convert_iupac: args = _convert_iupac(args) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Convert info fields if info_fields: INFO = {} for vcf_field,tab_field in info_fields.items(): if tab_field in row: INFO[vcf_field] = row[tab_field] else: INFO = {} # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([INFO, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def test_split_and_trim(): """Validates that we correctly remove variants which don't meet a threshold""" records = [ _Record(1, 0, '1', '', '', '', '', '', '', ''), _Record(2, 0, '2', '', '', '', '', '', '', ''), _Record(3, 0, '3', '', '', '', '', '', '', ''), _Record(4, 0, '4', '', '', '', '', '', '', '') ] groups = { '1': vg.VariantGroup('1', records), '2': vg.VariantGroup('1', records), } arlen = len(records) groups['1'].coverage_array = np.zeros((arlen, arlen)) groups['1'].existence_array = np.zeros((arlen, arlen)) groups['1'].coverage_array[-1][0] = 100. groups['1'].existence_array[-1][0] = 50. groups['1'].coverage_array[-2][1] = 1000. groups['1'].existence_array[-2][1] = 1. groups['2'].coverage_array = groups['1'].coverage_array.copy() groups['2'].existence_array = groups['1'].existence_array.copy() groups['1'].set_filter_fq_pab(25) groups['2'].set_filter_fq_pab(0) correct = {'1': ['1', '4'], '2': ['1', '2', '3', '4']} removed = {'1': ['2', '3'], '2': []} keep, reject = {}, {} keep['1'], reject['1'] = groups['1'].split_and_trim() keep['2'], reject['2'] = groups['2'].split_and_trim() keep['1'] = keep['1'][0] keep['2'][0].unsplit(keep['2'][1:]) obs_removed = { '1': [vlist.ID for vlist in reject['1']], '2': [vlist.ID for vlist in reject['2']] } obs_correct = { '1': sorted([vlist.ID for vlist in keep['1'].variant_list]), '2': sorted([vlist.ID for vlist in keep['2'][0].variant_list]) } assert_dict_equal(correct, obs_correct) assert_dict_equal(removed, obs_removed)
def tab_to_vcf(input_file, output_file, reference_file, convert_iupac=False, info_fields=None): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes convert_iupac (bool) : When present, convert IUPAC codes to the non-reference allele. This is only possible for when the reference and IUPAC-determined alternates share at least one allele. Tri-allelic conversion is not supported and will emit a warning. IUPAC codes: http://www.bioinformatics.org/sms/iupac.html """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator="\n") for row in reader: args = [row.get(tab_field, ".") for vcf_field, tab_field in VCF_TO_FIELDS] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Optionally convert IUPAC code if convert_iupac: args = _convert_iupac(args) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] if info_fields: INFO = {} for k, v in info_fields.items(): if k in row: INFO[v] = row[k] else: INFO = {} # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([INFO, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def test_coordinates_for_breakend(self): record = model._Record( '1', 10, 'id12', 'CTA', [model._Breakend('1', 500, False, True, 'GGTC', True)], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 12), (9, 12))
def test_coordinates_for_None_alt(self): record = model._Record( '1', 10, 'id4', 'C', [None], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_is_snp_for_n_alt(self): record = model._Record( '1', 10, 'id1', 'C', [model._Substitution('N')], None, None, {}, None, {}, None ) self.assertTrue(record.is_snp)
def test_coordinates_for_insertion(self): record = model._Record( '1', 10, 'id2', 'C', [model._Substitution('CTA')], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (10, 10))
def __next__(self): """Return the next record in the file.""" line = next(self.reader) row = self._row_pattern.split(line.rstrip()) chrom = row[0] if self._prepend_chr: chrom = 'chr' + chrom pos = int(row[1]) if row[2] != '.': ID = row[2] else: ID = None ref = row[3] alt = self._map(self._parse_alt, row[4].split(',')) try: qual = int(row[5]) except IndexError: qual = None except ValueError: try: qual = float(row[5]) except ValueError: qual = None filt = None # None returned by _parse_filter if. in column info = {} # Empty dict is returned by _parse_info if . is in column fmt = None # Set if fmt is . if len(row)>6: filt = self._parse_filter(row[6]) if len(row) > 7: info = self._parse_info(row[7]) if len(row) > 8: fmt = row[8] if fmt == '.': fmt = None record = _Record(chrom, pos, ID, ref, alt, qual, filt, info, fmt, self._sample_indexes) if fmt is not None: samples = self._parse_samples(row[9:], fmt, record) record.samples = samples return record
def _get_record_for_auxiliary(self, bpm_record_group, auxiliary_record): """ Create a new VCF record for an auxiliar locus THIS WILL ALSO ALTER THE BPM RECORD Args: bpm_record_group (list(BPMRecord)) : BPM records for the group (must be length 1) auxiliary_record (vcf._Record) : Auxiliar locus definition Returns: vcf._Record : The new VCF record definition """ # update BPM record with plus-strand alleles # these must stay in the same order as so plus allele shouldn't be reported (necessarily) as reference, alternate # need to use ref strand info to figure out which end of MNV allele to use for comparisonn # then create vcf record with alleles from auxiliary record identifier = self._get_identifier(bpm_record_group) assert len(bpm_record_group) == 1 bpm_record = bpm_record_group[0] (qual, filt, info, sample_indexes) = VcfRecordFactory._get_record_defaults() old_plus_strand_alleles = bpm_record.plus_strand_alleles new_plus_strand_alleles = [] if bpm_record.ref_strand == RefStrand.Plus: for old_allele in old_plus_strand_alleles: for new_allele in auxiliary_record.alleles: if str(new_allele)[0] == old_allele[0]: new_plus_strand_alleles.append(str(new_allele)) break else: assert bpm_record.ref_strand == RefStrand.Minus for old_allele in old_plus_strand_alleles: for new_allele in auxiliary_record.alleles: if str(new_allele)[-1] == old_allele[-1]: new_plus_strand_alleles.append(str(new_allele)) break assert len(new_plus_strand_alleles) == 2 assert new_plus_strand_alleles[0] != new_plus_strand_alleles[1] bpm_record.plus_strand_alleles = new_plus_strand_alleles return _Record(auxiliary_record.CHROM, auxiliary_record.POS, identifier, auxiliary_record.REF, auxiliary_record.ALT, qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes)
def tab_to_vcf(input_file, output_file, reference_file): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [ row.get(tab_field, ".") for vcf_field, tab_field in VCF_TO_FIELDS ] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith( ("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([{}, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def test_coordinates_for_multiple_snps(self): record = model._Record( '1', 10, 'id5', 'C', [ model._Substitution('A'), model._Substitution('G'), model._Substitution('T') ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def tab_to_vcf(input_file, output_file, reference_file): """ Convert tab-delimited file to VCF. Support for the fixed VCF fields: #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO PyVCF's _Record class requires the following arguments: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes """ reference_dict = FastaHack(reference_file) with open(input_file, "r") as input_fh: reader = csv.DictReader(input_fh, delimiter="\t") with open(TEMPLATE_VCF_FILE, "r") as template_fh: vcf_reader = vcf.Reader(template_fh) with open(output_file, "w") as output_fh: vcf_writer = vcf.Writer(output_fh, vcf_reader, lineterminator='\n') for row in reader: args = [row.get(tab_field, ".") for vcf_field, tab_field in VCF_TO_FIELDS] # Convert position to an integer. args[POSITION_INDEX] = int(args[POSITION_INDEX]) # Convert indels from GATK to VCF format. if args[ALT_INDEX].startswith(("+", "-")) and not "/" in args[ALT_INDEX]: args = gatk_indel_to_vcf(args, reference_dict) # Convert alternate allele scalar to a list. args[ALT_INDEX] = [args[ALT_INDEX]] # Add empty entries for INFO, FORMAT, and sample_indexes. args.extend([{}, ".", []]) record = _Record(*args) vcf_writer.write_record(record)
def _get_record_for_snv(self, bpm_record_group): """ Create a new VCF record for an SNV Args: bpm_record_group (list(BPMRecord)) : BPM records for the group (must all be SNV records) Returns: vcf._Record : The new VCF record definition """ assert not bpm_record_group[0].is_indel() identifier = self._get_identifier(bpm_record_group) bpm_record = bpm_record_group[0] (qual, filt, info, sample_indexes) = VcfRecordFactory._get_record_defaults() start_index = bpm_record.pos - 1 chrom = bpm_record.chromosome if chrom == "XX" or chrom == "XY": chrom = "X" reference_base = self._genome_reader.get_reference_bases( chrom, start_index, start_index + 1) if not check_reference_allele(reference_base, bpm_record_group): self._logger.warn("Reference allele is not queried for locus: " + identifier) alts = [] for record in bpm_record_group: for nucleotide in record.plus_strand_alleles: if nucleotide != reference_base: substitution = _Substitution(nucleotide) if substitution not in alts: alts.append(_Substitution(nucleotide)) return _Record(chrom, bpm_record.pos, identifier, reference_base, alts, qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes)
def vcf_record(self, *args, **kwargs): return _Record(*args, **kwargs)
def vcf_record( self, *args, **kwargs ): return _Record( *args, **kwargs )
def test_is_snp_for_n_alt(self): record = model._Record('1', 10, 'id1', 'C', [model._Substitution('N')], None, None, {}, None, {}, None) self.assertTrue(record.is_snp)
def test_coordinates_for_insertion(self): record = model._Record('1', 10, 'id2', 'C', [model._Substitution('CTA')], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (10, 10))
def test_coordinates_for_None_alt(self): record = model._Record('1', 10, 'id4', 'C', [None], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_coordinates_for_breakend(self): record = model._Record( '1', 10, 'id12', 'CTA', [model._Breakend('1', 500, False, True, 'GGTC', True)], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 12), (9, 12))