def test_coordinates_for_insertion_and_deletion(self): record = model._Record( '1', 10, 'id10', 'CT', [ model._Substitution('CA'), model._Substitution('CTT'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) record = model._Record( '1', 10, 'id11', 'CT', [ model._Substitution('CTT'), model._Substitution('CA'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 11), (10, 11))
def test_coordinates_for_insert_and_snp(self): record = model._Record( '1', 10, 'id6', 'C', [ model._Substitution('GTA'), model._Substitution('G'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) record = model._Record( '1', 10, 'id7', 'C', [ model._Substitution('G'), model._Substitution('GTA'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_coordinates_for_snp_and_deletion(self): record = model._Record( '1', 10, 'id8', 'CTA', [ model._Substitution('C'), model._Substitution('CTG'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) record = model._Record( '1', 10, 'id9', 'CTA', [ model._Substitution('CTG'), model._Substitution('C'), ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 12), (10, 12))
def test_coordinates_for_multiple_snps(self): record = model._Record('1', 10, 'id5', 'C', [ model._Substitution('A'), model._Substitution('G'), model._Substitution('T') ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def test_coordinates_for_insertion_and_deletion(self): record = model._Record('1', 10, 'id10', 'CT', [ model._Substitution('CA'), model._Substitution('CTT'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 11), (10, 11)) record = model._Record('1', 10, 'id11', 'CT', [ model._Substitution('CTT'), model._Substitution('CA'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 11), (10, 11))
def test_coordinates_for_snp_and_deletion(self): record = model._Record('1', 10, 'id8', 'CTA', [ model._Substitution('C'), model._Substitution('CTG'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 12), (10, 12)) record = model._Record('1', 10, 'id9', 'CTA', [ model._Substitution('CTG'), model._Substitution('C'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 12), (10, 12))
def test_coordinates_for_insert_and_snp(self): record = model._Record('1', 10, 'id6', 'C', [ model._Substitution('GTA'), model._Substitution('G'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10)) record = model._Record('1', 10, 'id7', 'C', [ model._Substitution('G'), model._Substitution('GTA'), ], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def _parse_alt(self, str): if self._alt_pattern.search(str) is not None: # Paired breakend items = self._alt_pattern.split(str) remoteCoords = items[1].split(':') chr = remoteCoords[0] if chr[0] == '<': chr = chr[1:-1] withinMainAssembly = False else: withinMainAssembly = True pos = remoteCoords[1] orientation = (str[0] == '[' or str[0] == ']') remoteOrientation = (re.search('\[', str) is not None) if orientation: connectingSequence = items[2] else: connectingSequence = items[0] return _Breakend(chr, pos, orientation, remoteOrientation, connectingSequence, withinMainAssembly) elif str[0] == '.' and len(str) > 1: return _SingleBreakend(True, str[1:]) elif str[-1] == '.' and len(str) > 1: return _SingleBreakend(False, str[:-1]) elif str[0] == "<" and str[-1] == ">": return _SV(str[1:-1]) else: return _Substitution(str)
def _get_record_for_indel(self, bpm_record_group): """ Create a new VCF record for an indel Args: bpm_record_group (list(BPMRecord)) : BPM records for the group (must all be indels) Returns: vcf._Record : The new VCF record definition """ (qual, filt, info, sample_indexes) = VcfRecordFactory._get_record_defaults() identifier = self._get_identifier(bpm_record_group) for record in bpm_record_group: assert record.is_deletion == bpm_record_group[0].is_deletion bpm_record = bpm_record_group[0] (_, indel_sequence, _) = bpm_record.get_indel_source_sequences(RefStrand.Plus) start_index = bpm_record.pos - 1 chrom = bpm_record.chromosome if chrom == "XX" or chrom == "XY": chrom = "X" if bpm_record.is_deletion: reference_base = self._genome_reader.get_reference_bases( chrom, start_index - 1, start_index) reference_allele = reference_base + indel_sequence alternate_allele = reference_base return _Record(chrom, start_index, identifier, reference_allele, [_Substitution(alternate_allele)], qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes) reference_base = self._genome_reader.get_reference_bases( chrom, start_index, start_index + 1) reference_allele = reference_base alternate_allele = reference_base + indel_sequence return _Record(chrom, start_index + 1, identifier, reference_allele, [_Substitution(alternate_allele)], qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes)
def _grid_item_to_vcf_record(info_dict, obj, sample_ids, sample_names): # , get_genotype_from_expanded_zygosity): CHROM = obj.get("locus__contig__name", ".") POS = obj.get("locus__position", ".") ID = obj.get("variantannotation__dbsnp_rs_id") REF = obj.get("locus__ref__seq", ".") ALT = obj.get("alt__seq", ".") QUAL = '.' # QUAL = obj.get("annotation__quality", ".") FILTER = None INFO = {} for info_id, data in info_dict.items(): col = data['column__variant_column'] val = obj.get(col) if val: INFO[info_id] = val FORMAT = None MY_FORMAT = ['GT', 'AD', 'AF', 'PL', 'DP', 'GQ'] CallData = make_calldata_tuple(MY_FORMAT) sample_indexes = {} samples = [] if sample_ids: FORMAT = ':'.join(MY_FORMAT) alts = [_Substitution(ALT)] ALT = alts record = _Record(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes) if sample_ids: for i, (sample_id, sample) in enumerate(zip(sample_ids, sample_names)): ad = obj[f"{sample_id}_samples_allele_depth"] zygosity = obj[f"{sample_id}_samples_zygosity"] gt = Zygosity.get_genotype_from_expanded_zygosity(zygosity) dp = obj[f"{sample_id}_samples_read_depth"] af = obj[f"{sample_id}_samples_allele_frequency"] # GQ/PL/FT are optional now # TODO: Ideally, we'd not write them out pl = obj.get(f"{sample_id}_samples_phred_likelihood", ".") gq = obj.get(f"{sample_id}_samples_genotype_quality", ".") # TODO: Need to grab information for reference base to be able to properly fill in this data. data_args = {'AD': ['.', ad], 'GT': gt, 'PL': ['.', pl], 'DP': ['.', dp], 'GQ': ['.', gq], 'AF': ['.', af]} data = CallData(**data_args) call = _Call(record, sample, data) samples.append(call) sample_indexes[sample] = i record.samples = samples return record
def test_coordinates_for_multiple_snps(self): record = model._Record( '1', 10, 'id5', 'C', [ model._Substitution('A'), model._Substitution('G'), model._Substitution('T') ], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (9, 10))
def _get_record_for_snv(self, bpm_record_group): """ Create a new VCF record for an SNV Args: bpm_record_group (list(BPMRecord)) : BPM records for the group (must all be SNV records) Returns: vcf._Record : The new VCF record definition """ assert not bpm_record_group[0].is_indel() identifier = self._get_identifier(bpm_record_group) bpm_record = bpm_record_group[0] (qual, filt, info, sample_indexes) = VcfRecordFactory._get_record_defaults() start_index = bpm_record.pos - 1 chrom = bpm_record.chromosome if chrom == "XX" or chrom == "XY": chrom = "X" reference_base = self._genome_reader.get_reference_bases( chrom, start_index, start_index + 1) if not check_reference_allele(reference_base, bpm_record_group): self._logger.warn("Reference allele is not queried for locus: " + identifier) alts = [] for record in bpm_record_group: for nucleotide in record.plus_strand_alleles: if nucleotide != reference_base: substitution = _Substitution(nucleotide) if substitution not in alts: alts.append(_Substitution(nucleotide)) return _Record(chrom, bpm_record.pos, identifier, reference_base, alts, qual, filt, info, self._format_factory.get_format_id_string(), sample_indexes)
def test_coordinates_for_insertion(self): record = model._Record( '1', 10, 'id2', 'C', [model._Substitution('CTA')], None, None, {}, None, {}, None ) self.assert_has_expected_coordinates(record, (9, 10), (10, 10))
def test_is_snp_for_n_alt(self): record = model._Record( '1', 10, 'id1', 'C', [model._Substitution('N')], None, None, {}, None, {}, None ) self.assertTrue(record.is_snp)
def test_coordinates_for_insertion(self): record = model._Record('1', 10, 'id2', 'C', [model._Substitution('CTA')], None, None, {}, None, {}, None) self.assert_has_expected_coordinates(record, (9, 10), (10, 10))
def test_is_snp_for_n_alt(self): record = model._Record('1', 10, 'id1', 'C', [model._Substitution('N')], None, None, {}, None, {}, None) self.assertTrue(record.is_snp)