def test_extract_sequence_vcf_plus(interval_plus, fasta, vcf): mutator = VCFMutator(True, False, vcf, None) res = mutator.mutate_sequence(interval_plus, fasta, None) assert res == [("CCCTGAGGCGTCCTTGC", (12, 14)), ("CCCCAAGGCGTCCTTGC", (12, 14, 8)), ("CCCTGAAGCGTCCTTGC", (12, 14, 11)), ("CCCTGAGGCGTCTTTGC", (12, 14, 17)), ("CCCCAAAGCGTCCTTGC", (12, 14, 8, 11)), ("CCCCAAGGCGTCTTTGC", (12, 14, 8, 17)), ("CCCTGAAGCGTCTTTGC", (12, 14, 11, 17)), ("CCCCAAAGCGTCTTTGC", (12, 14, 8, 11, 17))]
def test_extract_sequence_vcf_minus(interval_minus, fasta, vcf): mutator = VCFMutator(True, False, vcf, None) res = mutator.mutate_sequence(interval_minus, fasta, None) assert res == [("GCAAGGACGCCTCAGGG", (12, 14)), ("GCAAGGACGCCTTGGGG", (12, 14, 8)), ("GCAAGGACGCTTCAGGG", (12, 14, 11)), ("GCAAAGACGCCTCAGGG", (12, 14, 17)), ("GCAAGGACGCTTTGGGG", (12, 14, 8, 11)), ("GCAAAGACGCCTTGGGG", (12, 14, 8, 17)), ("GCAAAGACGCTTCAGGG", (12, 14, 11, 17)), ("GCAAAGACGCTTTGGGG", (12, 14, 8, 11, 17))]
def test_extract_sequence_no_vcf(interval_plus, fasta): mutator = VCFMutator(True, False, None, None) res = mutator.mutate_sequence(interval_plus, fasta, None) assert res == [("CCCTGAGTCATCCTTGC", )]
def get_nucleobase_mutation_table(self, vcf): """ Get a table which shows whether a certain nucleobase in Kozak sequence or stop codon context was mutated or not. :param vcf: path to the vcf.gz or file opened using cyvcf2 :type vcf: string or an "opened" file :return: pd.DataFrame, column names – K_i – where i shows position in Kozak sequence; S_i – where i shows position in stop codon context; gene_id; transcript_id rows – NaN – no variant, 1 – heterozygous variant, 2 – homozygous variant """ # only for Kozak sequence and stop codon context + transcript_id column columns = [ "K_0", "K_1", "K_2", "K_3", "K_4", "K_5", "K_6", "K_7", "K_8", "K_9", "K_10", "K_11", "K_12", "K_13", "K_14", "S_0", "S_1", "S_2", "S_3", "S_4", "S_5", "S_6", "S_7", "S_8", "S_9", "S_10", "S_11", "S_12", "S_13", "S_14", "transcript_id", "gene_id", "name" ] df_nucleobases = pd.DataFrame(columns=columns) nucleobases_lines = [] mutator = VCFMutator(False, True, vcf, True) contigs = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y' ] for contig in contigs: for transcript in self.transcripts(contig, '+'): if transcript.contains_stop_codon and transcript.contains_start_codon: Kozak_seq = transcript.get_Kozak_seq() Interval_Kozak = Interval( "chr" + transcript.contig, transcript.start_codon_positions[0] - 6, transcript.start_codon_positions[0] + 9, "NA", 0, "+") stop_codon_context = transcript.get_stop_codon_context() Interval_stop = Interval( "chr" + transcript.contig, transcript.stop_codon_positions[0] - 6, transcript.stop_codon_positions[0] + 9, "NA", 0, "+") df_nucleobases_line = mutator.mutate_codon_context( [Interval_Kozak, Interval_stop], [Kozak_seq, stop_codon_context], ["K_", "S_"]) if len(Kozak_seq) < 15: new_columns = [] for column in df_nucleobases_line: if column.find("K_") != -1: new_columns.append("K_" + str( int(column[2:]) + (15 - len(Kozak_seq)))) else: new_columns.append(column) df_nucleobases_line.columns = new_columns df_nucleobases_line["transcript_id"] = transcript.id df_nucleobases_line["gene_id"] = transcript.gene_id nucleobases_lines.append(df_nucleobases_line) for transcript in self.transcripts(contig, '-'): if transcript.contains_stop_codon and transcript.contains_start_codon: Kozak_seq = reverse_complement(transcript.get_Kozak_seq()) Interval_Kozak = Interval( "chr" + transcript.contig, transcript.start_codon_positions[0] - 6, transcript.start_codon_positions[0] + 9, "NA", 0, "-") stop_codon_context = reverse_complement( transcript.get_stop_codon_context()) Interval_stop = Interval( "chr" + transcript.contig, transcript.stop_codon_positions[0] - 6, transcript.stop_codon_positions[0] + 9, "NA", 0, "-") df_nucleobases_line = mutator.mutate_codon_context( [Interval_Kozak, Interval_stop], [Kozak_seq, stop_codon_context], ["K_", "S_"]) if len(Kozak_seq) < 15: new_columns = [] for column in df_nucleobases_line: if column.find("K_") != -1: new_columns.append("K_" + str( int(column[2:]) + (15 - len(Kozak_seq)))) else: new_columns.append(column) df_nucleobases_line.columns = new_columns df_nucleobases_line["transcript_id"] = transcript.id df_nucleobases_line["gene_id"] = transcript.gene_id nucleobases_lines.append(df_nucleobases_line) df_nucleobases = pd.concat(nucleobases_lines, ignore_index=True) df_nucleobases = df_nucleobases.drop(['name'], axis=1) return df_nucleobases