コード例 #1
0
def test_extract_sequence_vcf_plus(interval_plus, fasta, vcf):
    mutator = VCFMutator(True, False, vcf, None)
    res = mutator.mutate_sequence(interval_plus, fasta, None)
    assert res == [("CCCTGAGGCGTCCTTGC", (12, 14)),
                   ("CCCCAAGGCGTCCTTGC", (12, 14, 8)),
                   ("CCCTGAAGCGTCCTTGC", (12, 14, 11)),
                   ("CCCTGAGGCGTCTTTGC", (12, 14, 17)),
                   ("CCCCAAAGCGTCCTTGC", (12, 14, 8, 11)),
                   ("CCCCAAGGCGTCTTTGC", (12, 14, 8, 17)),
                   ("CCCTGAAGCGTCTTTGC", (12, 14, 11, 17)),
                   ("CCCCAAAGCGTCTTTGC", (12, 14, 8, 11, 17))]
コード例 #2
0
def test_extract_sequence_vcf_minus(interval_minus, fasta, vcf):
    mutator = VCFMutator(True, False, vcf, None)
    res = mutator.mutate_sequence(interval_minus, fasta, None)
    assert res == [("GCAAGGACGCCTCAGGG", (12, 14)),
                   ("GCAAGGACGCCTTGGGG", (12, 14, 8)),
                   ("GCAAGGACGCTTCAGGG", (12, 14, 11)),
                   ("GCAAAGACGCCTCAGGG", (12, 14, 17)),
                   ("GCAAGGACGCTTTGGGG", (12, 14, 8, 11)),
                   ("GCAAAGACGCCTTGGGG", (12, 14, 8, 17)),
                   ("GCAAAGACGCTTCAGGG", (12, 14, 11, 17)),
                   ("GCAAAGACGCTTTGGGG", (12, 14, 8, 11, 17))]
コード例 #3
0
def test_extract_sequence_no_vcf(interval_plus, fasta):
    mutator = VCFMutator(True, False, None, None)
    res = mutator.mutate_sequence(interval_plus, fasta, None)
    assert res == [("CCCTGAGTCATCCTTGC", )]
コード例 #4
0
ファイル: genome.py プロジェクト: gagneurlab/gfeat
    def get_nucleobase_mutation_table(self, vcf):
        """
        Get a table which shows whether a certain nucleobase in Kozak sequence or stop codon context was mutated or not.

        :param vcf: path to the vcf.gz or file opened using cyvcf2
        :type vcf: string or an "opened" file
        :return: pd.DataFrame,

                column names – K_i – where i shows position in Kozak sequence;
                S_i – where i shows position in stop codon context; gene_id; transcript_id

                rows – NaN – no variant, 1 – heterozygous variant, 2 – homozygous variant
        """

        # only for Kozak sequence and stop codon context + transcript_id column
        columns = [
            "K_0", "K_1", "K_2", "K_3", "K_4", "K_5", "K_6", "K_7", "K_8",
            "K_9", "K_10", "K_11", "K_12", "K_13", "K_14", "S_0", "S_1", "S_2",
            "S_3", "S_4", "S_5", "S_6", "S_7", "S_8", "S_9", "S_10", "S_11",
            "S_12", "S_13", "S_14", "transcript_id", "gene_id", "name"
        ]
        df_nucleobases = pd.DataFrame(columns=columns)
        nucleobases_lines = []

        mutator = VCFMutator(False, True, vcf, True)

        contigs = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
            'Y'
        ]

        for contig in contigs:
            for transcript in self.transcripts(contig, '+'):
                if transcript.contains_stop_codon and transcript.contains_start_codon:
                    Kozak_seq = transcript.get_Kozak_seq()
                    Interval_Kozak = Interval(
                        "chr" + transcript.contig,
                        transcript.start_codon_positions[0] - 6,
                        transcript.start_codon_positions[0] + 9, "NA", 0, "+")
                    stop_codon_context = transcript.get_stop_codon_context()
                    Interval_stop = Interval(
                        "chr" + transcript.contig,
                        transcript.stop_codon_positions[0] - 6,
                        transcript.stop_codon_positions[0] + 9, "NA", 0, "+")
                    df_nucleobases_line = mutator.mutate_codon_context(
                        [Interval_Kozak, Interval_stop],
                        [Kozak_seq, stop_codon_context], ["K_", "S_"])
                    if len(Kozak_seq) < 15:
                        new_columns = []
                        for column in df_nucleobases_line:
                            if column.find("K_") != -1:
                                new_columns.append("K_" + str(
                                    int(column[2:]) + (15 - len(Kozak_seq))))
                            else:
                                new_columns.append(column)
                        df_nucleobases_line.columns = new_columns

                    df_nucleobases_line["transcript_id"] = transcript.id
                    df_nucleobases_line["gene_id"] = transcript.gene_id
                    nucleobases_lines.append(df_nucleobases_line)
            for transcript in self.transcripts(contig, '-'):
                if transcript.contains_stop_codon and transcript.contains_start_codon:
                    Kozak_seq = reverse_complement(transcript.get_Kozak_seq())
                    Interval_Kozak = Interval(
                        "chr" + transcript.contig,
                        transcript.start_codon_positions[0] - 6,
                        transcript.start_codon_positions[0] + 9, "NA", 0, "-")
                    stop_codon_context = reverse_complement(
                        transcript.get_stop_codon_context())
                    Interval_stop = Interval(
                        "chr" + transcript.contig,
                        transcript.stop_codon_positions[0] - 6,
                        transcript.stop_codon_positions[0] + 9, "NA", 0, "-")
                    df_nucleobases_line = mutator.mutate_codon_context(
                        [Interval_Kozak, Interval_stop],
                        [Kozak_seq, stop_codon_context], ["K_", "S_"])
                    if len(Kozak_seq) < 15:
                        new_columns = []
                        for column in df_nucleobases_line:
                            if column.find("K_") != -1:
                                new_columns.append("K_" + str(
                                    int(column[2:]) + (15 - len(Kozak_seq))))
                            else:
                                new_columns.append(column)
                        df_nucleobases_line.columns = new_columns

                    df_nucleobases_line["transcript_id"] = transcript.id
                    df_nucleobases_line["gene_id"] = transcript.gene_id
                    nucleobases_lines.append(df_nucleobases_line)

            df_nucleobases = pd.concat(nucleobases_lines, ignore_index=True)
            df_nucleobases = df_nucleobases.drop(['name'], axis=1)
        return df_nucleobases