Exemple #1
0
    def testChromosomeConversionHG19(self):
        """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y
        """
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"), "X", "chrom of 23 did not produce X: " + MutUtils.convertChromosomeStringToMutationDataFormat("23", build="hg19"))
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"), "Y", "chrom of 24 did not produce Y: " + MutUtils.convertChromosomeStringToMutationDataFormat("24", build="hg19"))

        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"), "2", "chrom of 2 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("2", build="hg19"))
        self.assertEqual(MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"), "4", "chrom of 4 yielded different value: " + MutUtils.convertChromosomeStringToMutationDataFormat("4", build="hg19"))
Exemple #2
0
    def _determine_matching_alt_indices(self, mut, record, build):
        """

        :param mut:
        :param record:
        :return:
        """
        indices = []
        if record.is_monomorphic:
            chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
                record.CHROM)
            startPos = record.POS
            endPos = record.POS
            ref_allele = record.REF

            if self.match_mode == "exact":
                if mut.chr == chrom and mut.ref_allele == ref_allele:
                    indices = [-1]
            else:
                if mut.chr == chrom and int(mut.start) <= startPos and int(
                        mut.end) >= endPos:
                    indices = [-1]
        else:
            # Iterate over all alternates in the record
            for index in xrange(0, len(record.ALT)):
                chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
                    record.CHROM)
                startPos = record.POS
                endPos = record.POS
                ref = str(record.REF)
                alt = str(record.ALT[index])
                ds_mut = MutUtils.initializeMutFromAttributes(
                    chrom, startPos, endPos, ref, alt, build)

                if self.match_mode == "exact":
                    if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                        and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                        and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                else:  # cases whether the match mode isn't exact
                    if mut.chr == ds_mut.chr and int(mut.start) == int(
                            ds_mut.start) and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \
                        and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(
                            ds_mut.start) and int(mut.end) >= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \
                        and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start):
                        indices += [index]

        # if len(indices) == 0:
        #     indices = [None]

        return indices
    def _determine_matching_alt_indices(self, mut, record, build):
        """

        :param mut:
        :param record:
        :return:
        """
        indices = []
        if record.is_monomorphic:
            chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
            startPos = record.POS
            endPos = record.POS
            ref_allele = record.REF

            if self.match_mode == "exact":
                if mut.chr == chrom and mut.ref_allele == ref_allele:
                    indices = [-1]
            else:
                if mut.chr == chrom and int(mut.start) <= startPos and int(mut.end) >= endPos:
                    indices = [-1]
        else:
            # Iterate over all alternates in the record
            for index in xrange(0, len(record.ALT)):
                chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
                startPos = record.POS
                endPos = record.POS
                ref = str(record.REF)
                alt = str(record.ALT[index])
                ds_mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)

                if self.match_mode == "exact":
                    if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \
                        and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \
                        and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                else:  # cases whether the match mode isn't exact
                    if mut.chr == ds_mut.chr and int(mut.start) == int(ds_mut.start) and int(mut.end) == int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) >= int(ds_mut.start) \
                        and int(mut.end) >= int(ds_mut.end) and int(mut.start) <= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) and int(mut.end) >= int(ds_mut.end):
                        indices += [index]
                    elif mut.chr == ds_mut.chr and int(mut.start) <= int(ds_mut.start) \
                        and int(mut.end) <= int(ds_mut.end) and int(mut.end) >= int(ds_mut.start):
                        indices += [index]

        # if len(indices) == 0:
        #     indices = [None]

        return indices
Exemple #4
0
    def testChromosomeConversionHG19(self):
        """Test that an hg19 build with chrom = 23 or 24 gets converted to X or Y
        """
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("23",
                                                                 build="hg19"),
            "X", "chrom of 23 did not produce X: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("23",
                                                                 build="hg19"))
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("24",
                                                                 build="hg19"),
            "Y", "chrom of 24 did not produce Y: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("24",
                                                                 build="hg19"))

        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("2",
                                                                 build="hg19"),
            "2", "chrom of 2 yielded different value: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("2",
                                                                 build="hg19"))
        self.assertEqual(
            MutUtils.convertChromosomeStringToMutationDataFormat("4",
                                                                 build="hg19"),
            "4", "chrom of 4 yielded different value: " +
            MutUtils.convertChromosomeStringToMutationDataFormat("4",
                                                                 build="hg19"))
Exemple #5
0
    def createMutations(self):
        """ No inputs.
        Returns a generator of mutations built from the specified maflite file. """

        aliasKeys = self._reverseAlternativeDict.keys()
        allColumns = self._specified_fields

        for line in self._tsvReader:

            # We only need to assign fields that are mutation attributes and have a different name in the maflite file.
            mut = self._mutation_data_factory.create(build=self._build)

            for col in allColumns:
                # Three scenarios:
                #   1) col is name of mutation data field -- simple createAnnotation
                #   2) col name is an alias for a mutation data field -- do lookup then createAnnotation
                #   3) col name is not an alias for a mutation data field -- simple createAnnotation
                if col in aliasKeys:
                    realKey = self._reverseAlternativeDict[col]
                    self.logger.debug(realKey + " found from " + col)
                    val = line[col]
                    if realKey == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(
                            line[col])
                    mut.createAnnotation(realKey, val, 'INPUT')
                else:
                    # Scenario 1 and 3
                    # Make sure to convert chromosome values.
                    val = line[col]
                    if col == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(
                            line[col])
                    mut.createAnnotation(col, val, 'INPUT')

            mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(
            ), mut.alt_allele.strip(
            )  #remove any trailing whitespace if present

            # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different.
            if mut.alt_allele == mut.ref_allele:
                mut.alt_allele = self._find_alt_allele_in_other_field(
                    line, mut.ref_allele)

            # FIXME: Support more than one alias in the reverse dictionary.  Then this line can be removed.
            if mut.start is not "" and mut.end is "":
                mut.end = mut.start
            if mut.end is not "" and mut.start is "":
                mut.start = mut.end

            yield mut
Exemple #6
0
    def retrieveExons(self, gene, padding=10, isCodingOnly=False):
        """Return a list of (chr, start, end) tuples for each exon"""
        result = set()
        geneTuple = self.gene_id_idx.get(gene, None)
        if geneTuple is None:
            return result
        ctr = 0
        contig = MutUtils.convertChromosomeStringToMutationDataFormat(geneTuple[0])
        for b in self.Transcripts.get(contig, []):
            for i in self.Transcripts[contig][b]:
                if i["gene"] == gene:
                    if isCodingOnly and gaf_annotation.is_non_coding_transcript(i, self):
                        ctr += 1
                        continue

                    if isCodingOnly:
                        genomic_coords = self.getCodingTranscriptCoords(i)
                    else:
                        genomic_coords = i["genomic_coords"]

                    for coord in genomic_coords:
                        start = min(coord[0], coord[1])
                        end = max(coord[0], coord[1])
                        result.add((gene, i["chr"], str(start - padding), str(end + padding)))
        return result
Exemple #7
0
    def retrieveExons(self, gene, padding=10, isCodingOnly=False):
        """Return a list of (chr, start, end) tuples for each exon"""
        result = set()
        geneTuple = self.gene_id_idx.get(gene, None)
        if geneTuple is None:
            return result
        ctr = 0
        contig = MutUtils.convertChromosomeStringToMutationDataFormat(
            geneTuple[0])
        for b in self.Transcripts.get(contig, []):
            for i in self.Transcripts[contig][b]:
                if i['gene'] == gene:
                    if isCodingOnly and gaf_annotation.is_non_coding_transcript(
                            i, self):
                        ctr += 1
                        continue

                    if isCodingOnly:
                        genomic_coords = self.getCodingTranscriptCoords(i)
                    else:
                        genomic_coords = i['genomic_coords']

                    for coord in genomic_coords:
                        start = min(coord[0], coord[1])
                        end = max(coord[0], coord[1])
                        result.add((gene, i['chr'], str(start - padding),
                                    str(end + padding)))
        return result
Exemple #8
0
    def _createMutation(self, record, alt_index, build):
        chrom = MutUtils.convertChromosomeStringToMutationDataFormat(
            record.CHROM)
        startPos = int(record.POS)
        endPos = int(record.POS)
        ref = record.REF.strip()
        ref = "" if ref == "." else ref

        alt = ref
        if not record.is_monomorphic:
            alt = str(record.ALT[alt_index]).strip()

        mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos,
                                                   ref, alt, build,
                                                   self._mutation_data_factory)
        ID = "" if record.ID is None else record.ID
        mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID])
        mut.createAnnotation("qual",
                             str(record.QUAL),
                             "INPUT",
                             tags=[TagConstants.QUAL])
        mut.createAnnotation("alt_allele_seen", str(True), "INPUT")
        if self.collapse_filter_fields:
            mut = self._add_filter_data_2_mutation_single_field(mut, record)
        else:
            mut = self._addFilterData2Mutation(mut, record)
        mut = self._addInfoData2Mutation(mut, record, alt_index)
        return mut
    def createMutations(self):
        """ No inputs.
        Returns a generator of mutations built from the specified maflite file. """

        aliasKeys = self._reverseAlternativeDict.keys()
        allColumns = self._specified_fields

        for line in self._tsvReader:

            # We only need to assign fields that are mutation attributes and have a different name in the maflite file.
            mut = self._mutation_data_factory.create(build=self._build)

            for col in allColumns:
                # Three scenarios:
                #   1) col is name of mutation data field -- simple createAnnotation
                #   2) col name is an alias for a mutation data field -- do lookup then createAnnotation
                #   3) col name is not an alias for a mutation data field -- simple createAnnotation
                if col in aliasKeys:
                    realKey = self._reverseAlternativeDict[col]
                    self.logger.debug(realKey + " found from " + col)
                    val = line[col]
                    if realKey == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(realKey, val, 'INPUT')
                else:
                    # Scenario 1 and 3
                    # Make sure to convert chromosome values.
                    val = line[col]
                    if col == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(col, val, 'INPUT') 

            mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(), mut.alt_allele.strip() #remove any trailing whitespace if present

            # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different.
            if mut.alt_allele == mut.ref_allele:
                mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele)

            # FIXME: Support more than one alias in the reverse dictionary.  Then this line can be removed.
            if mut.start is not "" and mut.end is "":
                mut.end = mut.start
            if mut.end is not "" and mut.start is "":
                mut.start = mut.end

            yield mut
    def _createMutation(self, record, alt_index, build):
        chrom = MutUtils.convertChromosomeStringToMutationDataFormat(record.CHROM)
        startPos = int(record.POS)
        endPos = int(record.POS)
        ref = record.REF
        ref = "" if ref == "." else ref

        alt = ref
        if not record.is_monomorphic:
            alt = str(record.ALT[alt_index])

        mut = MutUtils.initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build)
        ID = "" if record.ID is None else record.ID
        mut.createAnnotation("id", ID, "INPUT", tags=[TagConstants.ID])
        mut.createAnnotation("qual", str(record.QUAL), "INPUT", tags=[TagConstants.QUAL])
        mut.createAnnotation("alt_allele_seen", str(True), "INPUT")
        mut = self._addFilterData2Mutation(mut, record)
        mut = self._addInfoData2Mutation(mut, record, alt_index)
        return mut
    def _convertGFFRecordToTranscript(self, gff_record, seq_dict, seq_dict_keys, tx_to_protein_mapping):
        """

        :param gff_record:
        :param seq_dict:
        :return: None if the record is a gene record or otherwise does not represent a transcript, CDS, *_codon, or exon
        """
        types_of_interest = ["exon", "CDS", "start_codon", "stop_codon"]
        if gff_record['type'] not in types_of_interest:
            return None

        quals = gff_record['quals']
        transcript_id = quals['transcript_id'][0]

        try:
            tx = self._transcript_index[transcript_id]
        except KeyError:

            # Create the initial record for this transcript.
            contig = MutUtils.convertChromosomeStringToMutationDataFormat(gff_record['rec_id'])
            tx = Transcript(transcript_id, gene=quals['gene_name'][0], gene_id=quals['gene_id'][0], contig=contig)
            self._transcript_index[transcript_id] = tx

            # Set the gene_type based on gene_type or gene_biotype
            key = "gene_biotype"
            if key not in quals.keys():
                key = "gene_type"
            self._transcript_index[transcript_id].set_gene_type(quals.get(key, [""])[0])

            if gff_record['strand'] == 1:
                self._transcript_index[transcript_id].set_strand("+")
            else:
                self._transcript_index[transcript_id].set_strand("-")
            qual_keys = quals.keys()
            for attribute in GenomeBuildFactory.QUALS_TO_CHECK:
                if attribute in qual_keys:
                    self._transcript_index[transcript_id].add_other_attribute(attribute, "|".join(quals[attribute]))

            seq = seq_dict.get(transcript_id, None)

            if seq is not None:
                genome_seq_as_str = str(seq.seq)
            else:
                genome_seq_as_str = ""

            self._transcript_index[transcript_id].set_seq(genome_seq_as_str)

            tx_id_for_protein_lookup = transcript_id
            if '.' in transcript_id:
                tx_id_for_protein_lookup = tx_id_for_protein_lookup[:tx_id_for_protein_lookup.index('.')]
            self._transcript_index[transcript_id].set_protein_id(tx_to_protein_mapping.get(tx_id_for_protein_lookup, ""))

            tx = self._transcript_index[transcript_id]

        gff_type = gff_record['type']
        if gff_type == 'exon':
            tx.add_exon(gff_record['location'][0], gff_record['location'][1], quals['exon_number'][0])
        elif gff_type == 'CDS':
            tx.add_cds(gff_record['location'][0], gff_record['location'][1])
        elif gff_type == 'start_codon':
            tx.set_start_codon(gff_record['location'][0], gff_record['location'][1])
        elif gff_type == 'stop_codon':
            tx.set_stop_codon(gff_record['location'][0], gff_record['location'][1])
    def _convertGFFRecordToTranscript(self, gff_record, seq_dict,
                                      seq_dict_keys, tx_to_protein_mapping):
        """

        :param gff_record:
        :param seq_dict:
        :return: None if the record is a gene record or otherwise does not represent a transcript, CDS, *_codon, or exon
        """
        types_of_interest = ["exon", "CDS", "start_codon", "stop_codon"]
        if gff_record['type'] not in types_of_interest:
            return None

        quals = gff_record['quals']
        transcript_id = quals['transcript_id'][0]

        try:
            tx = self._transcript_index[transcript_id]
        except KeyError:

            # Create the initial record for this transcript.
            contig = MutUtils.convertChromosomeStringToMutationDataFormat(
                gff_record['rec_id'])
            tx = Transcript(transcript_id,
                            gene=quals['gene_name'][0],
                            gene_id=quals['gene_id'][0],
                            contig=contig)
            self._transcript_index[transcript_id] = tx

            # Set the gene_type based on gene_type or gene_biotype
            key = "gene_biotype"
            if key not in quals.keys():
                key = "gene_type"
            self._transcript_index[transcript_id].set_gene_type(
                quals.get(key, [""])[0])

            if gff_record['strand'] == 1:
                self._transcript_index[transcript_id].set_strand("+")
            else:
                self._transcript_index[transcript_id].set_strand("-")
            qual_keys = quals.keys()
            for attribute in GenomeBuildFactory.QUALS_TO_CHECK:
                if attribute in qual_keys:
                    self._transcript_index[transcript_id].add_other_attribute(
                        attribute, "|".join(quals[attribute]))

            seq = seq_dict.get(transcript_id, None)

            if seq is not None:
                genome_seq_as_str = str(seq.seq)
            else:
                genome_seq_as_str = ""

            self._transcript_index[transcript_id].set_seq(genome_seq_as_str)

            tx_id_for_protein_lookup = transcript_id
            if '.' in transcript_id:
                tx_id_for_protein_lookup = tx_id_for_protein_lookup[:
                                                                    tx_id_for_protein_lookup
                                                                    .index('.'
                                                                           )]
            self._transcript_index[transcript_id].set_protein_id(
                tx_to_protein_mapping.get(tx_id_for_protein_lookup, ""))

            tx = self._transcript_index[transcript_id]

        gff_type = gff_record['type']
        if gff_type == 'exon':
            tx.add_exon(gff_record['location'][0], gff_record['location'][1],
                        quals['exon_number'][0])
        elif gff_type == 'CDS':
            tx.add_cds(gff_record['location'][0], gff_record['location'][1])
        elif gff_type == 'start_codon':
            tx.set_start_codon(gff_record['location'][0],
                               gff_record['location'][1])
        elif gff_type == 'stop_codon':
            tx.set_stop_codon(gff_record['location'][0],
                              gff_record['location'][1])