Exemple #1
0
    def test_getCDSAlignment(self):
        """test getCDSAlignment"""

        gene1 = Gene('G00001', 'Chr1', 1, 27, 1, [
            Transcript('G00001.1', 'Chr1', 1, 27, 1, 'G00001', [
                CDS('G00001.1_cds_1', 'Chr1', 1, 6, 1, 'G00001.1'),
                CDS('G00001.1_cds_1', 'Chr1', 13, 21, 1, 'G00001.1')
            ])
        ])
        gene2 = Gene('G00002', 'Chr5', 1, 27, 1, [
            Transcript('G00002.1', 'Chr5', 1, 27, 1, 'G00002', [
                CDS('G00002.1_cds_1', 'Chr5', 1, 6, 1, 'G00002.1'),
                CDS('G00002.1_cds_1', 'Chr5', 13, 27, 1, 'G00002.1')
            ])
        ])

        gl = GeneLink(
            Duplication('Chr1', 1, 58, 'Chr5', 1, 60, [
                (Region('Chr1', 1, 58, 1), Region('Chr5', 1, 60, 1))
            ], [(
                'ATGTATTCTATCTCATGTTAATGCTAATACTAGTCATGATCAGATACGATGATGAT--TA',
                'ATGTATTCTATCTCATGTTACTGCTAATACTAGTCATGATCAGATACGATGATGATCATA')
                ]), gene1, gene2)

        self.assertEquals(
            ('ATGTATtctatcTCATGTTAAtgctaa', 'ATGTATtctatcTCATGTTACTGCTAA',
             Region('Chr1', 1, 27, 1), Region('Chr5', 1, 27, 1)),
            gl.getCDSAlignment())

        gene1 = Gene('G00001', 'Chr1', 1, 27, 1, [
            Transcript('G00001.1', 'Chr1', 1, 27, 1, 'G00001', [
                CDS('G00001.1_cds_1', 'Chr1', 1, 6, 1, 'G00001.1'),
                CDS('G00001.1_cds_1', 'Chr1', 13, 21, 1, 'G00001.1')
            ])
        ])
        gene2 = Gene('G00002', 'Chr5', 27, 60, -1, [
            Transcript('G00002.1', 'Chr5', 27, 60, -1, 'G00002', [
                CDS('G00002.1_cds_1', 'Chr5', 27, 39, -1, 'G00002.1'),
                CDS('G00002.1_cds_1', 'Chr5', 48, 60, -1, 'G00002.1')
            ])
        ])

        gl = GeneLink(
            Duplication('Chr1', 1, 58, 'Chr5', 1, 60, [
                (Region('Chr1', 1, 58, -1), Region('Chr5', 1, 60, 1))
            ], [(
                'ATGTATTCTATCTCATGTTAATGCTAATACTAGTCATGATCAGATACGATGATGAG--TA',
                'ATGTATTCTATCTCATGTTACTGCTAATACTAGTCATGATCAGATACGATGATGATCATA')
                ]), gene1, gene2)

        self.assertEquals(
            ('atactagtcatGATCAGATAcgatgaTGAG--TA',
             'ATACTAGTCATGAtcagatacGATGATGATCATA', Region(
                 'Chr1', 1, 32, -1), Region('Chr5', 27, 60, 1)),
            gl.getCDSAlignment())
Exemple #2
0
    def test_getlGenesFromCoordinates(self):
        """Test getlGenesFromCoordinates"""

        gene1 = Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])])
        gene2 = Gene('G00002','Chr1',239880,249190,-1,[Transcript('G00002.1','Chr1',239880,249190,-1,'G00002',[CDS('G00002.1_cds_1','Chr1',239880,240830, -1, 'G00002.1'),CDS('G00002.1_cds_1','Chr1',242740,244270,-1,'G00002.1'),CDS('G00002.1_cds_1','Chr1',244890,249190,-1,'G00002.1')])])

        lGenes = [gene1,gene2]
        # self.db.deleteAllGenes()  TODO
        self.db.insertlGenes(lGenes)

        self.assertEquals([gene2],self.db.getlGenesFromCoordinates('Chr1',230000,250000))
Exemple #3
0
    def test_selectAllGenes(self):
        """Test selectAllGenes"""

        gene1 = Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])])

        lGenes = [gene1]
        # self.db.deleteAllGenes()  TODO
        self.db.insertlGenes(lGenes)

        self.assertEquals([gene1],self.db.selectAllGenes())
Exemple #4
0
    def getlGenesFromCoordinates(self, seqid, start, end):
        """Get genes included in a defined region"""

        lGenes = []
        dGenes = {}
        lTranscripts = []
        dTranscripts = {}
        lCDS = []
        dCDS = {}

        cursor = self.conn.execute(
            '''select id, seqid, start, end, strand from gene where seqid = \'{}\' and start > {} and end < {} order by start'''
            .format(seqid, start, end))
        for row in cursor:
            dGenes[row[0]] = Gene(row[0], row[1], row[2], row[3], row[4])

        if dGenes:
            cursor = self.conn.execute(
                '''select id, seqid, start,end,strand,gene_id from transcript where seqid = \'{}\' and start > {} and end < {} order by start'''
                .format(seqid, start, end))
            for row in cursor:
                transcript = Transcript(row[0], row[1], row[2], row[3], row[4],
                                        row[5])

                if transcript.gene_id in dGenes:
                    dTranscripts[row[0]] = transcript
                    if len(dGenes[transcript.gene_id].lTranscripts) > 0:
                        dGenes[transcript.gene_id].lTranscripts.append(
                            transcript)
                    else:
                        dGenes[transcript.gene_id].lTranscripts = [transcript]

        if dTranscripts:
            cursor = self.conn.execute(
                '''select cds_id, seqid, start,end,strand,transcript_id from cds where seqid = \'{}\' and start > {} and end < {} order by start'''
                .format(seqid, start, end))
            for row in cursor:
                cds = CDS(row[0], row[1], row[2], row[3], row[4], row[5])

                if cds.transcript_id in dTranscripts:
                    if len(dTranscripts[cds.transcript_id].lCDS) > 0:
                        dTranscripts[cds.transcript_id].lCDS.append(cds)
                    else:
                        dTranscripts[cds.transcript_id].lCDS = [cds]

        return dGenes.values()
    def _parse(self):
        """Parse the gff file"""

        dGenes = {}
        dTranscripts = {}
        dCDS = {}

        with open(self.inputGffFile, 'r') as input:
            for line in input:
                if not re.match('^#', line):
                    line = line.rstrip('\n')
                    values = line.split('\t')

                    if values[2] == 'gene':
                        id = self._getFeatureTagValue('ID',values[8])
                        currentGene = Gene(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]))
                        dGenes[id] = currentGene
                        self.lGenes.append(currentGene)
                        
                    if values[2] == 'mRNA':
                        id = self._getFeatureTagValue('ID',values[8])
                        gene_id = self._getFeatureTagValue('Parent', values[8])
                        #gene_id = '{}_G'.format(id)
                        #currentGene = Gene(gene_id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]))
                        #dGenes[gene_id] = currentGene
                        #self.lGenes.append(currentGene)
                        currentTranscript = Transcript(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]), gene_id)
                        dTranscripts[id] = currentTranscript

                        if len(dGenes[gene_id].lTranscripts) > 0:
                            dGenes[gene_id].lTranscripts.append(currentTranscript)
                        else:
                            dGenes[gene_id].lTranscripts = [currentTranscript]

                    if values[2] == 'CDS':
                        id = self._getFeatureTagValue('ID',values[8])
                        transcript_id = self._getFeatureTagValue('Parent', values[8])
                        #id = '{}_CDS'.format(transcript_id)
                        currentCDS = CDS(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]), transcript_id)
                        if len(dTranscripts[transcript_id].lCDS) > 0:
                            dTranscripts[transcript_id].lCDS.append(currentCDS)
                        else:
                            dTranscripts[transcript_id].lCDS = [currentCDS]
Exemple #6
0
    def selectAllGenes(self):
        """Select all genes"""

        lGenes = []
        dGenes = {}
        lTranscripts = []
        dTranscripts = {}
        lCDS = []
        dCDS = {}

        cursor = self.conn.execute(
            '''select id, seqid, start, end, strand from gene''')
        for row in cursor:
            dGenes[row[0]] = Gene(row[0], row[1], row[2], row[3], row[4])

        cursor = self.conn.execute(
            '''select id, seqid, start,end,strand,gene_id from transcript''')
        for row in cursor:
            transcript = Transcript(row[0], row[1], row[2], row[3], row[4],
                                    row[5])
            dTranscripts[row[0]] = transcript

            if len(dGenes[transcript.gene_id].lTranscripts) > 0:
                dGenes[transcript.gene_id].lTranscripts.append(transcript)
            else:
                dGenes[transcript.gene_id].lTranscripts = [transcript]

        cursor = self.conn.execute(
            '''select cds_id, seqid, start,end,strand,transcript_id from cds order by start'''
        )
        for row in cursor:
            cds = CDS(row[0], row[1], row[2], row[3], row[4], row[5])

            if len(dTranscripts[cds.transcript_id].lCDS) > 0:
                dTranscripts[cds.transcript_id].lCDS.append(cds)
            else:
                dTranscripts[cds.transcript_id].lCDS = [cds]

        return dGenes.values()
Exemple #7
0
    def test_getAllGenes(self):
        """Test getAllGenes method"""
        iGffGeneParser = GffGeneParser("test-data/gene.gff3")
        lGenes = [Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])])]

        self.assertEqual(iGffGeneParser.getAllGenes()[0],lGenes[0])