Ejemplo n.º 1
0
    def _load_genes(self):

        self._db.drop_collection('genes')
        self._db.genes.ensure_index('gene_id')

        self._db.drop_collection('transcripts')
        self._db.transcripts.ensure_index('transcript_id')
        self._db.transcripts.ensure_index('gene_id')

        self._db.drop_collection('exons')
        self._db.exons.ensure_index('exon_id')
        self._db.exons.ensure_index('gene_id')

        gencode_file = gzip.open(self.settings_module.gencode_gtf_file)
        size = os.path.getsize(self.settings_module.gencode_gtf_file)
        progress = get_progressbar(size, 'Loading gene definitions from GTF')
        for datatype, obj in get_data_from_gencode_gtf(gencode_file):
            progress.update(gencode_file.fileobj.tell())

            if datatype == 'gene':
                gene_id = obj['gene_id']
                obj['symbol'] = obj['gene_name']

                obj['tags'] = {}

                # TODO
                #obj['coding_size'] = loading_utils.get_coding_size_from_gene_structure(obj)
                obj['coding_size'] = 0

                self._db.genes.insert(obj)

            if datatype == 'transcript':
                transcript_id = obj['transcript_id']
                obj['tags'] = {}
                self._db.transcripts.insert(obj)

            if datatype == 'exon':
                exon_id = obj['exon_id']
                transcript_id = obj['transcript_id']
                del obj['transcript_id']
                if self._db.exons.find_one({'exon_id': exon_id}):
                    self._db.exons.update({'exon_id': exon_id}, {'$push': {'transcripts': transcript_id}})
                else:
                    obj['transcripts'] = [transcript_id,]
                    obj['tags'] = {}
                    self._db.exons.insert(obj)

            if datatype == 'cds':
                exon_id = obj['exon_id']
                # this works because cds always comes after exon
                # this is obviously an inglorious hack - all the gtf parsing should be improved
                self._db.exons.update({'exon_id': exon_id}, {'$set': {
                    'cds_start': obj['start'],
                    'cds_stop': obj['stop'],
                    'cds_xstart': obj['xstart'],
                    'cds_xstop': obj['xstop'],
                }})
Ejemplo n.º 2
0
    def _load_genes(self):

        self._db.drop_collection('genes')
        self._db.genes.ensure_index('gene_id')

        self._db.drop_collection('transcripts')
        self._db.transcripts.ensure_index('transcript_id')
        self._db.transcripts.ensure_index('gene_id')

        self._db.drop_collection('exons')
        self._db.exons.ensure_index('exon_id')
        self._db.exons.ensure_index('gene_id')

        gencode_file = gzip.open(self.settings_module.gencode_gtf_file)
        size = os.path.getsize(self.settings_module.gencode_gtf_file)
        progress = get_progressbar(size, 'Loading gene definitions from GTF')
        for datatype, obj in get_data_from_gencode_gtf(gencode_file):
            progress.update(gencode_file.fileobj.tell())

            if datatype == 'gene':
                gene_id = obj['gene_id']
                obj['symbol'] = obj['gene_name']

                obj['tags'] = {}

                # TODO
                #obj['coding_size'] = loading_utils.get_coding_size_from_gene_structure(obj)
                obj['coding_size'] = 0

                self._db.genes.insert(obj)

            if datatype == 'transcript':
                transcript_id = obj['transcript_id']
                obj['tags'] = {}
                self._db.transcripts.insert(obj)

            if datatype == 'exon':
                exon_id = obj['exon_id']
                transcript_id = obj['transcript_id']
                del obj['transcript_id']
                if self._db.exons.find_one({'exon_id': exon_id}):
                    self._db.exons.update({'exon_id': exon_id}, {'$push': {'transcripts': transcript_id}})
                else:
                    obj['transcripts'] = [transcript_id,]
                    obj['tags'] = {}
                    self._db.exons.insert(obj)

            if datatype == 'cds':
                exon_id = obj['exon_id']
                # this works because cds always comes after exon
                # this is obviously an inglorious hack - all the gtf parsing should be improved
                self._db.exons.update({'exon_id': exon_id}, {'$set': {
                    'cds_start': obj['start'],
                    'cds_stop': obj['stop'],
                    'cds_xstart': obj['xstart'],
                    'cds_xstop': obj['xstop'],
                }})