Example #1
0
    def astuple(self, encoding=None):
        """
        Return a tuple suitable for import into a database, with attributes
        field and extra field jsonified into strings

        If `encoding` is not None, then convert string fields to unicode using
        the provided encoding.
        """
        if not encoding:
            return (
                self.id, self.seqid, self.source, self.featuretype, self.start,
                self.end, self.score, self.strand, self.frame,
                helpers._jsonify(self.attributes),
                helpers._jsonify(self.extra), self.bin
            )
        return (
            self.id.decode(encoding), self.seqid.decode(encoding),
            self.source.decode(encoding), self.featuretype.decode(encoding),
            self.start, self.end, self.score.decode(encoding),
            self.strand.decode(encoding), self.frame.decode(encoding),
            helpers._jsonify(self.attributes).decode(encoding),
            helpers._jsonify(self.extra).decode(encoding), self.bin
        )
Example #2
0
    def _finalize(self):
        """
        Various last-minute stuff to perform after file has been parsed and
        imported.

        In general, if you'll be adding stuff to the meta table, do it here.
        """
        c = self.conn.cursor()
        c.executemany('''
                      INSERT INTO directives VALUES (?)
                      ''', ((i,) for i in self.iterator.directives))
        c.execute(
            '''
            INSERT INTO meta (version, dialect)
            VALUES (:version, :dialect)''',
            dict(version=version.version,
                 dialect=helpers._jsonify(self.iterator.dialect))
        )

        c.executemany(
            '''
            INSERT OR REPLACE INTO autoincrements VALUES (?, ?)
            ''', self._autoincrements.items())

        # These indexes are *well* worth the effort and extra storage: over
        # 500x speedup on code like this:
        #
        #   genes = []
        #   for i in db.features_of_type('snoRNA'):
        #       for k in db.parents(i, level=1, featuretype='gene'):
        #           genes.append(k.id)
        #
        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')
        logger.info("Creating features(featuretype) index")
        c.execute('DROP INDEX IF EXISTS featuretype')
        c.execute('CREATE INDEX featuretype ON features (featuretype)')

        self.conn.commit()

        self.warnings = self.iterator.warnings
Example #3
0
    def _update_relations(self):

        if not self.infer_gene_extent:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        logger.info('Inferring gene and transcript extents, '
                    'and writing to tempfile')
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        tmp = '/tmp/gffutils'
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature,))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:
            # transcript extent
            c2.execute(
                '''
                SELECT MIN(start), MAX(end), strand, seqid
                FROM features
                JOIN relations ON
                features.id = relations.child
                WHERE parent = ? AND featuretype == ?
                ''', (transcript_id, self.subfeature))
            transcript_start, transcript_end, strand, seqid = c2.fetchone()
            transcript_attributes = {
                self.transcript_key: [transcript_id],
                self.gene_key: [gene_id]
            }
            transcript_bin = bins.bins(
                transcript_start, transcript_end, one=True)

            # Write out to file; we'll be reading it back in shortly.  Omit
            # score, frame, source, and extra since they will always have the
            # same default values (".", ".", "gffutils_derived", and []
            # respectively)

            fout.write('\t'.join(map(str, [
                transcript_id,
                seqid,
                transcript_start,
                transcript_end,
                strand,
                'transcript',
                transcript_bin,
                helpers._jsonify(transcript_attributes)
            ])) + '\n')

            n_features += 1

            # Infer gene extent, but only if we haven't done so already.
            if gene_id != last_gene_id:
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (gene_id, self.subfeature))
                gene_start, gene_end, strand, seqid = c2.fetchone()
                gene_attributes = {self.gene_key: [gene_id]}
                gene_bin = bins.bins(gene_start, gene_end, one=True)

                fout.write('\t'.join(map(str, [
                    gene_id,
                    seqid,
                    gene_start,
                    gene_end,
                    strand,
                    'gene',
                    gene_bin,
                    helpers._jsonify(gene_attributes)
                ])) + '\n')

            last_gene_id = gene_id
            n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = ['parent', 'seqid', 'start', 'end', 'strand',
                    'featuretype', 'bin', 'attributes']
            for line in open(fout.name):
                d = dict(zip(keys, line.strip().split('\t')))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes),
                          fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        os.unlink(fout.name)
Example #4
0
    def _populate_from_lines(self, lines):
        msg = (
            "Populating features table and first-order relations: %d "
            "features\r"
        )

        c = self.conn.cursor()

        last_perc = 0
        for i, f in enumerate(lines):

            # Percent complete
            if self.verbose:

                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            f.id = self._id_handler(f)

            # Insert the feature itself...
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ['merge', 'replace']:
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes),
                              fixed.id))
                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join(
                            ['%s = ?' % field
                             for field in self.force_merge_fields])
                        values = [getattr(fixed, field)
                                  for field in self.force_merge_fields]\
                            + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, values)

                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            # For an on-spec GTF file,
            # self.transcript_key = "transcript_id"
            # self.gene_key = "gene_id"
            relations = []
            parent = None
            grandparent = None
            if self.transcript_key in f.attributes:
                parent = f.attributes[self.transcript_key][0]
                relations.append((parent, f.id, 1))

            if self.gene_key in f.attributes:
                grandparent = f.attributes[self.gene_key]
                if len(grandparent) > 0:
                    grandparent = grandparent[0]
                    relations.append((grandparent, f.id, 2))
                    if parent is not None:
                        relations.append((grandparent, parent, 1))

            # Note the IGNORE, so relationships defined many times in the file
            # (e.g., the transcript-gene relation on pretty much every line in
            # a GTF) will only be included once.
            c.executemany(
                '''
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                ''', relations
            )

        logger.info('Committing changes')
        self.conn.commit()
        if self.verbose:
            sys.stderr.write((msg % i) + '\n')
Example #5
0
    def _populate_from_lines(self, lines):
        c = self.conn.cursor()
        self._drop_indexes()
        last_perc = 0
        logger.info("Populating features")
        msg = ("Populating features table and first-order relations: "
               "%d features\r")

        # c.executemany() was not as much of an improvement as I had expected.
        #
        # Compared to a benchmark of doing each insert separately:
        # executemany using a list of dicts to iterate over is ~15% slower
        # executemany using a list of tuples to iterate over is ~8% faster

        _features, _relations = [], []
        for i, f in enumerate(lines):

            # Percent complete
            if self.verbose:
                if i % 1000 == 0:
                    logger.info(msg % i)

            # TODO: handle ID creation here...should be combined with the
            # INSERT below (that is, don't IGNORE below but catch the error and
            # re-try with a new ID).  However, is this doable with an
            # execute-many?
            f.id = self._id_handler(f)
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ['merge', 'replace']:
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes),
                              fixed.id))

                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join(
                            ['%s = ?' % field
                             for field in self.force_merge_fields])
                        values = [
                            getattr(fixed, field)
                            for field in self.force_merge_fields] + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, tuple(values))

                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            if 'Parent' in f.attributes:
                for parent in f.attributes['Parent']:
                    c.execute(
                        '''
                        INSERT OR IGNORE INTO relations VALUES
                        (?, ?, 1)
                        ''', (parent, f.id))

        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)