コード例 #1
0
ファイル: feature.py プロジェクト: arnikz/gffutils
    def astuple(self, encoding=None):
        """
        Return a tuple suitable for import into a database.

        Attributes field and extra field jsonified into strings. The order of
        fields is such that they can be supplied as arguments for the query
        defined in :attr:`gffutils.constants._INSERT`.

        If `encoding` is not None, then convert string fields to unicode using
        the provided encoding.

        Returns
        -------
        Tuple
        """
        if not encoding:
            return (
                self.id, self.seqid, self.source, self.featuretype, self.start,
                self.end, self.score, self.strand, self.frame,
                helpers._jsonify(self.attributes),
                helpers._jsonify(self.extra), self.calc_bin()
            )
        return (
            self.id.decode(encoding), self.seqid.decode(encoding),
            self.source.decode(encoding), self.featuretype.decode(encoding),
            self.start, self.end, self.score.decode(encoding),
            self.strand.decode(encoding), self.frame.decode(encoding),
            helpers._jsonify(self.attributes).decode(encoding),
            helpers._jsonify(self.extra).decode(encoding), self.calc_bin()
        )
コード例 #2
0
ファイル: feature_test.py プロジェクト: DHatziioanou/gffutils
def test_unjsonify():
    attributes, dialect = parser._split_keyvals('transcript_id "mRNA1"')
    assert attributes == {'transcript_id': ['mRNA1']}, attributes

    s = helpers._jsonify(attributes)
    assert s == '{"transcript_id":["mRNA1"]}', s

    d = helpers._unjsonify(s, isattributes=True)
    assert d == attributes
コード例 #3
0
ファイル: create.py プロジェクト: DHatziioanou/gffutils
    def _finalize(self):
        """
        Various last-minute stuff to perform after file has been parsed and
        imported.

        In general, if you'll be adding stuff to the meta table, do it here.
        """
        c = self.conn.cursor()
        c.executemany('''
                      INSERT INTO directives VALUES (?)
                      ''', ((i,) for i in self.iterator.directives))
        c.execute(
            '''
            INSERT INTO meta (version, dialect)
            VALUES (:version, :dialect)''',
            dict(version=version.version,
                 dialect=helpers._jsonify(self.iterator.dialect))
        )

        c.executemany(
            '''
            INSERT OR REPLACE INTO autoincrements VALUES (?, ?)
            ''', list(self._autoincrements.items()))

        # These indexes are *well* worth the effort and extra storage: over
        # 500x speedup on code like this:
        #
        #   genes = []
        #   for i in db.features_of_type('snoRNA'):
        #       for k in db.parents(i, level=1, featuretype='gene'):
        #           genes.append(k.id)
        #
        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')
        logger.info("Creating features(featuretype) index")
        c.execute('DROP INDEX IF EXISTS featuretype')
        c.execute('CREATE INDEX featuretype ON features (featuretype)')
        logger.info("Creating features (seqid, start, end) index")
        c.execute('DROP INDEX IF EXISTS seqidstartend')
        c.execute('CREATE INDEX seqidstartend ON features (seqid, start, end)')
        logger.info("Creating features (seqid, start, end, strand) index")
        c.execute('DROP INDEX IF EXISTS seqidstartendstrand')
        c.execute('CREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)')

        # speeds computation 1000x in some cases
        logger.info("Running ANALYSE features")
        c.execute('ANALYZE features')

        self.conn.commit()

        self.warnings = self.iterator.warnings
コード例 #4
0
ファイル: feature.py プロジェクト: rbeagrie/gffutils
    def astuple(self, encoding=None):
        """
        Return a tuple suitable for import into a database, with attributes
        field and extra field jsonified into strings

        If `encoding` is not None, then convert string fields to unicode using
        the provided encoding.
        """
        if not encoding:
            return (
                self.id, self.seqid, self.source, self.featuretype, self.start,
                self.end, self.score, self.strand, self.frame,
                helpers._jsonify(self.attributes),
                helpers._jsonify(self.extra), self.bin
            )
        return (
            self.id.decode(encoding), self.seqid.decode(encoding),
            self.source.decode(encoding), self.featuretype.decode(encoding),
            self.start, self.end, self.score.decode(encoding),
            self.strand.decode(encoding), self.frame.decode(encoding),
            helpers._jsonify(self.attributes).decode(encoding),
            helpers._jsonify(self.extra).decode(encoding), self.bin
        )
コード例 #5
0
ファイル: create.py プロジェクト: linsson/gffutils
    def _update_relations(self):

        if not self.infer_gene_extent:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        logger.info('Inferring gene and transcript extents, '
                    'and writing to tempfile')
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        tmp = '/tmp/gffutils'
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature,))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:
            # transcript extent
            c2.execute(
                '''
                SELECT MIN(start), MAX(end), strand, seqid
                FROM features
                JOIN relations ON
                features.id = relations.child
                WHERE parent = ? AND featuretype == ?
                ''', (transcript_id, self.subfeature))
            transcript_start, transcript_end, strand, seqid = c2.fetchone()
            transcript_attributes = {
                self.transcript_key: [transcript_id],
                self.gene_key: [gene_id]
            }
            transcript_bin = bins.bins(
                transcript_start, transcript_end, one=True)

            # Write out to file; we'll be reading it back in shortly.  Omit
            # score, frame, source, and extra since they will always have the
            # same default values (".", ".", "gffutils_derived", and []
            # respectively)

            fout.write('\t'.join(map(str, [
                transcript_id,
                seqid,
                transcript_start,
                transcript_end,
                strand,
                'transcript',
                transcript_bin,
                helpers._jsonify(transcript_attributes)
            ])) + '\n')

            n_features += 1

            # Infer gene extent, but only if we haven't done so already.
            if gene_id != last_gene_id:
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (gene_id, self.subfeature))
                gene_start, gene_end, strand, seqid = c2.fetchone()
                gene_attributes = {self.gene_key: [gene_id]}
                gene_bin = bins.bins(gene_start, gene_end, one=True)

                fout.write('\t'.join(map(str, [
                    gene_id,
                    seqid,
                    gene_start,
                    gene_end,
                    strand,
                    'gene',
                    gene_bin,
                    helpers._jsonify(gene_attributes)
                ])) + '\n')

            last_gene_id = gene_id
            n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = ['parent', 'seqid', 'start', 'end', 'strand',
                    'featuretype', 'bin', 'attributes']
            for line in open(fout.name):
                d = dict(list(zip(keys, line.strip().split('\t'))))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes),
                          fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        os.unlink(fout.name)
コード例 #6
0
ファイル: create.py プロジェクト: linsson/gffutils
    def _populate_from_lines(self, lines):
        msg = (
            "Populating features table and first-order relations: %d "
            "features\r"
        )

        c = self.conn.cursor()

        last_perc = 0
        lines_seen = None
        for i, f in enumerate(lines):
            lines_seen = i

            # Percent complete
            if self.verbose:

                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            f.id = self._id_handler(f)

            # Insert the feature itself...
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ['merge', 'replace']:
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes),
                              fixed.id))
                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join(
                            ['%s = ?' % field
                             for field in self.force_merge_fields])
                        values = [getattr(fixed, field)
                                  for field in self.force_merge_fields]\
                            + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, values)

                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            # For an on-spec GTF file,
            # self.transcript_key = "transcript_id"
            # self.gene_key = "gene_id"
            relations = []
            parent = None
            grandparent = None
            if self.transcript_key in f.attributes:
                parent = f.attributes[self.transcript_key][0]
                relations.append((parent, f.id, 1))

            if self.gene_key in f.attributes:
                grandparent = f.attributes[self.gene_key]
                if len(grandparent) > 0:
                    grandparent = grandparent[0]
                    relations.append((grandparent, f.id, 2))
                    if parent is not None:
                        relations.append((grandparent, parent, 1))

            # Note the IGNORE, so relationships defined many times in the file
            # (e.g., the transcript-gene relation on pretty much every line in
            # a GTF) will only be included once.
            c.executemany(
                '''
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                ''', relations
            )

        if lines_seen is None:
            raise ValueError("No lines parsed -- was an empty file provided?")
        logger.info('Committing changes')
        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)
コード例 #7
0
ファイル: create.py プロジェクト: linsson/gffutils
    def _populate_from_lines(self, lines):
        c = self.conn.cursor()
        self._drop_indexes()
        last_perc = 0
        logger.info("Populating features")
        msg = ("Populating features table and first-order relations: "
               "%d features\r")

        # c.executemany() was not as much of an improvement as I had expected.
        #
        # Compared to a benchmark of doing each insert separately:
        # executemany using a list of dicts to iterate over is ~15% slower
        # executemany using a list of tuples to iterate over is ~8% faster
        features_seen = None
        _features, _relations = [], []
        for i, f in enumerate(lines):
            features_seen = i

            # Percent complete

            if self.verbose:
                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            # TODO: handle ID creation here...should be combined with the
            # INSERT below (that is, don't IGNORE below but catch the error and
            # re-try with a new ID).  However, is this doable with an
            # execute-many?
            f.id = self._id_handler(f)
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ['merge', 'replace']:
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes),
                              fixed.id))

                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join(
                            ['%s = ?' % field
                             for field in self.force_merge_fields])
                        values = [
                            getattr(fixed, field)
                            for field in self.force_merge_fields] + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, tuple(values))

                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            if 'Parent' in f.attributes:
                for parent in f.attributes['Parent']:
                    c.execute(
                        '''
                        INSERT OR IGNORE INTO relations VALUES
                        (?, ?, 1)
                        ''', (parent, f.id))
        if features_seen is None:
            raise ValueError("No lines parsed -- was an empty file provided?")

        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)
コード例 #8
0
ファイル: create.py プロジェクト: DHatziioanou/gffutils
    def _populate_from_lines(self, lines):
        msg = (
            "Populating features table and first-order relations: %d "
            "features\r"
        )

        c = self.conn.cursor()

        # Only check this many features to see if it's a gene or transcript and
        # issue the appropriate warning.
        gene_and_transcript_check_limit = 1000

        last_perc = 0
        lines_seen = 0
        for i, f in enumerate(lines):

            # See issues #48 and #20.
            if lines_seen < gene_and_transcript_check_limit:
                if (
                    f.featuretype == 'transcript' and
                    not self.disable_infer_transcripts
                ):
                    warnings.warn(
                        "It appears you have a transcript feature in your GTF "
                        "file. You may want to use the "
                        "`disable_infer_transcripts` "
                        "option to speed up database creation")
                elif (
                    f.featuretype == 'gene' and
                    not self.disable_infer_genes
                ):
                    warnings.warn(
                        "It appears you have a gene feature in your GTF "
                        "file. You may want to use the "
                        "`disable_infer_genes` "
                        "option to speed up database creation")

            lines_seen = i + 1

            # Percent complete
            if self.verbose:

                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            f.id = self._id_handler(f)

            # Insert the feature itself...
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy == 'merge':
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes),
                              fixed.id))
                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join(
                            ['%s = ?' % field
                             for field in self.force_merge_fields])
                        values = [getattr(fixed, field)
                                  for field in self.force_merge_fields]\
                            + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, values)
                elif final_strategy == 'replace':
                    self._replace(f, c)
                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            # For an on-spec GTF file,
            # self.transcript_key = "transcript_id"
            # self.gene_key = "gene_id"
            relations = []
            parent = None
            grandparent = None
            if self.transcript_key in f.attributes:
                parent = f.attributes[self.transcript_key][0]
                relations.append((parent, f.id, 1))

            if self.gene_key in f.attributes:
                grandparent = f.attributes[self.gene_key]
                if len(grandparent) > 0:
                    grandparent = grandparent[0]
                    relations.append((grandparent, f.id, 2))
                    if parent is not None:
                        relations.append((grandparent, parent, 1))

            # Note the IGNORE, so relationships defined many times in the file
            # (e.g., the transcript-gene relation on pretty much every line in
            # a GTF) will only be included once.
            c.executemany(
                '''
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                ''', relations
            )

        if lines_seen == 0:
            raise ValueError("No lines parsed -- was an empty file provided?")
        logger.info('Committing changes')
        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)
コード例 #9
0
ファイル: create.py プロジェクト: rbeagrie/gffutils
    def _populate_from_lines(self, lines):
        msg = "Populating features table and first-order relations: %d " "features\r"

        c = self.conn.cursor()

        last_perc = 0
        for i, f in enumerate(lines):

            # Percent complete
            if self.verbose:

                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            f.id = self._id_handler(f)

            # Insert the feature itself...
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ["merge", "replace"]:
                    c.execute(
                        """
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        """,
                        (helpers._jsonify(fixed.attributes), fixed.id),
                    )
                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ", ".join(["%s = ?" % field for field in self.force_merge_fields])
                        values = [getattr(fixed, field) for field in self.force_merge_fields] + [fixed.id]
                        c.execute(
                            """
                            UPDATE features SET %s
                            WHERE id = ?
                            """
                            % _set_clause,
                            values,
                        )

                elif final_strategy == "create_unique":
                    self._insert(f, c)

            # For an on-spec GTF file,
            # self.transcript_key = "transcript_id"
            # self.gene_key = "gene_id"
            relations = []
            parent = None
            grandparent = None
            if self.transcript_key in f.attributes:
                parent = f.attributes[self.transcript_key][0]
                relations.append((parent, f.id, 1))

            if self.gene_key in f.attributes:
                grandparent = f.attributes[self.gene_key]
                if len(grandparent) > 0:
                    grandparent = grandparent[0]
                    relations.append((grandparent, f.id, 2))
                    if parent is not None:
                        relations.append((grandparent, parent, 1))

            # Note the IGNORE, so relationships defined many times in the file
            # (e.g., the transcript-gene relation on pretty much every line in
            # a GTF) will only be included once.
            c.executemany(
                """
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                """,
                relations,
            )

        logger.info("Committing changes")
        self.conn.commit()
        if self.verbose:
            sys.stderr.write((msg % i) + "\n")
コード例 #10
0
ファイル: create.py プロジェクト: rbeagrie/gffutils
    def _populate_from_lines(self, lines):
        c = self.conn.cursor()
        self._drop_indexes()
        last_perc = 0
        logger.info("Populating features")
        msg = "Populating features table and first-order relations: " "%d features\r"

        # c.executemany() was not as much of an improvement as I had expected.
        #
        # Compared to a benchmark of doing each insert separately:
        # executemany using a list of dicts to iterate over is ~15% slower
        # executemany using a list of tuples to iterate over is ~8% faster

        _features, _relations = [], []
        for i, f in enumerate(lines):

            # Percent complete
            if self.verbose:
                if i % 1000 == 0:
                    logger.info(msg % i)

            # TODO: handle ID creation here...should be combined with the
            # INSERT below (that is, don't IGNORE below but catch the error and
            # re-try with a new ID).  However, is this doable with an
            # execute-many?
            f.id = self._id_handler(f)
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy in ["merge", "replace"]:
                    c.execute(
                        """
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        """,
                        (helpers._jsonify(fixed.attributes), fixed.id),
                    )

                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ", ".join(["%s = ?" % field for field in self.force_merge_fields])
                        values = [getattr(fixed, field) for field in self.force_merge_fields] + [fixed.id]
                        c.execute(
                            """
                            UPDATE features SET %s
                            WHERE id = ?
                            """
                            % _set_clause,
                            tuple(values),
                        )

                elif final_strategy == "create_unique":
                    self._insert(f, c)

            if "Parent" in f.attributes:
                for parent in f.attributes["Parent"]:
                    c.execute(
                        """
                        INSERT OR IGNORE INTO relations VALUES
                        (?, ?, 1)
                        """,
                        (parent, f.id),
                    )

        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)
コード例 #11
0
ファイル: create.py プロジェクト: DHatziioanou/gffutils
    def _update_relations(self):

        if self.disable_infer_genes and self.disable_infer_transcripts:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        if not (self.disable_infer_genes or self.disable_infer_transcripts):
            msg = 'gene and transcript'
        elif self.disable_infer_transcripts:
            msg = 'gene'
        elif self.disable_infer_genes:
            msg = 'transcript'
        logger.info('Inferring %s extents ' 'and writing to tempfile' % msg)

        if isinstance(self._keep_tempfiles, six.string_types):
            suffix = self._keep_tempfiles
        else:
            suffix = '.gffutils'
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.
        #
        # By the way, we do this even if we're only looking for transcripts or
        # only looking for genes.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature, ))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and/or genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:

            if not self.disable_infer_transcripts:
                # transcript extent
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (transcript_id, self.subfeature))
                transcript_start, transcript_end, strand, seqid = c2.fetchone()
                transcript_attributes = {
                    self.transcript_key: [transcript_id],
                    self.gene_key: [gene_id]
                }
                transcript_bin = bins.bins(transcript_start,
                                           transcript_end,
                                           one=True)

                # Write out to file; we'll be reading it back in shortly.  Omit
                # score, frame, source, and extra since they will always have
                # the same default values (".", ".", "gffutils_derived", and []
                # respectively)

                fout.write('\t'.join(
                    map(str, [
                        transcript_id, seqid, transcript_start, transcript_end,
                        strand, 'transcript', transcript_bin,
                        helpers._jsonify(transcript_attributes)
                    ])) + '\n')

                n_features += 1

            if not self.disable_infer_genes:
                # Infer gene extent, but only if we haven't done so already
                if gene_id != last_gene_id:
                    c2.execute(
                        '''
                        SELECT MIN(start), MAX(end), strand, seqid
                        FROM features
                        JOIN relations ON
                        features.id = relations.child
                        WHERE parent = ? AND featuretype == ?
                        ''', (gene_id, self.subfeature))
                    gene_start, gene_end, strand, seqid = c2.fetchone()
                    gene_attributes = {self.gene_key: [gene_id]}
                    gene_bin = bins.bins(gene_start, gene_end, one=True)

                    fout.write('\t'.join(
                        map(str, [
                            gene_id, seqid, gene_start, gene_end, strand,
                            'gene', gene_bin,
                            helpers._jsonify(gene_attributes)
                        ])) + '\n')

                last_gene_id = gene_id
                n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = [
                'parent', 'seqid', 'start', 'end', 'strand', 'featuretype',
                'bin', 'attributes'
            ]
            for line in open(fout.name):
                d = dict(list(zip(keys, line.strip().split('\t'))))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes), fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        if not self._keep_tempfiles:
            os.unlink(fout.name)
コード例 #12
0
ファイル: create.py プロジェクト: DHatziioanou/gffutils
    def _populate_from_lines(self, lines):
        msg = ("Populating features table and first-order relations: %d "
               "features\r")

        c = self.conn.cursor()

        # Only check this many features to see if it's a gene or transcript and
        # issue the appropriate warning.
        gene_and_transcript_check_limit = 1000

        last_perc = 0
        lines_seen = 0
        for i, f in enumerate(lines):

            # See issues #48 and #20.
            if lines_seen < gene_and_transcript_check_limit:
                if (f.featuretype == 'transcript'
                        and not self.disable_infer_transcripts):
                    warnings.warn(
                        "It appears you have a transcript feature in your GTF "
                        "file. You may want to use the "
                        "`disable_infer_transcripts` "
                        "option to speed up database creation")
                elif (f.featuretype == 'gene'
                      and not self.disable_infer_genes):
                    warnings.warn(
                        "It appears you have a gene feature in your GTF "
                        "file. You may want to use the "
                        "`disable_infer_genes` "
                        "option to speed up database creation")

            lines_seen = i + 1

            # Percent complete
            if self.verbose:

                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            f.id = self._id_handler(f)

            # Insert the feature itself...
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy == 'merge':
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes), fixed.id))
                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join([
                            '%s = ?' % field
                            for field in self.force_merge_fields
                        ])
                        values = [getattr(fixed, field)
                                  for field in self.force_merge_fields]\
                            + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, values)
                elif final_strategy == 'replace':
                    self._replace(f, c)
                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            # For an on-spec GTF file,
            # self.transcript_key = "transcript_id"
            # self.gene_key = "gene_id"
            relations = []
            parent = None
            grandparent = None
            if self.transcript_key in f.attributes:
                parent = f.attributes[self.transcript_key][0]
                relations.append((parent, f.id, 1))

            if self.gene_key in f.attributes:
                grandparent = f.attributes[self.gene_key]
                if len(grandparent) > 0:
                    grandparent = grandparent[0]
                    relations.append((grandparent, f.id, 2))
                    if parent is not None:
                        relations.append((grandparent, parent, 1))

            # Note the IGNORE, so relationships defined many times in the file
            # (e.g., the transcript-gene relation on pretty much every line in
            # a GTF) will only be included once.
            c.executemany(
                '''
                INSERT OR IGNORE INTO relations (parent, child, level)
                VALUES (?, ?, ?)
                ''', relations)

        if lines_seen == 0:
            raise ValueError("No lines parsed -- was an empty file provided?")
        logger.info('Committing changes')
        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)
コード例 #13
0
ファイル: create.py プロジェクト: DHatziioanou/gffutils
    def _populate_from_lines(self, lines):
        c = self.conn.cursor()
        self._drop_indexes()
        last_perc = 0
        logger.info("Populating features")
        msg = ("Populating features table and first-order relations: "
               "%d features\r")

        # c.executemany() was not as much of an improvement as I had expected.
        #
        # Compared to a benchmark of doing each insert separately:
        # executemany using a list of dicts to iterate over is ~15% slower
        # executemany using a list of tuples to iterate over is ~8% faster
        features_seen = None
        _features, _relations = [], []
        for i, f in enumerate(lines):
            features_seen = i

            # Percent complete

            if self.verbose:
                if i % 1000 == 0:
                    sys.stderr.write(msg % i)
                    sys.stderr.flush()

            # TODO: handle ID creation here...should be combined with the
            # INSERT below (that is, don't IGNORE below but catch the error and
            # re-try with a new ID).  However, is this doable with an
            # execute-many?
            f.id = self._id_handler(f)
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, self.merge_strategy)
                if final_strategy == 'merge':
                    c.execute(
                        '''
                        UPDATE features SET attributes = ?
                        WHERE id = ?
                        ''', (helpers._jsonify(fixed.attributes), fixed.id))

                    # For any additional fields we're merging, update those as
                    # well.
                    if self.force_merge_fields:
                        _set_clause = ', '.join([
                            '%s = ?' % field
                            for field in self.force_merge_fields
                        ])
                        values = [
                            getattr(fixed, field)
                            for field in self.force_merge_fields
                        ] + [fixed.id]
                        c.execute(
                            '''
                            UPDATE features SET %s
                            WHERE id = ?
                            ''' % _set_clause, tuple(values))

                elif final_strategy == 'replace':
                    self._replace(f, c)

                elif final_strategy == 'create_unique':
                    self._insert(f, c)

            if 'Parent' in f.attributes:
                for parent in f.attributes['Parent']:
                    c.execute(
                        '''
                        INSERT OR IGNORE INTO relations VALUES
                        (?, ?, 1)
                        ''', (parent, f.id))
        if features_seen is None:
            raise ValueError("No lines parsed -- was an empty file provided?")

        self.conn.commit()
        if self.verbose:
            logger.info(msg % i)