Esempio n. 1
0
 def derived_feature_generator():
     """
     Generator of items from the file that was just created...
     """
     keys = ['parent', 'seqid', 'start', 'end', 'strand',
             'featuretype', 'bin', 'attributes']
     for line in open(fout.name):
         d = dict(zip(keys, line.strip().split('\t')))
         d.pop('parent')
         d['score'] = '.'
         d['source'] = 'gffutils_derived'
         d['frame'] = '.'
         d['extra'] = []
         d['attributes'] = helpers._unjsonify(d['attributes'])
         f = feature.Feature(**d)
         f.id = self._id_handler(f)
         yield f
Esempio n. 2
0
    def __init__(self, seqid=".", source=".", featuretype=".",
                 start=".", end=".", score=".", strand=".", frame=".",
                 attributes=None, extra=None, bin=None, id=None, dialect=None,
                 file_order=None, keep_order=False):
        """
        Represents a feature from the database.

        When printed, reproduces the original line from the file as faithfully
        as possible using `dialect`.

        Usually you won't want to use this directly, since it has various
        implementation details needed for operating in the context of FeatureDB
        objects.  Instead, try the :func:`feature_from_line` function.

        Parameters
        ----------

        seqid : string
            Name of the sequence (often chromosome)

        source : string
            Source of the feature; typically the originating database or
            program that predicted the feature

        featuretype : string
            Type of feature.  For example "gene", "exon", "TSS", etc

        start, end : int or "."
            1-based coordinates; start must be <= end.  If "." (the default
            placeholder for GFF files), then the corresponding attribute will
            be None.

        score : string
            Stored as a string.

        strand : "+" | "-" | "."
            Strand of the feature; "." when strand is not relevant.

        frame : "0" | "1" | "2"
            Coding frame.  0 means in-frame; 1 means there is one extra base at
            the beginning, so the first codon starts at the second base;
            2 means two extra bases at the beginning.  Interpretation is strand
            specific; "beginning" for a minus-strand feature is at the end
            coordinate.

        attributes : string or dict
            If a string, first assume it is serialized JSON; if this fails then
            assume it's the original key/vals string.  If it's a dictionary
            already, then use as-is.

            The end result is that this instance's `attributes` attribute will
            always be a dictionary.

            Upon printing, the attributes will be reconstructed based on this
            dictionary and the dialect -- except if the original attributes
            string was provided, in which case that will be used directly.

        extra : string or list
            Additional fields after the canonical 9 fields for GFF/GTF.

            If a string, then first assume it's serialized JSON; if this fails
            then assume it's a tab-delimited string of additional fields.  If
            it's a list already, then use as-is.

        bin : int
            UCSC genomic bin. If None, will be created based on provided
            start/end; if start or end is "." then bin will be None.

        id : None or string
            Database-specific primary key for this feature.  The only time this
            should not be None is if this feature is coming from a database, in
            which case it will be filled in automatically.

        dialect : dict or None

            The dialect to use when reconstructing attribute strings; defaults
            to the GFF3 spec.  :class:`FeatureDB` objects will automatically
            attach the dialect from the original file.

        file_order : int
            This is the `rowid` special field used in a sqlite3 database; this
            is provided by FeatureDB.

        keep_order : bool
            If True, then the attributes in the printed string will be in the
            order specified in the dialect.  Disabled by default, since this
            sorting step is time-consuming over many features.

        """
        # start/end can be provided as int-like, ".", or None, but will be
        # converted to int or None
        if start == ".":
            start = None
        elif start is not None:
            start = int(start)
        if end == ".":
            end = None
        elif end is not None:
            end = int(end)

        # Flexible handling of attributes:
        # If dict, then use that; otherwise assume JSON and convert to a dict;
        # otherwise assume original string and convert to a dict.
        #
        # dict_class is set at the module level above...this is so you can swap
        # in and out different dict implementations (ordered, defaultdict, etc)
        # for testing.
        attributes = attributes or dict_class()

        if isinstance(attributes, basestring):
            try:
                attributes = helpers._unjsonify(attributes, isattributes=True)

            # it's a string but not JSON: assume original attributes string.
            except simplejson.JSONDecodeError:

                # But Feature.attributes is still a dict
                attributes, _dialect = parser._split_keyvals(attributes)

                # Use this dialect if none provided.
                dialect = dialect or _dialect

        # If string, then try un-JSONifying it into a list; if that doesn't
        # work then assume it's tab-delimited and convert to a list.
        extra = extra or []
        if isinstance(extra, basestring):
            try:
                extra = helpers._unjsonify(extra)
            except simplejson.JSONDecodeError:
                extra = extra.split('\t')

        # Calculate bin if not provided
        if bin is None:
            try:
                bin = bins.bins(start, end, one=True)
            except TypeError:
                bin = None

        self.seqid = seqid
        self.source = source
        self.featuretype = featuretype
        self.start = start
        self.end = end
        self.score = score
        self.strand = strand
        self.frame = frame
        self.attributes = attributes
        self.extra = extra
        self.bin = bin
        self.id = id
        self.dialect = dialect or constants.dialect
        self.file_order = file_order
        self.keep_order = keep_order
Esempio n. 3
0
    def __init__(self, dbfn, text_factory=None, default_encoding='utf-8',
                 keep_order=False):
        """
        Connect to a database created by :func:`gffutils.create_db`.

        Parameters
        ----------

        dbfn : str

            Path to a database created by :func:`gffutils.create_db`.

        text_factory : callable

            Optionally set the way sqlite3 handles strings.  Default is
            sqlite3.OptimizedUnicode, which returns ascii when possible,
            unicode otherwise

        encoding : str

            When non-ASCII characters are encountered, assume they are in this
            encoding.

        keep_order : bool

            If True, all features returned from this instance will have the
            order of their attributes maintained.  This can be turned on or off
            database-wide by setting the `keep_order` attribute or with this
            kwarg, or on a feature-by-feature basis by setting the `keep_order`
            attribute of an individual feature.

            Default is False, since this includes a sorting step that can get
            time-consuming for many features.

        .. note::

            `dbfn` can also be a subclass of :class:`_DBCreator`, useful for
            when :func:`gffutils.create_db` is provided the ``dbfn=":memory:"``
            kwarg.

        """
        # Since specifying the string ":memory:" will actually try to connect
        # to a new, separate (and empty) db in memory, we can alternatively
        # pass in a sqlite connection instance to use its existing, in-memory
        # db.
        if isinstance(dbfn, create._DBCreator):
            self.conn = dbfn.conn
            self.dbfn = dbfn.dbfn

        elif isinstance(dbfn, sqlite3.Connection):
            self.conn = dbfn
            self.dbfn = dbfn
        # otherwise assume dbfn is a string.
        elif dbfn == ':memory:':
            raise ValueError(
                "cannot connect to memory db; please provide the connection")
        else:
            self.dbfn = dbfn
            self.conn = sqlite3.connect(self.dbfn)

        if text_factory is not None:
            self.conn.text_factory = text_factory
        self.conn.row_factory = sqlite3.Row

        self.default_encoding = default_encoding
        self.keep_order = keep_order
        c = self.conn.cursor()

        # Load some meta info
        # TODO: this is a good place to check for previous versions, and offer
        # to upgrade...
        c.execute(
            '''
            SELECT version, dialect FROM meta
            ''')
        version, dialect = c.fetchone()
        self.version = version
        self.dialect = helpers._unjsonify(dialect)

        # Load directives from db
        c.execute(
            '''
            SELECT directive FROM directives
            ''')
        self.directives = [directive[0] for directive in c if directive]

        # Load autoincrements so that when we add new features, we can start
        # autoincrementing from where we last left off (instead of from 1,
        # which would cause name collisions)
        c.execute(
            '''
            SELECT base, n FROM autoincrements
            ''')
        self._autoincrements = dict(c)