def derived_feature_generator(): """ Generator of items from the file that was just created... """ keys = ['parent', 'seqid', 'start', 'end', 'strand', 'featuretype', 'bin', 'attributes'] for line in open(fout.name): d = dict(zip(keys, line.strip().split('\t'))) d.pop('parent') d['score'] = '.' d['source'] = 'gffutils_derived' d['frame'] = '.' d['extra'] = [] d['attributes'] = helpers._unjsonify(d['attributes']) f = feature.Feature(**d) f.id = self._id_handler(f) yield f
def __init__(self, seqid=".", source=".", featuretype=".", start=".", end=".", score=".", strand=".", frame=".", attributes=None, extra=None, bin=None, id=None, dialect=None, file_order=None, keep_order=False): """ Represents a feature from the database. When printed, reproduces the original line from the file as faithfully as possible using `dialect`. Usually you won't want to use this directly, since it has various implementation details needed for operating in the context of FeatureDB objects. Instead, try the :func:`feature_from_line` function. Parameters ---------- seqid : string Name of the sequence (often chromosome) source : string Source of the feature; typically the originating database or program that predicted the feature featuretype : string Type of feature. For example "gene", "exon", "TSS", etc start, end : int or "." 1-based coordinates; start must be <= end. If "." (the default placeholder for GFF files), then the corresponding attribute will be None. score : string Stored as a string. strand : "+" | "-" | "." Strand of the feature; "." when strand is not relevant. frame : "0" | "1" | "2" Coding frame. 0 means in-frame; 1 means there is one extra base at the beginning, so the first codon starts at the second base; 2 means two extra bases at the beginning. Interpretation is strand specific; "beginning" for a minus-strand feature is at the end coordinate. attributes : string or dict If a string, first assume it is serialized JSON; if this fails then assume it's the original key/vals string. If it's a dictionary already, then use as-is. The end result is that this instance's `attributes` attribute will always be a dictionary. Upon printing, the attributes will be reconstructed based on this dictionary and the dialect -- except if the original attributes string was provided, in which case that will be used directly. extra : string or list Additional fields after the canonical 9 fields for GFF/GTF. If a string, then first assume it's serialized JSON; if this fails then assume it's a tab-delimited string of additional fields. If it's a list already, then use as-is. bin : int UCSC genomic bin. If None, will be created based on provided start/end; if start or end is "." then bin will be None. id : None or string Database-specific primary key for this feature. The only time this should not be None is if this feature is coming from a database, in which case it will be filled in automatically. dialect : dict or None The dialect to use when reconstructing attribute strings; defaults to the GFF3 spec. :class:`FeatureDB` objects will automatically attach the dialect from the original file. file_order : int This is the `rowid` special field used in a sqlite3 database; this is provided by FeatureDB. keep_order : bool If True, then the attributes in the printed string will be in the order specified in the dialect. Disabled by default, since this sorting step is time-consuming over many features. """ # start/end can be provided as int-like, ".", or None, but will be # converted to int or None if start == ".": start = None elif start is not None: start = int(start) if end == ".": end = None elif end is not None: end = int(end) # Flexible handling of attributes: # If dict, then use that; otherwise assume JSON and convert to a dict; # otherwise assume original string and convert to a dict. # # dict_class is set at the module level above...this is so you can swap # in and out different dict implementations (ordered, defaultdict, etc) # for testing. attributes = attributes or dict_class() if isinstance(attributes, basestring): try: attributes = helpers._unjsonify(attributes, isattributes=True) # it's a string but not JSON: assume original attributes string. except simplejson.JSONDecodeError: # But Feature.attributes is still a dict attributes, _dialect = parser._split_keyvals(attributes) # Use this dialect if none provided. dialect = dialect or _dialect # If string, then try un-JSONifying it into a list; if that doesn't # work then assume it's tab-delimited and convert to a list. extra = extra or [] if isinstance(extra, basestring): try: extra = helpers._unjsonify(extra) except simplejson.JSONDecodeError: extra = extra.split('\t') # Calculate bin if not provided if bin is None: try: bin = bins.bins(start, end, one=True) except TypeError: bin = None self.seqid = seqid self.source = source self.featuretype = featuretype self.start = start self.end = end self.score = score self.strand = strand self.frame = frame self.attributes = attributes self.extra = extra self.bin = bin self.id = id self.dialect = dialect or constants.dialect self.file_order = file_order self.keep_order = keep_order
def __init__(self, dbfn, text_factory=None, default_encoding='utf-8', keep_order=False): """ Connect to a database created by :func:`gffutils.create_db`. Parameters ---------- dbfn : str Path to a database created by :func:`gffutils.create_db`. text_factory : callable Optionally set the way sqlite3 handles strings. Default is sqlite3.OptimizedUnicode, which returns ascii when possible, unicode otherwise encoding : str When non-ASCII characters are encountered, assume they are in this encoding. keep_order : bool If True, all features returned from this instance will have the order of their attributes maintained. This can be turned on or off database-wide by setting the `keep_order` attribute or with this kwarg, or on a feature-by-feature basis by setting the `keep_order` attribute of an individual feature. Default is False, since this includes a sorting step that can get time-consuming for many features. .. note:: `dbfn` can also be a subclass of :class:`_DBCreator`, useful for when :func:`gffutils.create_db` is provided the ``dbfn=":memory:"`` kwarg. """ # Since specifying the string ":memory:" will actually try to connect # to a new, separate (and empty) db in memory, we can alternatively # pass in a sqlite connection instance to use its existing, in-memory # db. if isinstance(dbfn, create._DBCreator): self.conn = dbfn.conn self.dbfn = dbfn.dbfn elif isinstance(dbfn, sqlite3.Connection): self.conn = dbfn self.dbfn = dbfn # otherwise assume dbfn is a string. elif dbfn == ':memory:': raise ValueError( "cannot connect to memory db; please provide the connection") else: self.dbfn = dbfn self.conn = sqlite3.connect(self.dbfn) if text_factory is not None: self.conn.text_factory = text_factory self.conn.row_factory = sqlite3.Row self.default_encoding = default_encoding self.keep_order = keep_order c = self.conn.cursor() # Load some meta info # TODO: this is a good place to check for previous versions, and offer # to upgrade... c.execute( ''' SELECT version, dialect FROM meta ''') version, dialect = c.fetchone() self.version = version self.dialect = helpers._unjsonify(dialect) # Load directives from db c.execute( ''' SELECT directive FROM directives ''') self.directives = [directive[0] for directive in c if directive] # Load autoincrements so that when we add new features, we can start # autoincrementing from where we last left off (instead of from 1, # which would cause name collisions) c.execute( ''' SELECT base, n FROM autoincrements ''') self._autoincrements = dict(c)