def infer_dialect(attributes): """ Infer the dialect based on the attributes. Parameters ---------- attributes : str or iterable A single attributes string from a GTF or GFF line, or an iterable of such strings. """ if isinstance(attributes, basestring): attributes = [attributes] dialects = [parser._split_keyvals(i)[1] for i in attributes] return _choose_dialect(dialects)
def feature_from_line(line, dialect=None, strict=True, keep_order=False): """ Given a line from a GFF file, return a Feature object Parameters ---------- line : string strict : bool If True (default), assume `line` is a single, tab-delimited string that has at least 9 fields. If False, then the input can have a more flexible format, useful for creating single ad hoc features or for writing tests. In this case, `line` can be a multi-line string (as long as it has a single non-empty line), and, as long as there are only 9 fields (standard GFF/GTF), then it's OK to use spaces instead of tabs to separate fields in `line`. But if >9 fields are to be used, then tabs must be used. """ if not strict: lines = line.splitlines(False) _lines = [] for i in lines: i = i.strip() if len(i) > 0: _lines.append(i) assert len(_lines) == 1, _lines line = _lines[0] if '\t' in line: fields = line.rstrip('\n\r').split('\t') else: fields = line.rstrip('\n\r').split(None, 8) else: fields = line.rstrip('\n\r').split('\t') try: attr_string = fields[8] except IndexError: attr_string = "" attrs, _dialect = parser._split_keyvals(attr_string, dialect=dialect) d = dict(zip(constants._gffkeys, fields)) d['attributes'] = attrs d['extra'] = fields[9:] d['keep_order'] = keep_order if dialect is None: dialect = _dialect return Feature(dialect=dialect, **d)
def __init__(self, seqid=".", source=".", featuretype=".", start=".", end=".", score=".", strand=".", frame=".", attributes=None, extra=None, bin=None, id=None, dialect=None, file_order=None, keep_order=False): """ Represents a feature from the database. When printed, reproduces the original line from the file as faithfully as possible using `dialect`. Usually you won't want to use this directly, since it has various implementation details needed for operating in the context of FeatureDB objects. Instead, try the :func:`feature_from_line` function. Parameters ---------- seqid : string Name of the sequence (often chromosome) source : string Source of the feature; typically the originating database or program that predicted the feature featuretype : string Type of feature. For example "gene", "exon", "TSS", etc start, end : int or "." 1-based coordinates; start must be <= end. If "." (the default placeholder for GFF files), then the corresponding attribute will be None. score : string Stored as a string. strand : "+" | "-" | "." Strand of the feature; "." when strand is not relevant. frame : "0" | "1" | "2" Coding frame. 0 means in-frame; 1 means there is one extra base at the beginning, so the first codon starts at the second base; 2 means two extra bases at the beginning. Interpretation is strand specific; "beginning" for a minus-strand feature is at the end coordinate. attributes : string or dict If a string, first assume it is serialized JSON; if this fails then assume it's the original key/vals string. If it's a dictionary already, then use as-is. The end result is that this instance's `attributes` attribute will always be a dictionary. Upon printing, the attributes will be reconstructed based on this dictionary and the dialect -- except if the original attributes string was provided, in which case that will be used directly. extra : string or list Additional fields after the canonical 9 fields for GFF/GTF. If a string, then first assume it's serialized JSON; if this fails then assume it's a tab-delimited string of additional fields. If it's a list already, then use as-is. bin : int UCSC genomic bin. If None, will be created based on provided start/end; if start or end is "." then bin will be None. id : None or string Database-specific primary key for this feature. The only time this should not be None is if this feature is coming from a database, in which case it will be filled in automatically. dialect : dict or None The dialect to use when reconstructing attribute strings; defaults to the GFF3 spec. :class:`FeatureDB` objects will automatically attach the dialect from the original file. file_order : int This is the `rowid` special field used in a sqlite3 database; this is provided by FeatureDB. keep_order : bool If True, then the attributes in the printed string will be in the order specified in the dialect. Disabled by default, since this sorting step is time-consuming over many features. """ # start/end can be provided as int-like, ".", or None, but will be # converted to int or None if start == ".": start = None elif start is not None: start = int(start) if end == ".": end = None elif end is not None: end = int(end) # Flexible handling of attributes: # If dict, then use that; otherwise assume JSON and convert to a dict; # otherwise assume original string and convert to a dict. # # dict_class is set at the module level above...this is so you can swap # in and out different dict implementations (ordered, defaultdict, etc) # for testing. attributes = attributes or dict_class() if isinstance(attributes, basestring): try: attributes = helpers._unjsonify(attributes, isattributes=True) # it's a string but not JSON: assume original attributes string. except simplejson.JSONDecodeError: # But Feature.attributes is still a dict attributes, _dialect = parser._split_keyvals(attributes) # Use this dialect if none provided. dialect = dialect or _dialect # If string, then try un-JSONifying it into a list; if that doesn't # work then assume it's tab-delimited and convert to a list. extra = extra or [] if isinstance(extra, basestring): try: extra = helpers._unjsonify(extra) except simplejson.JSONDecodeError: extra = extra.split('\t') # Calculate bin if not provided if bin is None: try: bin = bins.bins(start, end, one=True) except TypeError: bin = None self.seqid = seqid self.source = source self.featuretype = featuretype self.start = start self.end = end self.score = score self.strand = strand self.frame = frame self.attributes = attributes self.extra = extra self.bin = bin self.id = id self.dialect = dialect or constants.dialect self.file_order = file_order self.keep_order = keep_order