Example #1
0
def infer_dialect(attributes):
    """
    Infer the dialect based on the attributes.

    Parameters
    ----------
    attributes : str or iterable
        A single attributes string from a GTF or GFF line, or an iterable of
        such strings.
    """
    if isinstance(attributes, basestring):
        attributes = [attributes]
    dialects = [parser._split_keyvals(i)[1] for i in attributes]
    return _choose_dialect(dialects)
Example #2
0
def feature_from_line(line, dialect=None, strict=True, keep_order=False):
    """
    Given a line from a GFF file, return a Feature object

    Parameters
    ----------
    line : string

    strict : bool
        If True (default), assume `line` is a single, tab-delimited string that
        has at least 9 fields.

        If False, then the input can have a more flexible format, useful for
        creating single ad hoc features or for writing tests.  In this case,
        `line` can be a multi-line string (as long as it has a single non-empty
        line), and, as long as there are only 9 fields (standard GFF/GTF), then
        it's OK to use spaces instead of tabs to separate fields in `line`.
        But if >9 fields are to be used, then tabs must be used.
    """
    if not strict:
        lines = line.splitlines(False)
        _lines = []
        for i in lines:
            i = i.strip()
            if len(i) > 0:
                _lines.append(i)

        assert len(_lines) == 1, _lines
        line = _lines[0]

        if '\t' in line:
            fields = line.rstrip('\n\r').split('\t')
        else:
            fields = line.rstrip('\n\r').split(None, 8)
    else:
        fields = line.rstrip('\n\r').split('\t')
    try:
        attr_string = fields[8]
    except IndexError:
        attr_string = ""
    attrs, _dialect = parser._split_keyvals(attr_string, dialect=dialect)
    d = dict(zip(constants._gffkeys, fields))
    d['attributes'] = attrs
    d['extra'] = fields[9:]
    d['keep_order'] = keep_order
    if dialect is None:
        dialect = _dialect
    return Feature(dialect=dialect, **d)
Example #3
0
    def __init__(self, seqid=".", source=".", featuretype=".",
                 start=".", end=".", score=".", strand=".", frame=".",
                 attributes=None, extra=None, bin=None, id=None, dialect=None,
                 file_order=None, keep_order=False):
        """
        Represents a feature from the database.

        When printed, reproduces the original line from the file as faithfully
        as possible using `dialect`.

        Usually you won't want to use this directly, since it has various
        implementation details needed for operating in the context of FeatureDB
        objects.  Instead, try the :func:`feature_from_line` function.

        Parameters
        ----------

        seqid : string
            Name of the sequence (often chromosome)

        source : string
            Source of the feature; typically the originating database or
            program that predicted the feature

        featuretype : string
            Type of feature.  For example "gene", "exon", "TSS", etc

        start, end : int or "."
            1-based coordinates; start must be <= end.  If "." (the default
            placeholder for GFF files), then the corresponding attribute will
            be None.

        score : string
            Stored as a string.

        strand : "+" | "-" | "."
            Strand of the feature; "." when strand is not relevant.

        frame : "0" | "1" | "2"
            Coding frame.  0 means in-frame; 1 means there is one extra base at
            the beginning, so the first codon starts at the second base;
            2 means two extra bases at the beginning.  Interpretation is strand
            specific; "beginning" for a minus-strand feature is at the end
            coordinate.

        attributes : string or dict
            If a string, first assume it is serialized JSON; if this fails then
            assume it's the original key/vals string.  If it's a dictionary
            already, then use as-is.

            The end result is that this instance's `attributes` attribute will
            always be a dictionary.

            Upon printing, the attributes will be reconstructed based on this
            dictionary and the dialect -- except if the original attributes
            string was provided, in which case that will be used directly.

        extra : string or list
            Additional fields after the canonical 9 fields for GFF/GTF.

            If a string, then first assume it's serialized JSON; if this fails
            then assume it's a tab-delimited string of additional fields.  If
            it's a list already, then use as-is.

        bin : int
            UCSC genomic bin. If None, will be created based on provided
            start/end; if start or end is "." then bin will be None.

        id : None or string
            Database-specific primary key for this feature.  The only time this
            should not be None is if this feature is coming from a database, in
            which case it will be filled in automatically.

        dialect : dict or None

            The dialect to use when reconstructing attribute strings; defaults
            to the GFF3 spec.  :class:`FeatureDB` objects will automatically
            attach the dialect from the original file.

        file_order : int
            This is the `rowid` special field used in a sqlite3 database; this
            is provided by FeatureDB.

        keep_order : bool
            If True, then the attributes in the printed string will be in the
            order specified in the dialect.  Disabled by default, since this
            sorting step is time-consuming over many features.

        """
        # start/end can be provided as int-like, ".", or None, but will be
        # converted to int or None
        if start == ".":
            start = None
        elif start is not None:
            start = int(start)
        if end == ".":
            end = None
        elif end is not None:
            end = int(end)

        # Flexible handling of attributes:
        # If dict, then use that; otherwise assume JSON and convert to a dict;
        # otherwise assume original string and convert to a dict.
        #
        # dict_class is set at the module level above...this is so you can swap
        # in and out different dict implementations (ordered, defaultdict, etc)
        # for testing.
        attributes = attributes or dict_class()

        if isinstance(attributes, basestring):
            try:
                attributes = helpers._unjsonify(attributes, isattributes=True)

            # it's a string but not JSON: assume original attributes string.
            except simplejson.JSONDecodeError:

                # But Feature.attributes is still a dict
                attributes, _dialect = parser._split_keyvals(attributes)

                # Use this dialect if none provided.
                dialect = dialect or _dialect

        # If string, then try un-JSONifying it into a list; if that doesn't
        # work then assume it's tab-delimited and convert to a list.
        extra = extra or []
        if isinstance(extra, basestring):
            try:
                extra = helpers._unjsonify(extra)
            except simplejson.JSONDecodeError:
                extra = extra.split('\t')

        # Calculate bin if not provided
        if bin is None:
            try:
                bin = bins.bins(start, end, one=True)
            except TypeError:
                bin = None

        self.seqid = seqid
        self.source = source
        self.featuretype = featuretype
        self.start = start
        self.end = end
        self.score = score
        self.strand = strand
        self.frame = frame
        self.attributes = attributes
        self.extra = extra
        self.bin = bin
        self.id = id
        self.dialect = dialect or constants.dialect
        self.file_order = file_order
        self.keep_order = keep_order