def update(self, data, make_backup=True, **kwargs):
        """
        Update database with features in `data`.

        data : str, iterable, FeatureDB instance
            If FeatureDB, all data will be used. If string, assume it's
            a filename of a GFF or GTF file.  Otherwise, assume it's an
            iterable of Feature objects.  The classes in gffutils.iterators may
            be helpful in this case.

        make_backup : bool
            If True, and the database you're about to update is a file on disk,
            makes a copy of the existing database and saves it with a .bak
            extension.

        Notes
        -----
        Other kwargs are used in the same way as in gffutils.create_db; see the
        help for that function for details.

        Returns
        -------
        FeatureDB with updated features.
        """
        from gffutils import create
        from gffutils import iterators
        if make_backup:
            if isinstance(self.dbfn, six.string_types):
                shutil.copy2(self.dbfn, self.dbfn + '.bak')

        # get iterator-specific kwargs
        _iterator_kwargs = {}
        for k, v in kwargs.items():
            if k in constants._iterator_kwargs:
                _iterator_kwargs[k] = v

        # Handle all sorts of input
        data = iterators.DataIterator(data, **_iterator_kwargs)

        if self.dialect['fmt'] == 'gtf':
            if 'id_spec' not in kwargs:
                kwargs['id_spec'] = {
                    'gene': 'gene_id', 'transcript': 'transcript_id'}
            db = create._GTFDBCreator(
                data=data, dbfn=self.dbfn, dialect=self.dialect, **kwargs)
        elif self.dialect['fmt'] == 'gff3':
            if 'id_spec' not in kwargs:
                kwargs['id_spec'] = 'ID'
            db = create._GFFDBCreator(
                data=data, dbfn=self.dbfn, dialect=self.dialect, **kwargs)

        else:
            raise ValueError

        db._populate_from_lines(data)
        db._update_relations()
        db._finalize()
        return db
Beispiel #2
0
    def __init__(self, data, dbfn, force=False, verbose=False, id_spec=None,
                 merge_strategy='merge', checklines=10, transform=None,
                 force_dialect_check=False, from_string=False, dialect=None,
                 default_encoding='utf-8',
                 infer_gene_extent=True,
                 force_merge_fields=None,
                 text_factory=sqlite3.OptimizedUnicode,
                 pragmas=constants.default_pragmas):
        """
        Base class for _GFFDBCreator and _GTFDBCreator; see create_db()
        function for docs
        """
        if force_merge_fields is None:
            force_merge_fields = []
        if merge_strategy == 'merge':
            if set(['start', 'end']).intersection(force_merge_fields):
                raise ValueError("Can't merge start/end fields since "
                                 "they must be integers")
            warn = set(force_merge_fields)\
                .intersection(['frame', 'strand'])
            for w in warn:
                warnings.warn(
                    "%s field will be merged for features with the same ID; "
                    "this may result in unusable features." % w)

        self.force_merge_fields = force_merge_fields
        self.pragmas = pragmas
        self.merge_strategy = merge_strategy
        self.default_encoding = default_encoding
        self.infer_gene_extent = infer_gene_extent
        self._autoincrements = collections.defaultdict(int)
        if force:
            if os.path.exists(dbfn):
                os.unlink(dbfn)
        self.dbfn = dbfn
        self.id_spec = id_spec
        if isinstance(dbfn, six.string_types):
            conn = sqlite3.connect(dbfn)
        else:
            conn = dbfn
        self.conn = conn
        self.conn.row_factory = sqlite3.Row
        self.set_verbose(verbose)

        if text_factory is not None:
            if self.verbose == 'debug':
                logger.debug('setting text factory to %s' % text_factory)
            self.conn.text_factory = text_factory
        self._data = data

        self._orig_logger_level = logger.level

        self.iterator = iterators.DataIterator(
            data=data, checklines=checklines, transform=transform,
            force_dialect_check=force_dialect_check, from_string=from_string,
            dialect=dialect
        )
Beispiel #3
0
def create_db(data, dbfn, id_spec=None, force=False, verbose=False,
              checklines=10, merge_strategy='error', transform=None,
              gtf_transcript_key='transcript_id', gtf_gene_key='gene_id',
              gtf_subfeature='exon', force_gff=False,
              force_dialect_check=False, from_string=False, keep_order=False,
              text_factory=sqlite3.OptimizedUnicode, infer_gene_extent=True,
              force_merge_fields=None, pragmas=constants.default_pragmas,
              sort_attribute_values=False):
    """
    Create a database from a GFF or GTF file.

    Parameters
    ----------
    data : string or iterable

        If a string (and `from_string` is False), then `data` is the path to
        the original GFF or GTF file.

        If a string and `from_string` is True, then assume `data` is the actual
        data to use.

        Otherwise, it's an iterable of Feature objects.

    dbfn : string

        Path to the database that will be created.  Can be the special string
        ":memory:" to create an in-memory database.

    id_spec : string, list, dict, callable, or None

        This parameter guides what will be used as the primary key for the
        database, which in turn determines how you will access individual
        features by name from the database.

        If `id_spec=None`, then auto-increment primary keys based on the
        feature type (e.g., "gene_1", "gene_2").  This is also the fallback
        behavior for the other values below.

        If `id_spec` is a string, then look for this key in the attributes.  If
        it exists, then use its value as the primary key, otherwise
        autoincrement based on the feature type.  For many GFF3 files, "ID"
        usually works well.

        If `id_spec` is a list or tuple of keys, then check for each one in
        order, using the first one found.  For GFF3, this might be ["ID",
        "Name"], which would use the ID if it exists, otherwise the Name,
        otherwise autoincrement based on the feature type.

        If `id_spec` is a dictionary, then it is a mapping of feature types to
        what should be used as the ID.  For example, for GTF files, `{'gene':
        'gene_id', 'transcript': 'transcript_id'}` may be useful.  The values
        of this dictionary can also be a list, e.g., `{'gene': ['gene_id',
        'geneID']}`

        If `id_spec` is a callable object, then it accepts a dictionary from
        the iterator and returns one of the following:

            * None (in which case the feature type will be auto-incremented)
            * string (which will be used as the primary key)
            * special string starting with "autoincrement:X", where "X" is
              a string that will be used for auto-incrementing.  For example,
              if "autoincrement:chr10", then the first feature will be
              "chr10_1", the second "chr10_2", and so on.

    force : bool

        If `False` (default), then raise an exception if `dbfn` already exists.
        Use `force=True` to overwrite any existing databases.

    verbose : bool

        Report percent complete and other feedback on how the db creation is
        progressing.

        In order to report percent complete, the entire file needs to be read
        once to see how many items there are; for large files you may want to
        use `verbose=False` to avoid this.

    checklines : int

        Number of lines to check the dialect.

    merge_strategy : { "merge", "create_unique", "error", "warning" }

        This parameter specifies the behavior when two items have an identical
        primary key.

        Using `merge_strategy="merge"`, then there will be a single entry in
        the database, but the attributes of all features with the same primary
        key will be merged.

        Using `merge_strategy="create_unique"`, then the first entry will use
        the original primary key, but the second entry will have a unique,
        autoincremented primary key assigned to it

        Using `merge_strategy="error"`, a :class:`gffutils.DuplicateID`
        exception will be raised.  This means you will have to edit the file
        yourself to fix the duplicated IDs.

        Using `merge_strategy="warning"`, a warning will be printed to the
        logger, and the duplicate feature will be skipped.

    transform : callable

        Function (or other callable object) that accepts a dictionary and
        returns a dictionary.

    gtf_transcript_key, gtf_gene_key : string

        Which attribute to use as the transcript ID and gene ID respectively
        for GTF files.  Default is `transcript_id` and `gene_id` according to
        the GTF spec.

    gtf_subfeature : string

        Feature type to use as a "gene component" when inferring gene and
        transcript extents for GTF files.  Default is `exon` according to the
        GTF spec.

    force_gff : bool
        If True, do not do automatic format detection -- only use GFF.

    force_dialect_check : bool
        If True, the dialect will be checkef for every feature (instead of just
        `checklines` features).  This can be slow, but may be necessary for
        inconsistently-formatted input files.

    from_string : bool
        If True, then treat `data` as actual data (rather than the path to
        a file).


    keep_order : bool

        If True, all features returned from this instance will have the
        order of their attributes maintained.  This can be turned on or off
        database-wide by setting the `keep_order` attribute or with this
        kwarg, or on a feature-by-feature basis by setting the `keep_order`
        attribute of an individual feature.

        Default is False, since this includes a sorting step that can get
        time-consuming for many features.

    infer_gene_extent : bool
        Only used for GTF files, set this to False in order to disable the
        inference of gene and transcript extents.  Use this if you don't care
        about having gene and transcript features in the database, or if the
        input GTF file already has "gene" and "transcript" featuretypes.

    force_merge_fields : list
        If merge_strategy="merge", then features will only be merged if their
        non-attribute values are identical (same chrom, source, start, stop,
        score, strand, phase).  Using `force_merge_fields`, you can override
        this behavior to allow merges even when fields are different.  This
        list can contain one or more of ['seqid', 'source', 'featuretype',
        'score', 'strand', 'frame'].  The resulting merged fields will be
        strings of comma-separated values.  Note that 'start' and 'end' are not
        available, since these fields need to be integers.

    text_factory : callable
        Text factory to use for the sqlite3 database.  See
        https://docs.python.org/2/library/\
                sqlite3.html#sqlite3.Connection.text_factory
        for details. The default sqlite3.OptimizedUnicode will return Unicode
        objects only for non-ASCII data, and bytestrings otherwise.

    pragmas : dict
        Dictionary of pragmas used when creating the sqlite3 database. See
        http://www.sqlite.org/pragma.html for a list of available pragmas.  The
        defaults are stored in constants.default_pragmas, which can be used as
        a template for supplying a custom dictionary.

    sort_attribute_values : bool
        All features returned from the database will have their attribute
        values sorted.  Typically this is only useful for testing, since this
        can get time-consuming for large numbers of features.
    """
    kwargs = dict(
        data=data, checklines=checklines, transform=transform,
        force_dialect_check=force_dialect_check, from_string=from_string)

    # First construct an iterator so that we can identify the file format.
    # DataIterator figures out what kind of data was provided (string of lines,
    # filename, or iterable of Features) and checks `checklines` lines to
    # identify the dialect.
    iterator = iterators.DataIterator(**kwargs)
    dialect = iterator.dialect

    if isinstance(iterator, iterators.FeatureIterator):
        # However, a side-effect of this is that  if `data` was a generator,
        # then we've just consumed `checklines` items (see
        # iterators.BaseIterator.__init__, which calls iterators.peek).
        #
        # But it also chains those consumed items back onto the beginning, and
        # the result is available as as iterator._iter.
        #
        # That's what we should be using now for `data:
        kwargs['data'] = iterator._iter

    # Since we've already checked lines, we don't want to do it again
    kwargs['checklines'] = 0

    if force_gff or (dialect['fmt'] == 'gff3'):
        cls = _GFFDBCreator
        id_spec = id_spec or 'ID'
        add_kwargs = {}
    elif dialect['fmt'] == 'gtf':
        cls = _GTFDBCreator
        id_spec = id_spec or {'gene': 'gene_id', 'transcript': 'transcript_id'}
        add_kwargs = dict(
            transcript_key=gtf_transcript_key,
            gene_key=gtf_gene_key,
            subfeature=gtf_subfeature)

    kwargs.update(**add_kwargs)
    kwargs['dialect'] = dialect
    c = cls(dbfn=dbfn, id_spec=id_spec, force=force, verbose=verbose,
            merge_strategy=merge_strategy, text_factory=text_factory,
            infer_gene_extent=infer_gene_extent,
            force_merge_fields=force_merge_fields, pragmas=pragmas, **kwargs)

    c.create()
    if dbfn == ':memory:':
        db = interface.FeatureDB(c.conn, keep_order=keep_order,
                                 pragmas=pragmas,
                                 sort_attribute_values=sort_attribute_values,
                                 text_factory=text_factory)
    else:
        db = interface.FeatureDB(c, keep_order=keep_order, pragmas=pragmas,
                                 sort_attribute_values=sort_attribute_values,
                                 text_factory=text_factory)

    return db
def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys',
                            'feature_count'], limit=None, verbose=True):
    """
    Inspect a GFF or GTF data source.

    This function is useful for figuring out the different featuretypes found
    in a file (for potential removal before creating a FeatureDB).

    Returns a dictionary with a key for each item in `look_for` and
    a corresponding value that is a dictionary of how many of each unique item
    were found.

    There will always be a `feature_count` key, indicating how many features
    were looked at (if `limit` is provided, then `feature_count` will be the
    same as `limit`).

    For example, if `look_for` is ['chrom', 'featuretype'], then the result
    will be a dictionary like::

        {
            'chrom': {
                'chr1': 500,
                'chr2': 435,
                'chr3': 200,
                ...
                ...
            }.

            'featuretype': {
                'gene': 150,
                'exon': 324,
                ...
            },

            'feature_count': 5000

        }


    Parameters
    ----------
    data : str, FeatureDB instance, or iterator of Features
        If `data` is a string, assume it's a GFF or GTF filename.  If it's
        a FeatureDB instance, then its `all_features()` method will be
        automatically called. Otherwise, assume it's an iterable of Feature
        objects.

    look_for : list
        List of things to keep track of. Options are:

            - any attribute of a Feature object, such as chrom, source, start,
              stop, strand.

            - "attribute_keys", which will look at all the individual
              attribute keys of each feature

    limit : int
        Number of features to look at.  Default is no limit.

    verbose : bool
        Report how many features have been processed.

    Returns
    -------
    dict
    """

    results = {}
    obj_attrs = []
    for i in look_for:
        if i not in ['attribute_keys', 'feature_count']:
            obj_attrs.append(i)
        results[i] = Counter()

    attr_keys = 'attribute_keys' in look_for

    d = iterators.DataIterator(data)
    feature_count = 0
    for f in d:
        if verbose:
            sys.stderr.write('\r%s features inspected' % feature_count)
            sys.stderr.flush()

        for obj_attr in obj_attrs:
            results[obj_attr].update([getattr(f, obj_attr)])

        if attr_keys:
            results['attribute_keys'].update(f.attributes.keys())

        feature_count += 1
        if limit and feature_count == limit:
            break

    new_results = {}
    for k, v in results.items():
        new_results[k] = dict(v)

    new_results['feature_count'] = feature_count
    return new_results