def update(self, data, make_backup=True, **kwargs): """ Update database with features in `data`. data : str, iterable, FeatureDB instance If FeatureDB, all data will be used. If string, assume it's a filename of a GFF or GTF file. Otherwise, assume it's an iterable of Feature objects. The classes in gffutils.iterators may be helpful in this case. make_backup : bool If True, and the database you're about to update is a file on disk, makes a copy of the existing database and saves it with a .bak extension. Notes ----- Other kwargs are used in the same way as in gffutils.create_db; see the help for that function for details. Returns ------- FeatureDB with updated features. """ from gffutils import create from gffutils import iterators if make_backup: if isinstance(self.dbfn, six.string_types): shutil.copy2(self.dbfn, self.dbfn + '.bak') # get iterator-specific kwargs _iterator_kwargs = {} for k, v in kwargs.items(): if k in constants._iterator_kwargs: _iterator_kwargs[k] = v # Handle all sorts of input data = iterators.DataIterator(data, **_iterator_kwargs) if self.dialect['fmt'] == 'gtf': if 'id_spec' not in kwargs: kwargs['id_spec'] = { 'gene': 'gene_id', 'transcript': 'transcript_id'} db = create._GTFDBCreator( data=data, dbfn=self.dbfn, dialect=self.dialect, **kwargs) elif self.dialect['fmt'] == 'gff3': if 'id_spec' not in kwargs: kwargs['id_spec'] = 'ID' db = create._GFFDBCreator( data=data, dbfn=self.dbfn, dialect=self.dialect, **kwargs) else: raise ValueError db._populate_from_lines(data) db._update_relations() db._finalize() return db
def __init__(self, data, dbfn, force=False, verbose=False, id_spec=None, merge_strategy='merge', checklines=10, transform=None, force_dialect_check=False, from_string=False, dialect=None, default_encoding='utf-8', infer_gene_extent=True, force_merge_fields=None, text_factory=sqlite3.OptimizedUnicode, pragmas=constants.default_pragmas): """ Base class for _GFFDBCreator and _GTFDBCreator; see create_db() function for docs """ if force_merge_fields is None: force_merge_fields = [] if merge_strategy == 'merge': if set(['start', 'end']).intersection(force_merge_fields): raise ValueError("Can't merge start/end fields since " "they must be integers") warn = set(force_merge_fields)\ .intersection(['frame', 'strand']) for w in warn: warnings.warn( "%s field will be merged for features with the same ID; " "this may result in unusable features." % w) self.force_merge_fields = force_merge_fields self.pragmas = pragmas self.merge_strategy = merge_strategy self.default_encoding = default_encoding self.infer_gene_extent = infer_gene_extent self._autoincrements = collections.defaultdict(int) if force: if os.path.exists(dbfn): os.unlink(dbfn) self.dbfn = dbfn self.id_spec = id_spec if isinstance(dbfn, six.string_types): conn = sqlite3.connect(dbfn) else: conn = dbfn self.conn = conn self.conn.row_factory = sqlite3.Row self.set_verbose(verbose) if text_factory is not None: if self.verbose == 'debug': logger.debug('setting text factory to %s' % text_factory) self.conn.text_factory = text_factory self._data = data self._orig_logger_level = logger.level self.iterator = iterators.DataIterator( data=data, checklines=checklines, transform=transform, force_dialect_check=force_dialect_check, from_string=from_string, dialect=dialect )
def create_db(data, dbfn, id_spec=None, force=False, verbose=False, checklines=10, merge_strategy='error', transform=None, gtf_transcript_key='transcript_id', gtf_gene_key='gene_id', gtf_subfeature='exon', force_gff=False, force_dialect_check=False, from_string=False, keep_order=False, text_factory=sqlite3.OptimizedUnicode, infer_gene_extent=True, force_merge_fields=None, pragmas=constants.default_pragmas, sort_attribute_values=False): """ Create a database from a GFF or GTF file. Parameters ---------- data : string or iterable If a string (and `from_string` is False), then `data` is the path to the original GFF or GTF file. If a string and `from_string` is True, then assume `data` is the actual data to use. Otherwise, it's an iterable of Feature objects. dbfn : string Path to the database that will be created. Can be the special string ":memory:" to create an in-memory database. id_spec : string, list, dict, callable, or None This parameter guides what will be used as the primary key for the database, which in turn determines how you will access individual features by name from the database. If `id_spec=None`, then auto-increment primary keys based on the feature type (e.g., "gene_1", "gene_2"). This is also the fallback behavior for the other values below. If `id_spec` is a string, then look for this key in the attributes. If it exists, then use its value as the primary key, otherwise autoincrement based on the feature type. For many GFF3 files, "ID" usually works well. If `id_spec` is a list or tuple of keys, then check for each one in order, using the first one found. For GFF3, this might be ["ID", "Name"], which would use the ID if it exists, otherwise the Name, otherwise autoincrement based on the feature type. If `id_spec` is a dictionary, then it is a mapping of feature types to what should be used as the ID. For example, for GTF files, `{'gene': 'gene_id', 'transcript': 'transcript_id'}` may be useful. The values of this dictionary can also be a list, e.g., `{'gene': ['gene_id', 'geneID']}` If `id_spec` is a callable object, then it accepts a dictionary from the iterator and returns one of the following: * None (in which case the feature type will be auto-incremented) * string (which will be used as the primary key) * special string starting with "autoincrement:X", where "X" is a string that will be used for auto-incrementing. For example, if "autoincrement:chr10", then the first feature will be "chr10_1", the second "chr10_2", and so on. force : bool If `False` (default), then raise an exception if `dbfn` already exists. Use `force=True` to overwrite any existing databases. verbose : bool Report percent complete and other feedback on how the db creation is progressing. In order to report percent complete, the entire file needs to be read once to see how many items there are; for large files you may want to use `verbose=False` to avoid this. checklines : int Number of lines to check the dialect. merge_strategy : { "merge", "create_unique", "error", "warning" } This parameter specifies the behavior when two items have an identical primary key. Using `merge_strategy="merge"`, then there will be a single entry in the database, but the attributes of all features with the same primary key will be merged. Using `merge_strategy="create_unique"`, then the first entry will use the original primary key, but the second entry will have a unique, autoincremented primary key assigned to it Using `merge_strategy="error"`, a :class:`gffutils.DuplicateID` exception will be raised. This means you will have to edit the file yourself to fix the duplicated IDs. Using `merge_strategy="warning"`, a warning will be printed to the logger, and the duplicate feature will be skipped. transform : callable Function (or other callable object) that accepts a dictionary and returns a dictionary. gtf_transcript_key, gtf_gene_key : string Which attribute to use as the transcript ID and gene ID respectively for GTF files. Default is `transcript_id` and `gene_id` according to the GTF spec. gtf_subfeature : string Feature type to use as a "gene component" when inferring gene and transcript extents for GTF files. Default is `exon` according to the GTF spec. force_gff : bool If True, do not do automatic format detection -- only use GFF. force_dialect_check : bool If True, the dialect will be checkef for every feature (instead of just `checklines` features). This can be slow, but may be necessary for inconsistently-formatted input files. from_string : bool If True, then treat `data` as actual data (rather than the path to a file). keep_order : bool If True, all features returned from this instance will have the order of their attributes maintained. This can be turned on or off database-wide by setting the `keep_order` attribute or with this kwarg, or on a feature-by-feature basis by setting the `keep_order` attribute of an individual feature. Default is False, since this includes a sorting step that can get time-consuming for many features. infer_gene_extent : bool Only used for GTF files, set this to False in order to disable the inference of gene and transcript extents. Use this if you don't care about having gene and transcript features in the database, or if the input GTF file already has "gene" and "transcript" featuretypes. force_merge_fields : list If merge_strategy="merge", then features will only be merged if their non-attribute values are identical (same chrom, source, start, stop, score, strand, phase). Using `force_merge_fields`, you can override this behavior to allow merges even when fields are different. This list can contain one or more of ['seqid', 'source', 'featuretype', 'score', 'strand', 'frame']. The resulting merged fields will be strings of comma-separated values. Note that 'start' and 'end' are not available, since these fields need to be integers. text_factory : callable Text factory to use for the sqlite3 database. See https://docs.python.org/2/library/\ sqlite3.html#sqlite3.Connection.text_factory for details. The default sqlite3.OptimizedUnicode will return Unicode objects only for non-ASCII data, and bytestrings otherwise. pragmas : dict Dictionary of pragmas used when creating the sqlite3 database. See http://www.sqlite.org/pragma.html for a list of available pragmas. The defaults are stored in constants.default_pragmas, which can be used as a template for supplying a custom dictionary. sort_attribute_values : bool All features returned from the database will have their attribute values sorted. Typically this is only useful for testing, since this can get time-consuming for large numbers of features. """ kwargs = dict( data=data, checklines=checklines, transform=transform, force_dialect_check=force_dialect_check, from_string=from_string) # First construct an iterator so that we can identify the file format. # DataIterator figures out what kind of data was provided (string of lines, # filename, or iterable of Features) and checks `checklines` lines to # identify the dialect. iterator = iterators.DataIterator(**kwargs) dialect = iterator.dialect if isinstance(iterator, iterators.FeatureIterator): # However, a side-effect of this is that if `data` was a generator, # then we've just consumed `checklines` items (see # iterators.BaseIterator.__init__, which calls iterators.peek). # # But it also chains those consumed items back onto the beginning, and # the result is available as as iterator._iter. # # That's what we should be using now for `data: kwargs['data'] = iterator._iter # Since we've already checked lines, we don't want to do it again kwargs['checklines'] = 0 if force_gff or (dialect['fmt'] == 'gff3'): cls = _GFFDBCreator id_spec = id_spec or 'ID' add_kwargs = {} elif dialect['fmt'] == 'gtf': cls = _GTFDBCreator id_spec = id_spec or {'gene': 'gene_id', 'transcript': 'transcript_id'} add_kwargs = dict( transcript_key=gtf_transcript_key, gene_key=gtf_gene_key, subfeature=gtf_subfeature) kwargs.update(**add_kwargs) kwargs['dialect'] = dialect c = cls(dbfn=dbfn, id_spec=id_spec, force=force, verbose=verbose, merge_strategy=merge_strategy, text_factory=text_factory, infer_gene_extent=infer_gene_extent, force_merge_fields=force_merge_fields, pragmas=pragmas, **kwargs) c.create() if dbfn == ':memory:': db = interface.FeatureDB(c.conn, keep_order=keep_order, pragmas=pragmas, sort_attribute_values=sort_attribute_values, text_factory=text_factory) else: db = interface.FeatureDB(c, keep_order=keep_order, pragmas=pragmas, sort_attribute_values=sort_attribute_values, text_factory=text_factory) return db
def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys', 'feature_count'], limit=None, verbose=True): """ Inspect a GFF or GTF data source. This function is useful for figuring out the different featuretypes found in a file (for potential removal before creating a FeatureDB). Returns a dictionary with a key for each item in `look_for` and a corresponding value that is a dictionary of how many of each unique item were found. There will always be a `feature_count` key, indicating how many features were looked at (if `limit` is provided, then `feature_count` will be the same as `limit`). For example, if `look_for` is ['chrom', 'featuretype'], then the result will be a dictionary like:: { 'chrom': { 'chr1': 500, 'chr2': 435, 'chr3': 200, ... ... }. 'featuretype': { 'gene': 150, 'exon': 324, ... }, 'feature_count': 5000 } Parameters ---------- data : str, FeatureDB instance, or iterator of Features If `data` is a string, assume it's a GFF or GTF filename. If it's a FeatureDB instance, then its `all_features()` method will be automatically called. Otherwise, assume it's an iterable of Feature objects. look_for : list List of things to keep track of. Options are: - any attribute of a Feature object, such as chrom, source, start, stop, strand. - "attribute_keys", which will look at all the individual attribute keys of each feature limit : int Number of features to look at. Default is no limit. verbose : bool Report how many features have been processed. Returns ------- dict """ results = {} obj_attrs = [] for i in look_for: if i not in ['attribute_keys', 'feature_count']: obj_attrs.append(i) results[i] = Counter() attr_keys = 'attribute_keys' in look_for d = iterators.DataIterator(data) feature_count = 0 for f in d: if verbose: sys.stderr.write('\r%s features inspected' % feature_count) sys.stderr.flush() for obj_attr in obj_attrs: results[obj_attr].update([getattr(f, obj_attr)]) if attr_keys: results['attribute_keys'].update(f.attributes.keys()) feature_count += 1 if limit and feature_count == limit: break new_results = {} for k, v in results.items(): new_results[k] = dict(v) new_results['feature_count'] = feature_count return new_results